Turning textual code written in Inform 6 syntax into a linked list of tokens.
§1. The following code was sketched out on a long night flight to Hong Kong, but
there is otherwise nothing exotic about it. In as simple a way as possible, we
take a text from and break it into Inform 6 tokens. What we return is not
literally a linked list, but it amounts to the same thing: a single node
holding an unstructured run of tokens —
EXPRESSION_ISNT
T1
T2
T3
...
We follow the syntax of Inform 6, except that we have to look for three extra
syntaxes: {-braced-commands}, (+ Inform 7 interpolation +), and, if the
abbreviated syntax is allowed, also some cryptic notations such as *1.
The following scanner is basically a finite state machine, and these are the states:
enumerate NO_TOKSTATE 1
enumerate COMMENT_TOKSTATE /* currently scanning... an I6 comment ! ... */
enumerate DQUOTED_TOKSTATE /* ...double-quoted text */
enumerate SQUOTED_TOKSTATE /* ...single-quoted text */
enumerate WHITE_TOKSTATE /* ...whitespace */
enumerate TOK_TOKSTATE /* ...an actual token */
void Tokenisation::go(inter_schema *sch, text_stream *from, int pos, int abbreviated, int no_quoted_inames, void **quoted_inames) { inter_schema_token *preceding_token = NULL; int definition_length = Str::len(from); int line_offset = 0; text_stream *current_raw = Str::new(); int tokeniser_state = NO_TOKSTATE; for (; pos<definition_length; pos++) { inchar32_t c = Str::get_at(from, pos); if (Characters::is_whitespace(c)) { if (c == '\n') line_offset++; if ((tokeniser_state == TOK_TOKSTATE) || (tokeniser_state == NO_TOKSTATE)) { Absorb raw material, if any1.2; tokeniser_state = WHITE_TOKSTATE; PUT_TO(current_raw, ' '); } } else { if (tokeniser_state == WHITE_TOKSTATE) { Absorb raw material, if any1.2; tokeniser_state = NO_TOKSTATE; } } switch (tokeniser_state) { case DQUOTED_TOKSTATE: if (c == '"') { Absorb raw material, for sure1.3; tokeniser_state = NO_TOKSTATE; } else { PUT_TO(current_raw, c); } break; case SQUOTED_TOKSTATE: { int ends_here = FALSE; if (c == '\'') { ends_here = TRUE; if ((Str::len(current_raw) == 0) && (Str::get_at(from, pos+1) == '\'')) ends_here = FALSE; } if (ends_here) { Absorb raw material, for sure1.3; tokeniser_state = NO_TOKSTATE; } else { PUT_TO(current_raw, c); } break; } case COMMENT_TOKSTATE: if (c == '\n') tokeniser_state = NO_TOKSTATE; break; case WHITE_TOKSTATE: break; default: if (c == '!') { Absorb raw material, if any1.2; tokeniser_state = COMMENT_TOKSTATE; break; } if (c == '"') { Absorb raw material, if any1.2; tokeniser_state = DQUOTED_TOKSTATE; break; } if (c == '\'') { Absorb raw material, if any1.2; tokeniser_state = SQUOTED_TOKSTATE; break; } if ((c == '{') && (abbreviated == FALSE)) Look for a possible bracing1.5 else if ((c == '*') && (abbreviated == TRUE)) Look for a possible abbreviated command1.6 else if ((c == '(') && (Str::get_at(from, pos+1) == '+') && (abbreviated == FALSE)) Look for a possible Inform 7 fragment1.4 else Absorb a raw character1.1; break; } } Absorb raw material, if any1.2; }
§1.1. Absorb a raw character1.1 =
tokeniser_state = TOK_TOKSTATE; PUT_TO(current_raw, c);
§1.2. Absorb raw material, if any1.2 =
if (Str::len(current_raw)) Absorb raw material, for sure1.3; tokeniser_state = NO_TOKSTATE;
§1.3. Absorb raw material, for sure1.3 =
switch (tokeniser_state) { case WHITE_TOKSTATE: InterSchemas::add_token(sch, InterSchemas::new_token(WHITE_SPACE_ISTT, I" ", 0, 0, -1, line_offset)); break; case DQUOTED_TOKSTATE: Tokenisation::de_escape_text(current_raw); InterSchemas::add_token(sch, InterSchemas::new_token(DQUOTED_ISTT, current_raw, 0, 0, -1, line_offset)); break; case SQUOTED_TOKSTATE: Tokenisation::de_escape_sq_text(current_raw); InterSchemas::add_token(sch, InterSchemas::new_token(SQUOTED_ISTT, current_raw, 0, 0, -1, line_offset)); break; default: Look for individual tokens1.3.1; break; } Str::clear(current_raw); tokeniser_state = NO_TOKSTATE;
Look for a possible Inform 7 fragment1.4 =
int save_pos = pos, accept = FALSE; TEMPORARY_TEXT(source_text_fragment) pos += 2; while (Str::get_at(from, pos)) { if ((Str::get_at(from, pos-1) == '+') && (Str::get_at(from, pos) == ')')) { Str::delete_last_character(source_text_fragment); accept = TRUE; break; } PUT_TO(source_text_fragment, Str::get_at(from, pos++)); } if (accept) { Absorb raw material, if any1.2; Expand a fragment of Inform 7 text1.4.1; } else { inchar32_t c = '('; Absorb a raw character1.1; pos = save_pos; } DISCARD_TEXT(source_text_fragment)
- This code is used in §1.
Expand a fragment of Inform 7 text1.4.1 =
if (Str::len(source_text_fragment) > 0) { InterSchemas::add_token(sch, InterSchemas::new_token(I7_ISTT, source_text_fragment, 0, 0, -1, line_offset)); }
- This code is used in §1.4.
§1.5. Material in braces sometimes indicates an inline command, but not always, because braces often occur innocently in I6 code. So we require the first character after the open-brace not to be white-space, and also not to be a pipe (though I've forgotten why). The text inside the braces is called a "bracing".
Look for a possible bracing1.5 =
int save_pos = pos++, accept = FALSE; TEMPORARY_TEXT(bracing) while (TRUE) { inchar32_t c = Str::get_at(from, pos); if (c == 0) break; if (c == '}') { accept = TRUE; break; } PUT_TO(bracing, c); pos++; } inchar32_t first = Str::get_first_char(bracing); if ((accept) && (first != ' ') && (first != '\t') && (first != '\n') && (first != '|')) { Absorb raw material, if any1.2; Parse a bracing into an inline command1.5.1; } else { inchar32_t c = '{'; Absorb a raw character1.1; pos = save_pos; } DISCARD_TEXT(bracing)
- This code is used in §1.
Parse a bracing into an inline command1.5.1 =
inter_schema_token *t = InterSchemas::new_token(INLINE_ISTT, bracing, 0, 0, -1, line_offset); t->bracing = Str::duplicate(bracing); t->command = Str::new(); t->operand = Str::new(); t->operand2 = Str::new(); Decompose the bracing1.5.1.1; if (Str::len(t->command) > 0) { int c = unknown_ISINC, sc = no_ISINSC; if (Str::eq_wide_string(t->command, U"primitive-definition")) { c = primitive_definition_ISINC; if (Str::eq_wide_string(t->operand, U"repeat-through")) { sc = repeat_through_ISINSC; } else if (Str::eq_wide_string(t->operand, U"repeat-through-list")) { sc = repeat_through_list_ISINSC; } else if (Str::eq_wide_string(t->operand, U"number-of")) { sc = number_of_ISINSC; } else if (Str::eq_wide_string(t->operand, U"random-of")) { sc = random_of_ISINSC; } else if (Str::eq_wide_string(t->operand, U"total-of")) { sc = total_of_ISINSC; } else if (Str::eq_wide_string(t->operand, U"extremal")) { sc = extremal_ISINSC; } else if (Str::eq_wide_string(t->operand, U"function-application")) { sc = function_application_ISINSC; } else if (Str::eq_wide_string(t->operand, U"description-application")) { sc = description_application_ISINSC; } else if (Str::eq_wide_string(t->operand, U"solve-equation")) { sc = solve_equation_ISINSC; } else if (Str::eq_wide_string(t->operand, U"switch")) { sc = switch_ISINSC; } else if (Str::eq_wide_string(t->operand, U"break")) { sc = break_ISINSC; } else if (Str::eq_wide_string(t->operand, U"verbose-checking")) { sc = verbose_checking_ISINSC; } } else if (Str::eq_wide_string(t->command, U"new")) { c = new_ISINC; } else if (Str::eq_wide_string(t->command, U"new-list-of")) { c = new_list_of_ISINC; } else if (Str::eq_wide_string(t->command, U"printing-routine")) { c = printing_routine_ISINC; } else if (Str::eq_wide_string(t->command, U"ranger-routine")) { c = ranger_routine_ISINC; } else if (Str::eq_wide_string(t->command, U"indexing-routine")) { c = indexing_routine_ISINC; } else if (Str::eq_wide_string(t->command, U"next-routine")) { c = next_routine_ISINC; } else if (Str::eq_wide_string(t->command, U"previous-routine")) { c = previous_routine_ISINC; } else if (Str::eq_wide_string(t->command, U"strong-kind")) { c = strong_kind_ISINC; } else if (Str::eq_wide_string(t->command, U"weak-kind")) { c = weak_kind_ISINC; } else if (Str::eq_wide_string(t->command, U"object-kind")) { c = object_kind_ISINC; } else if (Str::eq_wide_string(t->command, U"backspace")) { c = backspace_ISINC; } else if (Str::eq_wide_string(t->command, U"erase")) { c = erase_ISINC; } else if (Str::eq_wide_string(t->command, U"open-brace")) { c = open_brace_ISINC; } else if (Str::eq_wide_string(t->command, U"close-brace")) { c = close_brace_ISINC; } else if (Str::eq_wide_string(t->command, U"label")) { c = label_ISINC; } else if (Str::eq_wide_string(t->command, U"counter")) { c = counter_ISINC; } else if (Str::eq_wide_string(t->command, U"counter-storage")) { c = counter_storage_ISINC; } else if (Str::eq_wide_string(t->command, U"counter-up")) { c = counter_up_ISINC; } else if (Str::eq_wide_string(t->command, U"counter-down")) { c = counter_down_ISINC; } else if (Str::eq_wide_string(t->command, U"counter-makes-array")) { c = counter_makes_array_ISINC; } else if (Str::eq_wide_string(t->command, U"by-reference")) { c = by_reference_ISINC; } else if (Str::eq_wide_string(t->command, U"by-reference-blank-out")) { c = by_reference_blank_out_ISINC; } else if (Str::eq_wide_string(t->command, U"reference-exists")) { c = reference_exists_ISINC; } else if (Str::eq_wide_string(t->command, U"lvalue-by-reference")) { c = lvalue_by_reference_ISINC; } else if (Str::eq_wide_string(t->command, U"by-value")) { c = by_value_ISINC; } else if (Str::eq_wide_string(t->command, U"make-true")) { c = make_true_ISINC; } else if (Str::eq_wide_string(t->command, U"box-quotation-text")) { c = box_quotation_text_ISINC; } else if (Str::eq_wide_string(t->command, U"try-action")) { c = try_action_ISINC; } else if (Str::eq_wide_string(t->command, U"try-action-silently")) { c = try_action_silently_ISINC; } else if (Str::eq_wide_string(t->command, U"return-value")) { c = return_value_ISINC; } else if (Str::eq_wide_string(t->command, U"return-value-from-rule")) { c = return_value_from_rule_ISINC; } else if (Str::eq_wide_string(t->command, U"property-holds-block-value")) { c = property_holds_block_value_ISINC; } else if (Str::eq_wide_string(t->command, U"mark-event-used")) { c = mark_event_used_ISINC; } else if (Str::eq_wide_string(t->command, U"rtp-code")) { c = rtp_code_ISINC; } else if (Str::eq_wide_string(t->command, U"rtp-location")) { c = rtp_location_ISINC; } else if (Str::eq_wide_string(t->command, U"my")) { c = my_ISINC; } else if (Str::eq_wide_string(t->command, U"unprotect")) { c = unprotect_ISINC; } else if (Str::eq_wide_string(t->command, U"copy")) { c = copy_ISINC; } else if (Str::eq_wide_string(t->command, U"initialise")) { c = initialise_ISINC; } else if (Str::eq_wide_string(t->command, U"matches-description")) { c = matches_description_ISINC; } else if (Str::eq_wide_string(t->command, U"match-right-relation-domain")) { c = match_right_relation_domain_ISINC; } else if (Str::eq_wide_string(t->command, U"match-left-relation-domain")) { c = match_left_relation_domain_ISINC; } else if (Str::eq_wide_string(t->command, U"now-matches-description")) { c = now_matches_description_ISINC; } else if (Str::eq_wide_string(t->command, U"arithmetic-operation")) { c = arithmetic_operation_ISINC; } else if (Str::eq_wide_string(t->command, U"say")) { c = say_ISINC; } else if (Str::eq_wide_string(t->command, U"show-me")) { c = show_me_ISINC; } else if (Str::eq_wide_string(t->command, U"segment-count")) { c = segment_count_ISINC; } else if (Str::eq_wide_string(t->command, U"final-segment-marker")) { c = final_segment_marker_ISINC; } else if (Str::eq_wide_string(t->command, U"list-together")) { c = list_together_ISINC; if (Str::eq_wide_string(t->operand, U"unarticled")) { sc = unarticled_ISINSC; } else if (Str::eq_wide_string(t->operand, U"articled")) { sc = articled_ISINSC; } } else if (Str::eq_wide_string(t->command, U"rescale")) { c = rescale_ISINC; } t->inline_command = c; t->inline_subcommand = sc; } InterSchemas::add_token(sch, t); preceding_token = t;
- This code is used in §1.5.
§1.5.1.1. A bracing can take any of the following forms:
{-command}
{-command:operand}
{-command:operand:operand2}
{-command:operand<property name}
{-command:operand>property name}
{some text}
{-annotation:some text}
We parse this with the command or annotation in command, the "some text"
or operand in bracing, the property name (if given) in extremal_property,
the direction of the < or > in extremal_property_sign, and the second,
optional, operand in operand2.
Decompose the bracing1.5.1.1 =
TEMPORARY_TEXT(pname) if (Str::get_first_char(t->bracing) == '-') { int portion = 1; for (int i=1, L = Str::len(t->bracing); i<L; i++) { inchar32_t c = Str::get_at(t->bracing, i); switch(portion) { case 1: if (c == ':') portion = 2; else PUT_TO(t->command, c); break; case 2: if (c == ':') portion = 3; #ifdef CORE_MODULE else if (c == '<') { t->extremal_property_sign = MEASURE_T_OR_LESS; portion = 4; } else if (c == '>') { t->extremal_property_sign = MEASURE_T_OR_MORE; portion = 4; } #endif else PUT_TO(t->operand, c); break; case 3: PUT_TO(t->operand2, c); break; case 4: PUT_TO(pname, c); break; } } #ifdef CORE_MODULE if (t->extremal_property_sign != MEASURE_T_EXACTLY) { wording W = Feeds::feed_text(pname); if (<property-name>(W)) t->extremal_property = <<rp>>; } #endif Str::copy(t->bracing, t->operand); } DISCARD_TEXT(pname)
- This code is used in §1.5.1.
§1.6. In abbreviated prototypes, *1 and *2 are placeholders, but a number
of modifiers are allowed. See Compilation Schemas (in calculus).
define GIVE_KIND_ID_ISSBM 1
define GIVE_COMPARISON_ROUTINE_ISSBM 2
define DEREFERENCE_PROPERTY_ISSBM 4
define ADOPT_LOCAL_STACK_FRAME_ISSBM 8
define CAST_TO_KIND_OF_OTHER_TERM_ISSBM 16
define BY_REFERENCE_ISSBM 32
define LVALUE_CONTEXT_ISSBM 64
define STORAGE_AS_FUNCTION_ISSBM 128
Look for a possible abbreviated command1.6 =
int at = pos; inchar32_t c = Str::get_at(from, ++at); int iss_bitmap = 0; switch (c) { case '!': I6Errors::issue_at_node(sch->node_tree, I"the '*!' schema notation has been abolished"); break; case '%': iss_bitmap = iss_bitmap | LVALUE_CONTEXT_ISSBM; c = Str::get_at(from, ++at); break; case '$': iss_bitmap = iss_bitmap | STORAGE_AS_FUNCTION_ISSBM; c = Str::get_at(from, ++at); break; case '#': iss_bitmap = iss_bitmap | GIVE_KIND_ID_ISSBM; c = Str::get_at(from, ++at); break; case '_': iss_bitmap = iss_bitmap | GIVE_COMPARISON_ROUTINE_ISSBM; c = Str::get_at(from, ++at); break; case '+': iss_bitmap = iss_bitmap | DEREFERENCE_PROPERTY_ISSBM; c = Str::get_at(from, ++at); break; case '|': iss_bitmap = iss_bitmap | (DEREFERENCE_PROPERTY_ISSBM + LVALUE_CONTEXT_ISSBM); c = Str::get_at(from, ++at); break; case '?': iss_bitmap = iss_bitmap | ADOPT_LOCAL_STACK_FRAME_ISSBM; c = Str::get_at(from, ++at); break; case '<': iss_bitmap = iss_bitmap | CAST_TO_KIND_OF_OTHER_TERM_ISSBM; c = Str::get_at(from, ++at); break; case '^': iss_bitmap = iss_bitmap | (ADOPT_LOCAL_STACK_FRAME_ISSBM + BY_REFERENCE_ISSBM); c = Str::get_at(from, ++at); break; case '>': iss_bitmap = iss_bitmap | BY_REFERENCE_ISSBM; c = Str::get_at(from, ++at); break; } if (Characters::isdigit(c)) { Absorb raw material, if any1.2; TEMPORARY_TEXT(T) for (int i=pos; i<=at; i++) PUT_TO(T, Str::get_at(from, i)); inter_schema_token *t = InterSchemas::new_token(INLINE_ISTT, T, 0, 0, -1, line_offset); t->bracing = Str::duplicate(T); t->inline_command = substitute_ISINC; t->inline_modifiers = iss_bitmap; t->constant_number = (int) c - (int) '1'; InterSchemas::add_token(sch, t); preceding_token = t; DISCARD_TEXT(T) pos = at; } else if (c == '&') { inter_schema_token *t = InterSchemas::new_token(INLINE_ISTT, I"*&", 0, 0, -1, line_offset); t->bracing = I"*&"; t->inline_command = combine_ISINC; t->inline_modifiers = iss_bitmap; InterSchemas::add_token(sch, t); preceding_token = t; pos = at; } else if (c == '-') { I6Errors::issue_at_node(sch->node_tree, I"the '*-' schema notation has been abolished"); } else if (c == '*') { inchar32_t c = '*'; Absorb a raw character1.1; pos = at; } else { inchar32_t c = '{'; Absorb a raw character1.1; }
- This code is used in §1.
§1.3.1. That leaves us with just the main case to handle: raw I6 code which is outside of quotation marks and commentary, and which doesn't include bracings or I7 interpolations. That might look like, for instance,
Frog + 2*Toad(
(there is no reason to suppose that this stretch of code is complete or matches parentheses); we must tokenise it into
Frog
WHITE SPACE
+
WHITE SPACE
2
*
Toad
(
We scan through the text until we reach the start of a new token, and then break off what we scanned through since the last time.
Look for individual tokens1.3.1 =
int L = Str::len(current_raw); int c_start = 0, escaped = FALSE; for (int p = 0; p < L; p++) { inchar32_t c1 = Str::get_at(current_raw, p), c2 = 0, c3 = 0; if (p < L-1) c2 = Str::get_at(current_raw, p+1); if (p < L-2) c3 = Str::get_at(current_raw, p+2); if (escaped == FALSE) { if ((c1 == '$') && ((p == 0) || (Characters::isalpha(Str::get_at(current_raw, p-1)) == FALSE))) Break off here for real, binary or hexadecimal notation1.3.1.1; if (c1 == '-') Break off here for negative number1.3.1.2; Break off here for operators1.3.1.3; } if (c1 == 0x00A7) escaped = escaped?FALSE:TRUE; } if (c_start < L) { int x = c_start, y = L-1; Break off a token1.3.1.4; }
- This code is used in §1.3.
§1.3.1.1. Recall that in I6 notation, a dollar introduces a non-decimal number, and the character after the initial dollar determines which:
$+3.14159E2
$$1001001
$1FE6
Break off here for real, binary or hexadecimal notation1.3.1.1 =
int x = c_start, y = p-1; Break off a token1.3.1.4; switch (c2) { case '+': case '-': x = p; y = p+1; while ((Str::get_at(current_raw, y+1) == '.') || (Str::get_at(current_raw, y+1) == 'E') || (Str::get_at(current_raw, y+1) == 'e') || (Characters::isdigit(Str::get_at(current_raw, y+1)))) y++; Break off a token1.3.1.4; p = y; c_start = p+1; continue; case '$': x = p; y = p+1; while ((Str::get_at(current_raw, y+1) == '0') || (Str::get_at(current_raw, y+1) == '1')) y++; Break off a token1.3.1.4; p = y; c_start = p+1; continue; default: x = p; y = p; while (Tokenisation::identchar(Str::get_at(current_raw, y+1))) y++; Break off a token1.3.1.4; p = y; c_start = p+1; continue; }
- This code is used in §1.3.1.
§1.3.1.2. A token beginning with a minus sign and continuing with digits may still
not be a negative number: it may be the binary subtraction operator.
For example, we need to tokenise x-1 as
x
-
1
and not as
x
-1
This requires context, that is, remembering what the previous token was.
Break off here for negative number1.3.1.2 =
if (((preceding_token == NULL) || (preceding_token->ist_type == OPEN_ROUND_ISTT) || (preceding_token->ist_type == OPERATOR_ISTT) || (preceding_token->ist_type == DIVIDER_ISTT)) && (c_start == p) && (!((abbreviated) && (preceding_token->ist_type == INLINE_ISTT)))) { int dc = p+1; while (Characters::isdigit(Str::get_at(current_raw, dc))) dc++; if (dc > p+1) { int x = c_start, y = p-1; Break off a token1.3.1.4; x = p; y = dc - 1; Break off a token1.3.1.4; p = y; c_start = p+1; continue; } }
- This code is used in §1.3.1.
§1.3.1.3. In I6, operators made of non-alphanumeric characters can be up to three
characters long, and we take the longest match: thus --> is a trigraph,
not the monograph - followed by the digraph ->.
We treat the @ sign as if it were alphanumeric for the sake of assembly
language opcodes such as @pull.
Break off here for operators1.3.1.3 =
int monograph = TRUE, digraph = FALSE, trigraph = FALSE; if ((Tokenisation::identchar(c1)) || (c1 == '_') || (c1 == '$')) monograph = FALSE; if (c1 == 0x00A7) monograph = FALSE; if ((c1 == '#') && (Characters::isalpha(c2))) monograph = FALSE; if ((c1 == '_') && (Characters::isalpha(c2))) monograph = FALSE; if ((c1 == '#') && (c2 == '#') && (Characters::isalpha(c3))) monograph = FALSE; if ((c1 == '@') && (Characters::isalpha(c2))) monograph = FALSE; if ((c1 == '+') && (c2 == '+')) digraph = TRUE; if ((c1 == '-') && (c2 == '-')) digraph = TRUE; if ((c1 == '>') && (c2 == '=')) digraph = TRUE; if ((c1 == '<') && (c2 == '=')) digraph = TRUE; if ((c1 == '=') && (c2 == '=')) digraph = TRUE; if ((c1 == '-') && (c2 == '>')) digraph = TRUE; if ((c1 == '.') && (c2 == '&')) digraph = TRUE; if ((c1 == '.') && (c2 == '#')) digraph = TRUE; if ((c1 == '~') && (c2 == '~')) digraph = TRUE; if ((c1 == '~') && (c2 == '=')) digraph = TRUE; if ((c1 == '&') && (c2 == '&')) digraph = TRUE; if ((c1 == '|') && (c2 == '|')) digraph = TRUE; if ((c1 == '>') && (c2 == '>')) digraph = TRUE; if ((c1 == ':') && (c2 == ':')) digraph = TRUE; if ((c1 == '-') && (c2 == '-') && (c3 == '>')) trigraph = TRUE; if (trigraph) { int x = c_start, y = p-1; Break off a token1.3.1.4; x = p; y = p+2; Break off a token1.3.1.4; p += 2; c_start = p+1; continue; } if (digraph) { int x = c_start, y = p-1; Break off a token1.3.1.4; x = p; y = p+1; Break off a token1.3.1.4; p++; c_start = p+1; continue; } if (monograph) { int x = c_start, y = p-1; Break off a token1.3.1.4; x = p; y = p; Break off a token1.3.1.4; c_start = p+1; continue; }
- This code is used in §1.3.1.
§1.3.1.4. In this code, the new token is between character positions x and y
inclusive; we ignore an empty token.
Break off a token1.3.1.4 =
if (y >= x) { TEMPORARY_TEXT(T) for (int i = x; i <= y; i++) PUT_TO(T, Str::get_at(current_raw, i)); int is = RAW_ISTT; inter_ti which = 0; int which_rw = 0, which_number = -1, which_quote = -1; Identify this new token1.3.1.4.1; inter_schema_token *n = InterSchemas::new_token(is, T, which, which_rw, which_number, line_offset); #ifdef CORE_MODULE if (which_quote >= 0) n->as_quoted = quoted_inames[which_quote]; #endif InterSchemas::add_token(sch, n); if (n->ist_type != WHITE_SPACE_ISTT) preceding_token = n; DISCARD_TEXT(T) }
§1.3.1.4.1. Finally, we identify what sort of token we're looking at. It would be elegant to reimplement this with a trie (e.g. using Tries and Avinues (in foundation)), but speed is not quite important enough to make it worthwhile.
define LOWEST_XBIP_VALUE HAS_XBIP
enumerate HAS_XBIP 10000
enumerate HASNT_XBIP
enumerate READ_XBIP
enumerate OWNERKIND_XBIP
define HIGHEST_XBIP_VALUE OWNERKIND_XBIP
Identify this new token1.3.1.4.1 =
if (Str::get_at(T, 0) == '@') is = OPCODE_ISTT; if (Str::get_at(T, 0) == 0x00A7) is = IDENTIFIER_ISTT; if ((Str::get_at(T, 0) == '#') && (Str::get_at(T, 1) == '#') && (Characters::isalpha(Str::get_at(T, 2)))) { is = IDENTIFIER_ISTT; LOOP_THROUGH_TEXT(P, T) { inchar32_t c = Str::get(P); if ((c != '_') && (c != '#') && (!Tokenisation::identchar(c))) is = RAW_ISTT; } } if ((Str::get_at(T, 0) == '#') && (Characters::isalpha(Str::get_at(T, 1)))) { is = IDENTIFIER_ISTT; LOOP_THROUGH_TEXT(P, T) { inchar32_t c = Str::get(P); if ((c != '_') && (c != '#') && (c != '$') && (!Tokenisation::identchar(c))) is = RAW_ISTT; } } if ((Str::get_at(T, 0) == '_') && (Characters::isalpha(Str::get_at(T, 1)))) { is = IDENTIFIER_ISTT; LOOP_THROUGH_TEXT(P, T) { inchar32_t c = Str::get(P); if ((c != '_') && (c != '#') && (!Tokenisation::identchar(c))) is = RAW_ISTT; } } if (Characters::isalpha(Str::get_at(T, 0))) { is = IDENTIFIER_ISTT; LOOP_THROUGH_TEXT(P, T) { inchar32_t c = Str::get(P); if ((c != '_') && (!Tokenisation::identchar(c))) is = RAW_ISTT; } if (Str::begins_with_wide_string(T, U"QUOTED_INAME_0_")) which_quote = 0; else if (Str::begins_with_wide_string(T, U"QUOTED_INAME_1_")) which_quote = 1; if (Str::eq(T, I"I7_string")) { Str::clear(T); WRITE_TO(T, "I7_String"); } if (Str::eq(T, I"COMMA_WORD")) { Str::clear(T); WRITE_TO(T, "comma_word"); } } if (Characters::isdigit(Str::get_at(T, 0))) { is = NUMBER_ISTT; LOOP_THROUGH_TEXT(P, T) { inchar32_t c = Str::get(P); if (!Characters::isdigit(c)) is = RAW_ISTT; } } if (Str::get_at(T, 0) == '$') { is = HEX_NUMBER_ISTT; inchar32_t c = Str::get_at(T, 1); if (c == '$') is = BIN_NUMBER_ISTT; if (c == '+') is = REAL_NUMBER_ISTT; if (c == '-') is = REAL_NUMBER_ISTT; } if (Str::get_at(T, 0) == '-') is = NUMBER_ISTT; if (Str::eq(T, I"false")) { is = NUMBER_ISTT; which_number = 0; } if (Str::eq(T, I"true")) { is = NUMBER_ISTT; which_number = 1; } if (Str::eq(T, I"nothing")) { is = NUMBER_ISTT; which_number = 0; } if (Str::eq(T, I"if")) { is = RESERVED_ISTT; which_rw = IF_I6RW; } if (Str::eq(T, I"else")) { is = RESERVED_ISTT; which_rw = ELSE_I6RW; } if (Str::eq(T, I"style")) { is = RESERVED_ISTT; which_rw = STYLE_I6RW; } if (Str::eq(T, I"return")) { is = RESERVED_ISTT; which_rw = RETURN_I6RW; } if (Str::eq(T, I"rtrue")) { is = RESERVED_ISTT; which_rw = RTRUE_I6RW; } if (Str::eq(T, I"rfalse")) { is = RESERVED_ISTT; which_rw = RFALSE_I6RW; } if (Str::eq(T, I"for")) { is = RESERVED_ISTT; which_rw = FOR_I6RW; } if (Str::eq(T, I"objectloop")) { is = RESERVED_ISTT; which_rw = OBJECTLOOP_I6RW; } if (Str::eq(T, I"while")) { is = RESERVED_ISTT; which_rw = WHILE_I6RW; } if (Str::eq(T, I"do")) { is = RESERVED_ISTT; which_rw = DO_I6RW; } if (Str::eq(T, I"until")) { is = RESERVED_ISTT; which_rw = UNTIL_I6RW; } if (Str::eq(T, I"print")) { is = RESERVED_ISTT; which_rw = PRINT_I6RW; } if (Str::eq(T, I"print_ret")) { is = RESERVED_ISTT; which_rw = PRINTRET_I6RW; } if (Str::eq(T, I"new_line")) { is = RESERVED_ISTT; which_rw = NEWLINE_I6RW; } if (Str::eq(T, I"give")) { is = RESERVED_ISTT; which_rw = GIVE_I6RW; } if (Str::eq(T, I"move")) { is = RESERVED_ISTT; which_rw = MOVE_I6RW; } if (Str::eq(T, I"remove")) { is = RESERVED_ISTT; which_rw = REMOVE_I6RW; } if (Str::eq(T, I"jump")) { is = RESERVED_ISTT; which_rw = JUMP_I6RW; } if (Str::eq(T, I"switch")) { is = RESERVED_ISTT; which_rw = SWITCH_I6RW; } if (Str::eq(T, I"default")) { is = RESERVED_ISTT; which_rw = DEFAULT_I6RW; } if (Str::eq(T, I"font")) { is = RESERVED_ISTT; which_rw = FONT_I6RW; } if (Str::eq(T, I"continue")) { is = RESERVED_ISTT; which_rw = CONTINUE_I6RW; } if (Str::eq(T, I"break")) { is = RESERVED_ISTT; which_rw = BREAK_I6RW; } if (Str::eq(T, I"quit")) { is = RESERVED_ISTT; which_rw = QUIT_I6RW; } if (Str::eq(T, I"restore")) { is = RESERVED_ISTT; which_rw = RESTORE_I6RW; } if (Str::eq(T, I"spaces")) { is = RESERVED_ISTT; which_rw = SPACES_I6RW; } if (Str::eq(T, I"read")) { is = RESERVED_ISTT; which_rw = READ_I6RW; } if (Str::eq(T, I"inversion")) { is = RESERVED_ISTT; which_rw = INVERSION_I6RW; } if (Str::eq_insensitive(T, I"#IFDEF")) { is = DIRECTIVE_ISTT; which_rw = IFDEF_I6RW; } if (Str::eq_insensitive(T, I"#IFNDEF")) { is = DIRECTIVE_ISTT; which_rw = IFNDEF_I6RW; } if (Str::eq_insensitive(T, I"#IFTRUE")) { is = DIRECTIVE_ISTT; which_rw = IFTRUE_I6RW; } if (Str::eq_insensitive(T, I"#IFFALSE")) { is = DIRECTIVE_ISTT; which_rw = IFFALSE_I6RW; } if (Str::eq_insensitive(T, I"#IFNOT")) { is = DIRECTIVE_ISTT; which_rw = IFNOT_I6RW; } if (Str::eq_insensitive(T, I"#ENDIF")) { is = DIRECTIVE_ISTT; which_rw = ENDIF_I6RW; } if (Str::eq_insensitive(T, I"#ORIGSOURCE")) { is = DIRECTIVE_ISTT; which_rw = ORIGSOURCE_I6RW; } if (Str::eq(T, I",")) is = COMMA_ISTT; if (Str::eq(T, I":")) is = COLON_ISTT; if (Str::eq(T, I"(")) is = OPEN_ROUND_ISTT; if (Str::eq(T, I")")) is = CLOSE_ROUND_ISTT; if (Str::eq(T, I"{")) is = OPEN_BRACE_ISTT; if (Str::eq(T, I"}")) is = CLOSE_BRACE_ISTT; if (Str::eq(T, I";")) is = DIVIDER_ISTT; if (Str::eq(T, I"::")) is = DCOLON_ISTT; inter_ti x = I6Operators::notation_to_BIP(T); if (x > 0) { is = OPERATOR_ISTT; which = x; }
- This code is used in §1.3.1.4.
§2. Inform 6 has a baroque set of not very self-consistent escape characters in
its double-quoted text syntax: here we take a deep breath, and plunge in. The
following converts text from I6 notation to a (composed) Unicode-encoded
string, in which every character has its literal meaning.
Note that the test case schemas of the building-test module exercises
the following function.
void Tokenisation::de_escape_text(text_stream *text) { TEMPORARY_TEXT(raw) WRITE_TO(raw, "%S", text); Str::clear(text); Normalise the white space2.1; De-escape raw into text2.2; DISCARD_TEXT(raw) }
§2.1. Where a newline occurs inside double-quoted text, all whitespace either side of it is deleted, and the newline replaced by a single space.
Normalise the white space2.1 =
int run_start = -1, run_len = 0, run_includes = FALSE; for (int i=0; i<Str::len(raw); i++) { inchar32_t c = Str::get_at(raw, i); if ((c == ' ') || (c == '\t') || (c == '\n')) { if (run_start == -1) { run_start = i; run_len = 0; run_includes = FALSE; } run_len++; if (c == '\n') run_includes = TRUE; } else { if ((run_start >= 0) && (run_includes)) { Str::put_at(raw, run_start, ' '); for (int j=0; j<run_len-1; j++) Str::delete_nth_character(raw, run_start+1); i = run_start; } run_start = -1; } }
- This code is used in §2.
§2.2. I6 does not follow the C-like language convention of using backslash for
string escapes. Instead ^ marks a forced newline and ~ marks a double-quotation
mark. All other string escapes begin with @.
De-escape raw into text2.2 =
for (int i=0; i<Str::len(raw); i++) { De-escape the Inform 7 unicode escape2.2.1; inchar32_t c = Str::get_at(raw, i); switch (c) { case '^': PUT_TO(text, '\n'); break; case '~': PUT_TO(text, '\"'); break; case '@': { TEMPORARY_TEXT(token) int skip = 1, decimal = FALSE, hexadecimal = FALSE; Extract the escape token2.2.2; i += skip-1; if (hexadecimal) Expand hexadecimal Unicode value2.2.3 else if (decimal) Expand decimal ZSCII value2.2.4 else Expand TeX-style digraph2.2.5; DISCARD_TEXT(token) break; } default: PUT_TO(text, c); break; } }
- This code is used in §2.
§2.2.1. This is not an I6 notation at all. If a character outside the range allowed
by I6 in string literals is present in an I7 source text file — for example,
a capital Cyrillic ef — then it is converted internally by the compiler to
something like [unicode 1060], with 1060 here being the decimal code point
for the character.
We will recognise this notation and translate it back into Unicode. The reason for doing this, even though the stand-alone I6 compiler would not, is that it means I6 source fed into this tokeniser will be treated the same whether it comes from an Include directive in I7 source text, or whether it comes from a kit source file.
De-escape the Inform 7 unicode escape2.2.1 =
if (Str::includes_at(raw, i, I"[unicode ")) { int unicode_point = 0; for (int j=i+9; j<Str::len(raw); j++) { inchar32_t c = Str::get_at(raw, j); if (c == ']') { unicode_point = Str::atoi(raw, i+9); i = j; break; } if (Characters::isdigit(c) == FALSE) break; } if (unicode_point > 0) { PUT_TO(text, (inchar32_t) unicode_point); continue; } }
- This code is used in §2.2.
§2.2.2. There are three different forms for an @-escape. First, @{....} with
hexadecimal digits inside the braces; then @@... with decimal digits; and
otherwise @.. for any of the set of legal digraphs listed below. The
content represented by dots in these syntaxes we will store in token,
and skip will count the total length of the escape, in raw characters.
Thus for @{2af4} the skip count would be 7.
Extract the escape token2.2.2 =
inchar32_t d = Str::get_at(raw, i+1); if (d == '{') { skip++; while (Str::get_at(raw, i+skip)) { inchar32_t e = Str::get_at(raw, i+skip); skip++; if (e == '}') break; PUT_TO(token, e); } hexadecimal = TRUE; } else if (d == '@') { skip++; while (Characters::isdigit(Str::get_at(raw, i+skip))) { inchar32_t e = Str::get_at(raw, i+skip); skip++; PUT_TO(token, e); } decimal = TRUE; } else { PUT_TO(token, d); PUT_TO(token, Str::get_at(raw, i+2)); skip += 2; }
§2.2.3. The hex notation refers directly to Unicode code points, so all we need to do is convert the token from a string to hex and then put it as a character.
Expand hexadecimal Unicode value2.2.3 =
int N = 0; LOOP_THROUGH_TEXT(pos, token) { inchar32_t c = Str::get(pos); int D = Tokenisation::hex_val(c); if (D == -1) { N = -1; break; } N = 16*N + D; } if (N == -1) WRITE_TO(text, "?ERROR<%S>", token); else PUT_TO(text, (inchar32_t) N);
§2.2.4. Decimal notation is substantially more annoying, because it uses the ZSCII character set, not Unicode. ZSCII is (for our purposes at least) the same as ASCII in the range 0 to 127, but is then very unlike ISO Latin-1 (and thus Unicode) in the range 128 to 255. (Which is as far as it goes.) The following therefore converts ZSCII to Unicode code points. Note that ZSCII cannot be mapped faithfully into ISO Latin-1 alone: it contains the OE ligature, which is in a different Unicode page. See "Table 2B: Higher ZSCII Character Set" in the DM4.
Expand decimal ZSCII value2.2.4 =
int N = Str::atoi(token, 0); if ((N>0) && (N<128)) PUT_TO(text, (inchar32_t) N); else { switch (N) { case 155: PUT_TO(text, 0xE4); break; /* a-diarhesis */ case 156: PUT_TO(text, 0xF6); break; /* o-diarhesis */ case 157: PUT_TO(text, 0xFC); break; /* u-diarhesis */ case 158: PUT_TO(text, 0xC4); break; /* A-diarhesis */ case 159: PUT_TO(text, 0xD6); break; /* O-diarhesis */ case 160: PUT_TO(text, 0xDC); break; /* U-diarhesis */ case 161: PUT_TO(text, 0xDF); break; /* sharp s */ case 162: PUT_TO(text, 0xBB); break; /* close double-angle quotation mark */ case 163: PUT_TO(text, 0xAB); break; /* open double-angle quotation mark */ case 164: PUT_TO(text, 0xEB); break; /* e-diarhesis */ case 165: PUT_TO(text, 0xEF); break; /* i-diarhesis */ case 166: PUT_TO(text, 0xFF); break; /* y-diarhesis */ case 167: PUT_TO(text, 0xCB); break; /* E-diarhesis */ case 168: PUT_TO(text, 0xCF); break; /* I-diarhesis */ case 169: PUT_TO(text, 0xE1); break; /* a-acute */ case 170: PUT_TO(text, 0xE9); break; /* e-acute */ case 171: PUT_TO(text, 0xED); break; /* i-acute */ case 172: PUT_TO(text, 0xF3); break; /* o-acute */ case 173: PUT_TO(text, 0xFA); break; /* u-acute */ case 174: PUT_TO(text, 0xFD); break; /* y-acute */ case 175: PUT_TO(text, 0xC1); break; /* A-acute */ case 176: PUT_TO(text, 0xC9); break; /* E-acute */ case 177: PUT_TO(text, 0xCD); break; /* I-acute */ case 178: PUT_TO(text, 0xD3); break; /* O-acute */ case 179: PUT_TO(text, 0xDA); break; /* U-acute */ case 180: PUT_TO(text, 0xDD); break; /* Y-acute */ case 181: PUT_TO(text, 0xE0); break; /* a-grave */ case 182: PUT_TO(text, 0xE8); break; /* e-grave */ case 183: PUT_TO(text, 0xEC); break; /* i-grave */ case 184: PUT_TO(text, 0xF2); break; /* o-grave */ case 185: PUT_TO(text, 0xF9); break; /* u-grave */ case 186: PUT_TO(text, 0xC0); break; /* A-grave */ case 187: PUT_TO(text, 0xC8); break; /* E-grave */ case 188: PUT_TO(text, 0xCC); break; /* I-grave */ case 189: PUT_TO(text, 0xD2); break; /* O-grave */ case 190: PUT_TO(text, 0xD9); break; /* U-grave */ case 191: PUT_TO(text, 0xE2); break; /* a-circumflex */ case 192: PUT_TO(text, 0xEA); break; /* e-circumflex */ case 193: PUT_TO(text, 0xEE); break; /* i-circumflex */ case 194: PUT_TO(text, 0xF4); break; /* o-circumflex */ case 195: PUT_TO(text, 0xFB); break; /* u-circumflex */ case 196: PUT_TO(text, 0xC2); break; /* A-circumflex */ case 197: PUT_TO(text, 0xCA); break; /* E-circumflex */ case 198: PUT_TO(text, 0xCE); break; /* I-circumflex */ case 199: PUT_TO(text, 0xD4); break; /* O-circumflex */ case 200: PUT_TO(text, 0xDB); break; /* U-circumflex */ case 201: PUT_TO(text, 0xE6); break; /* a-ring */ case 202: PUT_TO(text, 0xC6); break; /* A-ring */ case 203: PUT_TO(text, 0xF8); break; /* o-stroke */ case 204: PUT_TO(text, 0xD8); break; /* O-stroke */ case 205: PUT_TO(text, 0xE3); break; /* a-tilde */ case 206: PUT_TO(text, 0xF1); break; /* n-tilde */ case 207: PUT_TO(text, 0xF5); break; /* o-tilde */ case 208: PUT_TO(text, 0xC3); break; /* A-tilde */ case 209: PUT_TO(text, 0xD1); break; /* N-tilde */ case 210: PUT_TO(text, 0xD5); break; /* O-tilde */ case 211: PUT_TO(text, 0xE6); break; /* ae */ case 212: PUT_TO(text, 0xC6); break; /* AE */ case 213: PUT_TO(text, 0xE7); break; /* c-cedilla */ case 214: PUT_TO(text, 0xC7); break; /* C-cedilla */ case 215: PUT_TO(text, 0xFE); break; /* thorn */ case 216: PUT_TO(text, 0xF0); break; /* eth */ case 217: PUT_TO(text, 0xDE); break; /* Thorn */ case 218: PUT_TO(text, 0xD0); break; /* Eth */ case 219: PUT_TO(text, 0xA3); break; /* pound sterling sign */ case 220: PUT_TO(text, 0x153); break; /* oe */ case 221: PUT_TO(text, 0x152); break; /* OE */ case 222: PUT_TO(text, 0xA1); break; /* inverted ! */ case 223: PUT_TO(text, 0xBF); break; /* inverted ? */ default: Unknown string token2.2.4.1; break; } }
§2.2.5. Now for the digraphs. For example, @'a is an a-acute, while @ss is a
German sharp s. Again, see the DM4 for the specification of these. A misprint
in the DM4 means that one part of that manual says that @cc is the syntax
for c-cedilla, and another says it is @,c. To be on the safe side, we
recognise both. For similar reasons, we recognise both @/o and @\o as
a Scandinavian o-stroke.
Expand TeX-style digraph2.2.5 =
inchar32_t c = Str::get_at(token, 0); inchar32_t d = Str::get_at(token, 1); switch (c) { case '\'': /* these are acute accents */ switch (d) { case 'a': PUT_TO(text, 0xE1); break; case 'e': PUT_TO(text, 0xE9); break; case 'i': PUT_TO(text, 0xED); break; case 'o': PUT_TO(text, 0xF3); break; case 'u': PUT_TO(text, 0xFA); break; case 'y': PUT_TO(text, 0xFD); break; case 'A': PUT_TO(text, 0xC1); break; case 'E': PUT_TO(text, 0xC9); break; case 'I': PUT_TO(text, 0xCD); break; case 'O': PUT_TO(text, 0xD3); break; case 'U': PUT_TO(text, 0xDA); break; case 'Y': PUT_TO(text, 0xDD); break; default: Unknown string token2.2.4.1; break; } break; case '`': /* these are grave accents */ switch (d) { case 'a': PUT_TO(text, 0xE0); break; case 'e': PUT_TO(text, 0xE8); break; case 'i': PUT_TO(text, 0xEC); break; case 'o': PUT_TO(text, 0xF2); break; case 'u': PUT_TO(text, 0xF9); break; case 'A': PUT_TO(text, 0xC0); break; case 'E': PUT_TO(text, 0xC8); break; case 'I': PUT_TO(text, 0xCC); break; case 'O': PUT_TO(text, 0xD2); break; case 'U': PUT_TO(text, 0xD9); break; default: Unknown string token2.2.4.1; break; } break; case '^': /* these are circumflex accents */ switch (d) { case 'a': PUT_TO(text, 0xE2); break; case 'e': PUT_TO(text, 0xEA); break; case 'i': PUT_TO(text, 0xEE); break; case 'o': PUT_TO(text, 0xF4); break; case 'u': PUT_TO(text, 0xFB); break; case 'A': PUT_TO(text, 0xC2); break; case 'E': PUT_TO(text, 0xCA); break; case 'I': PUT_TO(text, 0xCE); break; case 'O': PUT_TO(text, 0xD4); break; case 'U': PUT_TO(text, 0xDB); break; default: Unknown string token2.2.4.1; break; } break; case ':': /* these are diarhesis accents, that is, umlauts */ switch (d) { case 'a': PUT_TO(text, 0xE4); break; case 'e': PUT_TO(text, 0xEB); break; case 'i': PUT_TO(text, 0xEF); break; case 'o': PUT_TO(text, 0xF6); break; case 'u': PUT_TO(text, 0xFC); break; case 'y': PUT_TO(text, 0xFF); break; case 'A': PUT_TO(text, 0xC4); break; case 'E': PUT_TO(text, 0xCB); break; case 'I': PUT_TO(text, 0xCF); break; case 'O': PUT_TO(text, 0xD6); break; case 'U': PUT_TO(text, 0xDC); break; case 'Y': PUT_TO(text, 0x0178); break; default: Unknown string token2.2.4.1; break; } break; case '~': /* these are tilde accents */ switch (d) { case 'a': PUT_TO(text, 0xE3); break; case 'n': PUT_TO(text, 0xF1); break; case 'o': PUT_TO(text, 0xF5); break; case 'A': PUT_TO(text, 0xC3); break; case 'N': PUT_TO(text, 0xD1); break; case 'O': PUT_TO(text, 0xD5); break; default: Unknown string token2.2.4.1; break; } break; case ',': case 'c': /* cedilla (a misprint in the DM4 means both are said to work) */ switch (d) { case 'c': PUT_TO(text, 0xE7); break; case 'C': PUT_TO(text, 0xC7); break; default: Unknown string token2.2.4.1; break; } break; case '\\': case '/': /* the Scandinavian slash thing */ switch (d) { case 'o': PUT_TO(text, 0xF8); break; case 'O': PUT_TO(text, 0xD8); break; default: Unknown string token2.2.4.1; break; } break; case 'a': /* joined ae */ switch (d) { case 'e': PUT_TO(text, 0xE6); break; default: Unknown string token2.2.4.1; break; } break; case 'A': /* joined AE */ switch (d) { case 'E': PUT_TO(text, 0xC6); break; default: Unknown string token2.2.4.1; break; } break; case 'e': /* lower-case Icelandic eth */ switch (d) { case 't': PUT_TO(text, 0xF0); break; default: Unknown string token2.2.4.1; break; } break; case 'E': /* capital Icelandic eth */ switch (d) { case 't': PUT_TO(text, 0xD0); break; default: Unknown string token2.2.4.1; break; } break; case 't': /* lower-case thorn */ switch (d) { case 'h': PUT_TO(text, 0xFE); break; default: Unknown string token2.2.4.1; break; } break; case 'T': /* capital thorn */ switch (d) { case 'h': PUT_TO(text, 0xCE); break; default: Unknown string token2.2.4.1; break; } break; case 'L': /* pound sign */ switch (d) { case 'L': PUT_TO(text, 0xA3); break; default: Unknown string token2.2.4.1; break; } break; case '!': /* inverted Spanish exclamation mark */ if (d == '!') PUT_TO(text, 0xA1); else Unknown string token2.2.4.1; break; case '?': /* inverted Spanish question mark */ if (d == '?') PUT_TO(text, 0xBF); else Unknown string token2.2.4.1; break; case '<': /* Double-angle open quotation mark */ if (d == '<') PUT_TO(text, 0xAB); else Unknown string token2.2.4.1; break; case '>': /* Double-angle close quotation mark */ if (d == '>') PUT_TO(text, 0xBB); else Unknown string token2.2.4.1; break; case 's': /* German sharp s */ if (d == 's') PUT_TO(text, 0xDF); else Unknown string token2.2.4.1; break; case 'o': /* joined oe and ring accent A */ switch (d) { case 'a': PUT_TO(text, 0xE5); break; case 'A': PUT_TO(text, 0xC5); break; case 'e': PUT_TO(text, 0x153); break; default: Unknown string token2.2.4.1; break; } break; case 'O': /* joined OE */ switch (d) { case 'E': PUT_TO(text, 0x152); break; default: Unknown string token2.2.4.1; break; } break; default: WRITE_TO(text, "TOKEN<%S>", token); break; }
§2.2.4.1. Unknown string token2.2.4.1 =
WRITE_TO(text, "@%S", token);
int Tokenisation::hex_val(inchar32_t c) { if ((c >= '0') && (c <= '9')) return (int) (c - '0'); if ((c >= 'a') && (c <= 'f')) return (int) (c - 'a' + 10); if ((c >= 'A') && (c <= 'F')) return (int) (c - 'A' + 10); return -1; }
§4. And similarly for single-quoted text notation, which shares some of the same
conventions. In fact I6 for some reason does not support the @@... decimal
notation within character or dictionary literals, throwing an error if it
is used; but we'll recognise it anyway, for the sake of using the same code as
is given above.
The tricky thing here is that single-quoted literals are characters if they
contain one character and do not have a // marker, but dictionary literals
otherwise. We need to know which because ^ is an escape character for a
single quotation mark in a dictionary literal, but not a character literal.
void Tokenisation::de_escape_sq_text(text_stream *text) { TEMPORARY_TEXT(raw) WRITE_TO(raw, "%S", text); Str::clear(text); int is_dictionary_word = FALSE; Determine if this is a character or dictionary literal4.1; Expand the literal text4.2; DISCARD_TEXT(raw) }
§4.1. Determine if this is a character or dictionary literal4.1 =
int char_count = 0; for (int i=0; i<Str::len(raw); i++) { if ((Str::get_at(raw, i) == '/') && (Str::get_at(raw, i+1) == '/')) { is_dictionary_word = TRUE; break; } char_count++; if (Str::get_at(raw, i) == '@') { TEMPORARY_TEXT(token) int skip = 1, decimal = FALSE, hexadecimal = FALSE; Extract the escape token2.2.2; i += skip-1; DISCARD_TEXT(token) } } if (char_count > 1) is_dictionary_word = TRUE;
- This code is used in §4.
§4.2. Expand the literal text4.2 =
for (int i=0; i<Str::len(raw); i++) { inchar32_t c = Str::get_at(raw, i); if ((c == '/') && (Str::get_at(raw, i+1) == '/')) Past this point escape characters do not apply4.2.1; if (c == '@') { TEMPORARY_TEXT(token) int skip = 1, decimal = FALSE, hexadecimal = FALSE; Extract the escape token2.2.2; if (hexadecimal) Expand hexadecimal Unicode value2.2.3 else if (decimal) Expand decimal ZSCII value2.2.4 else Expand TeX-style digraph2.2.5; i += skip-1; DISCARD_TEXT(token) } else { if ((c == '^') && (is_dictionary_word)) PUT_TO(text, '\''); else PUT_TO(text, c); } }
- This code is used in §4.
§4.2.1. Past this point escape characters do not apply4.2.1 =
while (i < Str::len(raw)) { PUT_TO(text, Str::get_at(raw, i)); i++; } break;
- This code is used in §4.2.
int Tokenisation::identchar(inchar32_t c) { if ((Characters::isalnum(c)) || (c == '`')) return TRUE; return FALSE; }