Line Classifiers

Simple matching grammars to decide what LP content is present in a given text.

§1. Classifiers. A line classifier is really just a list of rules:

classdef ls_classifier {
    struct linked_list *rules; /* of ls_notation_rule */
}

ls_classifier *LineClassifiers::new(void) {
    ls_classifier *lc = CREATE(ls_classifier);
    lc->rules = NEW_LINKED_LIST(ls_notation_rule);
    return lc;
}

void LineClassifiers::add_rule(ls_classifier *lc, ls_notation_rule *R) {
    ADD_TO_LINKED_LIST(R, ls_notation_rule, lc->rules);
}

The structure ls_classifier is accessed in 3/pl, 3/tp and here.

§2. Rules. Each rule consists of three pieces: a condition, which has to hold for the match to be allowed at all; a textual pattern which the line has to match; and then the result if the rule should succeed.

classdef ls_notation_rule {
    struct ls_notation_rule_condition condition; /* provided this is met... */
    struct ls_notation_rule_pattern pattern;     /* and the line matches this... */
    struct ls_notation_rule_outcome outcome;     /* then we classify like so */
}

ls_notation_rule *LineClassifiers::new_rule(ls_notation_rule_condition condition,
    ls_notation_rule_pattern pattern, ls_notation_rule_outcome outcome) {
    ls_notation_rule *R = CREATE(ls_notation_rule);
    R->condition = condition;
    R->outcome = outcome;
    R->pattern = pattern;
    return R;
}

The structure ls_notation_rule is accessed in 1/cln, 2/lc, 5/ts, 5/tc, 5/fm, 5/hf and here.

§3. Parsing. The following takes pattern text pt and tail text tail, and either adds a valid rule to the classifier and returns NULL, or does nothing and returns a non-empty error message as text.

text_stream *LineClassifiers::parse_rule(ls_classifier *lc, text_stream *pt, text_stream *tail) {
    match_results mr = Regexp::create_mr();
    text_stream *error = NULL;
    ls_notation_rule_condition condition = LineClassifiers::truth_condition();
    if (Regexp::match(&mr, tail, U"(%c+) if (%c+)")) {
        tail = mr.exp[0];
        condition = LineClassifiers::parse_condition(mr.exp[1], &error);
    }
    if (Str::len(error) == 0) {
        ls_notation_rule_outcome outcome = LineClassifiers::parse_outcome(tail, &error);
        if (Str::len(error) == 0) {
            ls_notation_rule_pattern pattern = LineClassifiers::parse_pattern(pt, Conventions::generic_set(), &error);
            if (Str::len(error) == 0) {
                ls_notation_rule *R = LineClassifiers::new_rule(condition, pattern, outcome);
                LineClassifiers::add_rule(lc, R);
            }
        }
    }
    Regexp::dispose_of(&mr);
    return error;
}

§4. An annoying subtlety here is that the pattern part of a rule depends on the conventions in play, and they might have changed since the classifier was first created. So the following reparses all of the patterns in light of the conventions currently in force. That shouldn't throw errors, because any errors should have come up earlier; but, better safe than sorry.

It would be possible to rewrite the matching code so that there was no need for this reparsing, but that would classify lines more slowly, and speed counts here. The reparsing is done only once per weave or tangle, so it's costing us basically nothing in overhead. What must be fast is the matching code applied to every line.

void LineClassifiers::reparse_patterns_with_new_conventions(ls_classifier *lc,
    struct linked_list *conventions) {
    ls_notation_rule *R;
    LOOP_OVER_LINKED_LIST(R, ls_notation_rule, lc->rules) {
        text_stream *error = NULL;
        R->pattern = LineClassifiers::parse_pattern(R->pattern.parsed_from, conventions, &error);
        if (Str::len(error) > 0)
            WebErrors::issue_at(error, NULL);
    }
}

§5. Matching. We find the first rule in the list which applies to the given text, and return it; if none apply, we return NULL. What matches may depend on what has been classified in previous lines, which forms the context.

If a match is made, then the content of any wildcards is written into the supplied wildcards array.

define TRACE_LCLASSIFIER FALSE

classdef ls_classifier_context {
    struct ls_class *previously; /* how the previous line was classified */
    int single_file;             /* is this in a single-file web? */
    int whitespace_nature;       /* of the current line: a *_LINESHADE value */
    struct ls_notation *ntn;     /* notation currently in use */
} ls_classifier_context;

ls_notation_rule *LineClassifiers::match(ls_classifier *lc, text_stream *full_text,
    ls_classifier_context *context, text_stream **wildcards) {
    ls_notation_rule *R;
    if (TRACE_LCLASSIFIER) WRITE_TO(STDERR, "Match %S (wsn %d, pwsn %d, psft %d)\n",
        full_text, context->whitespace_nature,
        context->previously->whitespace_nature, context->previously->follows_title);

    LOOP_OVER_LINKED_LIST(R, ls_notation_rule, lc->rules)
        if (LineClassifiers::condition_met(&(R->condition), context))
            if (LineClassifiers::match_pattern(&(R->pattern), full_text, wildcards)) {
                if (TRACE_LCLASSIFIER)
                    WRITE_TO(STDERR, "Success with outcome %d\n", R->outcome.outcome_ID);
                return R;
            }

    return NULL;
}

The structure ls_classifier_context is accessed in 1/cln, 1/ws, 1/sw, 2/ls, 2/lc, 2/hs, 5/ts, 5/apacs, 6/rw, 6/bf and here.

§6. Conditions. The condition applied to a rule — for example, if following title — is turned into one of these:

classdef ls_notation_rule_condition {
    int negated; /* if TRUE, means we must be not in the given context */
    int atomic_condition; /* one of the *_LSNRCAC values below */
} ls_notation_rule_condition;

ls_notation_rule_condition LineClassifiers::truth_condition(void) {
    ls_notation_rule_condition condition;
    condition.atomic_condition = ANY_LSNRCAC;
    condition.negated = FALSE;
    return condition;
}

The structure ls_notation_rule_condition is private to this section.

§7. The "atomic conditions" are as follows:

enumerate ANY_LSNRCAC 0 /* represents "always true" */

enumerate FIRST_LINE_LSNRCAC

enumerate FIRST_LINE_SF_LSNRCAC

enumerate FOLLOWING_TITLE_LSNRCAC

enumerate DEFINITION_LSNRCAC

enumerate EXTRACT_LSNRCAC

enumerate HOLON_LSNRCAC

enumerate TEXTEXTRACT_LSNRCAC

enumerate INDENTED_LSNRCAC

enumerate PTAG_SUPPORTED_LSNRCAC

ls_notation_rule_condition LineClassifiers::parse_condition(text_stream *ct, text_stream **error) {
    ls_notation_rule_condition condition = LineClassifiers::truth_condition();
    match_results mr = Regexp::create_mr();
    if (Regexp::match(&mr, ct, U"not (%c+)")) {
        condition = LineClassifiers::parse_condition(mr.exp[0], error);
        condition.negated = (condition.negated)?FALSE:TRUE;
        return condition;
    } else {
        int AC = -1;
        if (Str::eq(ct, I"on first line"))                AC = FIRST_LINE_LSNRCAC;
        if (Str::eq(ct, I"on first line of only file"))   AC = FIRST_LINE_SF_LSNRCAC;
        if (Str::eq(ct, I"following title"))              AC = FOLLOWING_TITLE_LSNRCAC;
        if (Str::eq(ct, I"in definition context"))        AC = DEFINITION_LSNRCAC;
        if (Str::eq(ct, I"in extract context"))           AC = EXTRACT_LSNRCAC;
        if (Str::eq(ct, I"in textextract context"))       AC = TEXTEXTRACT_LSNRCAC;
        if (Str::eq(ct, I"in holon context"))             AC = HOLON_LSNRCAC;
        if (Str::eq(ct, I"in indented context"))          AC = INDENTED_LSNRCAC;
        if (Str::eq(ct, I"paragraph tags supported"))     AC = PTAG_SUPPORTED_LSNRCAC;
        if (condition.atomic_condition < 0) {
            *error = Str::new();
            WRITE_TO(*error, "unknown condition '%S'", ct);
            condition.atomic_condition = ANY_LSNRCAC;
        } else {
            condition.atomic_condition = AC;
        }
    }
    Regexp::dispose_of(&mr);
    return condition;
}

§8. Whether conditions hold or not depends on the surrounding context:

int LineClassifiers::condition_met(ls_notation_rule_condition *condition, ls_classifier_context *context) {
    int top_flag = FALSE;
    if (context->previously->major == UNCLASSIFIED_MAJLC) top_flag = TRUE;
    int applies = FALSE;
    switch (condition->atomic_condition) {
        case FIRST_LINE_LSNRCAC:
            if (top_flag) applies = TRUE; break;
        case FIRST_LINE_SF_LSNRCAC:
            if ((context->single_file) && (top_flag)) applies = TRUE; break;
        case FOLLOWING_TITLE_LSNRCAC:
            if (context->previously->follows_title) applies = TRUE; break;
        case INDENTED_LSNRCAC:
            if ((context->whitespace_nature == WHITE_LINESHADE) &&
                (context->previously->whitespace_nature == BLACK_LINESHADE))
                applies = FALSE;
            else if ((context->whitespace_nature != BLACK_LINESHADE) &&
                (context->previously->whitespace_nature != BLACK_LINESHADE))
                applies = TRUE;
            break;
        case DEFINITION_LSNRCAC:
            if (LineClassification::definition_lines_can_follow(
                context->previously->major, context->previously->minor)) applies = TRUE;
            break;
        case EXTRACT_LSNRCAC:
            if (LineClassification::extract_lines_can_follow(
                context->previously->major, context->previously->minor)) applies = TRUE;
            break;
        case TEXTEXTRACT_LSNRCAC:
            if ((LineClassification::extract_lines_can_follow(
                context->previously->major, context->previously->minor)) &&
                (LineClassification::code_lines_can_follow(
                context->previously->major, context->previously->minor) == FALSE)) applies = TRUE;
            break;
        case HOLON_LSNRCAC:
            if (LineClassification::code_lines_can_follow(
                context->previously->major, context->previously->minor)) applies = TRUE;
            break;
        case PTAG_SUPPORTED_LSNRCAC:
            if (WebNotation::supports_paragraph_tags(context->ntn)) applies = TRUE;
        default:
            applies = TRUE;
            break;
    }
    if (condition->negated) applies = applies?FALSE:TRUE;
    return applies;
}

§9. Patterns. Now for the textual pattern. We have a very simple model: the line must, once trailing whitespace is removed, match a sequence of tokens, each of which is either fixed wording or a wildcard meaning "one or more characters". The wildcards on a given line are numbered from 0, and each can only appear once; but they need not occur in numerical order, and need not all be present.

define MAX_LSSRTOKENS (2*NO_DEFINED_LSWILDCARD_VALUES+1)

classdef ls_notation_rule_pattern {
    int strip_indents;
    int no_tokens;
    struct ls_srtoken tokens[MAX_LSSRTOKENS];
    struct text_stream *parsed_from;
} ls_notation_rule_pattern;

ls_notation_rule_pattern LineClassifiers::new_pattern(void) {
    ls_notation_rule_pattern pattern;
    pattern.strip_indents = 0;
    pattern.no_tokens = 0;
    pattern.parsed_from = NULL;
    return pattern;
}

The structure ls_notation_rule_pattern is private to this section.

§10. So here are the tokens:

classdef ls_srtoken {
    struct text_stream *fixed_content;
    int wildcard;
    int whitespace;
    int nonwhitespace;
    int digital;
} ls_srtoken;

ls_srtoken LineClassifiers::fixed_token(text_stream *text, int from, int to) {
    ls_srtoken tok;
    tok.fixed_content = Str::new();
    for (int j=from; j<=to; j++) PUT_TO(tok.fixed_content, Str::get_at(text, j));
    tok.wildcard = -1;
    tok.whitespace = FALSE;
    tok.nonwhitespace = FALSE;
    tok.digital = FALSE;
    return tok;
}

ls_srtoken LineClassifiers::wildcard_token(int n) {
    if ((n < MATERIAL_LSWILDCARD) || (n >= NO_DEFINED_LSWILDCARD_VALUES))
        internal_error("wildcard out of range");
    ls_srtoken tok;
    tok.fixed_content = NULL;
    tok.wildcard = n;
    tok.whitespace = FALSE;
    tok.nonwhitespace = FALSE;
    tok.digital = FALSE;
    return tok;
}

The structure ls_srtoken is private to this section.

§11. The following parses source code such as @enum MATERIAL(NONWHITESPACE) from SECOND into a ls_notation_rule_pattern.

enumerate MATERIAL_LSWILDCARD 0

enumerate SECOND_LSWILDCARD

enumerate THIRD_LSWILDCARD

enumerate FOURTH_LSWILDCARD

enumerate OPTIONS_LSWILDCARD

enumerate RESIDUE_LSWILDCARD

ls_notation_rule_pattern LineClassifiers::parse_pattern(text_stream *pt,
    linked_list *conventions, text_stream **error) {
    ls_notation_rule_pattern pattern = LineClassifiers::new_pattern();
    pattern.parsed_from = Str::duplicate(pt);
    TEMPORARY_TEXT(text)
    for (int i=0; i<Str::len(pt); i++) {
        if (Str::includes_at(pt, i, I"<INDENT>")) {
            if (Str::len(text) > 0) {
                *error = I"<INDENT> can be used only at the start of a pattern";
                return pattern;
            }
            pattern.strip_indents++;
            i += Str::len(I"<INDENT>") - 1; continue;
        }
        if (Str::includes_at(pt, i, I"<OPENHOLON>")) {
            WRITE_TO(text, "%S",
                Conventions::get_textual_from(conventions, HOLON_NAME_SYNTAX_LSCONVENTION));
            i += Str::len(I"<OPENHOLON>") - 1; continue;
        }
        if (Str::includes_at(pt, i, I"<CLOSEHOLON>")) {
            WRITE_TO(text, "%S",
                Conventions::get_textual2_from(conventions, HOLON_NAME_SYNTAX_LSCONVENTION));
            i += Str::len(I"<CLOSEHOLON>") - 1; continue;
        }
        if (Str::includes_at(pt, i, I"<OPENFILEHOLON>")) {
            WRITE_TO(text, "%S",
                Conventions::get_textual_from(conventions, FILE_HOLON_NAME_SYNTAX_LSCONVENTION));
            i += Str::len(I"<OPENFILEHOLON>") - 1; continue;
        }
        if (Str::includes_at(pt, i, I"<CLOSEFILEHOLON>")) {
            WRITE_TO(text, "%S",
                Conventions::get_textual2_from(conventions, FILE_HOLON_NAME_SYNTAX_LSCONVENTION));
            i += Str::len(I"<CLOSEFILEHOLON>") - 1; continue;
        }
        if (Str::includes_at(pt, i, I"<OPENTAG>")) {
            WRITE_TO(text, "%S",
                Conventions::get_textual_from(conventions, TAGS_SYNTAX_LSCONVENTION));
            i += Str::len(I"<OPENTAG>") - 1; continue;
        }
        if (Str::includes_at(pt, i, I"<CLOSETAG>")) {
            WRITE_TO(text, "%S",
                Conventions::get_textual2_from(conventions, TAGS_SYNTAX_LSCONVENTION));
            i += Str::len(I"<CLOSETAG>") - 1; continue;
        }
        PUT_TO(text, Str::get_at(pt, i));
    }
    int from = 0;
    for (int i=0; i<Str::len(text); i++) {
        if (pattern.no_tokens + 2 > MAX_LSSRTOKENS) break;
        if (Str::includes_at(text, i, I"MATERIAL")) {
            if (from < i) pattern.tokens[pattern.no_tokens++] = LineClassifiers::fixed_token(text, from, i-1);
            pattern.tokens[pattern.no_tokens++] = LineClassifiers::wildcard_token(MATERIAL_LSWILDCARD);
            from = i + Str::len(I"MATERIAL");
            i = from - 1; continue;
        }
        if (Str::includes_at(text, i, I"SECOND")) {
            if (from < i) pattern.tokens[pattern.no_tokens++] = LineClassifiers::fixed_token(text, from, i-1);
            pattern.tokens[pattern.no_tokens++] = LineClassifiers::wildcard_token(SECOND_LSWILDCARD);
            from = i + Str::len(I"SECOND");
            i = from - 1; continue;
        }
        if (Str::includes_at(text, i, I"THIRD")) {
            if (from < i) pattern.tokens[pattern.no_tokens++] = LineClassifiers::fixed_token(text, from, i-1);
            pattern.tokens[pattern.no_tokens++] = LineClassifiers::wildcard_token(THIRD_LSWILDCARD);
            from = i + Str::len(I"THIRD");
            i = from - 1; continue;
        }
        if (Str::includes_at(text, i, I"FOURTH")) {
            if (from < i) pattern.tokens[pattern.no_tokens++] = LineClassifiers::fixed_token(text, from, i-1);
            pattern.tokens[pattern.no_tokens++] = LineClassifiers::wildcard_token(FOURTH_LSWILDCARD);
            from = i + Str::len(I"FOURTH");
            i = from - 1; continue;
        }
        if (Str::includes_at(text, i, I"OPTIONS")) {
            if (from < i) pattern.tokens[pattern.no_tokens++] = LineClassifiers::fixed_token(text, from, i-1);
            pattern.tokens[pattern.no_tokens++] = LineClassifiers::wildcard_token(OPTIONS_LSWILDCARD);
            from = i + Str::len(I"OPTIONS");
            i = from - 1; continue;
        }
        if (Str::includes_at(text, i, I"RESIDUE")) {
            if (from < i) pattern.tokens[pattern.no_tokens++] = LineClassifiers::fixed_token(text, from, i-1);
            pattern.tokens[pattern.no_tokens++] = LineClassifiers::wildcard_token(RESIDUE_LSWILDCARD);
            from = i + Str::len(I"RESIDUE");
            i = from - 1; continue;
        }
        if (Str::includes_at(text, i, I"(WHITESPACE)")) {
            if ((pattern.no_tokens == 0) || (pattern.tokens[pattern.no_tokens-1].wildcard < 0) || (i != from)) {
                *error = I"(WHITESPACE) can be used only immediately after a wildcard";
                return pattern;
            }
            pattern.tokens[pattern.no_tokens-1].whitespace = TRUE;
            from = i + Str::len(I"(WHITESPACE)");
            i = from - 1; continue;
        }
        if (Str::includes_at(text, i, I"(NONWHITESPACE)")) {
            if ((pattern.no_tokens == 0) || (pattern.tokens[pattern.no_tokens-1].wildcard < 0) || (i != from)) {
                *error = I"(NONWHITESPACE) can be used only immediately after a wildcard";
                return pattern;
            }
            pattern.tokens[pattern.no_tokens-1].nonwhitespace = TRUE;
            from = i + Str::len(I"(NONWHITESPACE)");
            i = from - 1; continue;
        }
        if (Str::includes_at(text, i, I"(DIGITS)")) {
            if ((pattern.no_tokens == 0) || (pattern.tokens[pattern.no_tokens-1].wildcard < 0) || (i != from)) {
                *error = I"(DIGITS) can be used only immediately after a wildcard";
                return pattern;
            }
            pattern.tokens[pattern.no_tokens-1].digital = TRUE;
            from = i + Str::len(I"(DIGITS)");
            i = from - 1; continue;
        }
    }
    if ((from < Str::len(text)) && (pattern.no_tokens < MAX_LSSRTOKENS))
        pattern.tokens[pattern.no_tokens++] = LineClassifiers::fixed_token(text, from, Str::len(text)-1);
    int usages[NO_DEFINED_LSWILDCARD_VALUES];
    for (int i=0; i<NO_DEFINED_LSWILDCARD_VALUES; i++) usages[i] = 0;
    for (int i=0; i<pattern.no_tokens; i++)
        if (pattern.tokens[i].wildcard >= 0) {
            usages[pattern.tokens[i].wildcard]++;
            if ((i < pattern.no_tokens - 1) && (pattern.tokens[i+1].wildcard >= 0)) {
                pattern.no_tokens = 1;
                *error = I"two consecutive wildcards in pattern";
                return pattern;
            }
        }
    for (int i=0; i<NO_DEFINED_LSWILDCARD_VALUES; i++)
        if (usages[i] > 1) {
            pattern.no_tokens = 1;
            *error = I"wildcards can be used only once each in a pattern";
            return pattern;
        }
    DISCARD_TEXT(text)
    return pattern;
}

§12. So now we match text against a given pattern:

int LineClassifiers::match_pattern(ls_notation_rule_pattern *pattern, text_stream *full_text,
    text_stream **wildcards) {
    TEMPORARY_TEXT(text)
    Reduce the line indentation to allow for  markers12.1;
    Try to find a match against the text12.2;
    DISCARD_TEXT(text)
    return FALSE;
}

§12.1. Each <INDENT> marker at the start of the pattern represents one tab's worth of white space to strip from the start of the line being matched. This does that:

Reduce the line indentation to allow for markers12.1 =

    int wsc = 0, on = (pattern->strip_indents == 0)?TRUE:FALSE;
    for (int i=0; i<Str::len(full_text); i++) {
        inchar32_t c = Str::get_at(full_text, i);
        if (on) {
            PUT_TO(text, c);
        } else {
            if (c == ' ') wsc++;
            else if (c == '\t') wsc = (wsc/4+1)*4;
            if (wsc == 4*pattern->strip_indents) on = TRUE;
        }
    }
    if (on == FALSE) return FALSE; /* that is, no match: insufficient indentation */

This code is used in §12.

§12.2. And now we try to make a textual match. Note that we clear the wildcard variables each time, since otherwise we could have results from a previous partial but failed match lingering on into a successful one.

Note that an empty token list matches only a whitespace line, since the text we are matching has already had its whitespace at each end trimmed, so that a whitespace line leads to the empty text here.

Try to find a match against the text12.2 =

    for (int i=0; i<NO_DEFINED_LSWILDCARD_VALUES; i++)
        if (wildcards[i]) Str::clear(wildcards[i]);
    int match_from = 0, match_to = pattern->no_tokens - 1;
    int p_from = 0, p_to = Str::len(text) - 1;
    while (match_from <= match_to) {
        if (TRACE_LCLASSIFIER) WRITE_TO(STDERR, "Match tokens %d to %d, chars %c to %c\n",
            match_from, match_to, Str::get_at(text, p_from), Str::get_at(text, p_to));
        If the leftmost token is fixed text, check that it matches12.2.1;
        If the rightmost token is fixed text, check that it matches12.2.2;
        If only one token is left, it must be a wildcard, so copy the remaining text into it12.2.3;
        At this point, the leftmost tokens must be a wildcard followed by fixed text, so look ahead12.2.4;
    }
    if ((match_from > match_to) && (p_from > p_to)) return TRUE;
    if (TRACE_LCLASSIFIER) WRITE_TO(STDERR, "Failure\n");

This code is used in §12.

§12.2.1. If the leftmost token is fixed text, check that it matches12.2.1 =

    if (pattern->tokens[match_from].wildcard < 0) {
        text_stream *prefix = pattern->tokens[match_from].fixed_content;
        if (Str::includes_at(text, p_from, prefix) == FALSE) break;
        p_from += Str::len(prefix);
        match_from++;
        continue;
    }

This code is used in §12.2.

§12.2.2. If the rightmost token is fixed text, check that it matches12.2.2 =

    if (pattern->tokens[match_to].wildcard < 0) {
        text_stream *suffix = pattern->tokens[match_to].fixed_content;
        if (Str::includes_at(text, p_to - Str::len(suffix) + 1, suffix) == FALSE) break;
        p_to -= Str::len(suffix);
        match_to--;
        continue;
    }

This code is used in §12.2.

§12.2.3. If only one token is left, it must be a wildcard, so copy the remaining text into it12.2.3 =

    if (match_from == match_to) {
        text_stream *WT = wildcards[pattern->tokens[match_from].wildcard];
        Str::substr(WT, Str::at(text, p_from), Str::at(text, p_to+1));
        if ((pattern->tokens[match_from].nonwhitespace) &&
            ((Str::includes_character(WT, ' ')) || (Str::includes_character(WT, '\t'))))
            break;
        if ((pattern->tokens[match_from].whitespace) && (Str::is_whitespace(WT) == FALSE))
            break;
        if (pattern->tokens[match_from].digital) {
            int not_digital = FALSE;
            for (int i=0; i<Str::len(WT); i++)
                if (Characters::isdigit(Str::get_at(WT, i)) == FALSE)
                    not_digital = TRUE;
            if (not_digital) break;
        }
        p_from = p_to + 1;
        match_from++;
        continue;
    }

This code is used in §12.2.

§12.2.4. The leftmost token must be a wildcard because if it were fixed wording then we would have checked it already; it cannot be the only token, because we've just handled that case; so there is a next token, which cannot be a wildcard because we cannot have two consecutive wildcards. And therefore...

Note that we use a non-greedy algorithm, i.e., we make the earliest match possible, and with no backtracking to check other possibilities. This is a deliberately simple parser, intended to work quickly on simple unambiguous grammars.

At this point, the leftmost tokens must be a wildcard followed by fixed text, so look ahead12.2.4 =

    if (pattern->tokens[match_from+1].wildcard >= 0) internal_error("consecutive wildcard tokens");
    int lookahead = p_from+1, l_to = p_to - Str::len(pattern->tokens[match_from+1].fixed_content);
    for (; lookahead <= l_to; lookahead++)
        if (Str::includes_at(text, lookahead, pattern->tokens[match_from+1].fixed_content)) {
            text_stream *WT = wildcards[pattern->tokens[match_from].wildcard];
            Str::substr(WT, Str::at(text, p_from), Str::at(text, lookahead));
            p_from = lookahead + Str::len(pattern->tokens[match_from+1].fixed_content);
            match_from += 2;
            break;
        } else if ((pattern->tokens[match_from].nonwhitespace) && (Characters::is_whitespace(Str::get_at(text, lookahead))))
            lookahead = l_to + 1;
    if (lookahead > l_to) break;

This code is used in §12.2.

§13. Outcomes and their options. If successful, a rule produces an "outcome" such as namedholon or code, together perhaps with options such as earlyholonoption.

classdef ls_notation_rule_outcome {
    int outcome_ID;            /* one of the *_LSNROID values below */
    int options_applied;       /* a bitmap of *_LSNROBIT values below */
    int new_paragraph;         /* does this line implicitly begin a new para? */
    struct text_stream *error; /* on a match, in fact throw this error */
} ls_notation_rule_outcome;

ls_notation_rule_outcome LineClassifiers::new_outcome(void) {
    ls_notation_rule_outcome outcome;
    outcome.outcome_ID = NO_LSNROID;
    outcome.options_applied = 0;
    outcome.new_paragraph = FALSE;
    outcome.error = NULL;
    return outcome;
}

ls_notation_rule_outcome LineClassifiers::parse_outcome(text_stream *ot, text_stream **error) {
    ls_notation_rule_outcome outcome = LineClassifiers::new_outcome();
    match_results mr = Regexp::create_mr();
    if (Regexp::match(&mr, ot, U"error \"(%c+)\"")) {
        outcome.error = Str::duplicate(mr.exp[0]);
        outcome.outcome_ID = COMMENTARY_LSNROID;
    } else {
        if (Regexp::match(&mr, ot, U"(%c+) in new paragraph")) {
            ot = Str::duplicate(mr.exp[0]);
            outcome.new_paragraph = TRUE;
        }
        while (Regexp::match(&mr, ot, U"(%c+) with (%C+)")) {
            ot = Str::duplicate(mr.exp[0]);
            int opt = LineClassifiers::outcome_by_name(mr.exp[1]);
            if (opt == NO_LSNROID) {
                *error = Str::new();
                WRITE_TO(*error, "unknown option '%S'", mr.exp[1]);
            } else {
                int B = LineClassifiers::option_bit(opt);
                if (B == -1) {
                    *error = Str::new();
                    WRITE_TO(*error, "an outcome, not an option: '%S'", mr.exp[1]);
                } else {
                    outcome.options_applied |= B;
                }
            }
        }
        outcome.outcome_ID = LineClassifiers::outcome_by_name(ot);
        if (outcome.outcome_ID == NO_LSNROID) {
            *error = Str::new();
            WRITE_TO(*error, "unknown outcome '%S'", ot);
            outcome.outcome_ID = COMMENTARY_LSNROID;
        }
    }
    Regexp::dispose_of(&mr);
    return outcome;
}

The structure ls_notation_rule_outcome is accessed in 2/ls, 2/lc and here.

§14. These outcome and option IDs share an enumeration; first, here are the outcomes. Note that NO_LSNROID is never the outcome of any rule: it's a value used to mean "nothing matched".

enumerate NO_LSNROID 0

enumerate AUDIO_LSNROID

enumerate BEGINPARAGRAPH_LSNROID

enumerate CAROUSELEND_LSNROID

enumerate CAROUSELSLIDE_LSNROID

enumerate CODE_LSNROID

enumerate COMMENTARY_LSNROID

enumerate DEFINITION_LSNROID

enumerate DEFINITIONCONTINUED_LSNROID

enumerate DOWNLOAD_LSNROID

enumerate EMBEDDEDVIDEO_LSNROID

enumerate ENDEXTRACT_LSNROID

enumerate ENUMERATION_LSNROID

enumerate EXTRACT_LSNROID

enumerate FIGURE_LSNROID

enumerate FILEHOLON_LSNROID

enumerate FORMATIDENTIFIER_LSNROID

enumerate HTML_LSNROID

enumerate INCLUDEFILE_LSNROID

enumerate MAKEDEFINITIONSHERE_LSNROID

enumerate MAKECLASSESHERE_LSNROID

enumerate NAMEDHOLON_LSNROID

enumerate NAMELESSHOLON_LSNROID

enumerate PARAGRAPHTAG_LSNROID

enumerate PARAGRAPHTITLING_LSNROID

enumerate PARTITION_LSNROID

enumerate PURPOSE_LSNROID

enumerate QUOTATION_LSNROID

enumerate TEXTASCODEEXTRACT_LSNROID

enumerate TEXTEXTRACT_LSNROID

enumerate TEXTEXTRACTTO_LSNROID

enumerate TITLE_LSNROID

enumerate VIDEO_LSNROID

§15. And here are the options which some of the above may be given:

enumerate HYPERLINKED_LSNROID

enumerate UNDISPLAYED_LSNROID

enumerate WEBWIDEHOLON_LSNROID

enumerate TANGLEDTOHOLON_LSNROID

enumerate VERYEARLYHOLON_LSNROID

enumerate EARLYHOLON_LSNROID

enumerate LATEHOLON_LSNROID

enumerate VERYLATEHOLON_LSNROID

enumerate CONTINUATION_LSNROID

enumerate SUPERHEADING_LSNROID

enumerate LEVEL1_LSNROID

enumerate LEVEL2_LSNROID

enumerate LEVEL3_LSNROID

enumerate LEVEL4_LSNROID

enumerate LEVEL5_LSNROID

enumerate SILENT_LSNROID

enumerate WITHPURPOSE_LSNROID

enumerate CAPTIONABOVE_LSNROID

enumerate CAPTIONBELOW_LSNROID

enumerate INCLUSIVE_LSNROID

enumerate DEFAULT_LSNROID

§16. The following converts outcome/option names to their enumerated values:

int LineClassifiers::outcome_by_name(text_stream *outcome) {
    if (Str::eq(outcome, I"audio"))                return AUDIO_LSNROID;
    if (Str::eq(outcome, I"beginparagraph"))       return BEGINPARAGRAPH_LSNROID;
    if (Str::eq(outcome, I"carouselend"))          return CAROUSELEND_LSNROID;
    if (Str::eq(outcome, I"carouselslide"))        return CAROUSELSLIDE_LSNROID;
    if (Str::eq(outcome, I"code"))                 return CODE_LSNROID;
    if (Str::eq(outcome, I"commentary"))           return COMMENTARY_LSNROID;
    if (Str::eq(outcome, I"definition"))           return DEFINITION_LSNROID;
    if (Str::eq(outcome, I"definitioncontinued"))  return DEFINITIONCONTINUED_LSNROID;
    if (Str::eq(outcome, I"download"))             return DOWNLOAD_LSNROID;
    if (Str::eq(outcome, I"embeddedvideo"))        return EMBEDDEDVIDEO_LSNROID;
    if (Str::eq(outcome, I"endextract"))           return ENDEXTRACT_LSNROID;
    if (Str::eq(outcome, I"enumeration"))          return ENUMERATION_LSNROID;
    if (Str::eq(outcome, I"extract"))              return EXTRACT_LSNROID;
    if (Str::eq(outcome, I"figure"))               return FIGURE_LSNROID;
    if (Str::eq(outcome, I"fileholon"))            return FILEHOLON_LSNROID;
    if (Str::eq(outcome, I"formatidentifier"))     return FORMATIDENTIFIER_LSNROID;
    if (Str::eq(outcome, I"html"))                 return HTML_LSNROID;
    if (Str::eq(outcome, I"includefile"))          return INCLUDEFILE_LSNROID;
    if (Str::eq(outcome, I"makedefinitionshere"))  return MAKEDEFINITIONSHERE_LSNROID;
    if (Str::eq(outcome, I"makeclasseshere"))      return MAKECLASSESHERE_LSNROID;
    if (Str::eq(outcome, I"namedholon"))           return NAMEDHOLON_LSNROID;
    if (Str::eq(outcome, I"namelessholon"))        return NAMELESSHOLON_LSNROID;
    if (Str::eq(outcome, I"paragraphtag"))         return PARAGRAPHTAG_LSNROID;
    if (Str::eq(outcome, I"paragraphtitling"))     return PARAGRAPHTITLING_LSNROID;
    if (Str::eq(outcome, I"partition"))            return PARTITION_LSNROID;
    if (Str::eq(outcome, I"purpose"))              return PURPOSE_LSNROID;
    if (Str::eq(outcome, I"quotation"))            return QUOTATION_LSNROID;
    if (Str::eq(outcome, I"textascodeextract"))    return TEXTASCODEEXTRACT_LSNROID;
    if (Str::eq(outcome, I"textextract"))          return TEXTEXTRACT_LSNROID;
    if (Str::eq(outcome, I"textextractto"))        return TEXTEXTRACTTO_LSNROID;
    if (Str::eq(outcome, I"title"))                return TITLE_LSNROID;
    if (Str::eq(outcome, I"video"))                return VIDEO_LSNROID;

    if (Str::eq(outcome, I"hyperlinkedoption"))    return HYPERLINKED_LSNROID;
    if (Str::eq(outcome, I"undisplayedoption"))    return UNDISPLAYED_LSNROID;

    if (Str::eq(outcome, I"webwideholonoption"))   return WEBWIDEHOLON_LSNROID;
    if (Str::eq(outcome, I"tangledtoholonoption")) return TANGLEDTOHOLON_LSNROID;
    if (Str::eq(outcome, I"veryearlyholonoption")) return VERYEARLYHOLON_LSNROID;
    if (Str::eq(outcome, I"earlyholonoption"))     return EARLYHOLON_LSNROID;
    if (Str::eq(outcome, I"lateholonoption"))      return LATEHOLON_LSNROID;
    if (Str::eq(outcome, I"verylateholonoption"))  return VERYLATEHOLON_LSNROID;

    if (Str::eq(outcome, I"continuationoption"))   return CONTINUATION_LSNROID;

    if (Str::eq(outcome, I"superheadingoption"))   return SUPERHEADING_LSNROID;
    if (Str::eq(outcome, I"subheading1option"))    return LEVEL1_LSNROID;
    if (Str::eq(outcome, I"subheading2option"))    return LEVEL2_LSNROID;
    if (Str::eq(outcome, I"subheading3option"))    return LEVEL3_LSNROID;
    if (Str::eq(outcome, I"subheading4option"))    return LEVEL4_LSNROID;
    if (Str::eq(outcome, I"subheading5option"))    return LEVEL5_LSNROID;

    if (Str::eq(outcome, I"silentoption"))         return SILENT_LSNROID;

    if (Str::eq(outcome, I"withpurposeoption"))    return WITHPURPOSE_LSNROID;

    if (Str::eq(outcome, I"captionaboveoption"))   return CAPTIONABOVE_LSNROID;
    if (Str::eq(outcome, I"captionbelowoption"))   return CAPTIONBELOW_LSNROID;

    if (Str::eq(outcome, I"inclusive"))            return INCLUSIVE_LSNROID;

    if (Str::eq(outcome, I"defaultoption"))        return DEFAULT_LSNROID;

    return NO_LSNROID;
}

§17. The following bits are high enough up that a valid options bitmap can never equal a valid outcome ID, but at present we make no use of this fact.

define HYPERLINKED_LSNROBIT 0x000100

define UNDISPLAYED_LSNROBIT 0x000200

define WEBWIDEHOLON_LSNROBIT 0x000400

define TANGLEDTOHOLON_LSNROBIT 0x000800

define VERYEARLYHOLON_LSNROBIT 0x001000

define EARLYHOLON_LSNROBIT 0x002000

define LATEHOLON_LSNROBIT 0x004000

define VERYLATEHOLON_LSNROBIT 0x008000

define CONTINUATION_LSNROBIT 0x010000

define SUPERHEADING_LSNROBIT 0x020000

define LEVEL1_LSNROBIT 0x040000

define LEVEL2_LSNROBIT 0x080000

define LEVEL3_LSNROBIT 0x100000

define LEVEL4_LSNROBIT 0x200000

define LEVEL5_LSNROBIT 0x400000

define SILENT_LSNROBIT 0x800000

define WITHPURPOSE_LSNROBIT 0x1000000

define CAPTIONABOVE_LSNROBIT 0x2000000

define CAPTIONBELOW_LSNROBIT 0x4000000

define INCLUSIVE_LSNROBIT 0x8000000

define DEFAULT_LSNROBIT 0x10000000

int LineClassifiers::option_bit(int O) {
    switch (O) {
        case HYPERLINKED_LSNROID:    return HYPERLINKED_LSNROBIT;
        case UNDISPLAYED_LSNROID:    return UNDISPLAYED_LSNROBIT;

        case WEBWIDEHOLON_LSNROID:   return WEBWIDEHOLON_LSNROBIT;
        case TANGLEDTOHOLON_LSNROID: return TANGLEDTOHOLON_LSNROBIT;
        case VERYEARLYHOLON_LSNROID: return VERYEARLYHOLON_LSNROBIT;
        case EARLYHOLON_LSNROID:     return EARLYHOLON_LSNROBIT;
        case LATEHOLON_LSNROID:      return LATEHOLON_LSNROBIT;
        case VERYLATEHOLON_LSNROID:  return VERYLATEHOLON_LSNROBIT;

        case CONTINUATION_LSNROID:   return CONTINUATION_LSNROBIT;

        case SUPERHEADING_LSNROID:   return SUPERHEADING_LSNROBIT;
        case LEVEL1_LSNROID:         return LEVEL1_LSNROBIT;
        case LEVEL2_LSNROID:         return LEVEL2_LSNROBIT;
        case LEVEL3_LSNROID:         return LEVEL3_LSNROBIT;
        case LEVEL4_LSNROID:         return LEVEL4_LSNROBIT;
        case LEVEL5_LSNROID:         return LEVEL5_LSNROBIT;

        case SILENT_LSNROID:         return SILENT_LSNROBIT;

        case WITHPURPOSE_LSNROID:    return WITHPURPOSE_LSNROBIT;

        case CAPTIONABOVE_LSNROID:   return CAPTIONABOVE_LSNROBIT;
        case CAPTIONBELOW_LSNROID:   return CAPTIONBELOW_LSNROBIT;

        case INCLUSIVE_LSNROID:      return INCLUSIVE_LSNROBIT;

        case DEFAULT_LSNROID:        return DEFAULT_LSNROBIT;
    }
    return -1;
}