Conjugating verbs into the many different forms they can take.
- §2. Finding
- §6. Making conjugations
- §7. Follow instructions
- §8. Merge verb material
- §11. Parsing verb form numbers
- §12. Parsing tense and sense indicators
- §13. Parsing utilities
- §15. Errors
- §16. Testing
§1. We will need to turn a base form of a verb — in English, this is always the infinitive — into up to 123 variants; we manage this with quite an extensive data structure. There will typically only be a few dozen fully conjugated verbs in any source text, though, so the memory cost isn't too extreme. For English it looks wasteful, since so many forms are the same, but for French (say) they are almost all different.
classdef verb_conjugation { struct word_assemblage infinitive; /* not counting the "to", in English */ struct word_assemblage past_participle; struct word_assemblage present_participle; struct verb_tabulation tabulations[NO_KNOWN_VOICES]; NATURAL_LANGUAGE_WORDS_TYPE *defined_in; #ifdef LINGUISTICS_MODULE struct verb *vc_conjugates; #endif #ifdef VC_COMPILATION_INFLECTIONS_CALLBACK struct verb_conjugation_compilation_data compilation_data; #endif int auxiliary_only; /* used only as an auxiliary, e.g. the "have" in "I have gone" */ int instance_of_verb; /* defines an instance of kind "verb" at run-time */ } typedef struct verb_tabulation { struct word_assemblage to_be_auxiliary; /* use this if non-empty */ struct word_assemblage vc_text[NO_KNOWN_TENSES][NO_KNOWN_SENSES][NO_KNOWN_PERSONS][NO_KNOWN_NUMBERS]; int modal_auxiliary_usage[NO_KNOWN_TENSES][NO_KNOWN_SENSES][NO_KNOWN_PERSONS][NO_KNOWN_NUMBERS]; } verb_tabulation;
- The structure verb_conjugation is accessed in 2/plr and here.
- The structure verb_tabulation is private to this section.
verb_conjugation *Conjugation::find_by_infinitive(word_assemblage infinitive) { verb_conjugation *vc; LOOP_OVER(vc, verb_conjugation) if (WordAssemblages::eq(&infinitive, &(vc->infinitive))) return vc; return NULL; }
verb_conjugation *Conjugation::find_prior(verb_conjugation *nvc) { if (nvc == NULL) return NULL; verb_conjugation *vc; LOOP_OVER(vc, verb_conjugation) if ((vc != nvc) && (Conjugation::eq(nvc, vc))) return vc; return NULL; }
int Conjugation::eq(verb_conjugation *nvc, verb_conjugation *vc) { if ((WordAssemblages::eq(&(nvc->infinitive), &(vc->infinitive))) && (WordAssemblages::eq(&(nvc->past_participle), &(vc->past_participle))) && (WordAssemblages::eq(&(nvc->present_participle), &(vc->present_participle)))) { for (int i=0; i<NO_KNOWN_VOICES; i++) { verb_tabulation *nvt = &(nvc->tabulations[i]); verb_tabulation *vt = &(vc->tabulations[i]); if (WordAssemblages::eq( &(nvt->to_be_auxiliary), &(vt->to_be_auxiliary)) == FALSE) return FALSE; for (int t=0; t<NO_KNOWN_TENSES; t++) for (int s=0; s<NO_KNOWN_SENSES; s++) for (int p=0; p<NO_KNOWN_PERSONS; p++) for (int n=0; n<NO_KNOWN_NUMBERS; n++) { if (WordAssemblages::eq( &(nvt->vc_text[t][s][p][n]), &(vt->vc_text[t][s][p][n])) == FALSE) return FALSE; if (nvt->modal_auxiliary_usage[t][s][p][n] != vt->modal_auxiliary_usage[t][s][p][n]) return FALSE; } } return TRUE; } return FALSE; }
void Conjugation::write(OUTPUT_STREAM, verb_conjugation *vc) { WRITE("Infinitive: %A / Present participle: %A / Past participle: %A^", &(vc->infinitive), &(vc->present_participle), &(vc->past_participle)); int voice_count = 2; if (WordAssemblages::nonempty(vc->tabulations[PASSIVE_VOICE].to_be_auxiliary)) voice_count = 1; for (int voice=0; voice<voice_count; voice++) { for (int sense=0; sense<NO_KNOWN_SENSES; sense++) { if (voice == 0) WRITE("Active "); else WRITE("Passive "); if (sense == 0) WRITE("positive^"); else WRITE("negative^"); for (int tense=0; tense<NO_KNOWN_TENSES; tense++) { WRITE("Tense %d: ", tense); int person = 0; for (int n=0; n<NO_KNOWN_NUMBERS; n++) for (int p=0; p<NO_KNOWN_PERSONS; p++) { word_assemblage *wa; if (voice == 0) wa = &(vc->tabulations[ACTIVE_VOICE].vc_text[tense][sense][p][n]); else wa = &(vc->tabulations[PASSIVE_VOICE].vc_text[tense][sense][p][n]); if (person++ > 0) WRITE(" / "); if (WordAssemblages::nonempty(*wa)) WRITE("%A", wa); else WRITE("--"); } WRITE("^"); } } } if (WordAssemblages::nonempty(vc->tabulations[PASSIVE_VOICE].to_be_auxiliary)) WRITE("Form passive as to be + %A\n", &(vc->tabulations[PASSIVE_VOICE].to_be_auxiliary)); }
§6. Making conjugations. The following will make more sense if read alongside the examples in "English Inflections", which explains the format in full.
The crucial early step here is Conjugation::follow_instructions, which has
two tasks to perform: it works out the numbered verb forms, and it chooses
which tabulation will be used. Verb form number 0 is always the base text,
and subsequent numbers include some which are universal across all verbs
(these have *_FORM_TYPE constants), and others which vary from one
conjugation to another.
verb_conjugation *Conjugation::conjugate(word_assemblage base_text, NATURAL_LANGUAGE_WORDS_TYPE *nl) { return Conjugation::conjugate_with_overrides(base_text, NULL, 0, nl); } verb_conjugation *Conjugation::conjugate_with_overrides(word_assemblage base_text, word_assemblage *overrides, int no_overrides, NATURAL_LANGUAGE_WORDS_TYPE *nl) { nl = DefaultLanguage::get(nl); if (WordAssemblages::nonempty(base_text) == FALSE) internal_error("No base text for verb conjugation"); word_assemblage verb_forms[MAX_FORM_TYPES+1]; Initialise all verb forms to the base text6.1; int n = 1, aux_len = 0, avo_flag = FALSE, niv_flag = FALSE; nonterminal *tabulation = Conjugation::follow_instructions(verb_forms, &n, &aux_len, &avo_flag, &niv_flag, nl); Override any verb forms with supplied irregularities6.2; Use the verb forms and the tabulation to make the conjugation6.3; }
§6.1. Initialise all verb forms to the base text6.1 =
int k; for (k=0; k<=MAX_FORM_TYPES; k++) verb_forms[k] = base_text;
- This code is used in §6.
§6.2. This feature is provided so that English verb definitions can override the usual grammatical rules, which enables us to create new irregular verbs. For example, Inform will by default make the past participle "blended" out of the verb "to blend", but a definition like
To blend (I blend, he blends, it is blent) ...
will cause "blent" to override "blended" in the PAST_PARTICIPLE_FORM_TYPE.
(Philip Larkin's poem "Church Going" uses "blent", but I've never seen
anybody else try this one on.)
Note that verb form 0 can't be overridden: that was the base text.
Override any verb forms with supplied irregularities6.2 =
for (int k=1; k<no_overrides; k++) if (WordAssemblages::nonempty(overrides[k])) verb_forms[k] = overrides[k];
- This code is used in §6.
§6.3. Use the verb forms and the tabulation to make the conjugation6.3 =
verb_conjugation *vc = CREATE(verb_conjugation); #ifdef LINGUISTICS_MODULE vc->vc_conjugates = NULL; #endif vc->infinitive = verb_forms[INFINITIVE_FORM_TYPE]; vc->present_participle = verb_forms[PRESENT_PARTICIPLE_FORM_TYPE]; vc->past_participle = verb_forms[PAST_PARTICIPLE_FORM_TYPE]; vc->defined_in = nl; vc->auxiliary_only = avo_flag; vc->instance_of_verb = (niv_flag)?FALSE:TRUE; #ifdef VC_COMPILATION_INFLECTIONS_CALLBACK VC_COMPILATION_INFLECTIONS_CALLBACK(vc); #endif Start by blanking out all the passive and active slots6.3.1; Work through the supplied tabulation, filling in slots as directed6.3.2; return vc;
- This code is used in §6.
§6.3.1. Start by blanking out all the passive and active slots6.3.1 =
vc->tabulations[ACTIVE_VOICE].to_be_auxiliary = WordAssemblages::lit_0(); vc->tabulations[PASSIVE_VOICE].to_be_auxiliary = WordAssemblages::lit_0(); for (int t=0; t<NO_KNOWN_TENSES; t++) for (int s=0; s<NO_KNOWN_SENSES; s++) for (int n=0; n<NO_KNOWN_NUMBERS; n++) for (int p=0; p<NO_KNOWN_PERSONS; p++) { vc->tabulations[ACTIVE_VOICE].vc_text[t][s][p][n] = WordAssemblages::lit_0(); vc->tabulations[PASSIVE_VOICE].vc_text[t][s][p][n] = WordAssemblages::lit_0(); }
- This code is used in §6.3.
§6.3.2. A tabulation is a sort of program laying out what to put in which slots, active or passive. Each production is a step in this program, and it consists of a "selector" followed by a "line". For example, the production:
a3 ( t1 avoir ) 3+*
contains six tokens; the selector is a3, and the line is made up from the
rest. (The selector is always just a single token.)
Work through the supplied tabulation, filling in slots as directed6.3.2 =
production_list *pl; for (pl = tabulation->first_pl; pl; pl = pl->next_pl) { if (nl == pl->definition_language) { for (production *pr = pl->first_pr; pr; pr = pr->next_pr) { ptoken *selector = pr->first_pt; ptoken *line = (selector)?(selector->next_pt):NULL; if ((selector) && (selector->ptoken_category == FIXED_WORD_PTC) && (line)) { Apply the given tabulation line to the slots selected6.3.2.1; } else Conjugation::error(base_text, tabulation, pr, "tabulation row doesn't consist of a selector and then text"); } } }
- This code is used in §6.3.
§6.3.2.1. Apply the given tabulation line to the slots selected6.3.2.1 =
int active_set = NOT_APPLICABLE, tense_set = -1, sense_set = -1, set_tba = FALSE; Parse the slot selector6.3.2.1.3; if (set_tba) vc->tabulations[PASSIVE_VOICE].to_be_auxiliary = Conjugation::merge(line, POSITIVE_SENSE, IS_TENSE, FIRST_PERSON, SINGULAR_NUMBER, MAX_FORM_TYPES+1, verb_forms, nl, NULL); for (int tense=0; tense<NO_KNOWN_TENSES; tense++) for (int sense=0; sense<NO_KNOWN_SENSES; sense++) for (int n=0; n<NO_KNOWN_NUMBERS; n++) for (int p=0; p<NO_KNOWN_PERSONS; p++) { if ((sense_set >= 0) && (sense != sense_set)) continue; if ((tense_set >= 0) && (tense != tense_set)) continue; if (active_set) Apply to the active voice6.3.2.1.1 else Apply to the passive voice6.3.2.1.2; }
- This code is used in §6.3.2.
§6.3.2.1.1. Apply to the active voice6.3.2.1.1 =
vc->tabulations[ACTIVE_VOICE].vc_text[tense][sense][p][n] = Conjugation::merge(line, sense, tense, p, n, MAX_FORM_TYPES+1, verb_forms, nl, &(vc->tabulations[ACTIVE_VOICE].modal_auxiliary_usage[tense][sense][p][n]));
- This code is used in §6.3.2.1.
§6.3.2.1.2. Apply to the passive voice6.3.2.1.2 =
vc->tabulations[PASSIVE_VOICE].vc_text[tense][sense][p][n] = Conjugation::merge(line, sense, tense, p, n, MAX_FORM_TYPES+1, verb_forms, nl, &(vc->tabulations[PASSIVE_VOICE].modal_auxiliary_usage[tense][sense][p][n]));
- This code is used in §6.3.2.1.
§6.3.2.1.3. The selector tells us which tense(s), sense(s) and voice(s) to apply the
line to; a3, for example, means active voice, tense 3, in both positive
and negative senses.
Parse the slot selector6.3.2.1.3 =
vocabulary_entry *ve = selector->ve_pt; inchar32_t *p = Vocabulary::get_exemplar(ve, FALSE); if (p[0] == 'a') active_set = TRUE; if (p[0] == 'p') active_set = FALSE; if (active_set == NOT_APPLICABLE) Conjugation::error(base_text, tabulation, pr, "tabulation row doesn't begin with 'a' or 'p'"); int at = 1; if (Characters::isdigit(p[at])) { tense_set = (int) (p[at++]-'1'); } if (p[at] == '+') { sense_set = 0; at++; } else if (p[at] == '-') { sense_set = 1; at++; } else if ((p[at] == '*') && (tense_set == -1) && (active_set == FALSE)) { set_tba = TRUE; at++; } if (p[at] != 0) { LOG("The selector here is: <%w>\n", p); Conjugation::error(base_text, tabulation, pr, "unrecognised selector in tabulation row"); }
- This code is used in §6.3.2.1.
§7. Follow instructions. That completes the top level of the routine, but it depended on two major
sub-steps: a preliminary pass called Conjugation::follow_instructions and
a routine to deal with the final results called Conjugation::merge.
Here's the first of these. Note that the routine indirects through three main
nonterminals; it always starts with <verb-conjugation-instructions> and
uses this to choose a "conjugation" nonterminal. It then chugs through
the conjugation, which ends by choosing a "tabulation". For example, in
English, the base text "do" passes through <verb-conjugation-instructions>,
which chooses the conjugation <to-do-conjugation>, which in turn sets some
participles and then chooses the tabulation <to-do-tabulation>.
nonterminal *Conjugation::follow_instructions(word_assemblage *verb_forms, int *highest_form_written, int *aux_len, int *avo_flag, int *niv_flag, NATURAL_LANGUAGE_WORDS_TYPE *nl) { nonterminal *instructions_nt = <verb-conjugation-instructions>; nonterminal *tabulation_nt = NULL, *conjugation_nt = NULL; *highest_form_written = 1; *aux_len = 0; *avo_flag = FALSE; *niv_flag = FALSE; Pattern match on the base text to decide which conjugation to use7.1; if (conjugation_nt == NULL) Conjugation::error(verb_forms[0], instructions_nt, NULL, "the instructions here failed to choose a conjugation"); Process the conjugation and determine the tabulation7.2; if (tabulation_nt == NULL) Conjugation::error(verb_forms[0], conjugation_nt, NULL, "the conjugation here failed to choose a tabulation"); return tabulation_nt; }
§7.1. Pattern match on the base text to decide which conjugation to use7.1 =
vocabulary_entry **base_text_words; int base_text_word_count; WordAssemblages::as_array(&(verb_forms[BASE_FORM_TYPE]), &base_text_words, &base_text_word_count); production_list *pl; for (pl = instructions_nt->first_pl; pl; pl = pl->next_pl) { if (nl == pl->definition_language) { production *pr; for (pr = pl->first_pr; pr; pr = pr->next_pr) { Try to match the base text against this production7.1.1; } } }
- This code is used in §7.
§7.1.1. Each production in this language's <verb-conjugation-instructions> grammar
consists of a (possibly empty) pattern to match, followed by the name of a
nonterminal to use as the conjugation if it matches. For example, in
-querir <fr-querir-conjugation>
the pattern part is a single token, -querir, which matches if the base text
is a single word whose last six characters are "querir". A more complicated
case is:
be able to ... <to-be-able-to-auxiliary>
Here the wildcard ... matches one or more words, and the "auxiliary
infinitive" form is set to the part matched by ...: for example,
"be able to see" matches with auxiliary infinitive "see".
Try to match the base text against this production7.1.1 =
ptoken *pt, *last = NULL; int len = 0, malformed = FALSE; for (pt = pr->first_pt; pt; pt = pt->next_pt) { last = pt; len++; } if ((len >= 1) && (last->ptoken_category == NONTERMINAL_PTC)) { if (conjugation_nt == NULL) { /* i.e., if we have not yet chosen a conjugation */ int failed_to_match = FALSE, wildcard_from = -1; Try to match the base text against the pattern part of the production7.1.1.1; if (failed_to_match == FALSE) { conjugation_nt = last->nt_pt; verb_forms[ADJOINT_INFINITIVE_FORM_TYPE] = verb_forms[BASE_FORM_TYPE]; if (wildcard_from > 0) WordAssemblages::truncate( &(verb_forms[ADJOINT_INFINITIVE_FORM_TYPE]), wildcard_from); *aux_len = wildcard_from; } } } else malformed = TRUE; if (malformed) Conjugation::error(verb_forms[BASE_FORM_TYPE], <verb-conjugation-instructions>, pr, "malformed line");
- This code is used in §7.1.
§7.1.1.1. Try to match the base text against the pattern part of the production7.1.1.1 =
int word_count = 0; for (pt = pr->first_pt; ((pt) && (pt != last)); pt = pt->next_pt) { if (pt->ptoken_category == FIXED_WORD_PTC) { if ((word_count < base_text_word_count) && (Conjugation::compare_ve_with_tails(base_text_words[word_count], pt->ve_pt))) word_count++; else failed_to_match = TRUE; } else if (pt->ptoken_category == MULTIPLE_WILDCARD_PTC) { wildcard_from = word_count; if (base_text_word_count <= word_count) failed_to_match = TRUE; /* must match at least one word */ } else malformed = TRUE; } if (wildcard_from == -1) { if (word_count != base_text_word_count) failed_to_match = TRUE; wildcard_from = 0; }
- This code is used in §7.1.1.
§7.2. In a conjugation, productions have two possible forms: either just a single nonterminal, which usually identifies the tabulation, or a number followed by some tokens.
Process the conjugation and determine the tabulation7.2 =
production_list *pl; for (pl = conjugation_nt->first_pl; pl; pl = pl->next_pl) { if (nl == pl->definition_language) { production *pr; for (pr = pl->first_pr; pr; pr = pr->next_pr) { ptoken *pt; int len = 0, malformed = FALSE; for (pt = pr->first_pt; pt; pt = pt->next_pt) len++; switch (len) { case 1: if (pr->first_pt->ptoken_category == NONTERMINAL_PTC) { if (pr->first_pt->nt_pt == <auxiliary-verb-only>) *avo_flag = TRUE; else if (pr->first_pt->nt_pt == <not-instance-of-verb-at-run-time>) *niv_flag = TRUE; else tabulation_nt = pr->first_pt->nt_pt; } else malformed = TRUE; break; case 2: Set a verb form from the conjugation line7.2.1; break; default: malformed = TRUE; break; } if (malformed) Conjugation::error(verb_forms[BASE_FORM_TYPE], conjugation_nt, pr, "malformed line"); } } }
- This code is used in §7.
§7.2.1. So here we check the more interesting case. The number identifies which verb form to set, and the token which follows it provides the content. For example:
2 having set to the literal text "having"
3 1+ed set to verb form 1 with "ed" suffixed
3 <en-trie-past-participle> run this trie on the base text and take the result
Set a verb form from the conjugation line7.2.1 =
ptoken *number_token = pr->first_pt; ptoken *content_token = number_token->next_pt; int n = Conjugation::ptoken_to_verb_form_number(number_token); if (n >= 0) { if (n > *highest_form_written) { *highest_form_written = n; } if (content_token->ptoken_category == NONTERMINAL_PTC) verb_forms[n] = Inflect::first_word( verb_forms[BASE_FORM_TYPE], PreformUtilities::define_trie(content_token->nt_pt, TRIE_END, DefaultLanguage::get(nl))); else if (content_token->ptoken_category == FIXED_WORD_PTC) verb_forms[n] = Conjugation::expand_with_endings(content_token->ve_pt, verb_forms); else malformed = TRUE; } else malformed = TRUE;
- This code is used in §7.2.
§8. Merge verb material. Now the final main step. row points to a list of ptokens containing text,
and we have to copy that text into a word assemblage and return it.
In theory that's a one-line routine, but it's made complicated by the number
of special syntaxes which can go into the row of text. For example, if row
is only
will not do
then the word assemblage comes out to just "will not do"; but if it is
( t1 auxiliary-have ) done
then we consult tense 1 (present) of the verb "auxiliary-have", extract the relevant slot, then append "done". (This might produce "have done" or "has done" or "haven't done" or "hasn't done", depending on the current sense and person.) We call the business of extracting text from a different verb's conjugation "lifting".
There are other complications, too. See "English Inflections" for more.
word_assemblage Conjugation::merge(ptoken *row, int sense, int tense, int person, int number, int num_ingredients, word_assemblage *ingredients, NATURAL_LANGUAGE_WORDS_TYPE *nl, int *modal_following) { if (modal_following) { *modal_following = 0; } word_assemblage wa = WordAssemblages::lit_0(); int verb_form_to_lift = -1; ptoken *chunk; for (chunk = row; chunk; chunk = chunk->next_pt) { A plus-plus-digit indicates auxiliary modal usage8.5; A form number followed by a bracketed verb lifts the relevant form8.3; A bracketed verb becomes a lift8.4; A fixed word is simply added to the result8.1; A nonterminal is a table of persons8.2; internal_error("Error in merge material line"); } return Conjugation::shorten_with_contractions(wa); }
§8.1. To take the easiest case first. If we read a word like trailing, we simply
add it. But note that Conjugation::expand_with_endings has other tricks up its sleeve,
and might expand 3+ed to "trailed".
A fixed word is simply added to the result8.1 =
if (chunk->ptoken_category == FIXED_WORD_PTC) { wa = WordAssemblages::join(wa, Conjugation::expand_with_endings(chunk->ve_pt, ingredients)); continue; }
- This code is used in §8.
§8.2. If we read a nonterminal name, such as <fr-vivre-present>, then this must
be a grammar with six productions, giving the text to use for the six different
persons. We consult person and extract the relevant text. For example, if
person is 3, we extract "vivons". Note that this material is itself read
in by a recursive use of Conjugation::merge(), because this enables it to
make use of the same fancy features we're allowing here.
A nonterminal is a table of persons8.2 =
if (chunk->ptoken_category == NONTERMINAL_PTC) { production_list *pl; for (pl = chunk->nt_pt->first_pl; pl; pl = pl->next_pl) { int N = 0; production *pr; for (pr = pl->first_pr; pr; pr = pr->next_pr) { if (N == number*3 + person) wa = WordAssemblages::join(wa, Conjugation::merge(pr->first_pt, sense, tense, person, number, num_ingredients, ingredients, nl, NULL)); N++; } } continue; }
- This code is used in §8.
§8.3. A number followed by a verb in brackets, like so:
3 ( avoir )
expands to verb form 3 of this verb — the past participle of "avoir", which is "eu", as it happens. This is a special kind of lift. It isn't actually performed now; we make a note and carry it out when we reach the brackets, on the next iteration.
A form number followed by a bracketed verb lifts the relevant form8.3 =
int X = Conjugation::ptoken_to_verb_form_number(chunk); if ((X >= 0) && (Conjugation::ptoken_as_bracket(chunk->next_pt))) { verb_form_to_lift = X; continue; }
- This code is used in §8.
§8.4. And now the lift takes place. We might at this point have verb_form_to_lift
set, in which case we should lift a verb form, or we might not, in which case
we should lift an ordinary usage, such as third-person singular in a particular
tense. A lift can optionally change tense or sense: for example,
( t1 have )
lifts from the present tense of "to have". If there's no tense indicator,
the tense remains the current one. (It's also possible to change the sense from
positive to negative or vice versa with this, though I can't think of a
language where this would be useful.) Note that, once again, the text of the
infinitive passes through Conjugation::expand_with_endings, so that it can
make use of the numbered verb forms if we want it to.
A bracketed verb becomes a lift8.4 =
if (Conjugation::ptoken_as_bracket(chunk) == 1) { chunk = chunk->next_pt; /* move past open bracket */ /* if there is a tense/sense indicator, use it, and move forward */ int S = -1; int T = Conjugation::ptoken_to_tense_indicator(chunk, &S) - 1; if (T >= 0) chunk = chunk->next_pt; else T = tense; if (S == -1) S = sense; /* extract the text of the infinitive */ word_assemblage verb_lifted = WordAssemblages::lit_0(); while ((chunk) && (Conjugation::ptoken_as_bracket(chunk) != -1)) { verb_lifted = WordAssemblages::join(verb_lifted, Conjugation::expand_with_endings(chunk->ve_pt, ingredients)); chunk = chunk->next_pt; } verb_conjugation *aux = Conjugation::find_by_infinitive(verb_lifted); if (aux == NULL) aux = Conjugation::conjugate(verb_lifted, nl); if (aux == NULL) internal_error("can't conjugate lifted verb"); switch (verb_form_to_lift) { case 1: wa = WordAssemblages::join(wa, aux->infinitive); break; case 2: wa = WordAssemblages::join(wa, aux->present_participle); break; case 3: wa = WordAssemblages::join(wa, aux->past_participle); break; case -1: wa = WordAssemblages::join(wa, aux->tabulations[ACTIVE_VOICE].vc_text[T][S][person][number]); break; default: internal_error("only parts 1, 2, 3 can be extracted"); } continue; }
- This code is used in §8.
§8.5. A plus-plus-digit indicates auxiliary modal usage8.5 =
if (chunk->ptoken_category == FIXED_WORD_PTC) { inchar32_t *p = Vocabulary::get_exemplar(chunk->ve_pt, TRUE); if ((p[0] == '+') && (p[1] == '+') && (Characters::isdigit(p[2])) && (p[3] == 0)) { if (modal_following) { *modal_following = ((int) p[2]) - ((int) '0'); } continue; } }
- This code is used in §8.
§9. Whenever we read a single word, it passes through the following. A word
like "fish" will pass through unchanged; a number like "7" will convert
to verb form 7 in the current verb (for example, 2 becomes the present
participle); a plus sign joins two pieces together; and a tilde is a tie,
joining but with a space. Thus fish~to~fry becomes three words.
word_assemblage Conjugation::expand_with_endings(vocabulary_entry *ve, word_assemblage *verb_forms) { if (ve == NULL) return WordAssemblages::lit_0(); inchar32_t *p = Vocabulary::get_exemplar(ve, TRUE); int i; for (i=0; p[i]; i++) if ((i>0) && (p[i+1]) && ((p[i] == '+') || (p[i] == '~'))) { vocabulary_entry *front = Vocabulary::entry_for_partial_text(p, 0, i-1); vocabulary_entry *back = Vocabulary::entry_for_partial_text(p, i+1, Wide::len(p)-1); word_assemblage front_wa = Conjugation::expand_with_endings(front, verb_forms); word_assemblage back_wa = Conjugation::expand_with_endings(back, verb_forms); TEMPORARY_TEXT(TEMP) WRITE_TO(TEMP, "%A", &front_wa); if (p[i] == '~') PUT_TO(TEMP, ' '); WRITE_TO(TEMP, "%A", &back_wa); wording W = Feeds::feed_text(TEMP); DISCARD_TEXT(TEMP) return WordAssemblages::from_wording(W); } int X = Conjugation::ve_to_verb_form_number(ve); if (X >= 0) return verb_forms[X]; return WordAssemblages::lit_1(ve); }
§10. The final step in merging verb material is to pass the result through the following, which attends to contractions. (Most of the time it does nothing.) For example, suppose we have:
ne-' ai pas
The -' marker tells us that the word it attaches to should contract if a
vowel follows it. In this case that's what happens, so we convert to:
n'ai pas
On the other hand,
ne-' jette pas
would convert to
ne jette pas
with no contraction. Either way, though, we have to take some action when
we see a -' marker.
word_assemblage Conjugation::shorten_with_contractions(word_assemblage wa) { vocabulary_entry **words; int word_count; WordAssemblages::as_array(&wa, &words, &word_count); int i; for (i=0; i<word_count-1; i++) { inchar32_t *p = Vocabulary::get_exemplar(words[i], TRUE); inchar32_t *q = Vocabulary::get_exemplar(words[i+1], TRUE); int j = Wide::len(p)-2; if ((j >= 0) && (p[j] == '-') && (p[j+1] == '\'')) { TEMPORARY_TEXT(TEMP) int contract_this = FALSE; Decide whether a contraction is needed here10.1; if (contract_this) { int k; for (k=0; k<j-1; k++) { WRITE_TO(TEMP, "%c", p[k]); } WRITE_TO(TEMP, "'%w", q); wording W = Feeds::feed_text(TEMP); words[i] = Lexer::word(Wordings::first_wn(W)); for (k=i+1; k<word_count; k++) words[k] = words[k+1]; word_count--; WordAssemblages::truncate_to(&wa, word_count); } else { int k; for (k=0; k<j; k++) { WRITE_TO(TEMP, "%c", p[k]); } wording W = Feeds::feed_text(TEMP); words[i] = Lexer::word(Wordings::first_wn(W)); } DISCARD_TEXT(TEMP) } } return wa; }
§10.1. We contract if the following word starts with a (possibly accented) vowel, and we construe "y" (but not "h" or "w") as a vowel.
Decide whether a contraction is needed here10.1 =
inchar32_t incipit = q[0]; inchar32_t first = Characters::tolower(Characters::remove_accent(incipit)); if ((first == 'a') || (first == 'e') || (first == 'i') || (first == 'o') || (first == 'u') || (first == 'y')) contract_this = TRUE;
- This code is used in §10.
int Conjugation::ptoken_to_verb_form_number(ptoken *pt) { if ((pt) && (pt->ptoken_category == FIXED_WORD_PTC)) return Conjugation::ve_to_verb_form_number(pt->ve_pt); return -1; } int Conjugation::ve_to_verb_form_number(vocabulary_entry *ve) { if (Vocabulary::test_vflags(ve, NUMBER_MC)) { int X = Vocabulary::get_literal_number_value(ve); if ((X >= 0) && (X < MAX_FORM_TYPES)) return X; } return -1; }
int Conjugation::ptoken_to_tense_indicator(ptoken *pt, int *set_sense) { if ((pt) && (pt->ptoken_category == FIXED_WORD_PTC)) { vocabulary_entry *ve = pt->ve_pt; inchar32_t *p = Vocabulary::get_exemplar(ve, FALSE); if ((p[0] == 't') && (Characters::isdigit(p[1])) && (p[2] == 0)) { int N = (int) (p[1] - '1' + 1); if ((N >= 1) && (N <= NO_KNOWN_TENSES)) return N; } if ((p[0] == 't') && (Characters::isdigit(p[1])) && (p[2] == '+') && (p[3] == 0)) { int N = (int) (p[1] - '1' + 1); if ((N >= 1) && (N <= NO_KNOWN_TENSES)) { *set_sense = 0; return N; } } if ((p[0] == 't') && (Characters::isdigit(p[1])) && (p[2] == '-') && (p[3] == 0)) { int N = (int) (p[1] - '1' + 1); if ((N >= 1) && (N <= NO_KNOWN_TENSES)) { *set_sense = 1; return N; } } } return -1; }
int Conjugation::ptoken_as_bracket(ptoken *pt) { if ((pt) && (pt->ptoken_category == FIXED_WORD_PTC)) { vocabulary_entry *ve = pt->ve_pt; if (ve == OPENBRACKET_V) return 1; if (ve == CLOSEBRACKET_V) return -1; } return 0; }
int Conjugation::compare_ve_with_tails(vocabulary_entry *ve, vocabulary_entry *pattern) { if (ve == pattern) return TRUE; inchar32_t *p = Vocabulary::get_exemplar(pattern, FALSE); if (p[0] == '-') { inchar32_t *q = Vocabulary::get_exemplar(ve, FALSE); int i, j = Wide::len(q)-(Wide::len(p)-1); for (i=1; p[i]; i++, j++) if ((j<0) || (p[i] != q[j])) return FALSE; return TRUE; } return FALSE; }
void Conjugation::error(word_assemblage base_text, nonterminal *nt, production *pr, char *message) { PreformUtilities::error(base_text, nt, pr, message); exit(1); }
§16. Testing. Similarly, the following helps translators by giving them unit tests for their conjugations:
Test verb (internal) with appuyer.
void Conjugation::test(OUTPUT_STREAM, wording W, NATURAL_LANGUAGE_WORDS_TYPE *nl) { verb_conjugation *vc = Conjugation::conjugate( WordAssemblages::from_wording(W), nl); if (vc == NULL) { WRITE("Failed test\n"); return; } Conjugation::write(OUT, vc); DESTROY(vc, verb_conjugation); }
§17. This is for testing English only; it helps with the test suite cases derived from our dictionary of 14,000 or so present and past participles.
void Conjugation::test_participle(OUTPUT_STREAM, wording W) { verb_conjugation *vc = Conjugation::conjugate( WordAssemblages::from_wording(W), DefaultLanguage::get(NULL)); if (vc == NULL) { WRITE("Failed test\n"); return; } Conjugation::write_participle(OUT, vc); DESTROY(vc, verb_conjugation); } void Conjugation::write_participle(OUTPUT_STREAM, verb_conjugation *vc) { WRITE("To %A: he is %A; it was %A.\n", &(vc->infinitive), &(vc->present_participle), &(vc->past_participle)); }
§18. As noted above, these nonterminals have no parsing function, and are used only as markers in verb conjugations.
<auxiliary-verb-only> internal { ==> { fail nonterminal }; } <not-instance-of-verb-at-run-time> internal { ==> { fail nonterminal }; }
- This is Preform grammar, not regular C code.