// Copyright (c) 2018, the Dart project authors. Please see the AUTHORS file // for details. All rights reserved. Use of this source code is governed by a // BSD-style license that can be found in the LICENSE file. import 'dart:io'; import 'dart:typed_data'; import 'package:characters/src/grapheme_clusters/constants.dart'; import 'debug_names.dart'; import 'string_literal_writer.dart'; // Builder for state automata used to find // next/previous grapheme cluster break. // The automaton states are described below, and the code builds tables // for those automatons, then writes the table bytes as a string literal. ////////////////////////////////////////////////////////////////////////////// // Transition table for grapheme cluster break automaton. // For each previous state and each input character category, // emit a new state and whether to break before that input character. // The table uses `!` to mark a break before the input character, // and then the output state. // // We do not care that there is no break between a start-of-text and // and end-of-text (and empty text). We could handle that with one extra // state, but it will never matter for the code using this table. // // Stored as string for comparison to actual generated automaton. const expectedAutomatonDescription = r''' Stat: Cat : CR Ctl Otr Ext Spc Reg Pic LF Pre L V T LV LVT OInC ZWJ EInE EInL EoT : ----------------------------------------------------------------------------------------------------- Brk :!CR !Brk !Otr !Otr !Otr !Reg !Pic !Brk !Pre !L !V !T !V !T !InC !Otr !Otr !Otr ! - : CR :!CR !Brk !Otr !Otr !Otr !Reg !Pic Brk !Pre !L !V !T !V !T !InC !Otr !Otr !Otr ! - : Otr :!CR !Brk !Otr Otr Otr !Reg !Pic !Brk !Pre !L !V !T !V !T !InC Otr Otr Otr ! - : Pre :!CR !Brk Otr Otr Otr Reg Pic !Brk Pre L V T V T InC Otr Otr Otr ! - : L :!CR !Brk !Otr Otr Otr !Reg !Pic !Brk !Pre L V !T V T !InC Otr Otr Otr ! - : V :!CR !Brk !Otr Otr Otr !Reg !Pic !Brk !Pre !L V T !V !T !InC Otr Otr Otr ! - : T :!CR !Brk !Otr Otr Otr !Reg !Pic !Brk !Pre !L !V T !V !T !InC Otr Otr Otr ! - : Pic :!CR !Brk !Otr Pic Otr !Reg !Pic !Brk !Pre !L !V !T !V !T !InC PicZ Pic Pic ! - : PicZ:!CR !Brk !Otr Otr Otr !Reg Pic !Brk !Pre !L !V !T !V !T !InC Otr Otr Otr ! - : Reg :!CR !Brk !Otr Otr Otr Otr !Pic !Brk !Pre !L !V !T !V !T !InC Otr Otr Otr ! - : InC :!CR !Brk !Otr Otr Otr !Reg !Pic !Brk !Pre !L !V !T !V !T !InC InC InC InCL! - : InCL:!CR !Brk !Otr Otr Otr !Reg !Pic !Brk !Pre !L !V !T !V !T InC InCL InCL InCL! - : SoTN: CR Brk Otr Otr Otr Reg Pic Brk Pre L V T V T InC Otr Otr Otr - : SoT :!CR !Brk !Otr !Otr !Otr !Reg !Pic !Brk !Pre !L !V !T !V !T !InC !Otr !Otr !Otr - : CAny:!CR !Brk Otr CExt Otr CReg!Pic !Brk Pre L V T V T InC CZWJ CIE CIL - : CZWJ:!CR !Brk !Otr Otr Otr !Reg $LAZP!Brk !Pre !L !V !T !V !T $LAIC CZIE CZIE CZIL! - : CIE :!CR !Brk !Otr CExt Otr !Reg !Pic !Brk !Pre !L !V !T !V !T $LAIC CIEZ CIE CIL ! - : CIL :!CR !Brk !Otr CExt Otr !Reg !Pic !Brk !Pre !L !V !T !V !T $LAIL CILZ CIL CIL ! - : CIEZ:!CR !Brk !Otr Otr Otr !Reg $LAZP!Brk !Pre !L !V !T !V !T !InC CZIE CZIE CZIL! - : CILZ:!CR !Brk !Otr Otr Otr !Reg $LAZP!Brk !Pre !L !V !T !V !T $LAIL CZIL CZIL CZIL! - : CZIE:!CR !Brk !Otr Otr Otr !Reg !Pic !Brk !Pre !L !V !T !V !T $LAIC CZIE CZIE CZIL! - : CZIL:!CR !Brk !Otr Otr Otr !Reg !Pic !Brk !Pre !L !V !T !V !T $LAIL CZIL CZIL CZIL! - : CExt:!CR !Brk !Otr CExt Otr !Reg !Pic !Brk !Pre !L !V !T !V !T !InC CExZ CExt CExt! - : CExZ:!CR !Brk !Otr Otr Otr !Reg $LAZP!Brk !Pre !L !V !T !V !T !InC Otr Otr Otr ! - : CReg:!CR !Brk !Otr Otr Otr $LARe!Pic !Brk !Pre !L !V !T !V !T !InC Otr Otr Otr ! - : '''; void writeForwardAutomaton(StringSink buffer, {required bool verbose}) { assert(categories.length == categoryCount); assert( automatonRowLength & maskFlags == 0 && automatonRowLength >= categoryCount, ); var table = Uint16List(stateLimit); void transitionLA(int state, int category, int targetState, int flags) { assert(flags <= maskFlags); assert( flags != flagLookahead || targetState >= stateLookaheadMin, '${stateShortName(state)} x ${categoryNames[category]} -> ' '${_targetStateName(targetState, flags)} | $flags', ); table[state + category] = targetState + flags; } void transition(int state, int category, int targetState, bool breakBefore) { assert(targetState < stateLimit, '$state + $category -> $targetState'); transitionLA( state, category, targetState, breakBefore ? flagBreak : flagNoBreak, ); } for (var state = 0; state < stateLimit; state += automatonRowLength) { // States that should always be broken after, unless something specifically // says otherwise. (And does so in GB1..G5). var alwaysBreakBefore = state == stateSoT || state == stateBreak || state == stateCR; // States that should never be broken after, unless `alwaysBreakBefore` // says otherwise (for example the rules in GB1..GB5). var neverBreakBefore = state == stateSoTNoBreak || state == stateCAny || // Break in this state never matters. state == statePrepend; // Other with InCB=None. // No rules apply specifically to Other, so break unless an // Any rule applies. transition(state, categoryOther, stateOther, !neverBreakBefore); // Other with InCB=Consonant. // GB9C. (Break unless Any rule applies, or preceded by indic sequence // with at least one Linked, `stateInCL`). // Remember having seen InCB=Consonant and no InCB=Linked yet. if (state == stateCZWJ || state == stateCIE || state == stateCZIE) { transitionLA( state, categoryOtherIndicConsonant, stateLookaheadInC, flagLookahead, ); } else if (state == stateCIL || state == stateCILZ || state == stateCZIL) { transitionLA( state, categoryOtherIndicConsonant, stateLookaheadInCL, flagLookahead, ); } else { transition( state, categoryOtherIndicConsonant, stateInC, !(neverBreakBefore || state == stateInCL || state == stateCAny), ); } // CR. // GB4 + GB5. Always break, after unless followed by LF, so remember // having seen CR (`stateCR`). transition(state, categoryCR, stateCR, state != stateSoTNoBreak); // LF. // GB3 + GB4 + GB5. Always break after. Break before unless following CR. transition( state, categoryLF, stateBreak, state != stateCR && state != stateSoTNoBreak, ); // Control. (Like CR+LF, without their mutual exception.) // GB4 + GB5. Always break before, even after Prepend, // and always break after (`stateBreak`). transition(state, categoryControl, stateBreak, state != stateSoTNoBreak); // Ext + ZWJ (including InCB Extend and Linked). // GB9 + GB9c + GB11. Never break before Ext or ZWJ, // unless required by earlier rule (after Control, CR, LF, SoT). // Remember whether after Pic+Ext* or InCB=Consonant(Extend|Linked)* if (state == statePictographic) { // GB9 + GB11, after Pic+Ext*. // Extend with InCB=None. transition(state, categoryExtend, statePictographic, false); // Extend with InCB=Extend. transition(state, categoryExtendIndicExtend, statePictographic, false); // Extend with InCB=Linked. transition(state, categoryExtendIndicLinked, statePictographic, false); // ZWJ. transition(state, categoryZWJ, statePictographicZWJ, false); } else if (state == stateInC || state == stateInCL) { // GB9 + GB9c, after InCB Consonant + (Extend|Linked)*. // Extend with InCB=None. transition(state, categoryExtend, stateOther, false); // Extend with InCB=Extend. transition(state, categoryExtendIndicExtend, state, false); // ZWJ (which has InCB=Extend). transition(state, categoryZWJ, state, false); // Extend with InCB=Linked. transition(state, categoryExtendIndicLinked, stateInCL, false); } else if (state < stateMinContextUnaware || state == stateCReg) { // GB9 alone. // No special rules for breaking after, // break before only if required by GB1-GB5. transition(state, categoryExtend, stateOther, alwaysBreakBefore); transition( state, categoryExtendIndicExtend, stateOther, alwaysBreakBefore, ); transition( state, categoryExtendIndicLinked, stateOther, alwaysBreakBefore, ); transition(state, categoryZWJ, stateOther, alwaysBreakBefore); } else { transition( state, categoryZWJ, switch (state) { stateCAny => stateCZWJ, stateCZWJ => stateCZIE, stateCIE => stateCIEZ, stateCIL => stateCILZ, stateCIEZ => stateCZIE, stateCILZ => stateCZIL, stateCZIE => stateCZIE, stateCZIL => stateCZIL, stateCExt => stateCExZ, _ => stateOther, }, false); transition( state, categoryExtend, (state == stateCAny || state == stateCIE || state == stateCIL || state == stateCExt) ? stateCExt : stateOther, false, ); transition( state, categoryExtendIndicExtend, switch (state) { stateCAny => stateCIE, stateCZWJ => stateCZIE, stateCIE => stateCIE, stateCIL => stateCIL, stateCIEZ => stateCZIE, stateCILZ => stateCZIL, stateCZIE => stateCZIE, stateCZIL => stateCZIL, stateCExt => stateCExt, _ => stateOther, }, false); transition( state, categoryExtendIndicLinked, switch (state) { stateCAny => stateCIL, stateCZWJ => stateCZIL, stateCIE => stateCIL, stateCIL => stateCIL, stateCIEZ => stateCZIL, stateCILZ => stateCZIL, stateCZIE => stateCZIL, stateCZIL => stateCZIL, stateCExt => stateCExt, _ => stateOther, }, false); } // Regional indicator. // GB12 + GB13: Don't break if after an odd number of Reg. // Otherwise remember an odd number of Reg, and break before unless // prior state says not to. if (state == stateRegionalSingle) { transition(state, categoryRegionalIndicator, stateOther, false); } else if (state == stateCAny) { transition(state, categoryRegionalIndicator, stateCReg, false); } else if (state == stateCReg) { transitionLA( state, categoryRegionalIndicator, stateLookaheadRegionalEven, flagLookahead, ); } else { // Break unless prior state says not to. transition( state, categoryRegionalIndicator, stateRegionalSingle, !neverBreakBefore, ); } // Prepend. // GB9b: Never break after Prepend (unless required by next character // due to GB1..GB5). // Break before unless prior state says not to. transition(state, categoryPrepend, statePrepend, !neverBreakBefore); // Spacing mark. (Like Extend but doesn't interact with emojis). // GB9a. Don't break before, unless must always break after prior char. transition(state, categorySpacingMark, stateOther, alwaysBreakBefore); // Hangul. // GB6+GB7+GB8. // Don't break if T follows V and V follows L. transition( state, categoryL, stateL, !(neverBreakBefore || state == stateL), ); transition( state, categoryLV, stateV, !(neverBreakBefore || state == stateL), ); transition( state, categoryLVT, stateT, !(neverBreakBefore || state == stateL), ); transition( state, categoryV, stateV, !(neverBreakBefore || state == stateL || state == stateV), ); transition( state, categoryT, stateT, !(neverBreakBefore || state == stateV || state == stateT), ); // Emoji // GB11. if (state == stateCZWJ || state == stateCExZ || state == stateCIEZ || state == stateCILZ) { transitionLA( state, categoryPictographic, stateLookaheadZWJPictographic, flagLookahead, ); } else { transition( state, categoryPictographic, statePictographic, state != statePrepend && state != statePictographicZWJ && state != stateSoTNoBreak, ); } // End of input. // GB2. transition( state, categoryEoT, stateSoTNoBreak, state != stateSoT && state != stateSoTNoBreak && state != stateCAny, ); // Pad table if necessary. for (var c = categoryCount; c < automatonRowLength; c++) { transition(state, c, stateSoTNoBreak, false); } } const prefix = 'const _stateMachine = '; buffer.write(prefix); var stringWriter = StringLiteralWriter(buffer, padding: 4); stringWriter.start(prefix.length); for (var i = 0; i < table.length; i++) { stringWriter.add(table[i]); } stringWriter.end(); buffer.write(';\n'); buffer.write(_moveMethod); if (verbose) _writeForwardTable(table, automatonRowLength); } const String _moveMethod = ''' $preferInline int move(int state, int inputCategory) => _stateMachine.codeUnitAt((state & $maskState) + inputCategory); '''; const String _moveBackMethod = ''' $preferInline int moveBack(int state, int inputCategory) => _backStateMachine.codeUnitAt((state & $maskState) + inputCategory); '''; const categories = [ categoryOther, categoryCR, categoryLF, categoryControl, categoryExtend, categoryRegionalIndicator, categoryPrepend, categorySpacingMark, categoryL, categoryV, categoryT, categoryLV, categoryLVT, categoryPictographic, categoryOtherIndicConsonant, categoryZWJ, categoryExtendIndicExtend, categoryExtendIndicLinked, categoryEoT, ]; ////////////////////////////////////////////////////////////////////////////// // Transition table for *reverse* grapheme cluster break automaton. // For each previous state and each previous input character category, // emit a new state and whether to break after that input character. // The table uses `!` to mark a break before the input character, // and then the output state. // Some breaks cannot be determined without look-ahead. Those return // specially marked states, with `$` in the name. // Those states will trigger a special code path which will then update // the state and/or index as necessary. // // Stored as string for comparison to actual generated automaton. const expectedBackAutomatonDescription = r''' Stat: Cat : CR Ctl Otr Ext Spc Reg Pic LF Pre L V T LV LVT OInC ZWJ EInE EInL SoT : ----------------------------------------------------------------------------------------------------- Brk :!Brk !Brk !Otr !Ext !Ext !Reg !Pic !LF !Otr !L !V !T !L !L !InC !Ext !Ext !Ext ! - : LF : Brk !Brk !Otr !Ext !Ext !Reg !Pic !LF !Otr !L !V !T !L !L !InC !Ext !Ext !Ext ! - : Otr :!Brk !Brk !Otr !Ext !Ext !Reg !Pic !LF Otr !L !V !T !L !L !InC !Ext !Ext !Ext ! - : Ext :!Brk !Brk Otr Ext Ext Reg Pic !LF Otr L V T L L InC Ext Ext Ext ! - : L :!Brk !Brk !Otr !Ext !Ext !Reg !Pic !LF Otr L !V !T !L !L !InC !Ext !Ext !Ext ! - : V :!Brk !Brk !Otr !Ext !Ext !Reg !Pic !LF Otr L V !T L !L !InC !Ext !Ext !Ext ! - : T :!Brk !Brk !Otr !Ext !Ext !Reg !Pic !LF Otr !L V T L L !InC !Ext !Ext !Ext ! - : Pic :!Brk !Brk !Otr !Ext !Ext !Reg !Pic !LF Otr !L !V !T !L !L !InC $LAZP!Ext !Ext ! - : RegO: - - - - - RegE - - - - - - - - - - - - - : Reg :!Brk !Brk !Otr !Ext !Ext $LARe!Pic !LF Otr !L !V !T !L !L !InC !Ext !Ext !Ext ! - : InC :!Brk !Brk !Otr !Ext !Ext !Reg !Pic !LF Otr !L !V !T !L !L !InC $LAIC$LAIC$LAIL! - : RegE:!Brk !Brk !Otr !Ext !Ext !RegO!Pic !LF Otr !L !V !T !L !L !InC !Ext !Ext !Ext ! - : EoTN: Brk Brk Otr Ext Ext Reg Pic LF Otr L V T L L InC Ext Ext Ext - : EoT :!Brk !Brk !Otr !Ext !Ext !Reg !Pic !LF !Otr !L !V !T !L !L !InC !Ext !Ext !Ext - : LAZP:#Ext #Ext !Otr LAZP!Ext !Reg Pic #Ext !Otr !L !V !T !L !L !InC !Ext LAZP LAZP#Ext : LAIC:#Ext #Ext !Otr !Ext !Ext !Reg !Pic #Ext !Otr !L !V !T !L !L !InC LAIC LAIC LAIL#Ext : LAIL:#Ext #Ext !Otr !Ext !Ext !Reg !Pic #Ext !Otr !L !V !T !L !L InC LAIL LAIL LAIL#Ext : LARe: RegE RegE RegE RegE RegE LARo RegE RegE RegE RegE RegE RegE RegE RegE RegE RegE RegE RegE RegE: LARo:!RegO!RegO!RegO!RegO!RegO LARe!RegO!RegO!RegO!RegO!RegO!RegO!RegO!RegO!RegO!RegO!RegO!RegO!RegO: '''; // The look-ahead part of the state machine is triggered by the `$`-transitions // above. // It is really a combination of three state machines, one for RI, one // for ZWJ+Pic and one for InCB. The backwards automaton always knows // which one it starts in. // A state not in the LA-range means to end lookahead with that state. // If starting with `stateLookaheadRegional`, // the result always resets the position to before the lookahead, // and the output state only states whether to break before that position. // (The output states are always one of `stateRegionalEven` or // `stateRegionalOdd`+break-before.) // Represented by ` ` for not breaking and `!` for breaking. // // For the other lookaheads, the output flags represent one of: // The marker before the target state means one of four things: // - ' ': No break up to and including last seen character. // - '!': Break before char before lookahead, none up to last seen character. // - '#`: Break before char before lookahead and before last seen character. // In this case, the output state is the state before that character. // (So move character position to before last lookahead step.) // // Examples of '≮' the last would be ZWJ + EXT + ZWJ + PIC which does lookahead // after seeing ZWJ+PIC. Seeing the second ZWJ, it knows it's not // a PIC+EXT*+ZWJ+PIC sequence, so it must break before the second ZWJ. // It also knows that it doesn't need to break again up to the first ZWJ, // because it's all EXT characters. It's output state is `≮Ext`. // An example of `#` would be `CR + EXT + ZWJ + PIC` which knows when it's // seen the `CR` that it should break after CR and ZWJ. // (Since it can only return one break at a time, it'll keep the position after // CR with a state of Ext and return the position between ZWJ and PIC.) // The look-ahead states are recognized and calls out to code that looks // ahead (backwards in the string) to see what the state should really be after const backStates = [ stateBreak, stateLF, stateOther, stateExtend, stateL, stateV, stateT, statePictographic, stateRegionalOdd, // Known disjoint look-ahead. stateRegionalSingle, stateInC, stateRegionalEven, stateEoTNoBreak, stateEoT, stateLookaheadRegionalEven, stateLookaheadRegionalOdd, stateLookaheadZWJPictographic, stateLookaheadInC, stateLookaheadInCL, ]; void writeBackwardAutomaton(StringSink buffer, {required bool verbose}) { assert(categories.length <= automatonRowLength); var table = Uint16List(backStateLimit); void transitionLA(int state, int category, int targetState, int flags) { assert( state < backStateLimit && targetState < backStateLimit, '$state + $category -> $targetState', ); assert( switch ((state, targetState)) { (< stateLookaheadMin, < stateLookaheadMin) => flags < flagLookahead, // Entering lookahead. Always sets the flagLookahead bit. (< stateLookaheadMin, _) => flags == flagLookahead, // Exiting lookahead, can have any flag value. (_, < stateLookaheadMin) => flags <= maskFlags, // Inside lookahead, not done yet. (_, _) => flags == 0, }, '$state + $category => $targetState | $flags'); table[state + category] = targetState | flags; } void transition(int state, int category, int targetState, bool breakBefore) { assert(state < stateLookaheadMin && targetState < stateLookaheadMin); transitionLA( state, category, targetState, (breakBefore ? flagBreak : flagNoBreak), ); } for (var state in backStates) { if (state < stateLookaheadMin) { if (state == stateRegionalOdd) { // Special state where we know the previous character // to some degree, due to having done look-ahead. // Most inputs are unreachable. Use EoT-nobreak as unreachable marker. for (var i = 0; i <= categoryCount; i++) { transition(state, i, stateEoTNoBreak, false); } transition(state, categoryRegionalIndicator, stateRegionalEven, false); // Remaining inputs are unreachable. continue; } transition( state, categoryOther, stateOther, state != stateExtend && state != stateEoTNoBreak, ); transition( state, categoryOtherIndicConsonant, stateInC, state != stateExtend && state != stateEoTNoBreak, ); transition(state, categoryLF, stateLF, state != stateEoTNoBreak); transition( state, categoryCR, stateBreak, state != stateLF && state != stateEoTNoBreak, ); transition(state, categoryControl, stateBreak, state != stateEoTNoBreak); var breakBeforeExtend = state != stateExtend && state != stateRegionalOdd && state != stateEoTNoBreak; transition(state, categoryExtend, stateExtend, breakBeforeExtend); if (state != stateInC) { transition( state, categoryExtendIndicExtend, stateExtend, breakBeforeExtend, ); transition( state, categoryExtendIndicLinked, stateExtend, breakBeforeExtend, ); } else { // If these come just before an InCB Consonant, look ahead. transitionLA( state, categoryExtendIndicExtend, stateLookaheadInC, flagLookahead, ); transitionLA( state, categoryExtendIndicLinked, stateLookaheadInCL, flagLookahead, ); } transition( state, categorySpacingMark, stateExtend, state != stateExtend && state != stateEoTNoBreak, ); if (state == statePictographic) { // Break-before value has no effect on lookahead states. transitionLA( state, categoryZWJ, stateLookaheadZWJPictographic, flagLookahead, ); } else if (state == stateInC) { transitionLA(state, categoryZWJ, stateLookaheadInC, flagLookahead); } else { transition( state, categoryZWJ, stateExtend, state != stateExtend && state != stateEoTNoBreak, ); } if (state == stateRegionalEven) { transition(state, categoryRegionalIndicator, stateRegionalOdd, true); } else if (state == stateRegionalSingle) { transitionLA( state, categoryRegionalIndicator, stateLookaheadRegionalEven, flagLookahead, ); } else { transition( state, categoryRegionalIndicator, stateRegionalSingle, state != stateExtend && state != stateEoTNoBreak, ); } transition( state, categoryPrepend, stateOther, state == stateBreak || state == stateCR || state == stateEoT, ); transition( state, categoryL, stateL, state != stateExtend && state != stateL && state != stateV && state != stateEoTNoBreak, ); transition( state, categoryLV, stateL, state != stateExtend && state != stateV && state != stateT && state != stateEoTNoBreak, ); transition( state, categoryLVT, stateL, state != stateExtend && state != stateT && state != stateEoTNoBreak, ); transition( state, categoryV, stateV, state != stateExtend && state != stateT && state != stateV && state != stateEoTNoBreak, ); transition( state, categoryT, stateT, state != stateExtend && state != stateT && state != stateEoTNoBreak, ); transition( state, categoryPictographic, statePictographic, state != stateExtend && state != stateRegionalOdd && state != stateEoTNoBreak, ); // Use EoT-NoBreak as marker for unreachable. transition( state, categorySoT, stateEoTNoBreak, state != stateEoT && state != stateEoTNoBreak, ); } else { if (state == stateLookaheadRegionalEven) { transitionLA( state, categoryRegionalIndicator, stateLookaheadRegionalOdd, 0, ); for (var c = 0; c < categoryCount; c++) { if (c != categoryRegionalIndicator) { transitionLA(state, c, stateRegionalEven, 0); } } continue; } if (state == stateLookaheadRegionalOdd) { transitionLA( state, categoryRegionalIndicator, stateLookaheadRegionalEven, 0, ); for (var c = 0; c < categoryCount; c++) { if (c != categoryRegionalIndicator) { transitionLA(state, c, stateRegionalOdd, flagBreak); } } continue; } transitionLA(state, categoryControl, stateExtend, flagLookaheadBreakBoth); transitionLA(state, categoryCR, stateExtend, flagLookaheadBreakBoth); transitionLA(state, categoryLF, stateExtend, flagLookaheadBreakBoth); transitionLA(state, categoryOther, stateOther, flagLookaheadBreakEarly); transitionLA( state, categorySpacingMark, stateExtend, flagLookaheadBreakEarly, ); transitionLA(state, categoryOther, stateOther, flagLookaheadBreakEarly); transitionLA( state, categoryRegionalIndicator, stateRegionalSingle, flagLookaheadBreakEarly, ); transitionLA( state, categoryPictographic, statePictographic, state == stateLookaheadZWJPictographic ? flagLookaheadBreakNone : flagLookaheadBreakEarly, ); transitionLA(state, categoryPrepend, stateOther, flagLookaheadBreakEarly); transitionLA(state, categoryL, stateL, flagLookaheadBreakEarly); transitionLA(state, categoryLV, stateL, flagLookaheadBreakEarly); transitionLA(state, categoryLVT, stateL, flagLookaheadBreakEarly); transitionLA(state, categoryV, stateV, flagLookaheadBreakEarly); transitionLA(state, categoryT, stateT, flagLookaheadBreakEarly); transitionLA( state, categoryOtherIndicConsonant, stateInC, state == stateLookaheadInCL ? flagLookaheadBreakNone : flagLookaheadBreakEarly, ); if (state == stateLookaheadZWJPictographic) { transitionLA(state, categoryExtend, state, 0); transitionLA(state, categoryZWJ, stateExtend, flagLookaheadBreakEarly); transitionLA(state, categoryExtendIndicLinked, state, 0); } else { transitionLA( state, categoryExtend, stateExtend, flagLookaheadBreakEarly, ); transitionLA(state, categoryZWJ, state, 0); transitionLA(state, categoryExtendIndicLinked, stateLookaheadInCL, 0); } transitionLA(state, categoryExtendIndicExtend, state, 0); transitionLA(state, categorySoT, stateExtend, flagLookaheadBreakBoth); } for (var i = categoryCount; i < automatonRowLength; i++) { transitionLA(state, i, stateEoTNoBreak, 0); } } var stringWriter = StringLiteralWriter(buffer, padding: 4); buffer.write('const _backStateMachine = '); stringWriter.start('const _backStateMachine = '.length); for (var i = 0; i < table.length; i++) { stringWriter.add(table[i]); } stringWriter.end(); buffer.write(';\n'); buffer.write(_moveBackMethod); if (verbose) _writeBackTable(table, automatonRowLength); } void _writeForwardTable(Uint16List table, int automatonRowLength) { var automaton = _generateTable( table, automatonRowLength, stateLimit, stateShortName, backStateShortName, categoryShortNames, stateSoTNoBreak, ); stdout.write(automaton); if (automaton != expectedAutomatonDescription) { stderr ..writeln('DIFFERS FROM EXPECTATION:') ..write(expectedAutomatonDescription); } } void _writeBackTable(Uint16List table, int automatonRowLength) { var backCategoryNames = [...categoryShortNames]..[categorySoT] = 'SoT'; var backAutomaton = _generateTable( table, automatonRowLength, backStateLimit, backStateShortName, backStateShortName, backCategoryNames, stateEoTNoBreak, ); stdout.write(backAutomaton); if (backAutomaton != expectedBackAutomatonDescription) { stderr ..writeln('DIFFERS FROM EXPECTATION:') ..write(expectedBackAutomatonDescription); } } /// Writes an automaton table to string, for debugging. /// /// The table has size `stateLimit`, which is a multiple of /// `automatonRowLength` and `automatonRowLength >= categoryCount`. /// The [stateNames] provide the names of the states for this particular /// automaton (differs between forward and backward automaton). /// It has a name for every target state that occurs in the *table*. /// The table contains states multiplied by `automatonRowLength`, possibly with /// the first bit set as a break-before/after flag. /// The [stateLimit] is an upper limit of "real" states that occur in the table, /// states above that, if any, are synthetic states that trigger non- /// automaton based scanning. /// The [ignoreState] is a single state that is not displayed. String _generateTable( Uint16List table, int automatonRowLength, int stateLimit, // A multiple of automatonRowLength String Function(int) stateNames, String Function(int) lookaheadStateNames, List categoryNames, int ignoreState, ) { assert(automatonRowLength >= categoryCount); assert(table.length == stateLimit); var buf = StringBuffer(); buf.writeln('Stat: Cat'); var preHeaderLength = buf.length; buf.write(' :'); for (var i = 0; i < categoryCount; i++) { buf ..write(' ') ..write(categoryNames[i].padRight(4)); } buf.writeln(':'); var lineLength = buf.length - preHeaderLength; buf.writeln('-' * (lineLength - 1)); for (var si = 0; si < stateLimit; si += automatonRowLength) { var stateName = stateNames(si); buf ..write(stateName.padRight(4)) ..write(':'); for (var ci = 0; ci < categoryCount; ci++) { var value = table[si + ci]; var targetState = value & maskState; var flags = value & maskFlags; var prefix = r' !$#'[flags]; var targetStateName = (flags == flagLookahead) ? lookaheadStateNames(targetState) : stateNames(targetState); // EoT is marker for unreachable states. if (targetState == ignoreState) targetStateName = ' - '; buf ..write(prefix) ..write(targetStateName.padRight(4)); } buf.writeln(':'); } return buf.toString(); } /// Target state name for forward automaton. String _targetStateName(int state, int flags) { if (flags == flagLookahead) return backStateShortName(state); return stateShortName(state); } const preferInline = """ @pragma('dart2js:prefer-inline') @pragma('vm:prefer-inline') @pragma('wasm:prefer-inline')""";