/// This library has a parser for HTML5 documents, that lets you parse HTML /// easily from a script or server side application: /// /// import 'package:html/parser.dart' show parse; /// import 'package:html/dom.dart'; /// main() { /// var document = parse( /// 'Hello world! HTML5 rocks!'); /// print(document.outerHtml); /// } /// /// The resulting document you get back has a DOM-like API for easy tree /// traversal and manipulation. library; import 'dart:collection'; import 'dart:math'; import 'package:source_span/source_span.dart'; import 'dom.dart'; import 'src/constants.dart'; import 'src/encoding_parser.dart'; import 'src/token.dart'; import 'src/tokenizer.dart'; import 'src/treebuilder.dart'; import 'src/utils.dart'; /// Parse an html5 document into a tree. /// /// The [input] can be a `String`, a `List` of bytes, or an /// [HtmlTokenizer]. /// /// If [input] is not a [HtmlTokenizer], you can optionally specify the file's /// [encoding], which must be a string. If specified that encoding will be /// used regardless of any BOM or later declaration (such as in a meta element). /// /// Set [generateSpans] if you want to generate [SourceSpan]s, otherwise the /// [Node.sourceSpan] property will be `null`. When using [generateSpans] you /// can additionally pass [sourceUrl] to indicate where the [input] was /// extracted from. Document parse(dynamic input, {String? encoding, bool generateSpans = false, String? sourceUrl}) { final p = HtmlParser(input, encoding: encoding, generateSpans: generateSpans, sourceUrl: sourceUrl); return p.parse(); } /// Parse an html5 document fragment into a tree. /// /// The [input] can be a `String`, a `List` of bytes, or an /// [HtmlTokenizer]. /// The [container] element can optionally be specified, otherwise it defaults /// to "div". /// /// If [input] is not a [HtmlTokenizer], you can optionally specify the file's /// [encoding], which must be a string. If specified, that encoding will be used, /// regardless of any BOM or later declaration (such as in a meta element). /// /// Set [generateSpans] if you want to generate [SourceSpan]s, otherwise the /// [Node.sourceSpan] property will be `null`. When using [generateSpans] you can /// additionally pass [sourceUrl] to indicate where the [input] was extracted /// from. DocumentFragment parseFragment(dynamic input, {String container = 'div', String? encoding, bool generateSpans = false, String? sourceUrl}) { final p = HtmlParser(input, encoding: encoding, generateSpans: generateSpans, sourceUrl: sourceUrl); return p.parseFragment(container); } /// Parser for HTML, which generates a tree structure from a stream of /// (possibly malformed) characters. class HtmlParser { /// Raise an exception on the first error encountered. final bool strict; /// True to generate [SourceSpan]s for the [Node.sourceSpan] property. final bool generateSpans; final HtmlTokenizer tokenizer; final TreeBuilder tree; final List errors = []; bool firstStartTag = false; // TODO(jmesserly): use enum? /// "quirks" / "limited quirks" / "no quirks" String compatMode = 'no quirks'; /// innerHTML container when parsing document fragment. String? innerHTML; late Phase phase = _initialPhase; Phase? originalPhase; bool framesetOK = true; // These fields hold the different phase singletons. At any given time one // of them will be active. late final _initialPhase = InitialPhase(this); late final _beforeHtmlPhase = BeforeHtmlPhase(this); late final _beforeHeadPhase = BeforeHeadPhase(this); late final _inHeadPhase = InHeadPhase(this); // TODO: html5lib did not implement the no script parsing mode // More information here: // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#scripting-flag // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inheadnoscript // late final _inHeadNoscript = InHeadNoScriptPhase(this); late final _afterHeadPhase = AfterHeadPhase(this); late final _inBodyPhase = InBodyPhase(this); late final _textPhase = TextPhase(this); late final _inTablePhase = InTablePhase(this); late final _inTableTextPhase = InTableTextPhase(this); late final _inCaptionPhase = InCaptionPhase(this); late final _inColumnGroupPhase = InColumnGroupPhase(this); late final _inTableBodyPhase = InTableBodyPhase(this); late final _inRowPhase = InRowPhase(this); late final _inCellPhase = InCellPhase(this); late final _inSelectPhase = InSelectPhase(this); late final _inSelectInTablePhase = InSelectInTablePhase(this); late final _inForeignContentPhase = InForeignContentPhase(this); late final _afterBodyPhase = AfterBodyPhase(this); late final _inFramesetPhase = InFramesetPhase(this); late final _afterFramesetPhase = AfterFramesetPhase(this); late final _afterAfterBodyPhase = AfterAfterBodyPhase(this); late final _afterAfterFramesetPhase = AfterAfterFramesetPhase(this); /// Create and configure an HtmlParser. /// /// The [input] can be a `String`, a `List` of bytes, or an /// [HtmlTokenizer]. /// /// The [strict], [tree] builder, and [generateSpans] arguments configure /// behavior for any type of input. /// /// If [input] is not a [HtmlTokenizer], you can specify a few more arguments. /// /// The [encoding] must be a string that indicates the encoding. If specified, /// that encoding will be used, regardless of any BOM or later declaration /// (such as in a meta element). /// /// Set [parseMeta] to false if you want to disable parsing the meta element. /// /// Set [lowercaseElementName] or [lowercaseAttrName] to false to disable the /// automatic conversion of element and attribute names to lower case. Note /// that standard way to parse HTML is to lowercase, which is what the browser /// DOM will do if you request `Element.outerHTML`, for example. HtmlParser( dynamic input, { TreeBuilder? tree, this.strict = false, this.generateSpans = false, String? encoding, bool parseMeta = true, bool lowercaseElementName = true, bool lowercaseAttrName = true, String? sourceUrl, }) : tree = tree ?? TreeBuilder(true), tokenizer = input is HtmlTokenizer ? input : HtmlTokenizer(input, encoding: encoding, parseMeta: parseMeta, lowercaseElementName: lowercaseElementName, lowercaseAttrName: lowercaseAttrName, generateSpans: generateSpans, sourceUrl: sourceUrl) { tokenizer.parser = this; } bool get innerHTMLMode => innerHTML != null; /// Parse an html5 document into a tree. /// /// After parsing, [errors] will be populated with parse errors, if any. Document parse() { innerHTML = null; _parse(); return tree.getDocument(); } /// Parse an html5 document fragment into a tree. /// /// Pass a [container] to change the type of the containing element. /// After parsing, [errors] will be populated with parse errors, if any. DocumentFragment parseFragment([String container = 'div']) { ArgumentError.checkNotNull(container, 'container'); innerHTML = container.toLowerCase(); _parse(); return tree.getFragment(); } void _parse() { reset(); while (true) { try { mainLoop(); break; } on ReparseException catch (_) { // Note: this happens if we start parsing but the character encoding // changes. So we should only need to restart very early in the parse. reset(); } } } void reset() { tokenizer.reset(); tree.reset(); firstStartTag = false; errors.clear(); // "quirks" / "limited quirks" / "no quirks" compatMode = 'no quirks'; if (innerHTMLMode) { if (cdataElements.contains(innerHTML)) { tokenizer.state = tokenizer.rcdataState; } else if (rcdataElements.contains(innerHTML)) { tokenizer.state = tokenizer.rawtextState; } else if (innerHTML == 'plaintext') { tokenizer.state = tokenizer.plaintextState; } else { // state already is data state // tokenizer.state = tokenizer.dataState; } phase = _beforeHtmlPhase; _beforeHtmlPhase.insertHtmlElement(); resetInsertionMode(); } else { phase = _initialPhase; } framesetOK = true; } bool isHTMLIntegrationPoint(Element element) { if (element.localName == 'annotation-xml' && element.namespaceUri == Namespaces.mathml) { final enc = element.attributes['encoding']?.toAsciiLowerCase(); return enc == 'text/html' || enc == 'application/xhtml+xml'; } else { return htmlIntegrationPointElements .contains((element.namespaceUri, element.localName)); } } bool isMathMLTextIntegrationPoint(Element element) { return mathmlTextIntegrationPointElements .contains((element.namespaceUri, element.localName)); } bool inForeignContent(Token token, int type) { if (tree.openElements.isEmpty) return false; final node = tree.openElements.last; if (node.namespaceUri == tree.defaultNamespace) return false; if (isMathMLTextIntegrationPoint(node)) { if (type == TokenKind.startTag && (token as StartTagToken).name != 'mglyph' && token.name != 'malignmark') { return false; } if (type == TokenKind.characters || type == TokenKind.spaceCharacters) { return false; } } if (node.localName == 'annotation-xml' && type == TokenKind.startTag && (token as StartTagToken).name == 'svg') { return false; } if (isHTMLIntegrationPoint(node)) { if (type == TokenKind.startTag || type == TokenKind.characters || type == TokenKind.spaceCharacters) { return false; } } return true; } void mainLoop() { while (tokenizer.moveNext()) { final token = tokenizer.current; Token? newToken = token; int type; while (newToken != null) { type = newToken.kind; // Note: avoid "is" test here, see http://dartbug.com/4795 if (type == TokenKind.parseError) { final error = newToken as ParseErrorToken; parseError(error.span, error.data, error.messageParams); newToken = null; } else { var localPhase = phase; if (inForeignContent(token, type)) { localPhase = _inForeignContentPhase; } switch (type) { case TokenKind.characters: newToken = localPhase.processCharacters(newToken as CharactersToken); break; case TokenKind.spaceCharacters: newToken = localPhase .processSpaceCharacters(newToken as SpaceCharactersToken); break; case TokenKind.startTag: newToken = localPhase.processStartTag(newToken as StartTagToken); break; case TokenKind.endTag: newToken = localPhase.processEndTag(newToken as EndTagToken); break; case TokenKind.comment: newToken = localPhase.processComment(newToken as CommentToken); break; case TokenKind.doctype: newToken = localPhase.processDoctype(newToken as DoctypeToken); break; } } } if (token is StartTagToken) { if (token.selfClosing && !token.selfClosingAcknowledged) { parseError(token.span, 'non-void-element-with-trailing-solidus', {'name': token.name}); } } } // When the loop finishes it's EOF var reprocess = true; final reprocessPhases = []; while (reprocess) { reprocessPhases.add(phase); reprocess = phase.processEOF(); if (reprocess) { assert(!reprocessPhases.contains(phase)); } } } /// The last span available. Used for EOF errors if we don't have something /// better. SourceSpan? get _lastSpan => tokenizer.stream.fileInfo ?.location(tokenizer.stream.position) .pointSpan(); void parseError(SourceSpan? span, String errorcode, [Map? datavars = const {}]) { if (!generateSpans && span == null) { span = _lastSpan; } final err = ParseError(errorcode, span, datavars); errors.add(err); if (strict) throw err; } void adjustMathMLAttributes(StartTagToken token) { final orig = token.data.remove('definitionurl'); if (orig != null) { token.data['definitionURL'] = orig; } } void adjustSVGAttributes(StartTagToken token) { final replacements = const { 'attributename': 'attributeName', 'attributetype': 'attributeType', 'basefrequency': 'baseFrequency', 'baseprofile': 'baseProfile', 'calcmode': 'calcMode', 'clippathunits': 'clipPathUnits', 'contentscripttype': 'contentScriptType', 'contentstyletype': 'contentStyleType', 'diffuseconstant': 'diffuseConstant', 'edgemode': 'edgeMode', 'externalresourcesrequired': 'externalResourcesRequired', 'filterres': 'filterRes', 'filterunits': 'filterUnits', 'glyphref': 'glyphRef', 'gradienttransform': 'gradientTransform', 'gradientunits': 'gradientUnits', 'kernelmatrix': 'kernelMatrix', 'kernelunitlength': 'kernelUnitLength', 'keypoints': 'keyPoints', 'keysplines': 'keySplines', 'keytimes': 'keyTimes', 'lengthadjust': 'lengthAdjust', 'limitingconeangle': 'limitingConeAngle', 'markerheight': 'markerHeight', 'markerunits': 'markerUnits', 'markerwidth': 'markerWidth', 'maskcontentunits': 'maskContentUnits', 'maskunits': 'maskUnits', 'numoctaves': 'numOctaves', 'pathlength': 'pathLength', 'patterncontentunits': 'patternContentUnits', 'patterntransform': 'patternTransform', 'patternunits': 'patternUnits', 'pointsatx': 'pointsAtX', 'pointsaty': 'pointsAtY', 'pointsatz': 'pointsAtZ', 'preservealpha': 'preserveAlpha', 'preserveaspectratio': 'preserveAspectRatio', 'primitiveunits': 'primitiveUnits', 'refx': 'refX', 'refy': 'refY', 'repeatcount': 'repeatCount', 'repeatdur': 'repeatDur', 'requiredextensions': 'requiredExtensions', 'requiredfeatures': 'requiredFeatures', 'specularconstant': 'specularConstant', 'specularexponent': 'specularExponent', 'spreadmethod': 'spreadMethod', 'startoffset': 'startOffset', 'stddeviation': 'stdDeviation', 'stitchtiles': 'stitchTiles', 'surfacescale': 'surfaceScale', 'systemlanguage': 'systemLanguage', 'tablevalues': 'tableValues', 'targetx': 'targetX', 'targety': 'targetY', 'textlength': 'textLength', 'viewbox': 'viewBox', 'viewtarget': 'viewTarget', 'xchannelselector': 'xChannelSelector', 'ychannelselector': 'yChannelSelector', 'zoomandpan': 'zoomAndPan' }; for (var originalName in token.data.keys.toList(growable: false)) { final svgName = replacements[originalName as String]; if (svgName != null) { token.data[svgName] = token.data.remove(originalName)!; } } } void adjustForeignAttributes(StartTagToken token) { // TODO(jmesserly): I don't like mixing non-string objects with strings in // the Node.attributes Map. Is there another solution? final replacements = const { 'xlink:actuate': AttributeName('xlink', 'actuate', Namespaces.xlink), 'xlink:arcrole': AttributeName('xlink', 'arcrole', Namespaces.xlink), 'xlink:href': AttributeName('xlink', 'href', Namespaces.xlink), 'xlink:role': AttributeName('xlink', 'role', Namespaces.xlink), 'xlink:show': AttributeName('xlink', 'show', Namespaces.xlink), 'xlink:title': AttributeName('xlink', 'title', Namespaces.xlink), 'xlink:type': AttributeName('xlink', 'type', Namespaces.xlink), 'xml:base': AttributeName('xml', 'base', Namespaces.xml), 'xml:lang': AttributeName('xml', 'lang', Namespaces.xml), 'xml:space': AttributeName('xml', 'space', Namespaces.xml), 'xmlns': AttributeName(null, 'xmlns', Namespaces.xmlns), 'xmlns:xlink': AttributeName('xmlns', 'xlink', Namespaces.xmlns) }; for (var originalName in token.data.keys.toList(growable: false)) { final foreignName = replacements[originalName as String]; if (foreignName != null) { token.data[foreignName] = token.data.remove(originalName)!; } } } void resetInsertionMode() { // The name of this method is mostly historical. (It's also used in the // specification.) for (var node in tree.openElements.reversed) { var nodeName = node.localName; final last = node == tree.openElements[0]; if (last) { assert(innerHTMLMode); nodeName = innerHTML; } // Check for conditions that should only happen in the innerHTML // case switch (nodeName) { case 'select': case 'colgroup': case 'head': case 'html': assert(innerHTMLMode); break; } if (!last && node.namespaceUri != tree.defaultNamespace) { continue; } switch (nodeName) { case 'select': phase = _inSelectPhase; return; case 'td': phase = _inCellPhase; return; case 'th': phase = _inCellPhase; return; case 'tr': phase = _inRowPhase; return; case 'tbody': phase = _inTableBodyPhase; return; case 'thead': phase = _inTableBodyPhase; return; case 'tfoot': phase = _inTableBodyPhase; return; case 'caption': phase = _inCaptionPhase; return; case 'colgroup': phase = _inColumnGroupPhase; return; case 'table': phase = _inTablePhase; return; case 'head': phase = _inBodyPhase; return; case 'body': phase = _inBodyPhase; return; case 'frameset': phase = _inFramesetPhase; return; case 'html': phase = _beforeHeadPhase; return; } } phase = _inBodyPhase; } /// Generic RCDATA/RAWTEXT Parsing algorithm /// [contentType] - RCDATA or RAWTEXT void parseRCDataRawtext(Token token, String contentType) { assert(contentType == 'RAWTEXT' || contentType == 'RCDATA'); tree.insertElement(token as StartTagToken); if (contentType == 'RAWTEXT') { tokenizer.state = tokenizer.rawtextState; } else { tokenizer.state = tokenizer.rcdataState; } originalPhase = phase; phase = _textPhase; } } /// Base class for helper object that implements each phase of processing. class Phase { // Order should be (they can be omitted): // * EOF // * Comment // * Doctype // * SpaceCharacters // * Characters // * StartTag // - startTag* methods // * EndTag // - endTag* methods final HtmlParser parser; final TreeBuilder tree; Phase(this.parser) : tree = parser.tree; bool processEOF() { throw UnimplementedError(); } Token? processComment(CommentToken token) { // For most phases the following is correct. Where it's not it will be // overridden. tree.insertComment(token, tree.openElements.last); return null; } Token? processDoctype(DoctypeToken token) { parser.parseError(token.span, 'unexpected-doctype'); return null; } Token? processCharacters(CharactersToken token) { tree.insertText(token.data, token.span); return null; } Token? processSpaceCharacters(SpaceCharactersToken token) { tree.insertText(token.data, token.span); return null; } Token? processStartTag(StartTagToken token) { throw UnimplementedError(); } Token? startTagHtml(StartTagToken token) { if (parser.firstStartTag == false && token.name == 'html') { parser.parseError(token.span, 'non-html-root'); } // XXX Need a check here to see if the first start tag token emitted is // this token... If it's not, invoke parser.parseError(). tree.openElements[0].sourceSpan = token.span; token.data.forEach((attr, value) { tree.openElements[0].attributes.putIfAbsent(attr, () => value); }); parser.firstStartTag = false; return null; } Token? processEndTag(EndTagToken token) { throw UnimplementedError(); } /// Helper method for popping openElements. void popOpenElementsUntil(EndTagToken token) { final name = token.name; var node = tree.openElements.removeLast(); while (node.localName != name) { node = tree.openElements.removeLast(); } node.endSourceSpan = token.span; } } class InitialPhase extends Phase { InitialPhase(super.parser); @override Token? processSpaceCharacters(SpaceCharactersToken token) { return null; } @override Token? processComment(CommentToken token) { tree.insertComment(token, tree.document); return null; } @override Token? processDoctype(DoctypeToken token) { final name = token.name; var publicId = token.publicId?.toAsciiLowerCase(); final systemId = token.systemId; final correct = token.correct; if (name != 'html' || publicId != null || systemId != null && systemId != 'about:legacy-compat') { parser.parseError(token.span, 'unknown-doctype'); } publicId ??= ''; tree.insertDoctype(token); if (!correct || token.name != 'html' || startsWithAny(publicId, const [ '+//silmaril//dtd html pro v0r11 19970101//', '-//advasoft ltd//dtd html 3.0 aswedit + extensions//', '-//as//dtd html 3.0 aswedit + extensions//', '-//ietf//dtd html 2.0 level 1//', '-//ietf//dtd html 2.0 level 2//', '-//ietf//dtd html 2.0 strict level 1//', '-//ietf//dtd html 2.0 strict level 2//', '-//ietf//dtd html 2.0 strict//', '-//ietf//dtd html 2.0//', '-//ietf//dtd html 2.1e//', '-//ietf//dtd html 3.0//', '-//ietf//dtd html 3.2 final//', '-//ietf//dtd html 3.2//', '-//ietf//dtd html 3//', '-//ietf//dtd html level 0//', '-//ietf//dtd html level 1//', '-//ietf//dtd html level 2//', '-//ietf//dtd html level 3//', '-//ietf//dtd html strict level 0//', '-//ietf//dtd html strict level 1//', '-//ietf//dtd html strict level 2//', '-//ietf//dtd html strict level 3//', '-//ietf//dtd html strict//', '-//ietf//dtd html//', '-//metrius//dtd metrius presentational//', '-//microsoft//dtd internet explorer 2.0 html strict//', '-//microsoft//dtd internet explorer 2.0 html//', '-//microsoft//dtd internet explorer 2.0 tables//', '-//microsoft//dtd internet explorer 3.0 html strict//', '-//microsoft//dtd internet explorer 3.0 html//', '-//microsoft//dtd internet explorer 3.0 tables//', '-//netscape comm. corp.//dtd html//', '-//netscape comm. corp.//dtd strict html//', "-//o'reilly and associates//dtd html 2.0//", "-//o'reilly and associates//dtd html extended 1.0//", "-//o'reilly and associates//dtd html extended relaxed 1.0//", '-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//', '-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//', '-//spyglass//dtd html 2.0 extended//', '-//sq//dtd html 2.0 hotmetal + extensions//', '-//sun microsystems corp.//dtd hotjava html//', '-//sun microsystems corp.//dtd hotjava strict html//', '-//w3c//dtd html 3 1995-03-24//', '-//w3c//dtd html 3.2 draft//', '-//w3c//dtd html 3.2 final//', '-//w3c//dtd html 3.2//', '-//w3c//dtd html 3.2s draft//', '-//w3c//dtd html 4.0 frameset//', '-//w3c//dtd html 4.0 transitional//', '-//w3c//dtd html experimental 19960712//', '-//w3c//dtd html experimental 970421//', '-//w3c//dtd w3 html//', '-//w3o//dtd w3 html 3.0//', '-//webtechs//dtd mozilla html 2.0//', '-//webtechs//dtd mozilla html//' ]) || const [ '-//w3o//dtd w3 html strict 3.0//en//', '-/w3c/dtd html 4.0 transitional/en', 'html' ].contains(publicId) || startsWithAny(publicId, const [ '-//w3c//dtd html 4.01 frameset//', '-//w3c//dtd html 4.01 transitional//' ]) && systemId == null || systemId != null && systemId.toLowerCase() == 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd') { parser.compatMode = 'quirks'; } else if (startsWithAny(publicId, const [ '-//w3c//dtd xhtml 1.0 frameset//', '-//w3c//dtd xhtml 1.0 transitional//' ]) || startsWithAny(publicId, const [ '-//w3c//dtd html 4.01 frameset//', '-//w3c//dtd html 4.01 transitional//' ]) && systemId != null) { parser.compatMode = 'limited quirks'; } parser.phase = parser._beforeHtmlPhase; return null; } void anythingElse() { parser.compatMode = 'quirks'; parser.phase = parser._beforeHtmlPhase; } @override Token processCharacters(CharactersToken token) { parser.parseError(token.span, 'expected-doctype-but-got-chars'); anythingElse(); return token; } @override Token processStartTag(StartTagToken token) { parser.parseError( token.span, 'expected-doctype-but-got-start-tag', {'name': token.name}); anythingElse(); return token; } @override Token processEndTag(EndTagToken token) { parser.parseError( token.span, 'expected-doctype-but-got-end-tag', {'name': token.name}); anythingElse(); return token; } @override bool processEOF() { parser.parseError(parser._lastSpan, 'expected-doctype-but-got-eof'); anythingElse(); return true; } } class BeforeHtmlPhase extends Phase { BeforeHtmlPhase(super.parser); // helper methods void insertHtmlElement() { tree.insertRoot( StartTagToken('html', data: LinkedHashMap())); parser.phase = parser._beforeHeadPhase; } // other @override bool processEOF() { insertHtmlElement(); return true; } @override Token? processComment(CommentToken token) { tree.insertComment(token, tree.document); return null; } @override Token? processSpaceCharacters(SpaceCharactersToken token) { return null; } @override Token processCharacters(CharactersToken token) { insertHtmlElement(); return token; } @override @override Token processStartTag(StartTagToken token) { if (token.name == 'html') { parser.firstStartTag = true; } insertHtmlElement(); return token; } @override Token? processEndTag(EndTagToken token) { switch (token.name) { case 'head': case 'body': case 'html': case 'br': insertHtmlElement(); return token; default: parser.parseError( token.span, 'unexpected-end-tag-before-html', {'name': token.name}); return null; } } } class BeforeHeadPhase extends Phase { BeforeHeadPhase(super.parser); @override Token? processStartTag(StartTagToken token) { switch (token.name) { case 'html': return startTagHtml(token); case 'head': startTagHead(token); return null; default: return startTagOther(token); } } @override Token? processEndTag(EndTagToken token) { switch (token.name) { case 'head': case 'body': case 'html': case 'br': return endTagImplyHead(token); default: endTagOther(token); return null; } } @override bool processEOF() { startTagHead(StartTagToken('head', data: LinkedHashMap())); return true; } @override Token? processSpaceCharacters(SpaceCharactersToken token) { return null; } @override Token processCharacters(CharactersToken token) { startTagHead(StartTagToken('head', data: LinkedHashMap())); return token; } @override Token? startTagHtml(StartTagToken token) { return parser._inBodyPhase.processStartTag(token); } void startTagHead(StartTagToken token) { tree.insertElement(token); tree.headPointer = tree.openElements.last; parser.phase = parser._inHeadPhase; } Token startTagOther(StartTagToken token) { startTagHead(StartTagToken('head', data: LinkedHashMap())); return token; } Token endTagImplyHead(EndTagToken token) { startTagHead(StartTagToken('head', data: LinkedHashMap())); return token; } void endTagOther(EndTagToken token) { parser.parseError( token.span, 'end-tag-after-implied-root', {'name': token.name}); } } class InHeadPhase extends Phase { InHeadPhase(super.parser); @override Token? processStartTag(StartTagToken token) { switch (token.name) { case 'html': return startTagHtml(token); case 'title': startTagTitle(token); return null; case 'noscript': case 'noframes': case 'style': startTagNoScriptNoFramesStyle(token); return null; case 'script': startTagScript(token); return null; case 'base': case 'basefont': case 'bgsound': case 'command': case 'link': startTagBaseLinkCommand(token); return null; case 'meta': startTagMeta(token); return null; case 'head': startTagHead(token); return null; default: return startTagOther(token); } } @override Token? processEndTag(EndTagToken token) { switch (token.name) { case 'head': endTagHead(token); return null; case 'br': case 'html': case 'body': return endTagHtmlBodyBr(token); default: endTagOther(token); return null; } } // the real thing @override bool processEOF() { anythingElse(); return true; } @override Token processCharacters(CharactersToken token) { anythingElse(); return token; } @override Token? startTagHtml(StartTagToken token) { return parser._inBodyPhase.processStartTag(token); } void startTagHead(StartTagToken token) { parser.parseError(token.span, 'two-heads-are-not-better-than-one'); } void startTagBaseLinkCommand(StartTagToken token) { tree.insertElement(token); tree.openElements.removeLast(); token.selfClosingAcknowledged = true; } void startTagMeta(StartTagToken token) { tree.insertElement(token); tree.openElements.removeLast(); token.selfClosingAcknowledged = true; final attributes = token.data; if (!parser.tokenizer.stream.charEncodingCertain) { final charset = attributes['charset']; final content = attributes['content']; if (charset != null) { parser.tokenizer.stream.changeEncoding(charset); } else if (content != null) { final data = EncodingBytes(content); final codec = ContentAttrParser(data).parse(); parser.tokenizer.stream.changeEncoding(codec); } } } void startTagTitle(StartTagToken token) { parser.parseRCDataRawtext(token, 'RCDATA'); } void startTagNoScriptNoFramesStyle(StartTagToken token) { // Need to decide whether to implement the scripting-disabled case parser.parseRCDataRawtext(token, 'RAWTEXT'); } void startTagScript(StartTagToken token) { tree.insertElement(token); parser.tokenizer.state = parser.tokenizer.scriptDataState; parser.originalPhase = parser.phase; parser.phase = parser._textPhase; } Token startTagOther(StartTagToken token) { anythingElse(); return token; } void endTagHead(EndTagToken token) { final node = parser.tree.openElements.removeLast(); assert(node.localName == 'head'); node.endSourceSpan = token.span; parser.phase = parser._afterHeadPhase; } Token endTagHtmlBodyBr(EndTagToken token) { anythingElse(); return token; } void endTagOther(EndTagToken token) { parser.parseError(token.span, 'unexpected-end-tag', {'name': token.name}); } void anythingElse() { endTagHead(EndTagToken('head')); } } // XXX If we implement a parser for which scripting is disabled we need to // implement this phase. // // class InHeadNoScriptPhase extends Phase { class AfterHeadPhase extends Phase { AfterHeadPhase(super.parser); @override Token? processStartTag(StartTagToken token) { switch (token.name) { case 'html': return startTagHtml(token); case 'body': startTagBody(token); return null; case 'frameset': startTagFrameset(token); return null; case 'base': case 'basefont': case 'bgsound': case 'link': case 'meta': case 'noframes': case 'script': case 'style': case 'title': startTagFromHead(token); return null; case 'head': startTagHead(token); return null; default: return startTagOther(token); } } @override Token? processEndTag(EndTagToken token) { switch (token.name) { case 'body': case 'html': case 'br': return endTagHtmlBodyBr(token); default: endTagOther(token); return null; } } @override bool processEOF() { anythingElse(); return true; } @override Token processCharacters(CharactersToken token) { anythingElse(); return token; } @override Token? startTagHtml(StartTagToken token) { return parser._inBodyPhase.processStartTag(token); } void startTagBody(StartTagToken token) { parser.framesetOK = false; tree.insertElement(token); parser.phase = parser._inBodyPhase; } void startTagFrameset(StartTagToken token) { tree.insertElement(token); parser.phase = parser._inFramesetPhase; } void startTagFromHead(StartTagToken token) { parser.parseError(token.span, 'unexpected-start-tag-out-of-my-head', {'name': token.name}); tree.openElements.add(tree.headPointer as Element); parser._inHeadPhase.processStartTag(token); for (var node in tree.openElements.reversed) { if (node.localName == 'head') { tree.openElements.remove(node); break; } } } void startTagHead(StartTagToken token) { parser.parseError(token.span, 'unexpected-start-tag', {'name': token.name}); } Token startTagOther(StartTagToken token) { anythingElse(); return token; } Token endTagHtmlBodyBr(EndTagToken token) { anythingElse(); return token; } void endTagOther(EndTagToken token) { parser.parseError(token.span, 'unexpected-end-tag', {'name': token.name}); } void anythingElse() { tree.insertElement( StartTagToken('body', data: LinkedHashMap())); parser.phase = parser._inBodyPhase; parser.framesetOK = true; } } typedef TokenProccessor = Token Function(Token token); class InBodyPhase extends Phase { bool dropNewline = false; // http://www.whatwg.org/specs/web-apps/current-work///parsing-main-inbody // the really-really-really-very crazy mode InBodyPhase(super.parser); @override Token? processStartTag(StartTagToken token) { switch (token.name) { case 'html': return startTagHtml(token); case 'base': case 'basefont': case 'bgsound': case 'command': case 'link': case 'meta': case 'noframes': case 'script': case 'style': case 'title': return startTagProcessInHead(token); case 'body': startTagBody(token); return null; case 'frameset': startTagFrameset(token); return null; case 'address': case 'article': case 'aside': case 'blockquote': case 'center': case 'details': case 'dir': case 'div': case 'dl': case 'fieldset': case 'figcaption': case 'figure': case 'footer': case 'header': case 'hgroup': case 'menu': case 'nav': case 'ol': case 'p': case 'section': case 'summary': case 'ul': startTagCloseP(token); return null; // headingElements case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': startTagHeading(token); return null; case 'pre': case 'listing': startTagPreListing(token); return null; case 'form': startTagForm(token); return null; case 'li': case 'dd': case 'dt': startTagListItem(token); return null; case 'plaintext': startTagPlaintext(token); return null; case 'a': startTagA(token); return null; case 'b': case 'big': case 'code': case 'em': case 'font': case 'i': case 's': case 'small': case 'strike': case 'strong': case 'tt': case 'u': startTagFormatting(token); return null; case 'nobr': startTagNobr(token); return null; case 'button': return startTagButton(token); case 'applet': case 'marquee': case 'object': startTagAppletMarqueeObject(token); return null; case 'xmp': startTagXmp(token); return null; case 'table': startTagTable(token); return null; case 'area': case 'br': case 'embed': case 'img': case 'keygen': case 'wbr': startTagVoidFormatting(token); return null; case 'param': case 'source': case 'track': startTagParamSource(token); return null; case 'input': startTagInput(token); return null; case 'hr': startTagHr(token); return null; case 'image': startTagImage(token); return null; case 'isindex': startTagIsIndex(token); return null; case 'textarea': startTagTextarea(token); return null; case 'iframe': startTagIFrame(token); return null; case 'noembed': case 'noscript': startTagRawtext(token); return null; case 'select': startTagSelect(token); return null; case 'rp': case 'rt': startTagRpRt(token); return null; case 'option': case 'optgroup': startTagOpt(token); return null; case 'math': startTagMath(token); return null; case 'svg': startTagSvg(token); return null; case 'caption': case 'col': case 'colgroup': case 'frame': case 'head': case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead': case 'tr': startTagMisplaced(token); return null; default: return startTagOther(token); } } @override Token? processEndTag(EndTagToken token) { switch (token.name) { case 'body': endTagBody(token); return null; case 'html': return endTagHtml(token); case 'address': case 'article': case 'aside': case 'blockquote': case 'button': case 'center': case 'details': case 'dir': case 'div': case 'dl': case 'fieldset': case 'figcaption': case 'figure': case 'footer': case 'header': case 'hgroup': case 'listing': case 'menu': case 'nav': case 'ol': case 'pre': case 'section': case 'summary': case 'ul': endTagBlock(token); return null; case 'form': endTagForm(token); return null; case 'p': endTagP(token); return null; case 'dd': case 'dt': case 'li': endTagListItem(token); return null; // headingElements case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': endTagHeading(token); return null; case 'a': case 'b': case 'big': case 'code': case 'em': case 'font': case 'i': case 'nobr': case 's': case 'small': case 'strike': case 'strong': case 'tt': case 'u': endTagFormatting(token); return null; case 'applet': case 'marquee': case 'object': endTagAppletMarqueeObject(token); return null; case 'br': endTagBr(token); return null; default: endTagOther(token); return null; } } bool isMatchingFormattingElement(Element node1, Element node2) { if (node1.localName != node2.localName || node1.namespaceUri != node2.namespaceUri) { return false; } else if (node1.attributes.length != node2.attributes.length) { return false; } else { for (var key in node1.attributes.keys) { if (node1.attributes[key] != node2.attributes[key]) { return false; } } } return true; } // helper void addFormattingElement(StartTagToken token) { tree.insertElement(token); final element = tree.openElements.last; final matchingElements = []; for (Node? node in tree.activeFormattingElements.reversed) { if (node == null) { break; } else if (isMatchingFormattingElement(node as Element, element)) { matchingElements.add(node); } } assert(matchingElements.length <= 3); if (matchingElements.length == 3) { tree.activeFormattingElements.remove(matchingElements.last); } tree.activeFormattingElements.add(element); } // the real deal @override bool processEOF() { for (var node in tree.openElements.reversed) { switch (node.localName) { case 'dd': case 'dt': case 'li': case 'p': case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead': case 'tr': case 'body': case 'html': continue; } parser.parseError(node.sourceSpan, 'expected-closing-tag-but-got-eof'); break; } //Stop parsing return false; } void processSpaceCharactersDropNewline(StringToken token) { // Sometimes (start of
, , and