2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 #include "HTMLTokenizer.h"
31 #include "HTMLEntityParser.h"
32 #include "HTMLToken.h"
33 #include "HTMLTreeBuilder.h"
34 #include "HTMLNames.h"
35 #include "MarkupTokenizerInlineMethods.h"
36 #include "NotImplemented.h"
37 #include <wtf/ASCIICType.h>
38 #include <wtf/CurrentTime.h>
39 #include <wtf/UnusedParam.h>
40 #include <wtf/text/AtomicString.h>
41 #include <wtf/text/CString.h>
42 #include <wtf/unicode/Unicode.h>
48 using namespace HTMLNames;
50 // This has to go in a .cpp file, as the linker doesn't like it being included more than once.
51 // We don't have an HTMLToken.cpp though, so this is the next best place.
53 QualifiedName AtomicMarkupTokenBase<HTMLToken>::nameForAttribute(const AttributeBase& attribute) const
55 return QualifiedName(nullAtom, AtomicString(attribute.m_name.data(), attribute.m_name.size()), nullAtom);
59 bool AtomicMarkupTokenBase<HTMLToken>::usesName() const
61 return m_type == HTMLTokenTypes::StartTag || m_type == HTMLTokenTypes::EndTag || m_type == HTMLTokenTypes::DOCTYPE;
65 bool AtomicMarkupTokenBase<HTMLToken>::usesAttributes() const
67 return m_type == HTMLTokenTypes::StartTag || m_type == HTMLTokenTypes::EndTag;
72 inline UChar toLowerCase(UChar cc)
74 ASSERT(isASCIIUpper(cc));
75 const int lowerCaseOffset = 0x20;
76 return cc + lowerCaseOffset;
79 inline bool vectorEqualsString(const Vector<UChar, 32>& vector, const String& string)
81 if (vector.size() != string.length())
83 const UChar* stringData = string.characters();
84 const UChar* vectorData = vector.data();
85 // FIXME: Is there a higher-level function we should be calling here?
86 return !memcmp(stringData, vectorData, vector.size() * sizeof(UChar));
89 inline bool isEndTagBufferingState(HTMLTokenizerState::State state)
92 case HTMLTokenizerState::RCDATAEndTagOpenState:
93 case HTMLTokenizerState::RCDATAEndTagNameState:
94 case HTMLTokenizerState::RAWTEXTEndTagOpenState:
95 case HTMLTokenizerState::RAWTEXTEndTagNameState:
96 case HTMLTokenizerState::ScriptDataEndTagOpenState:
97 case HTMLTokenizerState::ScriptDataEndTagNameState:
98 case HTMLTokenizerState::ScriptDataEscapedEndTagOpenState:
99 case HTMLTokenizerState::ScriptDataEscapedEndTagNameState:
108 #define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizerState, stateName)
109 #define HTML_RECONSUME_IN(stateName) RECONSUME_IN(HTMLTokenizerState, stateName)
110 #define HTML_ADVANCE_TO(stateName) ADVANCE_TO(HTMLTokenizerState, stateName)
111 #define HTML_SWITCH_TO(stateName) SWITCH_TO(HTMLTokenizerState, stateName)
113 HTMLTokenizer::HTMLTokenizer(bool usePreHTML5ParserQuirks)
114 : m_usePreHTML5ParserQuirks(usePreHTML5ParserQuirks)
119 HTMLTokenizer::~HTMLTokenizer()
124 inline bool MarkupTokenizerBase<HTMLToken, HTMLTokenizerState>::shouldSkipNullCharacters() const
126 return !m_forceNullCharacterReplacement
127 && (m_state == HTMLTokenizerState::DataState
128 || m_state == HTMLTokenizerState::RCDATAState
129 || m_state == HTMLTokenizerState::RAWTEXTState
130 || m_state == HTMLTokenizerState::PLAINTEXTState);
134 void HTMLTokenizer::reset()
136 m_state = HTMLTokenizerState::DataState;
139 m_skipLeadingNewLineForListing = false;
140 m_forceNullCharacterReplacement = false;
141 m_shouldAllowCDATA = false;
142 m_additionalAllowedCharacter = '\0';
145 inline bool HTMLTokenizer::processEntity(SegmentedString& source)
147 bool notEnoughCharacters = false;
148 StringBuilder decodedEntity;
149 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters);
150 if (notEnoughCharacters)
153 ASSERT(decodedEntity.isEmpty());
154 bufferCharacter('&');
156 for (unsigned i = 0; i < decodedEntity.length(); ++i)
157 bufferCharacter(decodedEntity[i]);
162 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
164 ASSERT(m_token->type() == HTMLTokenTypes::Character || m_token->type() == HTMLTokenTypes::Uninitialized);
165 source.advance(m_lineNumber);
166 if (m_token->type() == HTMLTokenTypes::Character)
168 m_token->beginEndTag(m_bufferedEndTagName);
169 m_bufferedEndTagName.clear();
173 #define FLUSH_AND_ADVANCE_TO(stateName) \
175 m_state = HTMLTokenizerState::stateName; \
176 if (flushBufferedEndTag(source)) \
178 if (source.isEmpty() \
179 || !m_inputStreamPreprocessor.peek(source, m_lineNumber)) \
180 return haveBufferedCharacterToken(); \
181 cc = m_inputStreamPreprocessor.nextInputCharacter(); \
185 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizerState::State state)
188 flushBufferedEndTag(source);
192 bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
194 // If we have a token in progress, then we're supposed to be called back
195 // with the same token so we can finish it.
196 ASSERT(!m_token || m_token == &token || token.type() == HTMLTokenTypes::Uninitialized);
199 if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) {
200 // FIXME: This should call flushBufferedEndTag().
201 // We started an end tag during our last iteration.
202 m_token->beginEndTag(m_bufferedEndTagName);
203 m_bufferedEndTagName.clear();
204 if (m_state == HTMLTokenizerState::DataState) {
205 // We're back in the data state, so we must be done with the tag.
210 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source, m_lineNumber))
211 return haveBufferedCharacterToken();
212 UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
214 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
215 // Note that this logic is different than the generic \r\n collapsing
216 // handled in the input stream preprocessor. This logic is here as an
217 // "authoring convenience" so folks can write:
224 // without getting an extra newline at the start of their <pre> element.
225 if (m_skipLeadingNewLineForListing) {
226 m_skipLeadingNewLineForListing = false;
228 if (m_state == HTMLTokenizerState::DataState)
229 HTML_ADVANCE_TO(DataState);
230 if (m_state == HTMLTokenizerState::RCDATAState)
231 HTML_ADVANCE_TO(RCDATAState);
232 // When parsing text/plain documents, we run the tokenizer in the
233 // PLAINTEXTState and ignore m_skipLeadingNewLineForListing.
234 ASSERT(m_state == HTMLTokenizerState::PLAINTEXTState);
238 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
240 HTML_BEGIN_STATE(DataState) {
242 HTML_ADVANCE_TO(CharacterReferenceInDataState);
243 else if (cc == '<') {
244 if (m_token->type() == HTMLTokenTypes::Character) {
245 // We have a bunch of character tokens queued up that we
246 // are emitting lazily here.
249 HTML_ADVANCE_TO(TagOpenState);
250 } else if (cc == InputStreamPreprocessor::endOfFileMarker)
251 return emitEndOfFile(source);
254 HTML_ADVANCE_TO(DataState);
259 HTML_BEGIN_STATE(CharacterReferenceInDataState) {
260 if (!processEntity(source))
261 return haveBufferedCharacterToken();
262 HTML_SWITCH_TO(DataState);
266 HTML_BEGIN_STATE(RCDATAState) {
268 HTML_ADVANCE_TO(CharacterReferenceInRCDATAState);
270 HTML_ADVANCE_TO(RCDATALessThanSignState);
271 else if (cc == InputStreamPreprocessor::endOfFileMarker)
272 return emitEndOfFile(source);
275 HTML_ADVANCE_TO(RCDATAState);
280 HTML_BEGIN_STATE(CharacterReferenceInRCDATAState) {
281 if (!processEntity(source))
282 return haveBufferedCharacterToken();
283 HTML_SWITCH_TO(RCDATAState);
287 HTML_BEGIN_STATE(RAWTEXTState) {
289 HTML_ADVANCE_TO(RAWTEXTLessThanSignState);
290 else if (cc == InputStreamPreprocessor::endOfFileMarker)
291 return emitEndOfFile(source);
294 HTML_ADVANCE_TO(RAWTEXTState);
299 HTML_BEGIN_STATE(ScriptDataState) {
301 HTML_ADVANCE_TO(ScriptDataLessThanSignState);
302 else if (cc == InputStreamPreprocessor::endOfFileMarker)
303 return emitEndOfFile(source);
306 HTML_ADVANCE_TO(ScriptDataState);
311 HTML_BEGIN_STATE(PLAINTEXTState) {
312 if (cc == InputStreamPreprocessor::endOfFileMarker)
313 return emitEndOfFile(source);
316 HTML_ADVANCE_TO(PLAINTEXTState);
320 HTML_BEGIN_STATE(TagOpenState) {
322 HTML_ADVANCE_TO(MarkupDeclarationOpenState);
324 HTML_ADVANCE_TO(EndTagOpenState);
325 else if (isASCIIUpper(cc)) {
326 m_token->beginStartTag(toLowerCase(cc));
327 HTML_ADVANCE_TO(TagNameState);
328 } else if (isASCIILower(cc)) {
329 m_token->beginStartTag(cc);
330 HTML_ADVANCE_TO(TagNameState);
331 } else if (cc == '?') {
333 // The spec consumes the current character before switching
334 // to the bogus comment state, but it's easier to implement
335 // if we reconsume the current character.
336 HTML_RECONSUME_IN(BogusCommentState);
339 bufferCharacter('<');
340 HTML_RECONSUME_IN(DataState);
345 HTML_BEGIN_STATE(EndTagOpenState) {
346 if (isASCIIUpper(cc)) {
347 m_token->beginEndTag(toLowerCase(cc));
348 HTML_ADVANCE_TO(TagNameState);
349 } else if (isASCIILower(cc)) {
350 m_token->beginEndTag(cc);
351 HTML_ADVANCE_TO(TagNameState);
352 } else if (cc == '>') {
354 HTML_ADVANCE_TO(DataState);
355 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
357 bufferCharacter('<');
358 bufferCharacter('/');
359 HTML_RECONSUME_IN(DataState);
362 HTML_RECONSUME_IN(BogusCommentState);
367 HTML_BEGIN_STATE(TagNameState) {
368 if (isTokenizerWhitespace(cc))
369 HTML_ADVANCE_TO(BeforeAttributeNameState);
371 HTML_ADVANCE_TO(SelfClosingStartTagState);
373 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
374 else if (m_usePreHTML5ParserQuirks && cc == '<')
375 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
376 else if (isASCIIUpper(cc)) {
377 m_token->appendToName(toLowerCase(cc));
378 HTML_ADVANCE_TO(TagNameState);
379 } if (cc == InputStreamPreprocessor::endOfFileMarker) {
381 HTML_RECONSUME_IN(DataState);
383 m_token->appendToName(cc);
384 HTML_ADVANCE_TO(TagNameState);
389 HTML_BEGIN_STATE(RCDATALessThanSignState) {
391 m_temporaryBuffer.clear();
392 ASSERT(m_bufferedEndTagName.isEmpty());
393 HTML_ADVANCE_TO(RCDATAEndTagOpenState);
395 bufferCharacter('<');
396 HTML_RECONSUME_IN(RCDATAState);
401 HTML_BEGIN_STATE(RCDATAEndTagOpenState) {
402 if (isASCIIUpper(cc)) {
403 m_temporaryBuffer.append(cc);
404 addToPossibleEndTag(toLowerCase(cc));
405 HTML_ADVANCE_TO(RCDATAEndTagNameState);
406 } else if (isASCIILower(cc)) {
407 m_temporaryBuffer.append(cc);
408 addToPossibleEndTag(cc);
409 HTML_ADVANCE_TO(RCDATAEndTagNameState);
411 bufferCharacter('<');
412 bufferCharacter('/');
413 HTML_RECONSUME_IN(RCDATAState);
418 HTML_BEGIN_STATE(RCDATAEndTagNameState) {
419 if (isASCIIUpper(cc)) {
420 m_temporaryBuffer.append(cc);
421 addToPossibleEndTag(toLowerCase(cc));
422 HTML_ADVANCE_TO(RCDATAEndTagNameState);
423 } else if (isASCIILower(cc)) {
424 m_temporaryBuffer.append(cc);
425 addToPossibleEndTag(cc);
426 HTML_ADVANCE_TO(RCDATAEndTagNameState);
428 if (isTokenizerWhitespace(cc)) {
429 if (isAppropriateEndTag())
430 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
431 } else if (cc == '/') {
432 if (isAppropriateEndTag())
433 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
434 } else if (cc == '>') {
435 if (isAppropriateEndTag())
436 return flushEmitAndResumeIn(source, HTMLTokenizerState::DataState);
438 bufferCharacter('<');
439 bufferCharacter('/');
440 m_token->appendToCharacter(m_temporaryBuffer);
441 m_bufferedEndTagName.clear();
442 HTML_RECONSUME_IN(RCDATAState);
447 HTML_BEGIN_STATE(RAWTEXTLessThanSignState) {
449 m_temporaryBuffer.clear();
450 ASSERT(m_bufferedEndTagName.isEmpty());
451 HTML_ADVANCE_TO(RAWTEXTEndTagOpenState);
453 bufferCharacter('<');
454 HTML_RECONSUME_IN(RAWTEXTState);
459 HTML_BEGIN_STATE(RAWTEXTEndTagOpenState) {
460 if (isASCIIUpper(cc)) {
461 m_temporaryBuffer.append(cc);
462 addToPossibleEndTag(toLowerCase(cc));
463 HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
464 } else if (isASCIILower(cc)) {
465 m_temporaryBuffer.append(cc);
466 addToPossibleEndTag(cc);
467 HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
469 bufferCharacter('<');
470 bufferCharacter('/');
471 HTML_RECONSUME_IN(RAWTEXTState);
476 HTML_BEGIN_STATE(RAWTEXTEndTagNameState) {
477 if (isASCIIUpper(cc)) {
478 m_temporaryBuffer.append(cc);
479 addToPossibleEndTag(toLowerCase(cc));
480 HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
481 } else if (isASCIILower(cc)) {
482 m_temporaryBuffer.append(cc);
483 addToPossibleEndTag(cc);
484 HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
486 if (isTokenizerWhitespace(cc)) {
487 if (isAppropriateEndTag())
488 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
489 } else if (cc == '/') {
490 if (isAppropriateEndTag())
491 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
492 } else if (cc == '>') {
493 if (isAppropriateEndTag())
494 return flushEmitAndResumeIn(source, HTMLTokenizerState::DataState);
496 bufferCharacter('<');
497 bufferCharacter('/');
498 m_token->appendToCharacter(m_temporaryBuffer);
499 m_bufferedEndTagName.clear();
500 HTML_RECONSUME_IN(RAWTEXTState);
505 HTML_BEGIN_STATE(ScriptDataLessThanSignState) {
507 m_temporaryBuffer.clear();
508 ASSERT(m_bufferedEndTagName.isEmpty());
509 HTML_ADVANCE_TO(ScriptDataEndTagOpenState);
510 } else if (cc == '!') {
511 bufferCharacter('<');
512 bufferCharacter('!');
513 HTML_ADVANCE_TO(ScriptDataEscapeStartState);
515 bufferCharacter('<');
516 HTML_RECONSUME_IN(ScriptDataState);
521 HTML_BEGIN_STATE(ScriptDataEndTagOpenState) {
522 if (isASCIIUpper(cc)) {
523 m_temporaryBuffer.append(cc);
524 addToPossibleEndTag(toLowerCase(cc));
525 HTML_ADVANCE_TO(ScriptDataEndTagNameState);
526 } else if (isASCIILower(cc)) {
527 m_temporaryBuffer.append(cc);
528 addToPossibleEndTag(cc);
529 HTML_ADVANCE_TO(ScriptDataEndTagNameState);
531 bufferCharacter('<');
532 bufferCharacter('/');
533 HTML_RECONSUME_IN(ScriptDataState);
538 HTML_BEGIN_STATE(ScriptDataEndTagNameState) {
539 if (isASCIIUpper(cc)) {
540 m_temporaryBuffer.append(cc);
541 addToPossibleEndTag(toLowerCase(cc));
542 HTML_ADVANCE_TO(ScriptDataEndTagNameState);
543 } else if (isASCIILower(cc)) {
544 m_temporaryBuffer.append(cc);
545 addToPossibleEndTag(cc);
546 HTML_ADVANCE_TO(ScriptDataEndTagNameState);
548 if (isTokenizerWhitespace(cc)) {
549 if (isAppropriateEndTag())
550 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
551 } else if (cc == '/') {
552 if (isAppropriateEndTag())
553 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
554 } else if (cc == '>') {
555 if (isAppropriateEndTag())
556 return flushEmitAndResumeIn(source, HTMLTokenizerState::DataState);
558 bufferCharacter('<');
559 bufferCharacter('/');
560 m_token->appendToCharacter(m_temporaryBuffer);
561 m_bufferedEndTagName.clear();
562 HTML_RECONSUME_IN(ScriptDataState);
567 HTML_BEGIN_STATE(ScriptDataEscapeStartState) {
570 HTML_ADVANCE_TO(ScriptDataEscapeStartDashState);
572 HTML_RECONSUME_IN(ScriptDataState);
576 HTML_BEGIN_STATE(ScriptDataEscapeStartDashState) {
579 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
581 HTML_RECONSUME_IN(ScriptDataState);
585 HTML_BEGIN_STATE(ScriptDataEscapedState) {
588 HTML_ADVANCE_TO(ScriptDataEscapedDashState);
589 } else if (cc == '<')
590 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
591 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
593 HTML_RECONSUME_IN(DataState);
596 HTML_ADVANCE_TO(ScriptDataEscapedState);
601 HTML_BEGIN_STATE(ScriptDataEscapedDashState) {
604 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
605 } else if (cc == '<')
606 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
607 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
609 HTML_RECONSUME_IN(DataState);
612 HTML_ADVANCE_TO(ScriptDataEscapedState);
617 HTML_BEGIN_STATE(ScriptDataEscapedDashDashState) {
620 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
621 } else if (cc == '<')
622 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
623 else if (cc == '>') {
625 HTML_ADVANCE_TO(ScriptDataState);
626 } if (cc == InputStreamPreprocessor::endOfFileMarker) {
628 HTML_RECONSUME_IN(DataState);
631 HTML_ADVANCE_TO(ScriptDataEscapedState);
636 HTML_BEGIN_STATE(ScriptDataEscapedLessThanSignState) {
638 m_temporaryBuffer.clear();
639 ASSERT(m_bufferedEndTagName.isEmpty());
640 HTML_ADVANCE_TO(ScriptDataEscapedEndTagOpenState);
641 } else if (isASCIIUpper(cc)) {
642 bufferCharacter('<');
644 m_temporaryBuffer.clear();
645 m_temporaryBuffer.append(toLowerCase(cc));
646 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
647 } else if (isASCIILower(cc)) {
648 bufferCharacter('<');
650 m_temporaryBuffer.clear();
651 m_temporaryBuffer.append(cc);
652 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
654 bufferCharacter('<');
655 HTML_RECONSUME_IN(ScriptDataEscapedState);
660 HTML_BEGIN_STATE(ScriptDataEscapedEndTagOpenState) {
661 if (isASCIIUpper(cc)) {
662 m_temporaryBuffer.append(cc);
663 addToPossibleEndTag(toLowerCase(cc));
664 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
665 } else if (isASCIILower(cc)) {
666 m_temporaryBuffer.append(cc);
667 addToPossibleEndTag(cc);
668 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
670 bufferCharacter('<');
671 bufferCharacter('/');
672 HTML_RECONSUME_IN(ScriptDataEscapedState);
677 HTML_BEGIN_STATE(ScriptDataEscapedEndTagNameState) {
678 if (isASCIIUpper(cc)) {
679 m_temporaryBuffer.append(cc);
680 addToPossibleEndTag(toLowerCase(cc));
681 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
682 } else if (isASCIILower(cc)) {
683 m_temporaryBuffer.append(cc);
684 addToPossibleEndTag(cc);
685 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
687 if (isTokenizerWhitespace(cc)) {
688 if (isAppropriateEndTag())
689 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
690 } else if (cc == '/') {
691 if (isAppropriateEndTag())
692 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
693 } else if (cc == '>') {
694 if (isAppropriateEndTag())
695 return flushEmitAndResumeIn(source, HTMLTokenizerState::DataState);
697 bufferCharacter('<');
698 bufferCharacter('/');
699 m_token->appendToCharacter(m_temporaryBuffer);
700 m_bufferedEndTagName.clear();
701 HTML_RECONSUME_IN(ScriptDataEscapedState);
706 HTML_BEGIN_STATE(ScriptDataDoubleEscapeStartState) {
707 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
709 if (temporaryBufferIs(scriptTag.localName()))
710 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
712 HTML_ADVANCE_TO(ScriptDataEscapedState);
713 } else if (isASCIIUpper(cc)) {
715 m_temporaryBuffer.append(toLowerCase(cc));
716 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
717 } else if (isASCIILower(cc)) {
719 m_temporaryBuffer.append(cc);
720 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
722 HTML_RECONSUME_IN(ScriptDataEscapedState);
726 HTML_BEGIN_STATE(ScriptDataDoubleEscapedState) {
729 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashState);
730 } else if (cc == '<') {
732 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
733 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
735 HTML_RECONSUME_IN(DataState);
738 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
743 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashState) {
746 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
747 } else if (cc == '<') {
749 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
750 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
752 HTML_RECONSUME_IN(DataState);
755 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
760 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) {
763 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
764 } else if (cc == '<') {
766 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
767 } else if (cc == '>') {
769 HTML_ADVANCE_TO(ScriptDataState);
770 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
772 HTML_RECONSUME_IN(DataState);
775 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
780 HTML_BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) {
783 m_temporaryBuffer.clear();
784 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
786 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState);
790 HTML_BEGIN_STATE(ScriptDataDoubleEscapeEndState) {
791 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
793 if (temporaryBufferIs(scriptTag.localName()))
794 HTML_ADVANCE_TO(ScriptDataEscapedState);
796 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
797 } else if (isASCIIUpper(cc)) {
799 m_temporaryBuffer.append(toLowerCase(cc));
800 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
801 } else if (isASCIILower(cc)) {
803 m_temporaryBuffer.append(cc);
804 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
806 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState);
810 HTML_BEGIN_STATE(BeforeAttributeNameState) {
811 if (isTokenizerWhitespace(cc))
812 HTML_ADVANCE_TO(BeforeAttributeNameState);
814 HTML_ADVANCE_TO(SelfClosingStartTagState);
816 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
817 else if (m_usePreHTML5ParserQuirks && cc == '<')
818 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
819 else if (isASCIIUpper(cc)) {
820 m_token->addNewAttribute();
821 m_token->beginAttributeName(source.numberOfCharactersConsumed());
822 m_token->appendToAttributeName(toLowerCase(cc));
823 HTML_ADVANCE_TO(AttributeNameState);
824 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
826 HTML_RECONSUME_IN(DataState);
828 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
830 m_token->addNewAttribute();
831 m_token->beginAttributeName(source.numberOfCharactersConsumed());
832 m_token->appendToAttributeName(cc);
833 HTML_ADVANCE_TO(AttributeNameState);
838 HTML_BEGIN_STATE(AttributeNameState) {
839 if (isTokenizerWhitespace(cc)) {
840 m_token->endAttributeName(source.numberOfCharactersConsumed());
841 HTML_ADVANCE_TO(AfterAttributeNameState);
842 } else if (cc == '/') {
843 m_token->endAttributeName(source.numberOfCharactersConsumed());
844 HTML_ADVANCE_TO(SelfClosingStartTagState);
845 } else if (cc == '=') {
846 m_token->endAttributeName(source.numberOfCharactersConsumed());
847 HTML_ADVANCE_TO(BeforeAttributeValueState);
848 } else if (cc == '>') {
849 m_token->endAttributeName(source.numberOfCharactersConsumed());
850 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
851 } else if (m_usePreHTML5ParserQuirks && cc == '<') {
852 m_token->endAttributeName(source.numberOfCharactersConsumed());
853 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
854 } else if (isASCIIUpper(cc)) {
855 m_token->appendToAttributeName(toLowerCase(cc));
856 HTML_ADVANCE_TO(AttributeNameState);
857 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
859 m_token->endAttributeName(source.numberOfCharactersConsumed());
860 HTML_RECONSUME_IN(DataState);
862 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
864 m_token->appendToAttributeName(cc);
865 HTML_ADVANCE_TO(AttributeNameState);
870 HTML_BEGIN_STATE(AfterAttributeNameState) {
871 if (isTokenizerWhitespace(cc))
872 HTML_ADVANCE_TO(AfterAttributeNameState);
874 HTML_ADVANCE_TO(SelfClosingStartTagState);
876 HTML_ADVANCE_TO(BeforeAttributeValueState);
878 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
879 else if (m_usePreHTML5ParserQuirks && cc == '<')
880 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
881 else if (isASCIIUpper(cc)) {
882 m_token->addNewAttribute();
883 m_token->beginAttributeName(source.numberOfCharactersConsumed());
884 m_token->appendToAttributeName(toLowerCase(cc));
885 HTML_ADVANCE_TO(AttributeNameState);
886 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
888 HTML_RECONSUME_IN(DataState);
890 if (cc == '"' || cc == '\'' || cc == '<')
892 m_token->addNewAttribute();
893 m_token->beginAttributeName(source.numberOfCharactersConsumed());
894 m_token->appendToAttributeName(cc);
895 HTML_ADVANCE_TO(AttributeNameState);
900 HTML_BEGIN_STATE(BeforeAttributeValueState) {
901 if (isTokenizerWhitespace(cc))
902 HTML_ADVANCE_TO(BeforeAttributeValueState);
903 else if (cc == '"') {
904 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
905 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
906 } else if (cc == '&') {
907 m_token->beginAttributeValue(source.numberOfCharactersConsumed());
908 HTML_RECONSUME_IN(AttributeValueUnquotedState);
909 } else if (cc == '\'') {
910 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
911 HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
912 } else if (cc == '>') {
914 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
915 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
917 HTML_RECONSUME_IN(DataState);
919 if (cc == '<' || cc == '=' || cc == '`')
921 m_token->beginAttributeValue(source.numberOfCharactersConsumed());
922 m_token->appendToAttributeValue(cc);
923 HTML_ADVANCE_TO(AttributeValueUnquotedState);
928 HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) {
930 m_token->endAttributeValue(source.numberOfCharactersConsumed());
931 HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
932 } else if (cc == '&') {
933 m_additionalAllowedCharacter = '"';
934 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
935 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
937 m_token->endAttributeValue(source.numberOfCharactersConsumed());
938 HTML_RECONSUME_IN(DataState);
940 m_token->appendToAttributeValue(cc);
941 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
946 HTML_BEGIN_STATE(AttributeValueSingleQuotedState) {
948 m_token->endAttributeValue(source.numberOfCharactersConsumed());
949 HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
950 } else if (cc == '&') {
951 m_additionalAllowedCharacter = '\'';
952 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
953 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
955 m_token->endAttributeValue(source.numberOfCharactersConsumed());
956 HTML_RECONSUME_IN(DataState);
958 m_token->appendToAttributeValue(cc);
959 HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
964 HTML_BEGIN_STATE(AttributeValueUnquotedState) {
965 if (isTokenizerWhitespace(cc)) {
966 m_token->endAttributeValue(source.numberOfCharactersConsumed());
967 HTML_ADVANCE_TO(BeforeAttributeNameState);
968 } else if (cc == '&') {
969 m_additionalAllowedCharacter = '>';
970 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
971 } else if (cc == '>') {
972 m_token->endAttributeValue(source.numberOfCharactersConsumed());
973 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
974 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
976 m_token->endAttributeValue(source.numberOfCharactersConsumed());
977 HTML_RECONSUME_IN(DataState);
979 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
981 m_token->appendToAttributeValue(cc);
982 HTML_ADVANCE_TO(AttributeValueUnquotedState);
987 HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) {
988 bool notEnoughCharacters = false;
989 StringBuilder decodedEntity;
990 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter);
991 if (notEnoughCharacters)
992 return haveBufferedCharacterToken();
994 ASSERT(decodedEntity.isEmpty());
995 m_token->appendToAttributeValue('&');
997 for (unsigned i = 0; i < decodedEntity.length(); ++i)
998 m_token->appendToAttributeValue(decodedEntity[i]);
1000 // We're supposed to switch back to the attribute value state that
1001 // we were in when we were switched into this state. Rather than
1002 // keeping track of this explictly, we observe that the previous
1003 // state can be determined by m_additionalAllowedCharacter.
1004 if (m_additionalAllowedCharacter == '"')
1005 HTML_SWITCH_TO(AttributeValueDoubleQuotedState);
1006 else if (m_additionalAllowedCharacter == '\'')
1007 HTML_SWITCH_TO(AttributeValueSingleQuotedState);
1008 else if (m_additionalAllowedCharacter == '>')
1009 HTML_SWITCH_TO(AttributeValueUnquotedState);
1011 ASSERT_NOT_REACHED();
1015 HTML_BEGIN_STATE(AfterAttributeValueQuotedState) {
1016 if (isTokenizerWhitespace(cc))
1017 HTML_ADVANCE_TO(BeforeAttributeNameState);
1019 HTML_ADVANCE_TO(SelfClosingStartTagState);
1021 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1022 else if (m_usePreHTML5ParserQuirks && cc == '<')
1023 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1024 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1026 HTML_RECONSUME_IN(DataState);
1029 HTML_RECONSUME_IN(BeforeAttributeNameState);
1034 HTML_BEGIN_STATE(SelfClosingStartTagState) {
1036 m_token->setSelfClosing();
1037 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1038 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1040 HTML_RECONSUME_IN(DataState);
1043 HTML_RECONSUME_IN(BeforeAttributeNameState);
1048 HTML_BEGIN_STATE(BogusCommentState) {
1049 m_token->beginComment();
1050 HTML_RECONSUME_IN(ContinueBogusCommentState);
1054 HTML_BEGIN_STATE(ContinueBogusCommentState) {
1056 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1057 else if (cc == InputStreamPreprocessor::endOfFileMarker)
1058 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1060 m_token->appendToComment(cc);
1061 HTML_ADVANCE_TO(ContinueBogusCommentState);
1066 HTML_BEGIN_STATE(MarkupDeclarationOpenState) {
1067 DEFINE_STATIC_LOCAL(String, dashDashString, ("--"));
1068 DEFINE_STATIC_LOCAL(String, doctypeString, ("doctype"));
1069 DEFINE_STATIC_LOCAL(String, cdataString, ("[CDATA["));
1071 SegmentedString::LookAheadResult result = source.lookAhead(dashDashString);
1072 if (result == SegmentedString::DidMatch) {
1073 source.advanceAndASSERT('-');
1074 source.advanceAndASSERT('-');
1075 m_token->beginComment();
1076 HTML_SWITCH_TO(CommentStartState);
1077 } else if (result == SegmentedString::NotEnoughCharacters)
1078 return haveBufferedCharacterToken();
1079 } else if (cc == 'D' || cc == 'd') {
1080 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString);
1081 if (result == SegmentedString::DidMatch) {
1082 advanceStringAndASSERTIgnoringCase(source, "doctype");
1083 HTML_SWITCH_TO(DOCTYPEState);
1084 } else if (result == SegmentedString::NotEnoughCharacters)
1085 return haveBufferedCharacterToken();
1086 } else if (cc == '[' && shouldAllowCDATA()) {
1087 SegmentedString::LookAheadResult result = source.lookAhead(cdataString);
1088 if (result == SegmentedString::DidMatch) {
1089 advanceStringAndASSERT(source, "[CDATA[");
1090 HTML_SWITCH_TO(CDATASectionState);
1091 } else if (result == SegmentedString::NotEnoughCharacters)
1092 return haveBufferedCharacterToken();
1095 HTML_RECONSUME_IN(BogusCommentState);
1099 HTML_BEGIN_STATE(CommentStartState) {
1101 HTML_ADVANCE_TO(CommentStartDashState);
1102 else if (cc == '>') {
1104 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1105 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1107 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1109 m_token->appendToComment(cc);
1110 HTML_ADVANCE_TO(CommentState);
1115 HTML_BEGIN_STATE(CommentStartDashState) {
1117 HTML_ADVANCE_TO(CommentEndState);
1118 else if (cc == '>') {
1120 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1121 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1123 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1125 m_token->appendToComment('-');
1126 m_token->appendToComment(cc);
1127 HTML_ADVANCE_TO(CommentState);
1132 HTML_BEGIN_STATE(CommentState) {
1134 HTML_ADVANCE_TO(CommentEndDashState);
1135 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1137 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1139 m_token->appendToComment(cc);
1140 HTML_ADVANCE_TO(CommentState);
1145 HTML_BEGIN_STATE(CommentEndDashState) {
1147 HTML_ADVANCE_TO(CommentEndState);
1148 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1150 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1152 m_token->appendToComment('-');
1153 m_token->appendToComment(cc);
1154 HTML_ADVANCE_TO(CommentState);
1159 HTML_BEGIN_STATE(CommentEndState) {
1161 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1162 else if (cc == '!') {
1164 HTML_ADVANCE_TO(CommentEndBangState);
1165 } else if (cc == '-') {
1167 m_token->appendToComment('-');
1168 HTML_ADVANCE_TO(CommentEndState);
1169 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1171 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1174 m_token->appendToComment('-');
1175 m_token->appendToComment('-');
1176 m_token->appendToComment(cc);
1177 HTML_ADVANCE_TO(CommentState);
1182 HTML_BEGIN_STATE(CommentEndBangState) {
1184 m_token->appendToComment('-');
1185 m_token->appendToComment('-');
1186 m_token->appendToComment('!');
1187 HTML_ADVANCE_TO(CommentEndDashState);
1188 } else if (cc == '>')
1189 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1190 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1192 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1194 m_token->appendToComment('-');
1195 m_token->appendToComment('-');
1196 m_token->appendToComment('!');
1197 m_token->appendToComment(cc);
1198 HTML_ADVANCE_TO(CommentState);
1203 HTML_BEGIN_STATE(DOCTYPEState) {
1204 if (isTokenizerWhitespace(cc))
1205 HTML_ADVANCE_TO(BeforeDOCTYPENameState);
1206 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1208 m_token->beginDOCTYPE();
1209 m_token->setForceQuirks();
1210 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1213 HTML_RECONSUME_IN(BeforeDOCTYPENameState);
1218 HTML_BEGIN_STATE(BeforeDOCTYPENameState) {
1219 if (isTokenizerWhitespace(cc))
1220 HTML_ADVANCE_TO(BeforeDOCTYPENameState);
1221 else if (isASCIIUpper(cc)) {
1222 m_token->beginDOCTYPE(toLowerCase(cc));
1223 HTML_ADVANCE_TO(DOCTYPENameState);
1224 } else if (cc == '>') {
1226 m_token->beginDOCTYPE();
1227 m_token->setForceQuirks();
1228 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1229 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1231 m_token->beginDOCTYPE();
1232 m_token->setForceQuirks();
1233 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1235 m_token->beginDOCTYPE(cc);
1236 HTML_ADVANCE_TO(DOCTYPENameState);
1241 HTML_BEGIN_STATE(DOCTYPENameState) {
1242 if (isTokenizerWhitespace(cc))
1243 HTML_ADVANCE_TO(AfterDOCTYPENameState);
1245 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1246 else if (isASCIIUpper(cc)) {
1247 m_token->appendToName(toLowerCase(cc));
1248 HTML_ADVANCE_TO(DOCTYPENameState);
1249 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1251 m_token->setForceQuirks();
1252 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1254 m_token->appendToName(cc);
1255 HTML_ADVANCE_TO(DOCTYPENameState);
1260 HTML_BEGIN_STATE(AfterDOCTYPENameState) {
1261 if (isTokenizerWhitespace(cc))
1262 HTML_ADVANCE_TO(AfterDOCTYPENameState);
1264 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1265 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1267 m_token->setForceQuirks();
1268 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1270 DEFINE_STATIC_LOCAL(String, publicString, ("public"));
1271 DEFINE_STATIC_LOCAL(String, systemString, ("system"));
1272 if (cc == 'P' || cc == 'p') {
1273 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString);
1274 if (result == SegmentedString::DidMatch) {
1275 advanceStringAndASSERTIgnoringCase(source, "public");
1276 HTML_SWITCH_TO(AfterDOCTYPEPublicKeywordState);
1277 } else if (result == SegmentedString::NotEnoughCharacters)
1278 return haveBufferedCharacterToken();
1279 } else if (cc == 'S' || cc == 's') {
1280 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString);
1281 if (result == SegmentedString::DidMatch) {
1282 advanceStringAndASSERTIgnoringCase(source, "system");
1283 HTML_SWITCH_TO(AfterDOCTYPESystemKeywordState);
1284 } else if (result == SegmentedString::NotEnoughCharacters)
1285 return haveBufferedCharacterToken();
1288 m_token->setForceQuirks();
1289 HTML_ADVANCE_TO(BogusDOCTYPEState);
1294 HTML_BEGIN_STATE(AfterDOCTYPEPublicKeywordState) {
1295 if (isTokenizerWhitespace(cc))
1296 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1297 else if (cc == '"') {
1299 m_token->setPublicIdentifierToEmptyString();
1300 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1301 } else if (cc == '\'') {
1303 m_token->setPublicIdentifierToEmptyString();
1304 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1305 } else if (cc == '>') {
1307 m_token->setForceQuirks();
1308 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1309 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1311 m_token->setForceQuirks();
1312 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1315 m_token->setForceQuirks();
1316 HTML_ADVANCE_TO(BogusDOCTYPEState);
1321 HTML_BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) {
1322 if (isTokenizerWhitespace(cc))
1323 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1324 else if (cc == '"') {
1325 m_token->setPublicIdentifierToEmptyString();
1326 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1327 } else if (cc == '\'') {
1328 m_token->setPublicIdentifierToEmptyString();
1329 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1330 } else if (cc == '>') {
1332 m_token->setForceQuirks();
1333 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1334 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1336 m_token->setForceQuirks();
1337 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1340 m_token->setForceQuirks();
1341 HTML_ADVANCE_TO(BogusDOCTYPEState);
1346 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) {
1348 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1349 else if (cc == '>') {
1351 m_token->setForceQuirks();
1352 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1353 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1355 m_token->setForceQuirks();
1356 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1358 m_token->appendToPublicIdentifier(cc);
1359 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1364 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) {
1366 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1367 else if (cc == '>') {
1369 m_token->setForceQuirks();
1370 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1371 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1373 m_token->setForceQuirks();
1374 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1376 m_token->appendToPublicIdentifier(cc);
1377 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1382 HTML_BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) {
1383 if (isTokenizerWhitespace(cc))
1384 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1386 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1387 else if (cc == '"') {
1389 m_token->setSystemIdentifierToEmptyString();
1390 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1391 } else if (cc == '\'') {
1393 m_token->setSystemIdentifierToEmptyString();
1394 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1395 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1397 m_token->setForceQuirks();
1398 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1401 m_token->setForceQuirks();
1402 HTML_ADVANCE_TO(BogusDOCTYPEState);
1407 HTML_BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) {
1408 if (isTokenizerWhitespace(cc))
1409 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1411 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1412 else if (cc == '"') {
1413 m_token->setSystemIdentifierToEmptyString();
1414 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1415 } else if (cc == '\'') {
1416 m_token->setSystemIdentifierToEmptyString();
1417 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1418 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1420 m_token->setForceQuirks();
1421 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1424 m_token->setForceQuirks();
1425 HTML_ADVANCE_TO(BogusDOCTYPEState);
1430 HTML_BEGIN_STATE(AfterDOCTYPESystemKeywordState) {
1431 if (isTokenizerWhitespace(cc))
1432 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1433 else if (cc == '"') {
1435 m_token->setSystemIdentifierToEmptyString();
1436 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1437 } else if (cc == '\'') {
1439 m_token->setSystemIdentifierToEmptyString();
1440 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1441 } else if (cc == '>') {
1443 m_token->setForceQuirks();
1444 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1445 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1447 m_token->setForceQuirks();
1448 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1451 m_token->setForceQuirks();
1452 HTML_ADVANCE_TO(BogusDOCTYPEState);
1457 HTML_BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) {
1458 if (isTokenizerWhitespace(cc))
1459 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1461 m_token->setSystemIdentifierToEmptyString();
1462 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1463 } else if (cc == '\'') {
1464 m_token->setSystemIdentifierToEmptyString();
1465 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1466 } else if (cc == '>') {
1468 m_token->setForceQuirks();
1469 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1470 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1472 m_token->setForceQuirks();
1473 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1476 m_token->setForceQuirks();
1477 HTML_ADVANCE_TO(BogusDOCTYPEState);
1482 HTML_BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) {
1484 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1485 else if (cc == '>') {
1487 m_token->setForceQuirks();
1488 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1489 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1491 m_token->setForceQuirks();
1492 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1494 m_token->appendToSystemIdentifier(cc);
1495 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1500 HTML_BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) {
1502 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1503 else if (cc == '>') {
1505 m_token->setForceQuirks();
1506 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1507 } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1509 m_token->setForceQuirks();
1510 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1512 m_token->appendToSystemIdentifier(cc);
1513 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1518 HTML_BEGIN_STATE(AfterDOCTYPESystemIdentifierState) {
1519 if (isTokenizerWhitespace(cc))
1520 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1522 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1523 else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1525 m_token->setForceQuirks();
1526 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1529 HTML_ADVANCE_TO(BogusDOCTYPEState);
1534 HTML_BEGIN_STATE(BogusDOCTYPEState) {
1536 return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1537 else if (cc == InputStreamPreprocessor::endOfFileMarker)
1538 return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1539 HTML_ADVANCE_TO(BogusDOCTYPEState);
1543 HTML_BEGIN_STATE(CDATASectionState) {
1545 HTML_ADVANCE_TO(CDATASectionRightSquareBracketState);
1546 else if (cc == InputStreamPreprocessor::endOfFileMarker)
1547 HTML_RECONSUME_IN(DataState);
1549 bufferCharacter(cc);
1550 HTML_ADVANCE_TO(CDATASectionState);
1555 HTML_BEGIN_STATE(CDATASectionRightSquareBracketState) {
1557 HTML_ADVANCE_TO(CDATASectionDoubleRightSquareBracketState);
1559 bufferCharacter(']');
1560 HTML_RECONSUME_IN(CDATASectionState);
1564 HTML_BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) {
1566 HTML_ADVANCE_TO(DataState);
1568 bufferCharacter(']');
1569 bufferCharacter(']');
1570 HTML_RECONSUME_IN(CDATASectionState);
1577 ASSERT_NOT_REACHED();
1581 void HTMLTokenizer::updateStateFor(const AtomicString& tagName, Frame* frame)
1583 if (tagName == textareaTag || tagName == titleTag)
1584 setState(HTMLTokenizerState::RCDATAState);
1585 else if (tagName == plaintextTag)
1586 setState(HTMLTokenizerState::PLAINTEXTState);
1587 else if (tagName == scriptTag)
1588 setState(HTMLTokenizerState::ScriptDataState);
1589 else if (tagName == styleTag
1590 || tagName == iframeTag
1591 || tagName == xmpTag
1592 || (tagName == noembedTag && HTMLTreeBuilder::pluginsEnabled(frame))
1593 || tagName == noframesTag
1594 || (tagName == noscriptTag && HTMLTreeBuilder::scriptEnabled(frame)))
1595 setState(HTMLTokenizerState::RAWTEXTState);
1598 inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString)
1600 return vectorEqualsString(m_temporaryBuffer, expectedString);
1603 inline void HTMLTokenizer::addToPossibleEndTag(UChar cc)
1605 ASSERT(isEndTagBufferingState(m_state));
1606 m_bufferedEndTagName.append(cc);
1609 inline bool HTMLTokenizer::isAppropriateEndTag()
1611 return m_bufferedEndTagName == m_appropriateEndTagName;
1614 inline void HTMLTokenizer::parseError()