7a269be190d8e4c826edc99dd8f6bebf2e665dc0
[vuplus_webkit] / Source / WebCore / html / parser / HTMLTokenizer.cpp
1 /*
2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3  * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
26  */
27
28 #include "config.h"
29 #include "HTMLTokenizer.h"
30
31 #include "HTMLEntityParser.h"
32 #include "HTMLToken.h"
33 #include "HTMLTreeBuilder.h"
34 #include "HTMLNames.h"
35 #include "MarkupTokenizerInlineMethods.h"
36 #include "NotImplemented.h"
37 #include <wtf/ASCIICType.h>
38 #include <wtf/CurrentTime.h>
39 #include <wtf/UnusedParam.h>
40 #include <wtf/text/AtomicString.h>
41 #include <wtf/text/CString.h>
42 #include <wtf/unicode/Unicode.h>
43
44 using namespace WTF;
45
46 namespace WebCore {
47
48 using namespace HTMLNames;
49
50 // This has to go in a .cpp file, as the linker doesn't like it being included more than once.
51 // We don't have an HTMLToken.cpp though, so this is the next best place.
52 template<>
53 QualifiedName AtomicMarkupTokenBase<HTMLToken>::nameForAttribute(const AttributeBase& attribute) const
54 {
55     return QualifiedName(nullAtom, AtomicString(attribute.m_name.data(), attribute.m_name.size()), nullAtom);
56 }
57
58 template<>
59 bool AtomicMarkupTokenBase<HTMLToken>::usesName() const
60 {
61     return m_type == HTMLTokenTypes::StartTag || m_type == HTMLTokenTypes::EndTag || m_type == HTMLTokenTypes::DOCTYPE;
62 }
63
64 template<>
65 bool AtomicMarkupTokenBase<HTMLToken>::usesAttributes() const
66 {
67     return m_type == HTMLTokenTypes::StartTag || m_type == HTMLTokenTypes::EndTag;
68 }
69
70 namespace {
71
72 inline UChar toLowerCase(UChar cc)
73 {
74     ASSERT(isASCIIUpper(cc));
75     const int lowerCaseOffset = 0x20;
76     return cc + lowerCaseOffset;
77 }
78
79 inline bool vectorEqualsString(const Vector<UChar, 32>& vector, const String& string)
80 {
81     if (vector.size() != string.length())
82         return false;
83     const UChar* stringData = string.characters();
84     const UChar* vectorData = vector.data();
85     // FIXME: Is there a higher-level function we should be calling here?
86     return !memcmp(stringData, vectorData, vector.size() * sizeof(UChar));
87 }
88
89 inline bool isEndTagBufferingState(HTMLTokenizerState::State state)
90 {
91     switch (state) {
92     case HTMLTokenizerState::RCDATAEndTagOpenState:
93     case HTMLTokenizerState::RCDATAEndTagNameState:
94     case HTMLTokenizerState::RAWTEXTEndTagOpenState:
95     case HTMLTokenizerState::RAWTEXTEndTagNameState:
96     case HTMLTokenizerState::ScriptDataEndTagOpenState:
97     case HTMLTokenizerState::ScriptDataEndTagNameState:
98     case HTMLTokenizerState::ScriptDataEscapedEndTagOpenState:
99     case HTMLTokenizerState::ScriptDataEscapedEndTagNameState:
100         return true;
101     default:
102         return false;
103     }
104 }
105
106 }
107     
108 #define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizerState, stateName)
109 #define HTML_RECONSUME_IN(stateName) RECONSUME_IN(HTMLTokenizerState, stateName)
110 #define HTML_ADVANCE_TO(stateName) ADVANCE_TO(HTMLTokenizerState, stateName)
111 #define HTML_SWITCH_TO(stateName) SWITCH_TO(HTMLTokenizerState, stateName)
112
113 HTMLTokenizer::HTMLTokenizer(bool usePreHTML5ParserQuirks)
114     : m_usePreHTML5ParserQuirks(usePreHTML5ParserQuirks)
115 {
116     reset();
117 }
118
119 HTMLTokenizer::~HTMLTokenizer()
120 {
121 }
122
123 template<>
124 inline bool MarkupTokenizerBase<HTMLToken, HTMLTokenizerState>::shouldSkipNullCharacters() const
125 {
126     return !m_forceNullCharacterReplacement
127         && (m_state == HTMLTokenizerState::DataState
128             || m_state == HTMLTokenizerState::RCDATAState
129             || m_state == HTMLTokenizerState::RAWTEXTState
130             || m_state == HTMLTokenizerState::PLAINTEXTState);
131 }
132
133
134 void HTMLTokenizer::reset()
135 {
136     m_state = HTMLTokenizerState::DataState;
137     m_token = 0;
138     m_lineNumber = 0;
139     m_skipLeadingNewLineForListing = false;
140     m_forceNullCharacterReplacement = false;
141     m_shouldAllowCDATA = false;
142     m_additionalAllowedCharacter = '\0';
143 }
144
145 inline bool HTMLTokenizer::processEntity(SegmentedString& source)
146 {
147     bool notEnoughCharacters = false;
148     StringBuilder decodedEntity;
149     bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters);
150     if (notEnoughCharacters)
151         return false;
152     if (!success) {
153         ASSERT(decodedEntity.isEmpty());
154         bufferCharacter('&');
155     } else {
156         for (unsigned i = 0; i < decodedEntity.length(); ++i)
157             bufferCharacter(decodedEntity[i]);
158     }
159     return true;
160 }
161
162 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
163 {
164     ASSERT(m_token->type() == HTMLTokenTypes::Character || m_token->type() == HTMLTokenTypes::Uninitialized);
165     source.advance(m_lineNumber);
166     if (m_token->type() == HTMLTokenTypes::Character)
167         return true;
168     m_token->beginEndTag(m_bufferedEndTagName);
169     m_bufferedEndTagName.clear();
170     return false;
171 }
172
173 #define FLUSH_AND_ADVANCE_TO(stateName)                                    \
174     do {                                                                   \
175         m_state = HTMLTokenizerState::stateName;                           \
176         if (flushBufferedEndTag(source))                                   \
177             return true;                                                   \
178         if (source.isEmpty()                                               \
179             || !m_inputStreamPreprocessor.peek(source, m_lineNumber))      \
180             return haveBufferedCharacterToken();                           \
181         cc = m_inputStreamPreprocessor.nextInputCharacter();               \
182         goto stateName;                                                    \
183     } while (false)
184
185 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizerState::State state)
186 {
187     m_state = state;
188     flushBufferedEndTag(source);
189     return true;
190 }
191
192 bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
193 {
194     // If we have a token in progress, then we're supposed to be called back
195     // with the same token so we can finish it.
196     ASSERT(!m_token || m_token == &token || token.type() == HTMLTokenTypes::Uninitialized);
197     m_token = &token;
198
199     if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) {
200         // FIXME: This should call flushBufferedEndTag().
201         // We started an end tag during our last iteration.
202         m_token->beginEndTag(m_bufferedEndTagName);
203         m_bufferedEndTagName.clear();
204         if (m_state == HTMLTokenizerState::DataState) {
205             // We're back in the data state, so we must be done with the tag.
206             return true;
207         }
208     }
209
210     if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source, m_lineNumber))
211         return haveBufferedCharacterToken();
212     UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
213
214     // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
215     // Note that this logic is different than the generic \r\n collapsing
216     // handled in the input stream preprocessor. This logic is here as an
217     // "authoring convenience" so folks can write:
218     //
219     // <pre>
220     // lorem ipsum
221     // lorem ipsum
222     // </pre>
223     //
224     // without getting an extra newline at the start of their <pre> element.
225     if (m_skipLeadingNewLineForListing) {
226         m_skipLeadingNewLineForListing = false;
227         if (cc == '\n') {
228             if (m_state == HTMLTokenizerState::DataState)
229                 HTML_ADVANCE_TO(DataState);
230             if (m_state == HTMLTokenizerState::RCDATAState)
231                 HTML_ADVANCE_TO(RCDATAState);
232             // When parsing text/plain documents, we run the tokenizer in the
233             // PLAINTEXTState and ignore m_skipLeadingNewLineForListing.
234             ASSERT(m_state == HTMLTokenizerState::PLAINTEXTState);
235         }
236     }
237
238     // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
239     switch (m_state) {
240     HTML_BEGIN_STATE(DataState) {
241         if (cc == '&')
242             HTML_ADVANCE_TO(CharacterReferenceInDataState);
243         else if (cc == '<') {
244             if (m_token->type() == HTMLTokenTypes::Character) {
245                 // We have a bunch of character tokens queued up that we
246                 // are emitting lazily here.
247                 return true;
248             }
249             HTML_ADVANCE_TO(TagOpenState);
250         } else if (cc == InputStreamPreprocessor::endOfFileMarker)
251             return emitEndOfFile(source);
252         else {
253             bufferCharacter(cc);
254             HTML_ADVANCE_TO(DataState);
255         }
256     }
257     END_STATE()
258
259     HTML_BEGIN_STATE(CharacterReferenceInDataState) {
260         if (!processEntity(source))
261             return haveBufferedCharacterToken();
262         HTML_SWITCH_TO(DataState);
263     }
264     END_STATE()
265
266     HTML_BEGIN_STATE(RCDATAState) {
267         if (cc == '&')
268             HTML_ADVANCE_TO(CharacterReferenceInRCDATAState);
269         else if (cc == '<')
270             HTML_ADVANCE_TO(RCDATALessThanSignState);
271         else if (cc == InputStreamPreprocessor::endOfFileMarker)
272             return emitEndOfFile(source);
273         else {
274             bufferCharacter(cc);
275             HTML_ADVANCE_TO(RCDATAState);
276         }
277     }
278     END_STATE()
279
280     HTML_BEGIN_STATE(CharacterReferenceInRCDATAState) {
281         if (!processEntity(source))
282             return haveBufferedCharacterToken();
283         HTML_SWITCH_TO(RCDATAState);
284     }
285     END_STATE()
286
287     HTML_BEGIN_STATE(RAWTEXTState) {
288         if (cc == '<')
289             HTML_ADVANCE_TO(RAWTEXTLessThanSignState);
290         else if (cc == InputStreamPreprocessor::endOfFileMarker)
291             return emitEndOfFile(source);
292         else {
293             bufferCharacter(cc);
294             HTML_ADVANCE_TO(RAWTEXTState);
295         }
296     }
297     END_STATE()
298
299     HTML_BEGIN_STATE(ScriptDataState) {
300         if (cc == '<')
301             HTML_ADVANCE_TO(ScriptDataLessThanSignState);
302         else if (cc == InputStreamPreprocessor::endOfFileMarker)
303             return emitEndOfFile(source);
304         else {
305             bufferCharacter(cc);
306             HTML_ADVANCE_TO(ScriptDataState);
307         }
308     }
309     END_STATE()
310
311     HTML_BEGIN_STATE(PLAINTEXTState) {
312         if (cc == InputStreamPreprocessor::endOfFileMarker)
313             return emitEndOfFile(source);
314         else
315             bufferCharacter(cc);
316         HTML_ADVANCE_TO(PLAINTEXTState);
317     }
318     END_STATE()
319
320     HTML_BEGIN_STATE(TagOpenState) {
321         if (cc == '!')
322             HTML_ADVANCE_TO(MarkupDeclarationOpenState);
323         else if (cc == '/')
324             HTML_ADVANCE_TO(EndTagOpenState);
325         else if (isASCIIUpper(cc)) {
326             m_token->beginStartTag(toLowerCase(cc));
327             HTML_ADVANCE_TO(TagNameState);
328         } else if (isASCIILower(cc)) {
329             m_token->beginStartTag(cc);
330             HTML_ADVANCE_TO(TagNameState);
331         } else if (cc == '?') {
332             parseError();
333             // The spec consumes the current character before switching
334             // to the bogus comment state, but it's easier to implement
335             // if we reconsume the current character.
336             HTML_RECONSUME_IN(BogusCommentState);
337         } else {
338             parseError();
339             bufferCharacter('<');
340             HTML_RECONSUME_IN(DataState);
341         }
342     }
343     END_STATE()
344
345     HTML_BEGIN_STATE(EndTagOpenState) {
346         if (isASCIIUpper(cc)) {
347             m_token->beginEndTag(toLowerCase(cc));
348             HTML_ADVANCE_TO(TagNameState);
349         } else if (isASCIILower(cc)) {
350             m_token->beginEndTag(cc);
351             HTML_ADVANCE_TO(TagNameState);
352         } else if (cc == '>') {
353             parseError();
354             HTML_ADVANCE_TO(DataState);
355         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
356             parseError();
357             bufferCharacter('<');
358             bufferCharacter('/');
359             HTML_RECONSUME_IN(DataState);
360         } else {
361             parseError();
362             HTML_RECONSUME_IN(BogusCommentState);
363         }
364     }
365     END_STATE()
366
367     HTML_BEGIN_STATE(TagNameState) {
368         if (isTokenizerWhitespace(cc))
369             HTML_ADVANCE_TO(BeforeAttributeNameState);
370         else if (cc == '/')
371             HTML_ADVANCE_TO(SelfClosingStartTagState);
372         else if (cc == '>')
373             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
374         else if (m_usePreHTML5ParserQuirks && cc == '<')
375             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
376         else if (isASCIIUpper(cc)) {
377             m_token->appendToName(toLowerCase(cc));
378             HTML_ADVANCE_TO(TagNameState);
379         } if (cc == InputStreamPreprocessor::endOfFileMarker) {
380             parseError();
381             HTML_RECONSUME_IN(DataState);
382         } else {
383             m_token->appendToName(cc);
384             HTML_ADVANCE_TO(TagNameState);
385         }
386     }
387     END_STATE()
388
389     HTML_BEGIN_STATE(RCDATALessThanSignState) {
390         if (cc == '/') {
391             m_temporaryBuffer.clear();
392             ASSERT(m_bufferedEndTagName.isEmpty());
393             HTML_ADVANCE_TO(RCDATAEndTagOpenState);
394         } else {
395             bufferCharacter('<');
396             HTML_RECONSUME_IN(RCDATAState);
397         }
398     }
399     END_STATE()
400
401     HTML_BEGIN_STATE(RCDATAEndTagOpenState) {
402         if (isASCIIUpper(cc)) {
403             m_temporaryBuffer.append(cc);
404             addToPossibleEndTag(toLowerCase(cc));
405             HTML_ADVANCE_TO(RCDATAEndTagNameState);
406         } else if (isASCIILower(cc)) {
407             m_temporaryBuffer.append(cc);
408             addToPossibleEndTag(cc);
409             HTML_ADVANCE_TO(RCDATAEndTagNameState);
410         } else {
411             bufferCharacter('<');
412             bufferCharacter('/');
413             HTML_RECONSUME_IN(RCDATAState);
414         }
415     }
416     END_STATE()
417
418     HTML_BEGIN_STATE(RCDATAEndTagNameState) {
419         if (isASCIIUpper(cc)) {
420             m_temporaryBuffer.append(cc);
421             addToPossibleEndTag(toLowerCase(cc));
422             HTML_ADVANCE_TO(RCDATAEndTagNameState);
423         } else if (isASCIILower(cc)) {
424             m_temporaryBuffer.append(cc);
425             addToPossibleEndTag(cc);
426             HTML_ADVANCE_TO(RCDATAEndTagNameState);
427         } else {
428             if (isTokenizerWhitespace(cc)) {
429                 if (isAppropriateEndTag())
430                     FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
431             } else if (cc == '/') {
432                 if (isAppropriateEndTag())
433                     FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
434             } else if (cc == '>') {
435                 if (isAppropriateEndTag())
436                     return flushEmitAndResumeIn(source, HTMLTokenizerState::DataState);
437             }
438             bufferCharacter('<');
439             bufferCharacter('/');
440             m_token->appendToCharacter(m_temporaryBuffer);
441             m_bufferedEndTagName.clear();
442             HTML_RECONSUME_IN(RCDATAState);
443         }
444     }
445     END_STATE()
446
447     HTML_BEGIN_STATE(RAWTEXTLessThanSignState) {
448         if (cc == '/') {
449             m_temporaryBuffer.clear();
450             ASSERT(m_bufferedEndTagName.isEmpty());
451             HTML_ADVANCE_TO(RAWTEXTEndTagOpenState);
452         } else {
453             bufferCharacter('<');
454             HTML_RECONSUME_IN(RAWTEXTState);
455         }
456     }
457     END_STATE()
458
459     HTML_BEGIN_STATE(RAWTEXTEndTagOpenState) {
460         if (isASCIIUpper(cc)) {
461             m_temporaryBuffer.append(cc);
462             addToPossibleEndTag(toLowerCase(cc));
463             HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
464         } else if (isASCIILower(cc)) {
465             m_temporaryBuffer.append(cc);
466             addToPossibleEndTag(cc);
467             HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
468         } else {
469             bufferCharacter('<');
470             bufferCharacter('/');
471             HTML_RECONSUME_IN(RAWTEXTState);
472         }
473     }
474     END_STATE()
475
476     HTML_BEGIN_STATE(RAWTEXTEndTagNameState) {
477         if (isASCIIUpper(cc)) {
478             m_temporaryBuffer.append(cc);
479             addToPossibleEndTag(toLowerCase(cc));
480             HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
481         } else if (isASCIILower(cc)) {
482             m_temporaryBuffer.append(cc);
483             addToPossibleEndTag(cc);
484             HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
485         } else {
486             if (isTokenizerWhitespace(cc)) {
487                 if (isAppropriateEndTag())
488                     FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
489             } else if (cc == '/') {
490                 if (isAppropriateEndTag())
491                     FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
492             } else if (cc == '>') {
493                 if (isAppropriateEndTag())
494                     return flushEmitAndResumeIn(source, HTMLTokenizerState::DataState);
495             }
496             bufferCharacter('<');
497             bufferCharacter('/');
498             m_token->appendToCharacter(m_temporaryBuffer);
499             m_bufferedEndTagName.clear();
500             HTML_RECONSUME_IN(RAWTEXTState);
501         }
502     }
503     END_STATE()
504
505     HTML_BEGIN_STATE(ScriptDataLessThanSignState) {
506         if (cc == '/') {
507             m_temporaryBuffer.clear();
508             ASSERT(m_bufferedEndTagName.isEmpty());
509             HTML_ADVANCE_TO(ScriptDataEndTagOpenState);
510         } else if (cc == '!') {
511             bufferCharacter('<');
512             bufferCharacter('!');
513             HTML_ADVANCE_TO(ScriptDataEscapeStartState);
514         } else {
515             bufferCharacter('<');
516             HTML_RECONSUME_IN(ScriptDataState);
517         }
518     }
519     END_STATE()
520
521     HTML_BEGIN_STATE(ScriptDataEndTagOpenState) {
522         if (isASCIIUpper(cc)) {
523             m_temporaryBuffer.append(cc);
524             addToPossibleEndTag(toLowerCase(cc));
525             HTML_ADVANCE_TO(ScriptDataEndTagNameState);
526         } else if (isASCIILower(cc)) {
527             m_temporaryBuffer.append(cc);
528             addToPossibleEndTag(cc);
529             HTML_ADVANCE_TO(ScriptDataEndTagNameState);
530         } else {
531             bufferCharacter('<');
532             bufferCharacter('/');
533             HTML_RECONSUME_IN(ScriptDataState);
534         }
535     }
536     END_STATE()
537
538     HTML_BEGIN_STATE(ScriptDataEndTagNameState) {
539         if (isASCIIUpper(cc)) {
540             m_temporaryBuffer.append(cc);
541             addToPossibleEndTag(toLowerCase(cc));
542             HTML_ADVANCE_TO(ScriptDataEndTagNameState);
543         } else if (isASCIILower(cc)) {
544             m_temporaryBuffer.append(cc);
545             addToPossibleEndTag(cc);
546             HTML_ADVANCE_TO(ScriptDataEndTagNameState);
547         } else {
548             if (isTokenizerWhitespace(cc)) {
549                 if (isAppropriateEndTag())
550                     FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
551             } else if (cc == '/') {
552                 if (isAppropriateEndTag())
553                     FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
554             } else if (cc == '>') {
555                 if (isAppropriateEndTag())
556                     return flushEmitAndResumeIn(source, HTMLTokenizerState::DataState);
557             }
558             bufferCharacter('<');
559             bufferCharacter('/');
560             m_token->appendToCharacter(m_temporaryBuffer);
561             m_bufferedEndTagName.clear();
562             HTML_RECONSUME_IN(ScriptDataState);
563         }
564     }
565     END_STATE()
566
567     HTML_BEGIN_STATE(ScriptDataEscapeStartState) {
568         if (cc == '-') {
569             bufferCharacter(cc);
570             HTML_ADVANCE_TO(ScriptDataEscapeStartDashState);
571         } else
572             HTML_RECONSUME_IN(ScriptDataState);
573     }
574     END_STATE()
575
576     HTML_BEGIN_STATE(ScriptDataEscapeStartDashState) {
577         if (cc == '-') {
578             bufferCharacter(cc);
579             HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
580         } else
581             HTML_RECONSUME_IN(ScriptDataState);
582     }
583     END_STATE()
584
585     HTML_BEGIN_STATE(ScriptDataEscapedState) {
586         if (cc == '-') {
587             bufferCharacter(cc);
588             HTML_ADVANCE_TO(ScriptDataEscapedDashState);
589         } else if (cc == '<')
590             HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
591         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
592             parseError();
593             HTML_RECONSUME_IN(DataState);
594         } else {
595             bufferCharacter(cc);
596             HTML_ADVANCE_TO(ScriptDataEscapedState);
597         }
598     }
599     END_STATE()
600
601     HTML_BEGIN_STATE(ScriptDataEscapedDashState) {
602         if (cc == '-') {
603             bufferCharacter(cc);
604             HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
605         } else if (cc == '<')
606             HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
607         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
608             parseError();
609             HTML_RECONSUME_IN(DataState);
610         } else {
611             bufferCharacter(cc);
612             HTML_ADVANCE_TO(ScriptDataEscapedState);
613         }
614     }
615     END_STATE()
616
617     HTML_BEGIN_STATE(ScriptDataEscapedDashDashState) {
618         if (cc == '-') {
619             bufferCharacter(cc);
620             HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
621         } else if (cc == '<')
622             HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
623         else if (cc == '>') {
624             bufferCharacter(cc);
625             HTML_ADVANCE_TO(ScriptDataState);
626         } if (cc == InputStreamPreprocessor::endOfFileMarker) {
627             parseError();
628             HTML_RECONSUME_IN(DataState);
629         } else {
630             bufferCharacter(cc);
631             HTML_ADVANCE_TO(ScriptDataEscapedState);
632         }
633     }
634     END_STATE()
635
636     HTML_BEGIN_STATE(ScriptDataEscapedLessThanSignState) {
637         if (cc == '/') {
638             m_temporaryBuffer.clear();
639             ASSERT(m_bufferedEndTagName.isEmpty());
640             HTML_ADVANCE_TO(ScriptDataEscapedEndTagOpenState);
641         } else if (isASCIIUpper(cc)) {
642             bufferCharacter('<');
643             bufferCharacter(cc);
644             m_temporaryBuffer.clear();
645             m_temporaryBuffer.append(toLowerCase(cc));
646             HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
647         } else if (isASCIILower(cc)) {
648             bufferCharacter('<');
649             bufferCharacter(cc);
650             m_temporaryBuffer.clear();
651             m_temporaryBuffer.append(cc);
652             HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
653         } else {
654             bufferCharacter('<');
655             HTML_RECONSUME_IN(ScriptDataEscapedState);
656         }
657     }
658     END_STATE()
659
660     HTML_BEGIN_STATE(ScriptDataEscapedEndTagOpenState) {
661         if (isASCIIUpper(cc)) {
662             m_temporaryBuffer.append(cc);
663             addToPossibleEndTag(toLowerCase(cc));
664             HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
665         } else if (isASCIILower(cc)) {
666             m_temporaryBuffer.append(cc);
667             addToPossibleEndTag(cc);
668             HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
669         } else {
670             bufferCharacter('<');
671             bufferCharacter('/');
672             HTML_RECONSUME_IN(ScriptDataEscapedState);
673         }
674     }
675     END_STATE()
676
677     HTML_BEGIN_STATE(ScriptDataEscapedEndTagNameState) {
678         if (isASCIIUpper(cc)) {
679             m_temporaryBuffer.append(cc);
680             addToPossibleEndTag(toLowerCase(cc));
681             HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
682         } else if (isASCIILower(cc)) {
683             m_temporaryBuffer.append(cc);
684             addToPossibleEndTag(cc);
685             HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
686         } else {
687             if (isTokenizerWhitespace(cc)) {
688                 if (isAppropriateEndTag())
689                     FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
690             } else if (cc == '/') {
691                 if (isAppropriateEndTag())
692                     FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
693             } else if (cc == '>') {
694                 if (isAppropriateEndTag())
695                     return flushEmitAndResumeIn(source, HTMLTokenizerState::DataState);
696             }
697             bufferCharacter('<');
698             bufferCharacter('/');
699             m_token->appendToCharacter(m_temporaryBuffer);
700             m_bufferedEndTagName.clear();
701             HTML_RECONSUME_IN(ScriptDataEscapedState);
702         }
703     }
704     END_STATE()
705
706     HTML_BEGIN_STATE(ScriptDataDoubleEscapeStartState) {
707         if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
708             bufferCharacter(cc);
709             if (temporaryBufferIs(scriptTag.localName()))
710                 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
711             else
712                 HTML_ADVANCE_TO(ScriptDataEscapedState);
713         } else if (isASCIIUpper(cc)) {
714             bufferCharacter(cc);
715             m_temporaryBuffer.append(toLowerCase(cc));
716             HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
717         } else if (isASCIILower(cc)) {
718             bufferCharacter(cc);
719             m_temporaryBuffer.append(cc);
720             HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
721         } else
722             HTML_RECONSUME_IN(ScriptDataEscapedState);
723     }
724     END_STATE()
725
726     HTML_BEGIN_STATE(ScriptDataDoubleEscapedState) {
727         if (cc == '-') {
728             bufferCharacter(cc);
729             HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashState);
730         } else if (cc == '<') {
731             bufferCharacter(cc);
732             HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
733         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
734             parseError();
735             HTML_RECONSUME_IN(DataState);
736         } else {
737             bufferCharacter(cc);
738             HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
739         }
740     }
741     END_STATE()
742
743     HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashState) {
744         if (cc == '-') {
745             bufferCharacter(cc);
746             HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
747         } else if (cc == '<') {
748             bufferCharacter(cc);
749             HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
750         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
751             parseError();
752             HTML_RECONSUME_IN(DataState);
753         } else {
754             bufferCharacter(cc);
755             HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
756         }
757     }
758     END_STATE()
759
760     HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) {
761         if (cc == '-') {
762             bufferCharacter(cc);
763             HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
764         } else if (cc == '<') {
765             bufferCharacter(cc);
766             HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
767         } else if (cc == '>') {
768             bufferCharacter(cc);
769             HTML_ADVANCE_TO(ScriptDataState);
770         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
771             parseError();
772             HTML_RECONSUME_IN(DataState);
773         } else {
774             bufferCharacter(cc);
775             HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
776         }
777     }
778     END_STATE()
779
780     HTML_BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) {
781         if (cc == '/') {
782             bufferCharacter(cc);
783             m_temporaryBuffer.clear();
784             HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
785         } else
786             HTML_RECONSUME_IN(ScriptDataDoubleEscapedState);
787     }
788     END_STATE()
789
790     HTML_BEGIN_STATE(ScriptDataDoubleEscapeEndState) {
791         if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
792             bufferCharacter(cc);
793             if (temporaryBufferIs(scriptTag.localName()))
794                 HTML_ADVANCE_TO(ScriptDataEscapedState);
795             else
796                 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
797         } else if (isASCIIUpper(cc)) {
798             bufferCharacter(cc);
799             m_temporaryBuffer.append(toLowerCase(cc));
800             HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
801         } else if (isASCIILower(cc)) {
802             bufferCharacter(cc);
803             m_temporaryBuffer.append(cc);
804             HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
805         } else
806             HTML_RECONSUME_IN(ScriptDataDoubleEscapedState);
807     }
808     END_STATE()
809
810     HTML_BEGIN_STATE(BeforeAttributeNameState) {
811         if (isTokenizerWhitespace(cc))
812             HTML_ADVANCE_TO(BeforeAttributeNameState);
813         else if (cc == '/')
814             HTML_ADVANCE_TO(SelfClosingStartTagState);
815         else if (cc == '>')
816             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
817         else if (m_usePreHTML5ParserQuirks && cc == '<')
818             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
819         else if (isASCIIUpper(cc)) {
820             m_token->addNewAttribute();
821             m_token->beginAttributeName(source.numberOfCharactersConsumed());
822             m_token->appendToAttributeName(toLowerCase(cc));
823             HTML_ADVANCE_TO(AttributeNameState);
824         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
825             parseError();
826             HTML_RECONSUME_IN(DataState);
827         } else {
828             if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
829                 parseError();
830             m_token->addNewAttribute();
831             m_token->beginAttributeName(source.numberOfCharactersConsumed());
832             m_token->appendToAttributeName(cc);
833             HTML_ADVANCE_TO(AttributeNameState);
834         }
835     }
836     END_STATE()
837
838     HTML_BEGIN_STATE(AttributeNameState) {
839         if (isTokenizerWhitespace(cc)) {
840             m_token->endAttributeName(source.numberOfCharactersConsumed());
841             HTML_ADVANCE_TO(AfterAttributeNameState);
842         } else if (cc == '/') {
843             m_token->endAttributeName(source.numberOfCharactersConsumed());
844             HTML_ADVANCE_TO(SelfClosingStartTagState);
845         } else if (cc == '=') {
846             m_token->endAttributeName(source.numberOfCharactersConsumed());
847             HTML_ADVANCE_TO(BeforeAttributeValueState);
848         } else if (cc == '>') {
849             m_token->endAttributeName(source.numberOfCharactersConsumed());
850             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
851         } else if (m_usePreHTML5ParserQuirks && cc == '<') {
852             m_token->endAttributeName(source.numberOfCharactersConsumed());
853             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
854         } else if (isASCIIUpper(cc)) {
855             m_token->appendToAttributeName(toLowerCase(cc));
856             HTML_ADVANCE_TO(AttributeNameState);
857         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
858             parseError();
859             m_token->endAttributeName(source.numberOfCharactersConsumed());
860             HTML_RECONSUME_IN(DataState);
861         } else {
862             if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
863                 parseError();
864             m_token->appendToAttributeName(cc);
865             HTML_ADVANCE_TO(AttributeNameState);
866         }
867     }
868     END_STATE()
869
870     HTML_BEGIN_STATE(AfterAttributeNameState) {
871         if (isTokenizerWhitespace(cc))
872             HTML_ADVANCE_TO(AfterAttributeNameState);
873         else if (cc == '/')
874             HTML_ADVANCE_TO(SelfClosingStartTagState);
875         else if (cc == '=')
876             HTML_ADVANCE_TO(BeforeAttributeValueState);
877         else if (cc == '>')
878             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
879         else if (m_usePreHTML5ParserQuirks && cc == '<')
880             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
881         else if (isASCIIUpper(cc)) {
882             m_token->addNewAttribute();
883             m_token->beginAttributeName(source.numberOfCharactersConsumed());
884             m_token->appendToAttributeName(toLowerCase(cc));
885             HTML_ADVANCE_TO(AttributeNameState);
886         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
887             parseError();
888             HTML_RECONSUME_IN(DataState);
889         } else {
890             if (cc == '"' || cc == '\'' || cc == '<')
891                 parseError();
892             m_token->addNewAttribute();
893             m_token->beginAttributeName(source.numberOfCharactersConsumed());
894             m_token->appendToAttributeName(cc);
895             HTML_ADVANCE_TO(AttributeNameState);
896         }
897     }
898     END_STATE()
899
900     HTML_BEGIN_STATE(BeforeAttributeValueState) {
901         if (isTokenizerWhitespace(cc))
902             HTML_ADVANCE_TO(BeforeAttributeValueState);
903         else if (cc == '"') {
904             m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
905             HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
906         } else if (cc == '&') {
907             m_token->beginAttributeValue(source.numberOfCharactersConsumed());
908             HTML_RECONSUME_IN(AttributeValueUnquotedState);
909         } else if (cc == '\'') {
910             m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
911             HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
912         } else if (cc == '>') {
913             parseError();
914             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
915         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
916             parseError();
917             HTML_RECONSUME_IN(DataState);
918         } else {
919             if (cc == '<' || cc == '=' || cc == '`')
920                 parseError();
921             m_token->beginAttributeValue(source.numberOfCharactersConsumed());
922             m_token->appendToAttributeValue(cc);
923             HTML_ADVANCE_TO(AttributeValueUnquotedState);
924         }
925     }
926     END_STATE()
927
928     HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) {
929         if (cc == '"') {
930             m_token->endAttributeValue(source.numberOfCharactersConsumed());
931             HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
932         } else if (cc == '&') {
933             m_additionalAllowedCharacter = '"';
934             HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
935         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
936             parseError();
937             m_token->endAttributeValue(source.numberOfCharactersConsumed());
938             HTML_RECONSUME_IN(DataState);
939         } else {
940             m_token->appendToAttributeValue(cc);
941             HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
942         }
943     }
944     END_STATE()
945
946     HTML_BEGIN_STATE(AttributeValueSingleQuotedState) {
947         if (cc == '\'') {
948             m_token->endAttributeValue(source.numberOfCharactersConsumed());
949             HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
950         } else if (cc == '&') {
951             m_additionalAllowedCharacter = '\'';
952             HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
953         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
954             parseError();
955             m_token->endAttributeValue(source.numberOfCharactersConsumed());
956             HTML_RECONSUME_IN(DataState);
957         } else {
958             m_token->appendToAttributeValue(cc);
959             HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
960         }
961     }
962     END_STATE()
963
964     HTML_BEGIN_STATE(AttributeValueUnquotedState) {
965         if (isTokenizerWhitespace(cc)) {
966             m_token->endAttributeValue(source.numberOfCharactersConsumed());
967             HTML_ADVANCE_TO(BeforeAttributeNameState);
968         } else if (cc == '&') {
969             m_additionalAllowedCharacter = '>';
970             HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
971         } else if (cc == '>') {
972             m_token->endAttributeValue(source.numberOfCharactersConsumed());
973             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
974         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
975             parseError();
976             m_token->endAttributeValue(source.numberOfCharactersConsumed());
977             HTML_RECONSUME_IN(DataState);
978         } else {
979             if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
980                 parseError();
981             m_token->appendToAttributeValue(cc);
982             HTML_ADVANCE_TO(AttributeValueUnquotedState);
983         }
984     }
985     END_STATE()
986
987     HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) {
988         bool notEnoughCharacters = false;
989         StringBuilder decodedEntity;
990         bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter);
991         if (notEnoughCharacters)
992             return haveBufferedCharacterToken();
993         if (!success) {
994             ASSERT(decodedEntity.isEmpty());
995             m_token->appendToAttributeValue('&');
996         } else {
997             for (unsigned i = 0; i < decodedEntity.length(); ++i)
998                 m_token->appendToAttributeValue(decodedEntity[i]);
999         }
1000         // We're supposed to switch back to the attribute value state that
1001         // we were in when we were switched into this state. Rather than
1002         // keeping track of this explictly, we observe that the previous
1003         // state can be determined by m_additionalAllowedCharacter.
1004         if (m_additionalAllowedCharacter == '"')
1005             HTML_SWITCH_TO(AttributeValueDoubleQuotedState);
1006         else if (m_additionalAllowedCharacter == '\'')
1007             HTML_SWITCH_TO(AttributeValueSingleQuotedState);
1008         else if (m_additionalAllowedCharacter == '>')
1009             HTML_SWITCH_TO(AttributeValueUnquotedState);
1010         else
1011             ASSERT_NOT_REACHED();
1012     }
1013     END_STATE()
1014
1015     HTML_BEGIN_STATE(AfterAttributeValueQuotedState) {
1016         if (isTokenizerWhitespace(cc))
1017             HTML_ADVANCE_TO(BeforeAttributeNameState);
1018         else if (cc == '/')
1019             HTML_ADVANCE_TO(SelfClosingStartTagState);
1020         else if (cc == '>')
1021             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1022         else if (m_usePreHTML5ParserQuirks && cc == '<')
1023             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1024         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1025             parseError();
1026             HTML_RECONSUME_IN(DataState);
1027         } else {
1028             parseError();
1029             HTML_RECONSUME_IN(BeforeAttributeNameState);
1030         }
1031     }
1032     END_STATE()
1033
1034     HTML_BEGIN_STATE(SelfClosingStartTagState) {
1035         if (cc == '>') {
1036             m_token->setSelfClosing();
1037             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1038         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1039             parseError();
1040             HTML_RECONSUME_IN(DataState);
1041         } else {
1042             parseError();
1043             HTML_RECONSUME_IN(BeforeAttributeNameState);
1044         }
1045     }
1046     END_STATE()
1047
1048     HTML_BEGIN_STATE(BogusCommentState) {
1049         m_token->beginComment();
1050         HTML_RECONSUME_IN(ContinueBogusCommentState);
1051     }
1052     END_STATE()
1053
1054     HTML_BEGIN_STATE(ContinueBogusCommentState) {
1055         if (cc == '>')
1056             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1057         else if (cc == InputStreamPreprocessor::endOfFileMarker)
1058             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1059         else {
1060             m_token->appendToComment(cc);
1061             HTML_ADVANCE_TO(ContinueBogusCommentState);
1062         }
1063     }
1064     END_STATE()
1065
1066     HTML_BEGIN_STATE(MarkupDeclarationOpenState) {
1067         DEFINE_STATIC_LOCAL(String, dashDashString, ("--"));
1068         DEFINE_STATIC_LOCAL(String, doctypeString, ("doctype"));
1069         DEFINE_STATIC_LOCAL(String, cdataString, ("[CDATA["));
1070         if (cc == '-') {
1071             SegmentedString::LookAheadResult result = source.lookAhead(dashDashString);
1072             if (result == SegmentedString::DidMatch) {
1073                 source.advanceAndASSERT('-');
1074                 source.advanceAndASSERT('-');
1075                 m_token->beginComment();
1076                 HTML_SWITCH_TO(CommentStartState);
1077             } else if (result == SegmentedString::NotEnoughCharacters)
1078                 return haveBufferedCharacterToken();
1079         } else if (cc == 'D' || cc == 'd') {
1080             SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString);
1081             if (result == SegmentedString::DidMatch) {
1082                 advanceStringAndASSERTIgnoringCase(source, "doctype");
1083                 HTML_SWITCH_TO(DOCTYPEState);
1084             } else if (result == SegmentedString::NotEnoughCharacters)
1085                 return haveBufferedCharacterToken();
1086         } else if (cc == '[' && shouldAllowCDATA()) {
1087             SegmentedString::LookAheadResult result = source.lookAhead(cdataString);
1088             if (result == SegmentedString::DidMatch) {
1089                 advanceStringAndASSERT(source, "[CDATA[");
1090                 HTML_SWITCH_TO(CDATASectionState);
1091             } else if (result == SegmentedString::NotEnoughCharacters)
1092                 return haveBufferedCharacterToken();
1093         }
1094         parseError();
1095         HTML_RECONSUME_IN(BogusCommentState);
1096     }
1097     END_STATE()
1098
1099     HTML_BEGIN_STATE(CommentStartState) {
1100         if (cc == '-')
1101             HTML_ADVANCE_TO(CommentStartDashState);
1102         else if (cc == '>') {
1103             parseError();
1104             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1105         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1106             parseError();
1107             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1108         } else {
1109             m_token->appendToComment(cc);
1110             HTML_ADVANCE_TO(CommentState);
1111         }
1112     }
1113     END_STATE()
1114
1115     HTML_BEGIN_STATE(CommentStartDashState) {
1116         if (cc == '-')
1117             HTML_ADVANCE_TO(CommentEndState);
1118         else if (cc == '>') {
1119             parseError();
1120             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1121         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1122             parseError();
1123             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1124         } else {
1125             m_token->appendToComment('-');
1126             m_token->appendToComment(cc);
1127             HTML_ADVANCE_TO(CommentState);
1128         }
1129     }
1130     END_STATE()
1131
1132     HTML_BEGIN_STATE(CommentState) {
1133         if (cc == '-')
1134             HTML_ADVANCE_TO(CommentEndDashState);
1135         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1136             parseError();
1137             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1138         } else {
1139             m_token->appendToComment(cc);
1140             HTML_ADVANCE_TO(CommentState);
1141         }
1142     }
1143     END_STATE()
1144
1145     HTML_BEGIN_STATE(CommentEndDashState) {
1146         if (cc == '-')
1147             HTML_ADVANCE_TO(CommentEndState);
1148         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1149             parseError();
1150             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1151         } else {
1152             m_token->appendToComment('-');
1153             m_token->appendToComment(cc);
1154             HTML_ADVANCE_TO(CommentState);
1155         }
1156     }
1157     END_STATE()
1158
1159     HTML_BEGIN_STATE(CommentEndState) {
1160         if (cc == '>')
1161             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1162         else if (cc == '!') {
1163             parseError();
1164             HTML_ADVANCE_TO(CommentEndBangState);
1165         } else if (cc == '-') {
1166             parseError();
1167             m_token->appendToComment('-');
1168             HTML_ADVANCE_TO(CommentEndState);
1169         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1170             parseError();
1171             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1172         } else {
1173             parseError();
1174             m_token->appendToComment('-');
1175             m_token->appendToComment('-');
1176             m_token->appendToComment(cc);
1177             HTML_ADVANCE_TO(CommentState);
1178         }
1179     }
1180     END_STATE()
1181
1182     HTML_BEGIN_STATE(CommentEndBangState) {
1183         if (cc == '-') {
1184             m_token->appendToComment('-');
1185             m_token->appendToComment('-');
1186             m_token->appendToComment('!');
1187             HTML_ADVANCE_TO(CommentEndDashState);
1188         } else if (cc == '>')
1189             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1190         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1191             parseError();
1192             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1193         } else {
1194             m_token->appendToComment('-');
1195             m_token->appendToComment('-');
1196             m_token->appendToComment('!');
1197             m_token->appendToComment(cc);
1198             HTML_ADVANCE_TO(CommentState);
1199         }
1200     }
1201     END_STATE()
1202
1203     HTML_BEGIN_STATE(DOCTYPEState) {
1204         if (isTokenizerWhitespace(cc))
1205             HTML_ADVANCE_TO(BeforeDOCTYPENameState);
1206         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1207             parseError();
1208             m_token->beginDOCTYPE();
1209             m_token->setForceQuirks();
1210             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1211         } else {
1212             parseError();
1213             HTML_RECONSUME_IN(BeforeDOCTYPENameState);
1214         }
1215     }
1216     END_STATE()
1217
1218     HTML_BEGIN_STATE(BeforeDOCTYPENameState) {
1219         if (isTokenizerWhitespace(cc))
1220             HTML_ADVANCE_TO(BeforeDOCTYPENameState);
1221         else if (isASCIIUpper(cc)) {
1222             m_token->beginDOCTYPE(toLowerCase(cc));
1223             HTML_ADVANCE_TO(DOCTYPENameState);
1224         } else if (cc == '>') {
1225             parseError();
1226             m_token->beginDOCTYPE();
1227             m_token->setForceQuirks();
1228             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1229         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1230             parseError();
1231             m_token->beginDOCTYPE();
1232             m_token->setForceQuirks();
1233             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1234         } else {
1235             m_token->beginDOCTYPE(cc);
1236             HTML_ADVANCE_TO(DOCTYPENameState);
1237         }
1238     }
1239     END_STATE()
1240
1241     HTML_BEGIN_STATE(DOCTYPENameState) {
1242         if (isTokenizerWhitespace(cc))
1243             HTML_ADVANCE_TO(AfterDOCTYPENameState);
1244         else if (cc == '>')
1245             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1246         else if (isASCIIUpper(cc)) {
1247             m_token->appendToName(toLowerCase(cc));
1248             HTML_ADVANCE_TO(DOCTYPENameState);
1249         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1250             parseError();
1251             m_token->setForceQuirks();
1252             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1253         } else {
1254             m_token->appendToName(cc);
1255             HTML_ADVANCE_TO(DOCTYPENameState);
1256         }
1257     }
1258     END_STATE()
1259
1260     HTML_BEGIN_STATE(AfterDOCTYPENameState) {
1261         if (isTokenizerWhitespace(cc))
1262             HTML_ADVANCE_TO(AfterDOCTYPENameState);
1263         if (cc == '>')
1264             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1265         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1266             parseError();
1267             m_token->setForceQuirks();
1268             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1269         } else {
1270             DEFINE_STATIC_LOCAL(String, publicString, ("public"));
1271             DEFINE_STATIC_LOCAL(String, systemString, ("system"));
1272             if (cc == 'P' || cc == 'p') {
1273                 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString);
1274                 if (result == SegmentedString::DidMatch) {
1275                     advanceStringAndASSERTIgnoringCase(source, "public");
1276                     HTML_SWITCH_TO(AfterDOCTYPEPublicKeywordState);
1277                 } else if (result == SegmentedString::NotEnoughCharacters)
1278                     return haveBufferedCharacterToken();
1279             } else if (cc == 'S' || cc == 's') {
1280                 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString);
1281                 if (result == SegmentedString::DidMatch) {
1282                     advanceStringAndASSERTIgnoringCase(source, "system");
1283                     HTML_SWITCH_TO(AfterDOCTYPESystemKeywordState);
1284                 } else if (result == SegmentedString::NotEnoughCharacters)
1285                     return haveBufferedCharacterToken();
1286             }
1287             parseError();
1288             m_token->setForceQuirks();
1289             HTML_ADVANCE_TO(BogusDOCTYPEState);
1290         }
1291     }
1292     END_STATE()
1293
1294     HTML_BEGIN_STATE(AfterDOCTYPEPublicKeywordState) {
1295         if (isTokenizerWhitespace(cc))
1296             HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1297         else if (cc == '"') {
1298             parseError();
1299             m_token->setPublicIdentifierToEmptyString();
1300             HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1301         } else if (cc == '\'') {
1302             parseError();
1303             m_token->setPublicIdentifierToEmptyString();
1304             HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1305         } else if (cc == '>') {
1306             parseError();
1307             m_token->setForceQuirks();
1308             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1309         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1310             parseError();
1311             m_token->setForceQuirks();
1312             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1313         } else {
1314             parseError();
1315             m_token->setForceQuirks();
1316             HTML_ADVANCE_TO(BogusDOCTYPEState);
1317         }
1318     }
1319     END_STATE()
1320
1321     HTML_BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) {
1322         if (isTokenizerWhitespace(cc))
1323             HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1324         else if (cc == '"') {
1325             m_token->setPublicIdentifierToEmptyString();
1326             HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1327         } else if (cc == '\'') {
1328             m_token->setPublicIdentifierToEmptyString();
1329             HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1330         } else if (cc == '>') {
1331             parseError();
1332             m_token->setForceQuirks();
1333             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1334         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1335             parseError();
1336             m_token->setForceQuirks();
1337             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1338         } else {
1339             parseError();
1340             m_token->setForceQuirks();
1341             HTML_ADVANCE_TO(BogusDOCTYPEState);
1342         }
1343     }
1344     END_STATE()
1345
1346     HTML_BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) {
1347         if (cc == '"')
1348             HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1349         else if (cc == '>') {
1350             parseError();
1351             m_token->setForceQuirks();
1352             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1353         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1354             parseError();
1355             m_token->setForceQuirks();
1356             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1357         } else {
1358             m_token->appendToPublicIdentifier(cc);
1359             HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1360         }
1361     }
1362     END_STATE()
1363
1364     HTML_BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) {
1365         if (cc == '\'')
1366             HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1367         else if (cc == '>') {
1368             parseError();
1369             m_token->setForceQuirks();
1370             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1371         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1372             parseError();
1373             m_token->setForceQuirks();
1374             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1375         } else {
1376             m_token->appendToPublicIdentifier(cc);
1377             HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1378         }
1379     }
1380     END_STATE()
1381
1382     HTML_BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) {
1383         if (isTokenizerWhitespace(cc))
1384             HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1385         else if (cc == '>')
1386             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1387         else if (cc == '"') {
1388             parseError();
1389             m_token->setSystemIdentifierToEmptyString();
1390             HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1391         } else if (cc == '\'') {
1392             parseError();
1393             m_token->setSystemIdentifierToEmptyString();
1394             HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1395         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1396             parseError();
1397             m_token->setForceQuirks();
1398             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1399         } else {
1400             parseError();
1401             m_token->setForceQuirks();
1402             HTML_ADVANCE_TO(BogusDOCTYPEState);
1403         }
1404     }
1405     END_STATE()
1406
1407     HTML_BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) {
1408         if (isTokenizerWhitespace(cc))
1409             HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1410         else if (cc == '>')
1411             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1412         else if (cc == '"') {
1413             m_token->setSystemIdentifierToEmptyString();
1414             HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1415         } else if (cc == '\'') {
1416             m_token->setSystemIdentifierToEmptyString();
1417             HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1418         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1419             parseError();
1420             m_token->setForceQuirks();
1421             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1422         } else {
1423             parseError();
1424             m_token->setForceQuirks();
1425             HTML_ADVANCE_TO(BogusDOCTYPEState);
1426         }
1427     }
1428     END_STATE()
1429
1430     HTML_BEGIN_STATE(AfterDOCTYPESystemKeywordState) {
1431         if (isTokenizerWhitespace(cc))
1432             HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1433         else if (cc == '"') {
1434             parseError();
1435             m_token->setSystemIdentifierToEmptyString();
1436             HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1437         } else if (cc == '\'') {
1438             parseError();
1439             m_token->setSystemIdentifierToEmptyString();
1440             HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1441         } else if (cc == '>') {
1442             parseError();
1443             m_token->setForceQuirks();
1444             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1445         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1446             parseError();
1447             m_token->setForceQuirks();
1448             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1449         } else {
1450             parseError();
1451             m_token->setForceQuirks();
1452             HTML_ADVANCE_TO(BogusDOCTYPEState);
1453         }
1454     }
1455     END_STATE()
1456
1457     HTML_BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) {
1458         if (isTokenizerWhitespace(cc))
1459             HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1460         if (cc == '"') {
1461             m_token->setSystemIdentifierToEmptyString();
1462             HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1463         } else if (cc == '\'') {
1464             m_token->setSystemIdentifierToEmptyString();
1465             HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1466         } else if (cc == '>') {
1467             parseError();
1468             m_token->setForceQuirks();
1469             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1470         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1471             parseError();
1472             m_token->setForceQuirks();
1473             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1474         } else {
1475             parseError();
1476             m_token->setForceQuirks();
1477             HTML_ADVANCE_TO(BogusDOCTYPEState);
1478         }
1479     }
1480     END_STATE()
1481
1482     HTML_BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) {
1483         if (cc == '"')
1484             HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1485         else if (cc == '>') {
1486             parseError();
1487             m_token->setForceQuirks();
1488             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1489         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1490             parseError();
1491             m_token->setForceQuirks();
1492             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1493         } else {
1494             m_token->appendToSystemIdentifier(cc);
1495             HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1496         }
1497     }
1498     END_STATE()
1499
1500     HTML_BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) {
1501         if (cc == '\'')
1502             HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1503         else if (cc == '>') {
1504             parseError();
1505             m_token->setForceQuirks();
1506             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1507         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1508             parseError();
1509             m_token->setForceQuirks();
1510             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1511         } else {
1512             m_token->appendToSystemIdentifier(cc);
1513             HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1514         }
1515     }
1516     END_STATE()
1517
1518     HTML_BEGIN_STATE(AfterDOCTYPESystemIdentifierState) {
1519         if (isTokenizerWhitespace(cc))
1520             HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1521         else if (cc == '>')
1522             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1523         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
1524             parseError();
1525             m_token->setForceQuirks();
1526             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1527         } else {
1528             parseError();
1529             HTML_ADVANCE_TO(BogusDOCTYPEState);
1530         }
1531     }
1532     END_STATE()
1533
1534     HTML_BEGIN_STATE(BogusDOCTYPEState) {
1535         if (cc == '>')
1536             return emitAndResumeIn(source, HTMLTokenizerState::DataState);
1537         else if (cc == InputStreamPreprocessor::endOfFileMarker)
1538             return emitAndReconsumeIn(source, HTMLTokenizerState::DataState);
1539         HTML_ADVANCE_TO(BogusDOCTYPEState);
1540     }
1541     END_STATE()
1542
1543     HTML_BEGIN_STATE(CDATASectionState) {
1544         if (cc == ']')
1545             HTML_ADVANCE_TO(CDATASectionRightSquareBracketState);
1546         else if (cc == InputStreamPreprocessor::endOfFileMarker)
1547             HTML_RECONSUME_IN(DataState);
1548         else {
1549             bufferCharacter(cc);
1550             HTML_ADVANCE_TO(CDATASectionState);
1551         }
1552     }
1553     END_STATE()
1554
1555     HTML_BEGIN_STATE(CDATASectionRightSquareBracketState) {
1556         if (cc == ']')
1557             HTML_ADVANCE_TO(CDATASectionDoubleRightSquareBracketState);
1558         else {
1559             bufferCharacter(']');
1560             HTML_RECONSUME_IN(CDATASectionState);
1561         }
1562     }
1563
1564     HTML_BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) {
1565         if (cc == '>')
1566             HTML_ADVANCE_TO(DataState);
1567         else {
1568             bufferCharacter(']');
1569             bufferCharacter(']');
1570             HTML_RECONSUME_IN(CDATASectionState);
1571         }
1572     }
1573     END_STATE()
1574
1575     }
1576
1577     ASSERT_NOT_REACHED();
1578     return false;
1579 }
1580
1581 void HTMLTokenizer::updateStateFor(const AtomicString& tagName, Frame* frame)
1582 {
1583     if (tagName == textareaTag || tagName == titleTag)
1584         setState(HTMLTokenizerState::RCDATAState);
1585     else if (tagName == plaintextTag)
1586         setState(HTMLTokenizerState::PLAINTEXTState);
1587     else if (tagName == scriptTag)
1588         setState(HTMLTokenizerState::ScriptDataState);
1589     else if (tagName == styleTag
1590         || tagName == iframeTag
1591         || tagName == xmpTag
1592         || (tagName == noembedTag && HTMLTreeBuilder::pluginsEnabled(frame))
1593         || tagName == noframesTag
1594         || (tagName == noscriptTag && HTMLTreeBuilder::scriptEnabled(frame)))
1595         setState(HTMLTokenizerState::RAWTEXTState);
1596 }
1597
1598 inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString)
1599 {
1600     return vectorEqualsString(m_temporaryBuffer, expectedString);
1601 }
1602
1603 inline void HTMLTokenizer::addToPossibleEndTag(UChar cc)
1604 {
1605     ASSERT(isEndTagBufferingState(m_state));
1606     m_bufferedEndTagName.append(cc);
1607 }
1608
1609 inline bool HTMLTokenizer::isAppropriateEndTag()
1610 {
1611     return m_bufferedEndTagName == m_appropriateEndTagName;
1612 }
1613
1614 inline void HTMLTokenizer::parseError()
1615 {
1616     notImplemented();
1617 }
1618
1619 }