2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #ifndef MarkupTokenizerBase_h
29 #define MarkupTokenizerBase_h
31 #include "SegmentedString.h"
32 #include <wtf/Noncopyable.h>
33 #include <wtf/PassOwnPtr.h>
34 #include <wtf/Vector.h>
35 #include <wtf/text/AtomicString.h>
39 // Never use this type for a variable, as it contains several non-virtual functions.
40 template<typename Token, typename State>
41 class MarkupTokenizerBase {
42 WTF_MAKE_NONCOPYABLE(MarkupTokenizerBase);
43 WTF_MAKE_FAST_ALLOCATED;
45 virtual ~MarkupTokenizerBase() { }
47 int lineNumber() const { return m_lineNumber; }
48 int columnNumber() const { return 1; } // Matches LegacyHTMLDocumentParser.h behavior.
50 typename State::State state() const { return m_state; }
51 void setState(typename State::State state) { m_state = state; }
53 bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
54 void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
56 // This method needs to be defined in a template specialization when subclassing this template
57 inline bool shouldSkipNullCharacters() const;
60 // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
61 class InputStreamPreprocessor {
62 WTF_MAKE_NONCOPYABLE(InputStreamPreprocessor);
64 InputStreamPreprocessor(MarkupTokenizerBase<Token, State>* tokenizer)
65 : m_tokenizer(tokenizer)
66 , m_nextInputCharacter('\0')
67 , m_skipNextNewLine(false)
71 UChar nextInputCharacter() const { return m_nextInputCharacter; }
73 // Returns whether we succeeded in peeking at the next character.
74 // The only way we can fail to peek is if there are no more
75 // characters in |source| (after collapsing \r\n, etc).
76 ALWAYS_INLINE bool peek(SegmentedString& source, int& lineNumber)
79 m_nextInputCharacter = *source;
81 // Every branch in this function is expensive, so we have a
82 // fast-reject branch for characters that don't require special
83 // handling. Please run the parser benchmark whenever you touch
84 // this function. It's very hot.
85 static const UChar specialCharacterMask = '\n' | '\r' | '\0';
86 if (m_nextInputCharacter & ~specialCharacterMask) {
87 m_skipNextNewLine = false;
91 if (m_nextInputCharacter == '\n' && m_skipNextNewLine) {
92 m_skipNextNewLine = false;
93 source.advancePastNewline(lineNumber);
96 m_nextInputCharacter = *source;
98 if (m_nextInputCharacter == '\r') {
99 m_nextInputCharacter = '\n';
100 m_skipNextNewLine = true;
102 m_skipNextNewLine = false;
103 // FIXME: The spec indicates that the surrogate pair range as well as
104 // a number of specific character values are parse errors and should be replaced
105 // by the replacement character. We suspect this is a problem with the spec as doing
106 // that filtering breaks surrogate pair handling and causes us not to match Minefield.
107 if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) {
108 if (m_tokenizer->shouldSkipNullCharacters()) {
109 source.advancePastNonNewline();
110 if (source.isEmpty())
114 m_nextInputCharacter = 0xFFFD;
120 // Returns whether there are more characters in |source| after advancing.
121 bool advance(SegmentedString& source, int& lineNumber)
123 source.advance(lineNumber);
124 if (source.isEmpty())
126 return peek(source, lineNumber);
129 static const UChar endOfFileMarker = 0;
132 bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const
134 return source.isClosed() && source.length() == 1;
137 MarkupTokenizerBase<Token, State>* m_tokenizer;
139 // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character
140 UChar m_nextInputCharacter;
141 bool m_skipNextNewLine;
144 MarkupTokenizerBase() : m_inputStreamPreprocessor(this) { reset(); }
146 inline void bufferCharacter(UChar character)
148 ASSERT(character != InputStreamPreprocessor::endOfFileMarker);
149 m_token->ensureIsCharacterToken();
150 m_token->appendToCharacter(character);
153 inline void bufferCodePoint(unsigned);
155 // This method can get hidden in subclasses
156 inline bool emitAndResumeIn(SegmentedString& source, typename State::State state)
159 source.advance(m_lineNumber);
163 // This method can get hidden in subclasses
164 inline bool emitAndReconsumeIn(SegmentedString&, typename State::State state)
170 inline bool emitEndOfFile(SegmentedString& source)
172 if (haveBufferedCharacterToken())
174 m_state = State::DataState;
175 source.advance(m_lineNumber);
177 m_token->makeEndOfFile();
183 m_state = State::DataState;
188 inline bool haveBufferedCharacterToken()
190 return m_token->type() == Token::Type::Character;
193 typename State::State m_state;
195 // m_token is owned by the caller. If nextToken is not on the stack,
196 // this member might be pointing to unallocated memory.
200 bool m_skipLeadingNewLineForListing;
201 bool m_forceNullCharacterReplacement;
203 // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
204 UChar m_additionalAllowedCharacter;
206 // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
207 InputStreamPreprocessor m_inputStreamPreprocessor;
212 #endif // MarkupTokenizerBase_h