code.vuplus.com Git - vuplus_webkit/blob - Source/WebCore/xml/parser/MarkupTokenizerBase.h

   1 /*
   2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
   3  * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
   4  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26  */
  27
  28 #ifndef MarkupTokenizerBase_h
  29 #define MarkupTokenizerBase_h
  30
  31 #include "SegmentedString.h"
  32 #include <wtf/Noncopyable.h>
  33 #include <wtf/PassOwnPtr.h>
  34 #include <wtf/Vector.h>
  35 #include <wtf/text/AtomicString.h>
  36
  37 namespace WebCore {
  38
  39 // Never use this type for a variable, as it contains several non-virtual functions.
  40 template<typename Token, typename State>
  41 class MarkupTokenizerBase {
  42     WTF_MAKE_NONCOPYABLE(MarkupTokenizerBase);
  43     WTF_MAKE_FAST_ALLOCATED;
  44 public:
  45     virtual ~MarkupTokenizerBase() { }
  46
  47     int lineNumber() const { return m_lineNumber; }
  48     int columnNumber() const { return 1; } // Matches LegacyHTMLDocumentParser.h behavior.
  49
  50     typename State::State state() const { return m_state; }
  51     void setState(typename State::State state) { m_state = state; }
  52
  53     bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
  54     void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
  55
  56     // This method needs to be defined in a template specialization when subclassing this template
  57     inline bool shouldSkipNullCharacters() const;
  58
  59 protected:
  60     // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
  61     class InputStreamPreprocessor {
  62         WTF_MAKE_NONCOPYABLE(InputStreamPreprocessor);
  63     public:
  64         InputStreamPreprocessor(MarkupTokenizerBase<Token, State>* tokenizer)
  65             : m_tokenizer(tokenizer)
  66             , m_nextInputCharacter('\0')
  67             , m_skipNextNewLine(false)
  68         {
  69         }
  70
  71         UChar nextInputCharacter() const { return m_nextInputCharacter; }
  72
  73         // Returns whether we succeeded in peeking at the next character.
  74         // The only way we can fail to peek is if there are no more
  75         // characters in |source| (after collapsing \r\n, etc).
  76         ALWAYS_INLINE bool peek(SegmentedString& source, int& lineNumber)
  77         {
  78         PeekAgain:
  79             m_nextInputCharacter = *source;
  80
  81             // Every branch in this function is expensive, so we have a
  82             // fast-reject branch for characters that don't require special
  83             // handling. Please run the parser benchmark whenever you touch
  84             // this function. It's very hot.
  85             static const UChar specialCharacterMask = '\n' | '\r' | '\0';
  86             if (m_nextInputCharacter & ~specialCharacterMask) {
  87                 m_skipNextNewLine = false;
  88                 return true;
  89             }
  90
  91             if (m_nextInputCharacter == '\n' && m_skipNextNewLine) {
  92                 m_skipNextNewLine = false;
  93                 source.advancePastNewline(lineNumber);
  94                 if (source.isEmpty())
  95                     return false;
  96                 m_nextInputCharacter = *source;
  97             }
  98             if (m_nextInputCharacter == '\r') {
  99                 m_nextInputCharacter = '\n';
 100                 m_skipNextNewLine = true;
 101             } else {
 102                 m_skipNextNewLine = false;
 103                 // FIXME: The spec indicates that the surrogate pair range as well as
 104                 // a number of specific character values are parse errors and should be replaced
 105                 // by the replacement character. We suspect this is a problem with the spec as doing
 106                 // that filtering breaks surrogate pair handling and causes us not to match Minefield.
 107                 if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) {
 108                     if (m_tokenizer->shouldSkipNullCharacters()) {
 109                         source.advancePastNonNewline();
 110                         if (source.isEmpty())
 111                             return false;
 112                         goto PeekAgain;
 113                     }
 114                     m_nextInputCharacter = 0xFFFD;
 115                 }
 116             }
 117             return true;
 118         }
 119
 120         // Returns whether there are more characters in |source| after advancing.
 121         bool advance(SegmentedString& source, int& lineNumber)
 122         {
 123             source.advance(lineNumber);
 124             if (source.isEmpty())
 125                 return false;
 126             return peek(source, lineNumber);
 127         }
 128
 129         static const UChar endOfFileMarker = 0;
 130
 131     private:
 132         bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const
 133         {
 134             return source.isClosed() && source.length() == 1;
 135         }
 136
 137         MarkupTokenizerBase<Token, State>* m_tokenizer;
 138
 139         // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character
 140         UChar m_nextInputCharacter;
 141         bool m_skipNextNewLine;
 142     };
 143
 144     MarkupTokenizerBase() : m_inputStreamPreprocessor(this) { reset(); }
 145
 146     inline void bufferCharacter(UChar character)
 147     {
 148         ASSERT(character != InputStreamPreprocessor::endOfFileMarker);
 149         m_token->ensureIsCharacterToken();
 150         m_token->appendToCharacter(character);
 151     }
 152
 153     inline void bufferCodePoint(unsigned);
 154
 155     // This method can get hidden in subclasses
 156     inline bool emitAndResumeIn(SegmentedString& source, typename State::State state)
 157     {
 158         m_state = state;
 159         source.advance(m_lineNumber);
 160         return true;
 161     }
 162
 163     // This method can get hidden in subclasses
 164     inline bool emitAndReconsumeIn(SegmentedString&, typename State::State state)
 165     {
 166         m_state = state;
 167         return true;
 168     }
 169
 170     inline bool emitEndOfFile(SegmentedString& source)
 171     {
 172         if (haveBufferedCharacterToken())
 173             return true;
 174         m_state = State::DataState;
 175         source.advance(m_lineNumber);
 176         m_token->clear();
 177         m_token->makeEndOfFile();
 178         return true;
 179     }
 180
 181     void reset()
 182     {
 183         m_state = State::DataState;
 184         m_token = 0;
 185         m_lineNumber = 0;
 186     }
 187
 188     inline bool haveBufferedCharacterToken()
 189     {
 190         return m_token->type() == Token::Type::Character;
 191     }
 192
 193     typename State::State m_state;
 194
 195     // m_token is owned by the caller. If nextToken is not on the stack,
 196     // this member might be pointing to unallocated memory.
 197     Token* m_token;
 198     int m_lineNumber;
 199
 200     bool m_skipLeadingNewLineForListing;
 201     bool m_forceNullCharacterReplacement;
 202
 203     // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
 204     UChar m_additionalAllowedCharacter;
 205
 206     // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
 207     InputStreamPreprocessor m_inputStreamPreprocessor;
 208 };
 209
 210 }
 211
 212 #endif // MarkupTokenizerBase_h