2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 #include "HTMLEntityParser.h"
31 #include "CharacterReferenceParserInlineMethods.h"
32 #include "HTMLEntitySearch.h"
33 #include "HTMLEntityTable.h"
34 #include <wtf/text/StringBuilder.h>
42 static const UChar windowsLatin1ExtensionArray[32] = {
43 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
44 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
45 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
46 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
49 inline bool isAlphaNumeric(UChar cc)
51 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
54 class HTMLEntityParser {
56 inline static UChar adjustEntity(UChar32 value)
58 if ((value & ~0x1F) != 0x0080)
60 return windowsLatin1ExtensionArray[value - 0x80];
63 inline static UChar32 legalEntityFor(UChar32 value)
65 // FIXME: A number of specific entity values generate parse errors.
66 if (!value || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
69 return adjustEntity(value);
73 inline static bool convertToUTF16(UChar32 value, StringBuilder& decodedEntity)
75 if (U_IS_BMP(value)) {
76 UChar character = static_cast<UChar>(value);
77 ASSERT(character == value);
78 decodedEntity.append(character);
81 decodedEntity.append(U16_LEAD(value));
82 decodedEntity.append(U16_TRAIL(value));
86 inline static bool acceptMalformed() { return true; }
88 inline static bool consumeNamedEntity(SegmentedString& source, StringBuilder& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
90 StringBuilder consumedCharacters;
91 HTMLEntitySearch entitySearch;
92 while (!source.isEmpty()) {
94 entitySearch.advance(cc);
95 if (!entitySearch.isEntityPrefix())
97 consumedCharacters.append(cc);
98 source.advanceAndASSERT(cc);
100 notEnoughCharacters = source.isEmpty();
101 if (notEnoughCharacters) {
102 // We can't an entity because there might be a longer entity
103 // that we could match if we had more data.
104 unconsumeCharacters(source, consumedCharacters);
107 if (!entitySearch.mostRecentMatch()) {
108 ASSERT(!entitySearch.currentValue());
109 unconsumeCharacters(source, consumedCharacters);
112 if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
113 // We've consumed too many characters. We need to walk the
114 // source back to the point at which we had consumed an
116 unconsumeCharacters(source, consumedCharacters);
117 consumedCharacters.clear();
118 const int length = entitySearch.mostRecentMatch()->length;
119 const UChar* reference = entitySearch.mostRecentMatch()->entity;
120 for (int i = 0; i < length; ++i) {
122 ASSERT_UNUSED(reference, cc == *reference++);
123 consumedCharacters.append(cc);
124 source.advanceAndASSERT(cc);
125 ASSERT(!source.isEmpty());
129 if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
130 || !additionalAllowedCharacter
131 || !(isAlphaNumeric(cc) || cc == '=')) {
132 return convertToUTF16(entitySearch.mostRecentMatch()->value, decodedEntity);
134 unconsumeCharacters(source, consumedCharacters);
142 bool consumeHTMLEntity(SegmentedString& source, StringBuilder& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
144 return consumeCharacterReference<HTMLEntityParser>(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter);
147 UChar decodeNamedEntity(const char* name)
149 HTMLEntitySearch search;
151 search.advance(*name++);
152 if (!search.isEntityPrefix())
156 UChar32 entityValue = search.currentValue();
157 if (U16_LENGTH(entityValue) != 1) {
158 // Callers need to move off this API if the entity table has values
159 // which do no fit in a 16 bit UChar!
160 ASSERT_NOT_REACHED();
163 return static_cast<UChar>(entityValue);
166 } // namespace WebCore