code.vuplus.com Git - vuplus_webkit/blob - Source/WebCore/platform/text/TextCodecUTF8.cpp

   1 /*
   2  * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
  14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24  */
  25
  26 #include "config.h"
  27 #include "TextCodecUTF8.h"
  28
  29 #include "TextCodecASCIIFastPath.h"
  30 #include <wtf/text/CString.h>
  31 #include <wtf/text/StringBuffer.h>
  32 #include <wtf/unicode/CharacterNames.h>
  33
  34 using namespace WTF::Unicode;
  35 using namespace std;
  36
  37 namespace WebCore {
  38
  39 const int nonCharacter = -1;
  40
  41 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
  42 {
  43     return adoptPtr(new TextCodecUTF8);
  44 }
  45
  46 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
  47 {
  48     registrar("UTF-8", "UTF-8");
  49
  50     // Additional aliases that originally were present in the encoding
  51     // table in WebKit on Macintosh, and subsequently added by
  52     // TextCodecICU. Perhaps we can prove some are not used on the web
  53     // and remove them.
  54     registrar("unicode11utf8", "UTF-8");
  55     registrar("unicode20utf8", "UTF-8");
  56     registrar("utf8", "UTF-8");
  57     registrar("x-unicode20utf8", "UTF-8");
  58 }
  59
  60 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
  61 {
  62     registrar("UTF-8", create, 0);
  63 }
  64
  65 static inline int nonASCIISequenceLength(uint8_t firstByte)
  66 {
  67     static const uint8_t lengths[256] = {
  68         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  69         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  70         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  71         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  72         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  73         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  74         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  75         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  76         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  77         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  78         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  79         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  80         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  81         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  82         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  83         4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  84     };
  85     return lengths[firstByte];
  86 }
  87
  88 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
  89 {
  90     ASSERT(!isASCII(sequence[0]));
  91     if (length == 2) {
  92         ASSERT(sequence[0] <= 0xDF);
  93         if (sequence[0] < 0xC2)
  94             return nonCharacter;
  95         if (sequence[1] < 0x80 || sequence[1] > 0xBF)
  96             return nonCharacter;
  97         return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
  98     }
  99     if (length == 3) {
 100         ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
 101         switch (sequence[0]) {
 102         case 0xE0:
 103             if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
 104                 return nonCharacter;
 105             break;
 106         case 0xED:
 107             if (sequence[1] < 0x80 || sequence[1] > 0x9F)
 108                 return nonCharacter;
 109             break;
 110         default:
 111             if (sequence[1] < 0x80 || sequence[1] > 0xBF)
 112                 return nonCharacter;
 113         }
 114         if (sequence[2] < 0x80 || sequence[2] > 0xBF)
 115             return nonCharacter;
 116         return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
 117     }
 118     ASSERT(length == 4);
 119     ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
 120     switch (sequence[0]) {
 121     case 0xF0:
 122         if (sequence[1] < 0x90 || sequence[1] > 0xBF)
 123             return nonCharacter;
 124         break;
 125     case 0xF4:
 126         if (sequence[1] < 0x80 || sequence[1] > 0x8F)
 127             return nonCharacter;
 128         break;
 129     default:
 130         if (sequence[1] < 0x80 || sequence[1] > 0xBF)
 131             return nonCharacter;
 132     }
 133     if (sequence[2] < 0x80 || sequence[2] > 0xBF)
 134         return nonCharacter;
 135     if (sequence[3] < 0x80 || sequence[3] > 0xBF)
 136         return nonCharacter;
 137     return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
 138 }
 139
 140 static inline UChar* appendCharacter(UChar* destination, int character)
 141 {
 142     ASSERT(character != nonCharacter);
 143     ASSERT(!U_IS_SURROGATE(character));
 144     if (U_IS_BMP(character))
 145         *destination++ = character;
 146     else {
 147         *destination++ = U16_LEAD(character);
 148         *destination++ = U16_TRAIL(character);
 149     }
 150     return destination;
 151 }
 152
 153 void TextCodecUTF8::consumePartialSequenceByte()
 154 {
 155     --m_partialSequenceSize;
 156     memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
 157 }
 158
 159 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
 160 {
 161     sawError = true;
 162     if (stopOnError)
 163         return;
 164     // Each error generates a replacement character and consumes one byte.
 165     *destination++ = replacementCharacter;
 166     consumePartialSequenceByte();
 167 }
 168
 169 void TextCodecUTF8::handlePartialSequence(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
 170 {
 171     ASSERT(m_partialSequenceSize);
 172     do {
 173         if (isASCII(m_partialSequence[0])) {
 174             *destination++ = m_partialSequence[0];
 175             consumePartialSequenceByte();
 176             continue;
 177         }
 178         int count = nonASCIISequenceLength(m_partialSequence[0]);
 179         if (!count) {
 180             handleError(destination, stopOnError, sawError);
 181             if (stopOnError)
 182                 return;
 183             continue;
 184         }
 185         if (count > m_partialSequenceSize) {
 186             if (count - m_partialSequenceSize > end - source) {
 187                 if (!flush) {
 188                     // The new data is not enough to complete the sequence, so
 189                     // add it to the existing partial sequence.
 190                     memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
 191                     m_partialSequenceSize += end - source;
 192                     return;
 193                 }
 194                 // An incomplete partial sequence at the end is an error.
 195                 handleError(destination, stopOnError, sawError);
 196                 if (stopOnError)
 197                     return;
 198                 continue;
 199             }
 200             memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
 201             source += count - m_partialSequenceSize;
 202             m_partialSequenceSize = count;
 203         }
 204         int character = decodeNonASCIISequence(m_partialSequence, count);
 205         if (character == nonCharacter) {
 206             handleError(destination, stopOnError, sawError);
 207             if (stopOnError)
 208                 return;
 209             continue;
 210         }
 211         m_partialSequenceSize -= count;
 212         destination = appendCharacter(destination, character);
 213     } while (m_partialSequenceSize);
 214 }
 215
 216 String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
 217 {
 218     // Each input byte might turn into a character.
 219     // That includes all bytes in the partial-sequence buffer because
 220     // each byte in an invalid sequence will turn into a replacement character.
 221     StringBuffer buffer(m_partialSequenceSize + length);
 222
 223     const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
 224     const uint8_t* end = source + length;
 225     const uint8_t* alignedEnd = alignToMachineWord(end);
 226     UChar* destination = buffer.characters();
 227
 228     do {
 229         if (m_partialSequenceSize) {
 230             // Explicitly copy destination and source pointers to avoid taking pointers to the
 231             // local variables, which may harm code generation by disabling some optimizations
 232             // in some compilers.
 233             UChar* destinationForHandlePartialSequence = destination;
 234             const uint8_t* sourceForHandlePartialSequence = source;
 235             handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
 236             destination = destinationForHandlePartialSequence;
 237             source = sourceForHandlePartialSequence;
 238             if (m_partialSequenceSize)
 239                 break;
 240         }
 241
 242         while (source < end) {
 243             if (isASCII(*source)) {
 244                 // Fast path for ASCII. Most UTF-8 text will be ASCII.
 245                 if (isAlignedToMachineWord(source)) {
 246                     while (source < alignedEnd) {
 247                         MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
 248                         if (!isAllASCII(chunk))
 249                             break;
 250                         copyASCIIMachineWord(destination, source);
 251                         source += sizeof(MachineWord);
 252                         destination += sizeof(MachineWord);
 253                     }
 254                     if (source == end)
 255                         break;
 256                     if (!isASCII(*source))
 257                         continue;
 258                 }
 259                 *destination++ = *source++;
 260                 continue;
 261             }
 262             int count = nonASCIISequenceLength(*source);
 263             int character;
 264             if (!count)
 265                 character = nonCharacter;
 266             else {
 267                 if (count > end - source) {
 268                     ASSERT(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
 269                     ASSERT(!m_partialSequenceSize);
 270                     m_partialSequenceSize = end - source;
 271                     memcpy(m_partialSequence, source, m_partialSequenceSize);
 272                     source = end;
 273                     break;
 274                 }
 275                 character = decodeNonASCIISequence(source, count);
 276             }
 277             if (character == nonCharacter) {
 278                 sawError = true;
 279                 if (stopOnError)
 280                     break;
 281                 // Each error generates a replacement character and consumes one byte.
 282                 *destination++ = replacementCharacter;
 283                 ++source;
 284                 continue;
 285             }
 286             source += count;
 287             destination = appendCharacter(destination, character);
 288         }
 289     } while (flush && m_partialSequenceSize);
 290
 291     buffer.shrink(destination - buffer.characters());
 292
 293     return String::adopt(buffer);
 294 }
 295
 296 CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
 297 {
 298     // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
 299     // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
 300     // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
 301     if (length > numeric_limits<size_t>::max() / 3)
 302         CRASH();
 303     Vector<uint8_t> bytes(length * 3);
 304
 305     size_t i = 0;
 306     size_t bytesWritten = 0;
 307     while (i < length) {
 308         UChar32 character;
 309         U16_NEXT(characters, i, length, character);
 310         U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
 311     }
 312
 313     return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
 314 }
 315
 316 } // namespace WebCore