code.vuplus.com Git - vuplus_webkit/blob - Source/WebCore/platform/text/UnicodeRange.cpp

   1 /*
   2  * Copyright (C) 2007 Apple Computer, Inc.
   3  *
   4  * Portions are Copyright (C) 1998 Netscape Communications Corporation.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  19  *
  20  * Alternatively, the contents of this file may be used under the terms
  21  * of either the Mozilla Public License Version 1.1, found at
  22  * http://www.mozilla.org/MPL/ (the "MPL") or the GNU General Public
  23  * License Version 2.0, found at http://www.fsf.org/copyleft/gpl.html
  24  * (the "GPL"), in which case the provisions of the MPL or the GPL are
  25  * applicable instead of those above.  If you wish to allow use of your
  26  * version of this file only under the terms of one of those two
  27  * licenses (the MPL or the GPL) and not to allow others to use your
  28  * version of this file under the LGPL, indicate your decision by
  29  * deletingthe provisions above and replace them with the notice and
  30  * other provisions required by the MPL or the GPL, as the case may be.
  31  * If you do not delete the provisions above, a recipient may use your
  32  * version of this file under any of the LGPL, the MPL or the GPL.
  33  */
  34
  35 #include "config.h"
  36 #include "UnicodeRange.h"
  37
  38 namespace WebCore {
  39
  40 // This table depends on unicode range definitions.
  41 // Each item's index must correspond to a unicode range value
  42 // eg. x-cyrillic = LangGroupTable[cRangeCyrillic]
  43 static const char* gUnicodeRangeToLangGroupTable[] =
  44 {
  45   "x-cyrillic",
  46   "el",
  47   "tr",
  48   "he",
  49   "ar",
  50   "x-baltic",
  51   "th",
  52   "ko",
  53   "ja",
  54   "zh-CN",
  55   "zh-TW",
  56   "x-devanagari",
  57   "x-tamil",
  58   "x-armn",
  59   "x-beng",
  60   "x-cans",
  61   "x-ethi",
  62   "x-geor",
  63   "x-gujr",
  64   "x-guru",
  65   "x-khmr",
  66   "x-mlym"
  67 };
  68
  69 /**********************************************************************
  70  * Unicode subranges as defined in unicode 3.0
  71  * x-western, x-central-euro, tr, x-baltic  -> latin
  72  *  0000 - 036f
  73  *  1e00 - 1eff
  74  *  2000 - 206f  (general punctuation)
  75  *  20a0 - 20cf  (currency symbols)
  76  *  2100 - 214f  (letterlike symbols)
  77  *  2150 - 218f  (Number Forms)
  78  * el         -> greek
  79  *  0370 - 03ff
  80  *  1f00 - 1fff
  81  * x-cyrillic -> cyrillic
  82  *  0400 - 04ff
  83  * he         -> hebrew
  84  *  0590 - 05ff
  85  * ar         -> arabic
  86  *  0600 - 06ff
  87  *  fb50 - fdff (arabic presentation forms)
  88  *  fe70 - feff (arabic presentation forms b)
  89  * th - thai
  90  *  0e00 - 0e7f
  91  * ko        -> korean
  92  *  ac00 - d7af  (hangul Syllables)
  93  *  1100 - 11ff    (jamo)
  94  *  3130 - 318f (hangul compatibility jamo)
  95  * ja
  96  *  3040 - 309f (hiragana)
  97  *  30a0 - 30ff (katakana)
  98  * zh-CN
  99  * zh-TW
 100  *
 101  * CJK
 102  *  3100 - 312f (bopomofo)
 103  *  31a0 - 31bf (bopomofo extended)
 104  *  3000 - 303f (CJK Symbols and Punctuation)
 105  *  2e80 - 2eff (CJK radicals supplement)
 106  *  2f00 - 2fdf (Kangxi Radicals)
 107  *  2ff0 - 2fff (Ideographic Description Characters)
 108  *  3190 - 319f (kanbun)
 109  *  3200 - 32ff (Enclosed CJK letters and Months)
 110  *  3300 - 33ff (CJK compatibility)
 111  *  3400 - 4dbf (CJK Unified Ideographs Extension A)
 112  *  4e00 - 9faf (CJK Unified Ideographs)
 113  *  f900 - fa5f (CJK Compatibility Ideographs)
 114  *  fe30 - fe4f (CJK compatibility Forms)
 115  *  ff00 - ffef (halfwidth and fullwidth forms)
 116  *
 117  * Armenian
 118  *  0530 - 058f
 119  * Sriac
 120  *  0700 - 074f
 121  * Thaana
 122  *  0780 - 07bf
 123  * Devanagari
 124  *  0900 - 097f
 125  * Bengali
 126  *  0980 - 09ff
 127  * Gurmukhi
 128  *  0a00 - 0a7f
 129  * Gujarati
 130  *  0a80 - 0aff
 131  * Oriya
 132  *  0b00 - 0b7f
 133  * Tamil
 134  *  0b80 - 0bff
 135  * Telugu
 136  *  0c00 - 0c7f
 137  * Kannada
 138  *  0c80 - 0cff
 139  * Malayalam
 140  *  0d00 - 0d7f
 141  * Sinhala
 142  *  0d80 - 0def
 143  * Lao
 144  *  0e80 - 0eff
 145  * Tibetan
 146  *  0f00 - 0fbf
 147  * Myanmar
 148  *  1000 - 109f
 149  * Georgian
 150  *  10a0 - 10ff
 151  * Ethiopic
 152  *  1200 - 137f
 153  * Cherokee
 154  *  13a0 - 13ff
 155  * Canadian Aboriginal Syllabics
 156  *  1400 - 167f
 157  * Ogham
 158  *  1680 - 169f
 159  * Runic
 160  *  16a0 - 16ff
 161  * Khmer
 162  *  1780 - 17ff
 163  * Mongolian
 164  *  1800 - 18af
 165  * Misc - superscripts and subscripts
 166  *  2070 - 209f
 167  * Misc - Combining Diacritical Marks for Symbols
 168  *  20d0 - 20ff
 169  * Misc - Arrows
 170  *  2190 - 21ff
 171  * Misc - Mathematical Operators
 172  *  2200 - 22ff
 173  * Misc - Miscellaneous Technical
 174  *  2300 - 23ff
 175  * Misc - Control picture
 176  *  2400 - 243f
 177  * Misc - Optical character recognition
 178  *  2440 - 2450
 179  * Misc - Enclose Alphanumerics
 180  *  2460 - 24ff
 181  * Misc - Box Drawing
 182  *  2500 - 257f
 183  * Misc - Block Elements
 184  *  2580 - 259f
 185  * Misc - Geometric Shapes
 186  *  25a0 - 25ff
 187  * Misc - Miscellaneous Symbols
 188  *  2600 - 267f
 189  * Misc - Dingbats
 190  *  2700 - 27bf
 191  * Misc - Braille Patterns
 192  *  2800 - 28ff
 193  * Yi Syllables
 194  *  a000 - a48f
 195  * Yi radicals
 196  *  a490 - a4cf
 197  * Alphabetic Presentation Forms
 198  *  fb00 - fb4f
 199  * Misc - Combining half Marks
 200  *  fe20 - fe2f
 201  * Misc - small form variants
 202  *  fe50 - fe6f
 203  * Misc - Specials
 204  *  fff0 - ffff
 205  *********************************************************************/
 206
 207 static const unsigned cNumSubTables = 9;
 208 static const unsigned cSubTableSize = 16;
 209
 210 static const unsigned char gUnicodeSubrangeTable[cNumSubTables][cSubTableSize] =
 211 {
 212   { // table for X---
 213     cRangeTableBase+1,  //u0xxx
 214     cRangeTableBase+2,  //u1xxx
 215     cRangeTableBase+3,  //u2xxx
 216     cRangeSetCJK,       //u3xxx
 217     cRangeSetCJK,       //u4xxx
 218     cRangeSetCJK,       //u5xxx
 219     cRangeSetCJK,       //u6xxx
 220     cRangeSetCJK,       //u7xxx
 221     cRangeSetCJK,       //u8xxx
 222     cRangeSetCJK,       //u9xxx
 223     cRangeTableBase+4,  //uaxxx
 224     cRangeKorean,       //ubxxx
 225     cRangeKorean,       //ucxxx
 226     cRangeTableBase+5,  //udxxx
 227     cRangePrivate,      //uexxx
 228     cRangeTableBase+6   //ufxxx
 229   },
 230   { //table for 0X--
 231     cRangeSetLatin,          //u00xx
 232     cRangeSetLatin,          //u01xx
 233     cRangeSetLatin,          //u02xx
 234     cRangeGreek,             //u03xx     XXX 0300-036f is in fact cRangeCombiningDiacriticalMarks
 235     cRangeCyrillic,          //u04xx
 236     cRangeTableBase+7,       //u05xx, includes Cyrillic supplement, Hebrew, and Armenian
 237     cRangeArabic,            //u06xx
 238     cRangeTertiaryTable,     //u07xx
 239     cRangeUnassigned,        //u08xx
 240     cRangeTertiaryTable,     //u09xx
 241     cRangeTertiaryTable,     //u0axx
 242     cRangeTertiaryTable,     //u0bxx
 243     cRangeTertiaryTable,     //u0cxx
 244     cRangeTertiaryTable,     //u0dxx
 245     cRangeTertiaryTable,     //u0exx
 246     cRangeTibetan,           //u0fxx
 247   },
 248   { //table for 1x--
 249     cRangeTertiaryTable,     //u10xx
 250     cRangeKorean,            //u11xx
 251     cRangeEthiopic,          //u12xx
 252     cRangeTertiaryTable,     //u13xx
 253     cRangeCanadian,          //u14xx
 254     cRangeCanadian,          //u15xx
 255     cRangeTertiaryTable,     //u16xx
 256     cRangeKhmer,             //u17xx
 257     cRangeMongolian,         //u18xx
 258     cRangeUnassigned,        //u19xx
 259     cRangeUnassigned,        //u1axx
 260     cRangeUnassigned,        //u1bxx
 261     cRangeUnassigned,        //u1cxx
 262     cRangeUnassigned,        //u1dxx
 263     cRangeSetLatin,          //u1exx
 264     cRangeGreek,             //u1fxx
 265   },
 266   { //table for 2x--
 267     cRangeSetLatin,          //u20xx
 268     cRangeSetLatin,          //u21xx
 269     cRangeMathOperators,     //u22xx
 270     cRangeMiscTechnical,     //u23xx
 271     cRangeControlOpticalEnclose, //u24xx
 272     cRangeBoxBlockGeometrics, //u25xx
 273     cRangeMiscSymbols,       //u26xx
 274     cRangeDingbats,          //u27xx
 275     cRangeBraillePattern,    //u28xx
 276     cRangeUnassigned,        //u29xx
 277     cRangeUnassigned,        //u2axx
 278     cRangeUnassigned,        //u2bxx
 279     cRangeUnassigned,        //u2cxx
 280     cRangeUnassigned,        //u2dxx
 281     cRangeSetCJK,            //u2exx
 282     cRangeSetCJK,            //u2fxx
 283   },
 284   {  //table for ax--
 285     cRangeYi,                //ua0xx
 286     cRangeYi,                //ua1xx
 287     cRangeYi,                //ua2xx
 288     cRangeYi,                //ua3xx
 289     cRangeYi,                //ua4xx
 290     cRangeUnassigned,        //ua5xx
 291     cRangeUnassigned,        //ua6xx
 292     cRangeUnassigned,        //ua7xx
 293     cRangeUnassigned,        //ua8xx
 294     cRangeUnassigned,        //ua9xx
 295     cRangeUnassigned,        //uaaxx
 296     cRangeUnassigned,        //uabxx
 297     cRangeKorean,            //uacxx
 298     cRangeKorean,            //uadxx
 299     cRangeKorean,            //uaexx
 300     cRangeKorean,            //uafxx
 301   },
 302   {  //table for dx--
 303     cRangeKorean,            //ud0xx
 304     cRangeKorean,            //ud1xx
 305     cRangeKorean,            //ud2xx
 306     cRangeKorean,            //ud3xx
 307     cRangeKorean,            //ud4xx
 308     cRangeKorean,            //ud5xx
 309     cRangeKorean,            //ud6xx
 310     cRangeKorean,            //ud7xx
 311     cRangeSurrogate,         //ud8xx
 312     cRangeSurrogate,         //ud9xx
 313     cRangeSurrogate,         //udaxx
 314     cRangeSurrogate,         //udbxx
 315     cRangeSurrogate,         //udcxx
 316     cRangeSurrogate,         //uddxx
 317     cRangeSurrogate,         //udexx
 318     cRangeSurrogate,         //udfxx
 319   },
 320   { // table for fx--
 321     cRangePrivate,           //uf0xx
 322     cRangePrivate,           //uf1xx
 323     cRangePrivate,           //uf2xx
 324     cRangePrivate,           //uf3xx
 325     cRangePrivate,           //uf4xx
 326     cRangePrivate,           //uf5xx
 327     cRangePrivate,           //uf6xx
 328     cRangePrivate,           //uf7xx
 329     cRangePrivate,           //uf8xx
 330     cRangeSetCJK,            //uf9xx
 331     cRangeSetCJK,            //ufaxx
 332     cRangeArabic,            //ufbxx, includes alphabic presentation form
 333     cRangeArabic,            //ufcxx
 334     cRangeArabic,            //ufdxx
 335     cRangeArabic,            //ufexx, includes Combining half marks,
 336                              //                CJK compatibility forms,
 337                              //                CJK compatibility forms,
 338                              //                small form variants
 339     cRangeTableBase+8,       //uffxx, halfwidth and fullwidth forms, includes Specials
 340   },
 341   { //table for 0x0500 - 0x05ff
 342     cRangeCyrillic,          //u050x
 343     cRangeCyrillic,          //u051x
 344     cRangeCyrillic,          //u052x
 345     cRangeArmenian,          //u053x
 346     cRangeArmenian,          //u054x
 347     cRangeArmenian,          //u055x
 348     cRangeArmenian,          //u056x
 349     cRangeArmenian,          //u057x
 350     cRangeArmenian,          //u058x
 351     cRangeHebrew,            //u059x
 352     cRangeHebrew,            //u05ax
 353     cRangeHebrew,            //u05bx
 354     cRangeHebrew,            //u05cx
 355     cRangeHebrew,            //u05dx
 356     cRangeHebrew,            //u05ex
 357     cRangeHebrew,            //u05fx
 358   },
 359   { //table for 0xff00 - 0xffff
 360     cRangeSetCJK,            //uff0x, fullwidth latin
 361     cRangeSetCJK,            //uff1x, fullwidth latin
 362     cRangeSetCJK,            //uff2x, fullwidth latin
 363     cRangeSetCJK,            //uff3x, fullwidth latin
 364     cRangeSetCJK,            //uff4x, fullwidth latin
 365     cRangeSetCJK,            //uff5x, fullwidth latin
 366     cRangeSetCJK,            //uff6x, halfwidth katakana
 367     cRangeSetCJK,            //uff7x, halfwidth katakana
 368     cRangeSetCJK,            //uff8x, halfwidth katakana
 369     cRangeSetCJK,            //uff9x, halfwidth katakana
 370     cRangeSetCJK,            //uffax, halfwidth hangul jamo
 371     cRangeSetCJK,            //uffbx, halfwidth hangul jamo
 372     cRangeSetCJK,            //uffcx, halfwidth hangul jamo
 373     cRangeSetCJK,            //uffdx, halfwidth hangul jamo
 374     cRangeSetCJK,            //uffex, fullwidth symbols
 375     cRangeSpecials,          //ufffx, Specials
 376   },
 377 };
 378
 379 // Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80)
 380 // code points so that the number of entries in the tertiary range
 381 // table for that range is obtained by dividing (0x1700 - 0x0700) by 128.
 382 // Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal
 383 // syllabaries take multiple chunks and Ogham and Runic share a single chunk.
 384 static const unsigned cTertiaryTableSize = ((0x1700 - 0x0700) / 0x80);
 385
 386 static const unsigned char gUnicodeTertiaryRangeTable[cTertiaryTableSize] =
 387 { //table for 0x0700 - 0x1600
 388     cRangeSyriac,            //u070x
 389     cRangeThaana,            //u078x
 390     cRangeUnassigned,        //u080x  place holder(resolved in the 2ndary tab.)
 391     cRangeUnassigned,        //u088x  place holder(resolved in the 2ndary tab.)
 392     cRangeDevanagari,        //u090x
 393     cRangeBengali,           //u098x
 394     cRangeGurmukhi,          //u0a0x
 395     cRangeGujarati,          //u0a8x
 396     cRangeOriya,             //u0b0x
 397     cRangeTamil,             //u0b8x
 398     cRangeTelugu,            //u0c0x
 399     cRangeKannada,           //u0c8x
 400     cRangeMalayalam,         //u0d0x
 401     cRangeSinhala,           //u0d8x
 402     cRangeThai,              //u0e0x
 403     cRangeLao,               //u0e8x
 404     cRangeTibetan,           //u0f0x  place holder(resolved in the 2ndary tab.)
 405     cRangeTibetan,           //u0f8x  place holder(resolved in the 2ndary tab.)
 406     cRangeMyanmar,           //u100x
 407     cRangeGeorgian,          //u108x
 408     cRangeKorean,            //u110x  place holder(resolved in the 2ndary tab.)
 409     cRangeKorean,            //u118x  place holder(resolved in the 2ndary tab.)
 410     cRangeEthiopic,          //u120x  place holder(resolved in the 2ndary tab.)
 411     cRangeEthiopic,          //u128x  place holder(resolved in the 2ndary tab.)
 412     cRangeEthiopic,          //u130x
 413     cRangeCherokee,          //u138x
 414     cRangeCanadian,          //u140x  place holder(resolved in the 2ndary tab.)
 415     cRangeCanadian,          //u148x  place holder(resolved in the 2ndary tab.)
 416     cRangeCanadian,          //u150x  place holder(resolved in the 2ndary tab.)
 417     cRangeCanadian,          //u158x  place holder(resolved in the 2ndary tab.)
 418     cRangeCanadian,          //u160x
 419     cRangeOghamRunic,        //u168x  this contains two scripts, Ogham & Runic
 420 };
 421
 422 // A two level index is almost enough for locating a range, with the
 423 // exception of u03xx and u05xx. Since we don't really care about range for
 424 // combining diacritical marks in our font application, they are
 425 // not discriminated further.  Future adoption of this method for other use
 426 // should be aware of this limitation. The implementation can be extended if
 427 // there is such a need.
 428 // For Indic, Southeast Asian scripts and some other scripts between
 429 // U+0700 and U+16FF, it's extended to the third level.
 430 unsigned int findCharUnicodeRange(UChar32 ch)
 431 {
 432     if (ch >= 0xFFFF)
 433         return 0;
 434
 435     unsigned int range;
 436
 437     //search the first table
 438     range = gUnicodeSubrangeTable[0][ch >> 12];
 439
 440     if (range < cRangeTableBase)
 441         // we try to get a specific range
 442         return range;
 443
 444     // otherwise, we have one more table to look at
 445     range = gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x0f00) >> 8];
 446     if (range < cRangeTableBase)
 447         return range;
 448     if (range < cRangeTertiaryTable)
 449         return gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x00f0) >> 4];
 450
 451     // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks
 452     return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7];
 453 }
 454
 455 const char* langGroupFromUnicodeRange(unsigned char unicodeRange)
 456 {
 457     if (cRangeSpecificItemNum > unicodeRange)
 458         return gUnicodeRangeToLangGroupTable[unicodeRange];
 459     return 0;
 460 }
 461
 462 }