The Ecere SDK - sdk/blob - ecere/src/sys/unicode.ec

   1 namespace sys;
   2
   3 import "File"
   4
   5 public int UTF8toISO8859_1(char * source, char * dest, int max)
   6 {
   7    unichar ch;
   8    int nb;
   9    int c;
  10    int d = 0;
  11    for(c = 0; (ch = UTF8GetChar(source + c, &nb)) && d < max-1; c += nb)
  12    {
  13       if(ch < 256)
  14          dest[d] = (byte)ch;
  15       else
  16          dest[d] = '?';
  17       d++;
  18    }
  19    dest[d] = 0;
  20    return d;
  21 }
  22
  23 public uint16 * UTF8toUTF16Len(char * source, int byteCount, int * wordCount)
  24 {
  25    if(source)
  26    {
  27       uint16 * dest = new uint16[byteCount + 1];
  28       int c;
  29       int d = 0;
  30       bool error = false;
  31       for(c = 0; c<byteCount && source[c];)
  32       {
  33          byte ch = source[c];
  34          unichar codePoint = 0;
  35          int numBytes = 1;
  36          int i;
  37          byte mask = 0x7F;
  38          if(ch & 0x80 && ch & 0x40)
  39          {
  40             numBytes++;
  41             mask >>= 2;
  42             if(ch & 0x20)
  43             {
  44                numBytes++;
  45                mask >>= 1;
  46                if(ch & 0x10)
  47                {
  48                   if(ch & 0x08)
  49                      error = true;
  50                   else
  51                   {
  52                      numBytes++;
  53                      mask >>= 1;
  54                   }
  55                }
  56             }
  57          }
  58          for(i = 0; i<numBytes; i++, c++)
  59          {
  60             if(c >= byteCount || !(ch = source[c]))
  61             {
  62                error = true;
  63                break;
  64             }
  65             codePoint <<= 6;
  66             codePoint |= ch & mask;
  67             mask = 0x3F;
  68
  69             if(i > 1)
  70             {
  71                if(!(ch & 0x80) || (ch & 0x40))
  72                {
  73                   error = true;
  74                   break;
  75                }
  76             }
  77          }
  78          if(i < numBytes)
  79             error = true;
  80          if(codePoint > 0x10FFFF || (codePoint >= 0xD800 && codePoint <= 0xDFFF) ||
  81            (codePoint < 0x80 && numBytes > 1) ||
  82            (codePoint < 0x800 && numBytes > 2) ||
  83            (codePoint < 0x10000 && numBytes > 3))
  84             error = true;
  85          if(error)
  86          {
  87             c = byteCount;
  88             d = 0;
  89             break;
  90          }
  91
  92          if(codePoint > 0xFFFF)
  93          {
  94             uint16 lead = (uint16)(LEAD_OFFSET + (codePoint >> 10));
  95             uint16 trail = 0xDC00 + (uint16)(codePoint & 0x3FF);
  96
  97             dest[d++] = lead;
  98             dest[d++] = trail;
  99          }
 100          else
 101          {
 102             dest[d++] = (uint16)codePoint;
 103          }
 104       }
 105       *wordCount = d + byteCount - c;
 106       dest[d] = 0;
 107       return dest;
 108    }
 109    *wordCount = 0;
 110    return null;
 111 }
 112
 113 public int UTF8toUTF16BufferLen(char * source, uint16 * dest, int max, int len)
 114 {
 115    if(source)
 116    {
 117       int c;
 118       int d = 0;
 119       for(c = 0; c < len && source[c];)
 120       {
 121          byte ch = source[c];
 122          unichar codePoint = 0;
 123          int numBytes = 1;
 124          int i;
 125          byte mask = 0x7F;
 126          if(ch & 0x80 && ch & 0x40)
 127          {
 128             mask >>= 2;
 129             numBytes++;
 130             if(ch & 0x20)
 131             {
 132                numBytes++;
 133                mask >>= 1;
 134                if(ch & 0x10)
 135                {
 136                   numBytes++;
 137                   mask >>= 1;
 138                }
 139             }
 140          }
 141          for(i = 0; i<numBytes; i++)
 142          {
 143             codePoint <<= 6;
 144             codePoint |= source[c++] & mask;
 145             mask = 0x3F;
 146          }
 147
 148          if(codePoint > 0xFFFF)
 149          {
 150             uint16 lead = (uint16)(LEAD_OFFSET + (codePoint >> 10));
 151             uint16 trail = 0xDC00 + (uint16)(codePoint & 0x3FF);
 152             if(d >= max - 1) break;
 153             dest[d++] = lead;
 154             dest[d++] = trail;
 155          }
 156          else
 157          {
 158             if(d >= max) break;
 159             dest[d++] = (uint16)codePoint;
 160          }
 161       }
 162       dest[d] = 0;
 163       return d;
 164    }
 165    return 0;
 166 }
 167
 168 public int UTF16BEtoUTF8Buffer(uint16 * source, byte * dest, int max)
 169 {
 170    int c;
 171    int d = 0;
 172    uint16 u16;
 173    for(c = 0; (u16 = ((source[c] & 0xFF00) >> 8) | ((source[c] & 0x00FF) << 8)); c++)
 174    {
 175       unichar ch;
 176       if(u16 < 0xD800 || u16 > 0xDBFF)
 177       {
 178          // TOFIX: PRECOMP ERROR IF NO BRACKETS
 179          ch = (unichar)u16;
 180       }
 181       else
 182       {
 183          // TOFIX: PRECOMP ERROR IF NO BRACKETS
 184          ch = ((unichar)u16 << 10) + source[c++] + SURROGATE_OFFSET;
 185       }
 186
 187       if(ch < 0x80)
 188       {
 189          if(d + 1 >= max) break;
 190          dest[d++] = (char)ch;
 191       }
 192       else if(ch < 0x800)
 193       {
 194          if(d + 2 >= max) break;
 195          dest[d++] = (byte)(0xC0 | ((ch & 0x7C0) >> 6));
 196          dest[d++] = (byte)(0x80 | (ch & 0x03F));
 197       }
 198       else if(ch < 0x10000)
 199       {
 200          if(d + 3 >= max) break;
 201          dest[d++] = (byte)(0xE0 | ((ch & 0xF000) >> 12));
 202          dest[d++] = (byte)(0x80 | ((ch & 0xFC0) >> 6));
 203          dest[d++] = (byte)(0x80 | (ch & 0x03F));
 204       }
 205       else
 206       {
 207          if(d + 4 >= max) break;
 208          dest[d++] = (byte)(0xF0 | ((ch & 0x1C0000) >> 18));
 209          dest[d++] = (byte)(0x80 | ((ch & 0x3F000) >> 12));
 210          dest[d++] = (byte)(0x80 | ((ch & 0xFC0) >> 6));
 211          dest[d++] = (byte)(0x80 | (ch & 0x03F));
 212       }
 213    }
 214    dest[d] = 0;
 215    return d;
 216 }
 217
 218 enum LineBreakClass
 219 {
 220    OP, CL, QU, GL, NS, EX, SY, IS, PR, PO,
 221    NU, AL, ID, IN, HY, BA, BB, B2, ZW, CM,
 222    WJ, H2, H3, JL, JV, JT, SA, SG, SP, CR,
 223    LF, BK
 224 };
 225
 226 public enum CharCategory
 227 {
 228    none = 0,
 229
 230    Mn = 1, markNonSpacing       = 1,
 231    Mc = 2, markSpacing          = 2,
 232    Me = 3, markEnclosing        = 3,
 233
 234    Nd = 4, numberDecimalDigit   = 4,
 235    Nl = 5, numberLetter         = 5,
 236    No = 6, numberOther          = 6,
 237
 238    Zs = 7, separatorSpace       = 7,
 239    Zl = 8, separatorLine        = 8,
 240    Zp = 9, separatorParagraph   = 9,
 241
 242    Cc = 10, otherControl         = 10,
 243    Cf = 11, otherFormat          = 11,
 244    Cs = 12, otherSurrogate       = 12,
 245    Co = 13, otherPrivateUse      = 13,
 246    Cn = 14, otherNotAssigned     = 14,
 247
 248    Lu = 15, letterUpperCase      = 15,
 249    Ll = 16, letterLowerCase      = 16,
 250    Lt = 17, letterTitleCase      = 17,
 251    Lm = 18, letterModifier       = 18,
 252    Lo = 19, letterOther          = 19,
 253
 254    Pc = 20, punctuationConnector = 20,
 255    Pd = 21, punctuationDash      = 21,
 256    Ps = 22, punctuationOpen      = 22,
 257    Pe = 23, punctuationClose     = 23,
 258    Pi = 24, punctuationInitial   = 24,
 259    Pf = 25, punctuationFinal     = 25,
 260    Po = 26, punctuationOther     = 26,
 261
 262    Sm = 27, symbolMath           = 27,
 263    Sc = 28, symbolCurrency       = 28,
 264    Sk = 29, symbolModifier       = 29,
 265    So = 30, symbolOther          = 30
 266 };
 267
 268 public class CharCategories : uint
 269 {
 270 public:
 271    bool none:1;
 272    bool markNonSpacing:1, markSpacing:1, markEnclosing:1;
 273    bool numberDecimalDigit:1, numberLetter:1, numberOther:1;
 274    bool separatorSpace:1,separatorLine:1,separatorParagraph:1;
 275    bool otherControl:1,otherFormat:1,otherSurrogate:1,otherPrivateUse:1,otherNotAssigned:1;
 276    bool letterUpperCase:1, letterLowerCase:1, letterTitleCase:1, letterModifier:1, letterOther:1;
 277    bool punctuationConnector:1, punctuationDash:1, punctuationOpen:1, punctuationClose:1, punctuationInitial:1, punctuationFinal:1, punctuationOther:1;
 278    bool symbolMath:1, symbolCurrency:1, symbolModifier:1, symbolOther:1;
 279 };
 280
 281 public enum PredefinedCharCategories : CharCategories
 282 {
 283    none = CharCategories { none = true },
 284    marks = CharCategories { markNonSpacing = true, markSpacing = true, markEnclosing = true },
 285    numbers = CharCategories { numberDecimalDigit = true, numberLetter = true, numberOther = true },
 286    separators = CharCategories { separatorSpace = true, separatorLine = true, separatorParagraph = true },
 287    others = CharCategories { otherControl = true, otherFormat = true, otherSurrogate = true, otherPrivateUse = true, otherNotAssigned = true },
 288    letters = CharCategories { letterUpperCase = true, letterLowerCase = true, letterTitleCase = true, letterModifier = true, letterOther = true },
 289    punctuation = CharCategories { punctiationConnector = true, punctuationDash = true, punctuationOpen = true, punctuationClose = true, punctuationInitial = true,
 290                      punctuationFinal = true, punctuationOther = true },
 291    symbols = CharCategories { symbolMath = true, symbolCurrency = true, symbolModifier = true, symbolOther = true },
 292    connector = CharCategories { punctuationConnector = true },
 293 };
 294
 295 public bool GetAlNum(char ** input, char * string, int max)
 296 {
 297    int c = 0;
 298    unichar ch;
 299    int nb = 1;
 300    bool result = true;
 301    char * buffer = *input;
 302    if(!buffer[0]) { string[0]=0; return false; }
 303
 304    // Eat all left spacing, leave last char in ch
 305    for(;;)
 306    {
 307 #if defined(ECERE_NOUNICODE) || defined(ECERE_BOOTSTRAP)
 308       ch = *buffer;
 309 #else
 310       ch = UTF8GetChar(buffer, &nb);
 311 #endif
 312       if(!ch) { result = false; break; }
 313
 314 #if defined(ECERE_NOUNICODE) || defined(ECERE_BOOTSTRAP)
 315       if(isalnum(ch))
 316 #else
 317       if(CharMatchCategories(ch, numbers|letters))
 318 #endif
 319          break;
 320       else
 321          buffer += nb;
 322    }
 323    if(result)
 324    {
 325       while(c < max-1)
 326       {
 327          int i;
 328          for(i = 0; i < nb && c < max-1; i++)
 329             string[c++] = *(buffer++);
 330
 331 #if defined(ECERE_NOUNICODE) || defined(ECERE_BOOTSTRAP)
 332          ch = *buffer;
 333 #else
 334          ch = UTF8GetChar(buffer, &nb);
 335 #endif
 336          if(!ch) break; // End of input string
 337
 338 #if defined(ECERE_NOUNICODE) || defined(ECERE_BOOTSTRAP)
 339          if(!isalnum(ch))
 340 #else
 341          if(!CharMatchCategories(ch, numbers|letters))
 342 #endif
 343             // End of this alpha numeric word
 344             break;
 345       }
 346       string[c]=0;
 347    }
 348    *input = buffer;
 349    return result;
 350 }
 351
 352 static struct Range
 353 {
 354    uint start, end;
 355    CharCategory category;
 356 };
 357
 358 static int CompareRange(BinaryTree tree, Range a, Range b)
 359 {
 360    if(a.start > b.end)
 361       return 1;
 362    else if(a.end < b.start)
 363       return -1;
 364    else
 365       return 0;
 366 }
 367
 368 static void FreeRange(Range range)
 369 {
 370    delete range;
 371 }
 372
 373 static CharCategory asciiCategories[] =
 374 {
 375    Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc,
 376    Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc,
 377    Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc,
 378    Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc,
 379    Zs, Po, Po, Po, Sc, Po, Po, Po,
 380    Ps, Pe, Po, Sm, Cs, Pd, Po, Po,
 381    Nd, Nd, Nd, Nd, Nd, Nd, Nd, Nd,
 382    Nd, Nd, Po, Po, Sm, Sm, Sm, Po,
 383    Po, Lu, Lu, Lu, Lu, Lu, Lu, Lu,
 384    Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu,
 385    Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu,
 386    Lu, Lu, Lu, Ps, Po, Pe, Sk, Pc,
 387    Sk, Ll, Ll, Ll, Ll, Ll, Ll, Ll,
 388    Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll,
 389    Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll,
 390    Ll, Ll, Ll, Ps, Sm, Pe, Sm, Cc
 391 };
 392
 393 static class UnicodeDatabase
 394 {
 395    BinaryTree categories
 396    {
 397       CompareKey = (void *)CompareRange;
 398       FreeKey = (void *)FreeRange;
 399    };
 400
 401    UnicodeDatabase()
 402    {
 403       File f = FileOpen("<:ecere>unicode/derivedGeneralCategoryStripped.txt", read);
 404       if(f)
 405       {
 406          char line[1024];
 407          while(f.GetLine(line, 1024))
 408          {
 409             if(line[0] && line[0] != '#')
 410             {
 411                char * endPtr;
 412                uint start = strtoul(line, &endPtr, 16);
 413                if(endPtr)
 414                {
 415                   uint end = (endPtr && *endPtr == '.') ? strtoul(endPtr + 2, &endPtr, 16) : start;
 416                   if(endPtr)
 417                   {
 418                      endPtr = strchr(endPtr, ';');
 419                      if(endPtr)
 420                      {
 421                         CharCategory category = none;
 422                         endPtr += 2;
 423                         switch(*endPtr)
 424                         {
 425                            case 'C':
 426                               switch(endPtr[1])
 427                               {
 428                                  case 'n': category = Cn; break;
 429                                  case 'c': category = Cc; break;
 430                                  case 'f': category = Cf; break;
 431                                  case 'o': category = Co; break;
 432                                  case 's': category = Cs; break;
 433                               }
 434                               break;
 435                            case 'L':
 436                               switch(endPtr[1])
 437                               {
 438                                  case 'u': category = Lu; break;
 439                                  case 'l': category = Ll; break;
 440                                  case 't': category = Lt; break;
 441                                  case 'm': category = Lm; break;
 442                                  case 'o': category = Lo; break;
 443                               }
 444                               break;
 445                            case 'M':
 446                               switch(endPtr[1])
 447                               {
 448                                  case 'n': category = Mn; break;
 449                                  case 'e': category = Me; break;
 450                                  case 'c': category = Mc; break;
 451                               }
 452                               break;
 453                            case 'Z':
 454                               switch(endPtr[1])
 455                               {
 456                                  case 's': category = Zs; break;
 457                                  case 'l': category = Zl; break;
 458                                  case 'p': category = Zp; break;
 459                               }
 460                               break;
 461                            case 'P':
 462                               switch(endPtr[1])
 463                               {
 464                                  case 'd': category = Pd; break;
 465                                  case 's': category = Ps; break;
 466                                  case 'e': category = Pe; break;
 467                                  case 'c': category = Pc; break;
 468                                  case 'o': category = Po; break;
 469                                  case 'i': category = Pi; break;
 470                                  case 'f': category = Pf; break;
 471                               }
 472                               break;
 473                            case 'S':
 474                               switch(endPtr[1])
 475                               {
 476                                  case 'm': category = Sm; break;
 477                                  case 'c': category = Sc; break;
 478                                  case 'k': category = Sk; break;
 479                                  case 'o': category = So; break;
 480                               }
 481                               break;
 482                            case 'N':
 483                               switch(endPtr[1])
 484                               {
 485                                  case 'd': category = Nd; break;
 486                                  case 'l': category = Nl; break;
 487                                  case 'o': category = No; break;
 488                               }
 489                               break;
 490                         }
 491                         if(category)
 492                         {
 493                            Range range { start, end, category };
 494                            BTNode node { key = (uintptr) &range };
 495                            if(categories.Add(node))
 496                            {
 497                               node.key = (uintptr)new Range[1];
 498                               *(Range *)node.key = range;
 499                            }
 500                            else
 501                               delete node;
 502                         }
 503                      }
 504                   }
 505                }
 506             }
 507          }
 508          delete f;
 509          /*
 510          f = FileOpen("DerivedGeneralCategoryStripped.txt", write);
 511          if(f)
 512          {
 513             BTNode node;
 514             for(node = categories.first; node; node = node.next)
 515             {
 516                Range * range = (Range *)node.key;
 517                char string[64];
 518                int len, c;
 519
 520                if(range->end > range->start)
 521                   sprintf(string, "%04x..%04x", range->start, range->end);
 522                else
 523                   sprintf(string, "%04x", range->start);
 524                len = strlen(string);
 525                for(c = len; c<14; c++)
 526                   string[len++] = ' ';
 527                string[len++] = ';';
 528                string[len++] = ' ';
 529                range->category.OnGetString(string + len, null, null);
 530                len += 2;
 531                string[len++] = '\n';
 532                string[len] = '\0';
 533                f.Puts(string);
 534             }
 535             delete f;
 536          }
 537          */
 538       }
 539    }
 540    ~UnicodeDatabase()
 541    {
 542       categories.Free();
 543    }
 544 };
 545
 546 static UnicodeDatabase dataBase { };
 547
 548 public CharCategory GetCharCategory(unichar ch)
 549 {
 550    if(ch < 128)
 551       return asciiCategories[ch];
 552    else
 553    {
 554       CharCategory category = none;
 555       Range range { ch, ch };
 556       BTNode node = dataBase.categories.Find((uintptr) &range);
 557       if(node)
 558          category = ((Range *)node.key)->category;
 559       return category;
 560    }
 561 }
 562
 563
 564 public bool CharMatchCategories(unichar ch, CharCategories categories)
 565 {
 566    bool result = false;
 567    CharCategory category = GetCharCategory(ch);
 568    switch(category)
 569    {
 570       case none:                 result = categories.none;                 break;
 571       case markNonSpacing:       result = categories.markNonSpacing;       break;
 572       case markSpacing:          result = categories.markSpacing;          break;
 573       case markEnclosing:        result = categories.markEnclosing;        break;
 574
 575       case numberDecimalDigit:   result = categories.numberDecimalDigit;   break;
 576       case numberLetter:         result = categories.numberLetter;         break;
 577       case numberOther:          result = categories.numberOther;          break;
 578
 579       case separatorSpace:       result = categories.separatorSpace;       break;
 580       case separatorLine:        result = categories.separatorLine;        break;
 581       case separatorParagraph:   result = categories.separatorParagraph;   break;
 582
 583       case otherControl:         result = categories.otherControl;         break;
 584       case otherFormat:          result = categories.otherFormat;          break;
 585       case otherSurrogate:       result = categories.otherSurrogate;       break;
 586       case otherPrivateUse:      result = categories.otherPrivateUse;      break;
 587       case otherNotAssigned:     result = categories.otherNotAssigned;     break;
 588
 589       case letterUpperCase:      result = categories.letterUpperCase;      break;
 590       case letterLowerCase:      result = categories.letterLowerCase;      break;
 591       case letterTitleCase:      result = categories.letterTitleCase;      break;
 592       case letterModifier:       result = categories.letterModifier;       break;
 593       case letterOther:          result = categories.letterOther;          break;
 594
 595       case punctuationConnector: result = categories.punctuationConnector; break;
 596       case punctuationDash:      result = categories.punctuationDash;      break;
 597       case punctuationOpen:      result = categories.punctuationOpen;      break;
 598       case punctuationClose:     result = categories.punctuationClose;     break;
 599       case punctuationInitial:   result = categories.punctuationInitial;   break;
 600       case punctuationFinal:     result = categories.punctuationFinal;     break;
 601       case punctuationOther:     result = categories.punctuationOther;     break;
 602
 603       case symbolMath:           result = categories.symbolMath;           break;
 604       case symbolCurrency:       result = categories.symbolCurrency;       break;
 605       case symbolModifier:       result = categories.symbolModifier;       break;
 606       case symbolOther:          result = categories.symbolOther;          break;
 607    }
 608    return result;
 609 }