5 public int UTF8toISO8859_1(char * source, char * dest, int max)
11 for(c = 0; (ch = UTF8GetChar(source + c, &nb)) && d < max-1; c += nb)
23 public uint16 * UTF8toUTF16Len(char * source, int byteCount, int * wordCount)
27 uint16 * dest = new uint16[byteCount + 1];
31 for(c = 0; c<byteCount && source[c];)
34 unichar codePoint = 0;
38 if(ch & 0x80 && ch & 0x40)
58 for(i = 0; i<numBytes; i++, c++)
60 if(c >= byteCount || !(ch = source[c]))
66 codePoint |= ch & mask;
71 if(!(ch & 0x80) || (ch & 0x40))
80 if(codePoint > 0x10FFFF || (codePoint >= 0xD800 && codePoint <= 0xDFFF) ||
81 (codePoint < 0x80 && numBytes > 1) ||
82 (codePoint < 0x800 && numBytes > 2) ||
83 (codePoint < 0x10000 && numBytes > 3))
92 if(codePoint > 0xFFFF)
94 uint16 lead = (uint16)(LEAD_OFFSET + (codePoint >> 10));
95 uint16 trail = 0xDC00 + (uint16)(codePoint & 0x3FF);
102 dest[d++] = (uint16)codePoint;
105 *wordCount = d + byteCount - c;
113 public int UTF8toUTF16BufferLen(char * source, uint16 * dest, int max, int len)
119 for(c = 0; c < len && source[c];)
122 unichar codePoint = 0;
126 if(ch & 0x80 && ch & 0x40)
141 for(i = 0; i<numBytes; i++)
144 codePoint |= source[c++] & mask;
148 if(codePoint > 0xFFFF)
150 uint16 lead = (uint16)(LEAD_OFFSET + (codePoint >> 10));
151 uint16 trail = 0xDC00 + (uint16)(codePoint & 0x3FF);
152 if(d >= max - 1) break;
159 dest[d++] = (uint16)codePoint;
168 public int UTF16BEtoUTF8Buffer(uint16 * source, byte * dest, int max)
173 for(c = 0; (u16 = ((source[c] & 0xFF00) >> 8) | ((source[c] & 0x00FF) << 8)); c++)
176 if(u16 < 0xD800 || u16 > 0xDBFF)
178 // TOFIX: PRECOMP ERROR IF NO BRACKETS
183 // TOFIX: PRECOMP ERROR IF NO BRACKETS
184 ch = ((unichar)u16 << 10) + source[c++] + SURROGATE_OFFSET;
189 if(d + 1 >= max) break;
190 dest[d++] = (char)ch;
194 if(d + 2 >= max) break;
195 dest[d++] = (byte)(0xC0 | ((ch & 0x7C0) >> 6));
196 dest[d++] = (byte)(0x80 | (ch & 0x03F));
198 else if(ch < 0x10000)
200 if(d + 3 >= max) break;
201 dest[d++] = (byte)(0xE0 | ((ch & 0xF000) >> 12));
202 dest[d++] = (byte)(0x80 | ((ch & 0xFC0) >> 6));
203 dest[d++] = (byte)(0x80 | (ch & 0x03F));
207 if(d + 4 >= max) break;
208 dest[d++] = (byte)(0xF0 | ((ch & 0x1C0000) >> 18));
209 dest[d++] = (byte)(0x80 | ((ch & 0x3F000) >> 12));
210 dest[d++] = (byte)(0x80 | ((ch & 0xFC0) >> 6));
211 dest[d++] = (byte)(0x80 | (ch & 0x03F));
220 OP, CL, QU, GL, NS, EX, SY, IS, PR, PO,
221 NU, AL, ID, IN, HY, BA, BB, B2, ZW, CM,
222 WJ, H2, H3, JL, JV, JT, SA, SG, SP, CR,
226 public enum CharCategory
230 Mn = 1, markNonSpacing = 1,
231 Mc = 2, markSpacing = 2,
232 Me = 3, markEnclosing = 3,
234 Nd = 4, numberDecimalDigit = 4,
235 Nl = 5, numberLetter = 5,
236 No = 6, numberOther = 6,
238 Zs = 7, separatorSpace = 7,
239 Zl = 8, separatorLine = 8,
240 Zp = 9, separatorParagraph = 9,
242 Cc = 10, otherControl = 10,
243 Cf = 11, otherFormat = 11,
244 Cs = 12, otherSurrogate = 12,
245 Co = 13, otherPrivateUse = 13,
246 Cn = 14, otherNotAssigned = 14,
248 Lu = 15, letterUpperCase = 15,
249 Ll = 16, letterLowerCase = 16,
250 Lt = 17, letterTitleCase = 17,
251 Lm = 18, letterModifier = 18,
252 Lo = 19, letterOther = 19,
254 Pc = 20, punctuationConnector = 20,
255 Pd = 21, punctuationDash = 21,
256 Ps = 22, punctuationOpen = 22,
257 Pe = 23, punctuationClose = 23,
258 Pi = 24, punctuationInitial = 24,
259 Pf = 25, punctuationFinal = 25,
260 Po = 26, punctuationOther = 26,
262 Sm = 27, symbolMath = 27,
263 Sc = 28, symbolCurrency = 28,
264 Sk = 29, symbolModifier = 29,
265 So = 30, symbolOther = 30
268 public class CharCategories : uint
272 bool markNonSpacing:1, markSpacing:1, markEnclosing:1;
273 bool numberDecimalDigit:1, numberLetter:1, numberOther:1;
274 bool separatorSpace:1,separatorLine:1,separatorParagraph:1;
275 bool otherControl:1,otherFormat:1,otherSurrogate:1,otherPrivateUse:1,otherNotAssigned:1;
276 bool letterUpperCase:1, letterLowerCase:1, letterTitleCase:1, letterModifier:1, letterOther:1;
277 bool punctuationConnector:1, punctuationDash:1, punctuationOpen:1, punctuationClose:1, punctuationInitial:1, punctuationFinal:1, punctuationOther:1;
278 bool symbolMath:1, symbolCurrency:1, symbolModifier:1, symbolOther:1;
281 public enum PredefinedCharCategories : CharCategories
283 none = CharCategories { none = true },
284 marks = CharCategories { markNonSpacing = true, markSpacing = true, markEnclosing = true },
285 numbers = CharCategories { numberDecimalDigit = true, numberLetter = true, numberOther = true },
286 separators = CharCategories { separatorSpace = true, separatorLine = true, separatorParagraph = true },
287 others = CharCategories { otherControl = true, otherFormat = true, otherSurrogate = true, otherPrivateUse = true, otherNotAssigned = true },
288 letters = CharCategories { letterUpperCase = true, letterLowerCase = true, letterTitleCase = true, letterModifier = true, letterOther = true },
289 punctuation = CharCategories { punctiationConnector = true, punctuationDash = true, punctuationOpen = true, punctuationClose = true, punctuationInitial = true,
290 punctuationFinal = true, punctuationOther = true },
291 symbols = CharCategories { symbolMath = true, symbolCurrency = true, symbolModifier = true, symbolOther = true },
292 connector = CharCategories { punctuationConnector = true },
295 public bool GetAlNum(char ** input, char * string, int max)
301 char * buffer = *input;
302 if(!buffer[0]) { string[0]=0; return false; }
304 // Eat all left spacing, leave last char in ch
307 #if defined(ECERE_NOUNICODE) || defined(ECERE_BOOTSTRAP)
310 ch = UTF8GetChar(buffer, &nb);
312 if(!ch) { result = false; break; }
314 #if defined(ECERE_NOUNICODE) || defined(ECERE_BOOTSTRAP)
317 if(CharMatchCategories(ch, numbers|letters))
328 for(i = 0; i < nb && c < max-1; i++)
329 string[c++] = *(buffer++);
331 #if defined(ECERE_NOUNICODE) || defined(ECERE_BOOTSTRAP)
334 ch = UTF8GetChar(buffer, &nb);
336 if(!ch) break; // End of input string
338 #if defined(ECERE_NOUNICODE) || defined(ECERE_BOOTSTRAP)
341 if(!CharMatchCategories(ch, numbers|letters))
343 // End of this alpha numeric word
355 CharCategory category;
358 static int CompareRange(BinaryTree tree, Range a, Range b)
362 else if(a.end < b.start)
368 static void FreeRange(Range range)
373 static CharCategory asciiCategories[] =
375 Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc,
376 Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc,
377 Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc,
378 Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc,
379 Zs, Po, Po, Po, Sc, Po, Po, Po,
380 Ps, Pe, Po, Sm, Cs, Pd, Po, Po,
381 Nd, Nd, Nd, Nd, Nd, Nd, Nd, Nd,
382 Nd, Nd, Po, Po, Sm, Sm, Sm, Po,
383 Po, Lu, Lu, Lu, Lu, Lu, Lu, Lu,
384 Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu,
385 Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu,
386 Lu, Lu, Lu, Ps, Po, Pe, Sk, Pc,
387 Sk, Ll, Ll, Ll, Ll, Ll, Ll, Ll,
388 Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll,
389 Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll,
390 Ll, Ll, Ll, Ps, Sm, Pe, Sm, Cc
393 static class UnicodeDatabase
395 BinaryTree categories
397 CompareKey = (void *)CompareRange;
398 FreeKey = (void *)FreeRange;
403 File f = FileOpen("<:ecere>unicode/derivedGeneralCategoryStripped.txt", read);
407 while(f.GetLine(line, 1024))
409 if(line[0] && line[0] != '#')
412 uint start = strtoul(line, &endPtr, 16);
415 uint end = (endPtr && *endPtr == '.') ? strtoul(endPtr + 2, &endPtr, 16) : start;
418 endPtr = strchr(endPtr, ';');
421 CharCategory category = none;
428 case 'n': category = Cn; break;
429 case 'c': category = Cc; break;
430 case 'f': category = Cf; break;
431 case 'o': category = Co; break;
432 case 's': category = Cs; break;
438 case 'u': category = Lu; break;
439 case 'l': category = Ll; break;
440 case 't': category = Lt; break;
441 case 'm': category = Lm; break;
442 case 'o': category = Lo; break;
448 case 'n': category = Mn; break;
449 case 'e': category = Me; break;
450 case 'c': category = Mc; break;
456 case 's': category = Zs; break;
457 case 'l': category = Zl; break;
458 case 'p': category = Zp; break;
464 case 'd': category = Pd; break;
465 case 's': category = Ps; break;
466 case 'e': category = Pe; break;
467 case 'c': category = Pc; break;
468 case 'o': category = Po; break;
469 case 'i': category = Pi; break;
470 case 'f': category = Pf; break;
476 case 'm': category = Sm; break;
477 case 'c': category = Sc; break;
478 case 'k': category = Sk; break;
479 case 'o': category = So; break;
485 case 'd': category = Nd; break;
486 case 'l': category = Nl; break;
487 case 'o': category = No; break;
493 Range range { start, end, category };
494 BTNode node { key = (uintptr) &range };
495 if(categories.Add(node))
497 node.key = (uintptr)new Range[1];
498 *(Range *)node.key = range;
510 f = FileOpen("DerivedGeneralCategoryStripped.txt", write);
514 for(node = categories.first; node; node = node.next)
516 Range * range = (Range *)node.key;
520 if(range->end > range->start)
521 sprintf(string, "%04x..%04x", range->start, range->end);
523 sprintf(string, "%04x", range->start);
524 len = strlen(string);
525 for(c = len; c<14; c++)
529 range->category.OnGetString(string + len, null, null);
531 string[len++] = '\n';
546 static UnicodeDatabase dataBase { };
548 public CharCategory GetCharCategory(unichar ch)
551 return asciiCategories[ch];
554 CharCategory category = none;
555 Range range { ch, ch };
556 BTNode node = dataBase.categories.Find((uintptr) &range);
558 category = ((Range *)node.key)->category;
564 public bool CharMatchCategories(unichar ch, CharCategories categories)
567 CharCategory category = GetCharCategory(ch);
570 case none: result = categories.none; break;
571 case markNonSpacing: result = categories.markNonSpacing; break;
572 case markSpacing: result = categories.markSpacing; break;
573 case markEnclosing: result = categories.markEnclosing; break;
575 case numberDecimalDigit: result = categories.numberDecimalDigit; break;
576 case numberLetter: result = categories.numberLetter; break;
577 case numberOther: result = categories.numberOther; break;
579 case separatorSpace: result = categories.separatorSpace; break;
580 case separatorLine: result = categories.separatorLine; break;
581 case separatorParagraph: result = categories.separatorParagraph; break;
583 case otherControl: result = categories.otherControl; break;
584 case otherFormat: result = categories.otherFormat; break;
585 case otherSurrogate: result = categories.otherSurrogate; break;
586 case otherPrivateUse: result = categories.otherPrivateUse; break;
587 case otherNotAssigned: result = categories.otherNotAssigned; break;
589 case letterUpperCase: result = categories.letterUpperCase; break;
590 case letterLowerCase: result = categories.letterLowerCase; break;
591 case letterTitleCase: result = categories.letterTitleCase; break;
592 case letterModifier: result = categories.letterModifier; break;
593 case letterOther: result = categories.letterOther; break;
595 case punctuationConnector: result = categories.punctuationConnector; break;
596 case punctuationDash: result = categories.punctuationDash; break;
597 case punctuationOpen: result = categories.punctuationOpen; break;
598 case punctuationClose: result = categories.punctuationClose; break;
599 case punctuationInitial: result = categories.punctuationInitial; break;
600 case punctuationFinal: result = categories.punctuationFinal; break;
601 case punctuationOther: result = categories.punctuationOther; break;
603 case symbolMath: result = categories.symbolMath; break;
604 case symbolCurrency: result = categories.symbolCurrency; break;
605 case symbolModifier: result = categories.symbolModifier; break;
606 case symbolOther: result = categories.symbolOther; break;