namespace sys; import "File" public int UTF8toISO8859_1(const char * source, char * dest, int max) { unichar ch; int nb; int c; int d = 0; for(c = 0; (ch = UTF8GetChar(source + c, &nb)) && d < max-1; c += nb) { if(ch < 256) dest[d] = (byte)ch; else dest[d] = '?'; d++; } dest[d] = 0; return d; } public uint16 * UTF8toUTF16Len(const char * source, int byteCount, int * wordCount) { if(source) { uint16 * dest = new uint16[byteCount + 1]; int c; int d = 0; bool error = false; for(c = 0; c>= 2; if(ch & 0x20) { numBytes++; mask >>= 1; if(ch & 0x10) { if(ch & 0x08) error = true; else { numBytes++; mask >>= 1; } } } } for(i = 0; i= byteCount || !(ch = source[c])) { error = true; break; } codePoint <<= 6; codePoint |= ch & mask; mask = 0x3F; if(i > 1) { if(!(ch & 0x80) || (ch & 0x40)) { error = true; break; } } } if(i < numBytes) error = true; if(codePoint > 0x10FFFF || (codePoint >= 0xD800 && codePoint <= 0xDFFF) || (codePoint < 0x80 && numBytes > 1) || (codePoint < 0x800 && numBytes > 2) || (codePoint < 0x10000 && numBytes > 3)) error = true; if(error) { c = byteCount; d = 0; break; } if(codePoint > 0xFFFF) { uint16 lead = (uint16)(LEAD_OFFSET + (codePoint >> 10)); uint16 trail = (uint16)(0xDC00 | (codePoint & 0x3FF)); dest[d++] = lead; dest[d++] = trail; } else { dest[d++] = (uint16)codePoint; } } *wordCount = d + byteCount - c; dest[d] = 0; return dest; } *wordCount = 0; return null; } public int UTF8toUTF16BufferLen(const char * source, uint16 * dest, int max, int len) { if(source) { int c; int d = 0; for(c = 0; c < len && source[c];) { byte ch = source[c]; unichar codePoint = 0; int numBytes = 1; int i; byte mask = 0x7F; if(ch & 0x80 && ch & 0x40) { mask >>= 2; numBytes++; if(ch & 0x20) { numBytes++; mask >>= 1; if(ch & 0x10) { numBytes++; mask >>= 1; } } } for(i = 0; i 0xFFFF) { uint16 lead = (uint16)(LEAD_OFFSET + (codePoint >> 10)); uint16 trail = (uint16)(0xDC00 | (codePoint & 0x3FF)); if(d >= max - 1) break; dest[d++] = lead; dest[d++] = trail; } else { if(d >= max) break; dest[d++] = (uint16)codePoint; } } dest[d] = 0; return d; } return 0; } public int UTF16BEtoUTF8Buffer(const uint16 * source, byte * dest, int max) { int c; int d = 0; uint16 u16; for(c = 0; (u16 = ((source[c] & 0xFF00) >> 8) | ((source[c] & 0x00FF) << 8)); c++) { unichar ch; if(u16 < 0xD800 || u16 > 0xDBFF) { // TOFIX: PRECOMP ERROR IF NO BRACKETS ch = (unichar)u16; } else { // TOFIX: PRECOMP ERROR IF NO BRACKETS ch = ((unichar)u16 << 10) + source[c++] + SURROGATE_OFFSET; } if(ch < 0x80) { if(d + 1 >= max) break; dest[d++] = (char)ch; } else if(ch < 0x800) { if(d + 2 >= max) break; dest[d++] = (byte)(0xC0 | ((ch & 0x7C0) >> 6)); dest[d++] = (byte)(0x80 | (ch & 0x03F)); } else if(ch < 0x10000) { if(d + 3 >= max) break; dest[d++] = (byte)(0xE0 | ((ch & 0xF000) >> 12)); dest[d++] = (byte)(0x80 | ((ch & 0xFC0) >> 6)); dest[d++] = (byte)(0x80 | (ch & 0x03F)); } else { if(d + 4 >= max) break; dest[d++] = (byte)(0xF0 | ((ch & 0x1C0000) >> 18)); dest[d++] = (byte)(0x80 | ((ch & 0x3F000) >> 12)); dest[d++] = (byte)(0x80 | ((ch & 0xFC0) >> 6)); dest[d++] = (byte)(0x80 | (ch & 0x03F)); } } dest[d] = 0; return d; } enum LineBreakClass { OP, CL, QU, GL, NS, EX, SY, IS, PR, PO, NU, AL, ID, IN, HY, BA, BB, B2, ZW, CM, WJ, H2, H3, JL, JV, JT, SA, SG, SP, CR, LF, BK }; public enum CharCategory { none = 0, Mn = 1, markNonSpacing = 1, Mc = 2, markSpacing = 2, Me = 3, markEnclosing = 3, Nd = 4, numberDecimalDigit = 4, Nl = 5, numberLetter = 5, No = 6, numberOther = 6, Zs = 7, separatorSpace = 7, Zl = 8, separatorLine = 8, Zp = 9, separatorParagraph = 9, Cc = 10, otherControl = 10, Cf = 11, otherFormat = 11, Cs = 12, otherSurrogate = 12, Co = 13, otherPrivateUse = 13, Cn = 14, otherNotAssigned = 14, Lu = 15, letterUpperCase = 15, Ll = 16, letterLowerCase = 16, Lt = 17, letterTitleCase = 17, Lm = 18, letterModifier = 18, Lo = 19, letterOther = 19, Pc = 20, punctuationConnector = 20, Pd = 21, punctuationDash = 21, Ps = 22, punctuationOpen = 22, Pe = 23, punctuationClose = 23, Pi = 24, punctuationInitial = 24, Pf = 25, punctuationFinal = 25, Po = 26, punctuationOther = 26, Sm = 27, symbolMath = 27, Sc = 28, symbolCurrency = 28, Sk = 29, symbolModifier = 29, So = 30, symbolOther = 30 }; public class CharCategories : uint { public: bool none:1; bool markNonSpacing:1, markSpacing:1, markEnclosing:1; bool numberDecimalDigit:1, numberLetter:1, numberOther:1; bool separatorSpace:1,separatorLine:1,separatorParagraph:1; bool otherControl:1,otherFormat:1,otherSurrogate:1,otherPrivateUse:1,otherNotAssigned:1; bool letterUpperCase:1, letterLowerCase:1, letterTitleCase:1, letterModifier:1, letterOther:1; bool punctuationConnector:1, punctuationDash:1, punctuationOpen:1, punctuationClose:1, punctuationInitial:1, punctuationFinal:1, punctuationOther:1; bool symbolMath:1, symbolCurrency:1, symbolModifier:1, symbolOther:1; }; public enum PredefinedCharCategories : CharCategories { none = CharCategories { none = true }, marks = CharCategories { markNonSpacing = true, markSpacing = true, markEnclosing = true }, numbers = CharCategories { numberDecimalDigit = true, numberLetter = true, numberOther = true }, separators = CharCategories { separatorSpace = true, separatorLine = true, separatorParagraph = true }, others = CharCategories { otherControl = true, otherFormat = true, otherSurrogate = true, otherPrivateUse = true, otherNotAssigned = true }, letters = CharCategories { letterUpperCase = true, letterLowerCase = true, letterTitleCase = true, letterModifier = true, letterOther = true }, punctuation = CharCategories { punctuationConnector = true, punctuationDash = true, punctuationOpen = true, punctuationClose = true, punctuationInitial = true, punctuationFinal = true, punctuationOther = true }, symbols = CharCategories { symbolMath = true, symbolCurrency = true, symbolModifier = true, symbolOther = true }, connector = CharCategories { punctuationConnector = true } }; public bool GetAlNum(const char ** input, char * string, int max) { int c = 0; unichar ch; int nb = 1; bool result = true; const char * buffer = *input; if(!buffer[0]) { string[0]=0; return false; } // Eat all left spacing, leave last char in ch for(;;) { #if defined(ECERE_NOUNICODE) || defined(ECERE_BOOTSTRAP) ch = *buffer; #else ch = UTF8GetChar(buffer, &nb); #endif if(!ch) { result = false; break; } #if defined(ECERE_NOUNICODE) || defined(ECERE_BOOTSTRAP) if(isalnum(ch)) #else if(CharMatchCategories(ch, numbers|letters)) #endif break; else buffer += nb; } if(result) { while(c < max-1) { int i; for(i = 0; i < nb && c < max-1; i++) string[c++] = *(buffer++); #if defined(ECERE_NOUNICODE) || defined(ECERE_BOOTSTRAP) ch = *buffer; #else ch = UTF8GetChar(buffer, &nb); #endif if(!ch) break; // End of input string #if defined(ECERE_NOUNICODE) || defined(ECERE_BOOTSTRAP) if(!isalnum(ch)) #else if(!CharMatchCategories(ch, numbers|letters)) #endif // End of this alpha numeric word break; } string[c]=0; } *input = buffer; return result; } static struct Range { uint start, end; CharCategory category; }; static int CompareRange(BinaryTree tree, Range a, Range b) { if(a.start > b.end) return 1; else if(a.end < b.start) return -1; else return 0; } static void FreeRange(Range range) { delete range; } static CharCategory asciiCategories[] = { Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Cc, Zs, Po, Po, Po, Sc, Po, Po, Po, Ps, Pe, Po, Sm, Cs, Pd, Po, Po, Nd, Nd, Nd, Nd, Nd, Nd, Nd, Nd, Nd, Nd, Po, Po, Sm, Sm, Sm, Po, Po, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Lu, Ps, Po, Pe, Sk, Pc, Sk, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ll, Ps, Sm, Pe, Sm, Cc }; static class UnicodeDatabase { BinaryTree categories { CompareKey = (void *)CompareRange; FreeKey = (void *)FreeRange; }; UnicodeDatabase() { File f = FileOpen("<:ecere>unicode/derivedGeneralCategoryStripped.txt", read); if(f) { char line[1024]; while(f.GetLine(line, 1024)) { if(line[0] && line[0] != '#') { char * endPtr; uint start = (uint)strtoul(line, &endPtr, 16); if(endPtr) { uint end = (endPtr && *endPtr == '.') ? (uint)strtoul(endPtr + 2, &endPtr, 16) : start; if(endPtr) { endPtr = strchr(endPtr, ';'); if(endPtr) { CharCategory category = none; endPtr += 2; switch(*endPtr) { case 'C': switch(endPtr[1]) { case 'n': category = Cn; break; case 'c': category = Cc; break; case 'f': category = Cf; break; case 'o': category = Co; break; case 's': category = Cs; break; } break; case 'L': switch(endPtr[1]) { case 'u': category = Lu; break; case 'l': category = Ll; break; case 't': category = Lt; break; case 'm': category = Lm; break; case 'o': category = Lo; break; } break; case 'M': switch(endPtr[1]) { case 'n': category = Mn; break; case 'e': category = Me; break; case 'c': category = Mc; break; } break; case 'Z': switch(endPtr[1]) { case 's': category = Zs; break; case 'l': category = Zl; break; case 'p': category = Zp; break; } break; case 'P': switch(endPtr[1]) { case 'd': category = Pd; break; case 's': category = Ps; break; case 'e': category = Pe; break; case 'c': category = Pc; break; case 'o': category = Po; break; case 'i': category = Pi; break; case 'f': category = Pf; break; } break; case 'S': switch(endPtr[1]) { case 'm': category = Sm; break; case 'c': category = Sc; break; case 'k': category = Sk; break; case 'o': category = So; break; } break; case 'N': switch(endPtr[1]) { case 'd': category = Nd; break; case 'l': category = Nl; break; case 'o': category = No; break; } break; } if(category) { Range range { start, end, category }; BTNode node { key = (uintptr) &range }; if(categories.Add(node)) { node.key = (uintptr)new Range[1]; *(Range *)node.key = range; } else delete node; } } } } } } delete f; /* f = FileOpen("DerivedGeneralCategoryStripped.txt", write); if(f) { BTNode node; for(node = categories.first; node; node = node.next) { Range * range = (Range *)node.key; char string[64]; int len, c; if(range->end > range->start) sprintf(string, "%04x..%04x", range->start, range->end); else sprintf(string, "%04x", range->start); len = strlen(string); for(c = len; c<14; c++) string[len++] = ' '; string[len++] = ';'; string[len++] = ' '; range->category.OnGetString(string + len, null, null); len += 2; string[len++] = '\n'; string[len] = '\0'; f.Puts(string); } delete f; } */ } } ~UnicodeDatabase() { categories.Free(); } }; static UnicodeDatabase dataBase { }; public CharCategory GetCharCategory(unichar ch) { if(ch < 128) return asciiCategories[ch]; else { CharCategory category = none; Range range { ch, ch }; BTNode node = dataBase.categories.Find((uintptr) &range); if(node) category = ((Range *)node.key)->category; return category; } } public bool CharMatchCategories(unichar ch, CharCategories categories) { bool result = false; CharCategory category = GetCharCategory(ch); switch(category) { case none: result = categories.none; break; case markNonSpacing: result = categories.markNonSpacing; break; case markSpacing: result = categories.markSpacing; break; case markEnclosing: result = categories.markEnclosing; break; case numberDecimalDigit: result = categories.numberDecimalDigit; break; case numberLetter: result = categories.numberLetter; break; case numberOther: result = categories.numberOther; break; case separatorSpace: result = categories.separatorSpace; break; case separatorLine: result = categories.separatorLine; break; case separatorParagraph: result = categories.separatorParagraph; break; case otherControl: result = categories.otherControl; break; case otherFormat: result = categories.otherFormat; break; case otherSurrogate: result = categories.otherSurrogate; break; case otherPrivateUse: result = categories.otherPrivateUse; break; case otherNotAssigned: result = categories.otherNotAssigned; break; case letterUpperCase: result = categories.letterUpperCase; break; case letterLowerCase: result = categories.letterLowerCase; break; case letterTitleCase: result = categories.letterTitleCase; break; case letterModifier: result = categories.letterModifier; break; case letterOther: result = categories.letterOther; break; case punctuationConnector: result = categories.punctuationConnector; break; case punctuationDash: result = categories.punctuationDash; break; case punctuationOpen: result = categories.punctuationOpen; break; case punctuationClose: result = categories.punctuationClose; break; case punctuationInitial: result = categories.punctuationInitial; break; case punctuationFinal: result = categories.punctuationFinal; break; case punctuationOther: result = categories.punctuationOther; break; case symbolMath: result = categories.symbolMath; break; case symbolCurrency: result = categories.symbolCurrency; break; case symbolModifier: result = categories.symbolModifier; break; case symbolOther: result = categories.symbolOther; break; } return result; }