00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include "OVR_UTF8Util.h"
00019
00020 namespace OVR { namespace UTF8Util {
00021
00022 SPInt OVR_STDCALL GetLength(const char* buf, SPInt buflen)
00023 {
00024 const char* p = buf;
00025 SPInt length = 0;
00026
00027 if (buflen != -1)
00028 {
00029 while (p - buf < buflen)
00030 {
00031
00032 UTF8Util::DecodeNextChar_Advance0(&p);
00033 length++;
00034 }
00035 }
00036 else
00037 {
00038 while (UTF8Util::DecodeNextChar_Advance0(&p))
00039 length++;
00040 }
00041
00042 return length;
00043 }
00044
00045 UInt32 OVR_STDCALL GetCharAt(SPInt index, const char* putf8str, SPInt length)
00046 {
00047 const char* buf = putf8str;
00048 UInt32 c = 0;
00049
00050 if (length != -1)
00051 {
00052 while (buf - putf8str < length)
00053 {
00054 c = UTF8Util::DecodeNextChar_Advance0(&buf);
00055 if (index == 0)
00056 return c;
00057 index--;
00058 }
00059
00060 return c;
00061 }
00062
00063 do
00064 {
00065 c = UTF8Util::DecodeNextChar_Advance0(&buf);
00066 index--;
00067
00068 if (c == 0)
00069 {
00070
00071 OVR_ASSERT(index == 0);
00072 return c;
00073 }
00074 } while (index >= 0);
00075
00076 return c;
00077 }
00078
00079 SPInt OVR_STDCALL GetByteIndex(SPInt index, const char *putf8str, SPInt length)
00080 {
00081 const char* buf = putf8str;
00082
00083 if (length != -1)
00084 {
00085 while ((buf - putf8str) < length && index > 0)
00086 {
00087 UTF8Util::DecodeNextChar_Advance0(&buf);
00088 index--;
00089 }
00090
00091 return buf-putf8str;
00092 }
00093
00094 while (index > 0)
00095 {
00096 UInt32 c = UTF8Util::DecodeNextChar_Advance0(&buf);
00097 index--;
00098
00099 if (c == 0)
00100 return buf-putf8str;
00101 };
00102
00103 return buf-putf8str;
00104 }
00105
00106 int OVR_STDCALL GetEncodeCharSize(UInt32 ucs_character)
00107 {
00108 if (ucs_character <= 0x7F)
00109 return 1;
00110 else if (ucs_character <= 0x7FF)
00111 return 2;
00112 else if (ucs_character <= 0xFFFF)
00113 return 3;
00114 else if (ucs_character <= 0x1FFFFF)
00115 return 4;
00116 else if (ucs_character <= 0x3FFFFFF)
00117 return 5;
00118 else if (ucs_character <= 0x7FFFFFFF)
00119 return 6;
00120 else
00121 return 0;
00122 }
00123
00124 UInt32 OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer)
00125 {
00126 UInt32 uc;
00127 char c;
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146 #define INVALID_CHAR 0x0FFFD
00147
00148 #define FIRST_BYTE(mask, shift) \
00149 uc = (c & (mask)) << (shift);
00150
00151 #define NEXT_BYTE(shift) \
00152 c = **putf8Buffer; \
00153 if (c == 0) return 0; \
00154 if ((c & 0xC0) != 0x80) return INVALID_CHAR; \
00155 (*putf8Buffer)++; \
00156 uc |= (c & 0x3F) << shift;
00157
00158 c = **putf8Buffer;
00159 (*putf8Buffer)++;
00160 if (c == 0)
00161 return 0;
00162
00163 if ((c & 0x80) == 0) return (UInt32) c;
00164
00165
00166 if ((c & 0xE0) == 0xC0)
00167 {
00168
00169 FIRST_BYTE(0x1F, 6);
00170 NEXT_BYTE(0);
00171 if (uc < 0x80) return INVALID_CHAR;
00172 return uc;
00173 }
00174 else if ((c & 0xF0) == 0xE0)
00175 {
00176
00177 FIRST_BYTE(0x0F, 12);
00178 NEXT_BYTE(6);
00179 NEXT_BYTE(0);
00180 if (uc < 0x800) return INVALID_CHAR;
00181
00182
00183
00184
00185 return uc;
00186 }
00187 else if ((c & 0xF8) == 0xF0)
00188 {
00189
00190 FIRST_BYTE(0x07, 18);
00191 NEXT_BYTE(12);
00192 NEXT_BYTE(6);
00193 NEXT_BYTE(0);
00194 if (uc < 0x010000) return INVALID_CHAR;
00195 return uc;
00196 }
00197 else if ((c & 0xFC) == 0xF8)
00198 {
00199
00200 FIRST_BYTE(0x03, 24);
00201 NEXT_BYTE(18);
00202 NEXT_BYTE(12);
00203 NEXT_BYTE(6);
00204 NEXT_BYTE(0);
00205 if (uc < 0x0200000) return INVALID_CHAR;
00206 return uc;
00207 }
00208 else if ((c & 0xFE) == 0xFC)
00209 {
00210
00211 FIRST_BYTE(0x01, 30);
00212 NEXT_BYTE(24);
00213 NEXT_BYTE(18);
00214 NEXT_BYTE(12);
00215 NEXT_BYTE(6);
00216 NEXT_BYTE(0);
00217 if (uc < 0x04000000) return INVALID_CHAR;
00218 return uc;
00219 }
00220 else
00221 {
00222
00223 return INVALID_CHAR;
00224 }
00225 }
00226
00227
00228 void OVR_STDCALL EncodeChar(char* pbuffer, SPInt* pindex, UInt32 ucs_character)
00229 {
00230 if (ucs_character <= 0x7F)
00231 {
00232
00233 pbuffer[(*pindex)++] = (char) ucs_character;
00234 }
00235 else if (ucs_character <= 0x7FF)
00236 {
00237
00238 pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6);
00239 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
00240 }
00241 else if (ucs_character <= 0xFFFF)
00242 {
00243
00244 pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12);
00245 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
00246 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
00247 }
00248 else if (ucs_character <= 0x1FFFFF)
00249 {
00250
00251 pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18);
00252 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
00253 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
00254 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
00255 }
00256 else if (ucs_character <= 0x3FFFFFF)
00257 {
00258
00259 pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24);
00260 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
00261 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
00262 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
00263 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
00264 }
00265 else if (ucs_character <= 0x7FFFFFFF)
00266 {
00267
00268 pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30);
00269 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F);
00270 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
00271 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
00272 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
00273 pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
00274 }
00275 else
00276 {
00277
00278 }
00279 }
00280
00281 SPInt OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, SPInt length)
00282 {
00283 SPInt len = 0;
00284 if (length != -1)
00285 for (int i = 0; i < length; i++)
00286 {
00287 len += GetEncodeCharSize(pchar[i]);
00288 }
00289 else
00290 for (int i = 0;; i++)
00291 {
00292 if (pchar[i] == 0)
00293 return len;
00294 len += GetEncodeCharSize(pchar[i]);
00295 }
00296 return len;
00297 }
00298
00299 void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, SPInt length)
00300 {
00301 SPInt ofs = 0;
00302 if (length != -1)
00303 {
00304 for (int i = 0; i < length; i++)
00305 {
00306 EncodeChar(pbuff, &ofs, pchar[i]);
00307 }
00308 }
00309 else
00310 {
00311 for (int i = 0;; i++)
00312 {
00313 if (pchar[i] == 0)
00314 break;
00315 EncodeChar(pbuff, &ofs, pchar[i]);
00316 }
00317 }
00318 pbuff[ofs] = 0;
00319 }
00320
00321 UPInt OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, SPInt bytesLen)
00322 {
00323 wchar_t *pbegin = pbuff;
00324 if (bytesLen == -1)
00325 {
00326 while (1)
00327 {
00328 UInt32 ch = DecodeNextChar_Advance0(&putf8str);
00329 if (ch == 0)
00330 break;
00331 else if (ch >= 0xFFFF)
00332 ch = 0xFFFD;
00333 *pbuff++ = wchar_t(ch);
00334 }
00335 }
00336 else
00337 {
00338 const char* p = putf8str;
00339 while ((p - putf8str) < bytesLen)
00340 {
00341 UInt32 ch = DecodeNextChar_Advance0(&p);
00342 if (ch >= 0xFFFF)
00343 ch = 0xFFFD;
00344 *pbuff++ = wchar_t(ch);
00345 }
00346 }
00347
00348 *pbuff = 0;
00349 return pbuff - pbegin;
00350 }
00351
00352
00353 #ifdef UTF8_UNIT_TEST
00354
00355
00356
00357
00358
00359
00360
00361
00362
00363
00364
00365
00366
00367
00368
00369
00370
00371
00372 #include "base/utility.h"
00373 #include <stdio.h>
00374
00375
00376 bool check_equal(const char* utf8_in, const UInt32* ucs_in)
00377 {
00378 for (;;)
00379 {
00380 UInt32 next_ucs = *ucs_in++;
00381 UInt32 next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in);
00382 if (next_ucs != next_ucs_from_utf8)
00383 {
00384 return false;
00385 }
00386 if (next_ucs == 0)
00387 {
00388 OVR_ASSERT(next_ucs_from_utf8 == 0);
00389 break;
00390 }
00391 }
00392
00393 return true;
00394 }
00395
00396
00397 void log_ascii(const char* line)
00398 {
00399 for (;;)
00400 {
00401 unsigned char c = (unsigned char) *line++;
00402 if (c == 0)
00403 {
00404
00405 return;
00406 }
00407 else if (c != '\n'
00408 && (c < 32 || c > 127))
00409 {
00410
00411 printf("<0x%02X>", (int) c);
00412 }
00413 else
00414 {
00415 printf("%c", c);
00416 }
00417 }
00418 }
00419
00420
00421 void log_ucs(const UInt32* line)
00422 {
00423 for (;;)
00424 {
00425 UInt32 uc = *line++;
00426 if (uc == 0)
00427 {
00428
00429 return;
00430 }
00431 else if (uc != '\n'
00432 && (uc < 32 || uc > 127))
00433 {
00434
00435 printf("<U-%04X>", uc);
00436 }
00437 else
00438 {
00439 printf("%c", (char) uc);
00440 }
00441 }
00442 }
00443
00444
00445
00446 int main(int argc, const char* argv[])
00447 {
00448 {
00449 const char* test8 = "Ignacio CastaƱo";
00450 const UInt32 test32[] =
00451 {
00452 0x49, 0x67, 0x6E, 0x61, 0x63,
00453 0x69, 0x6F, 0x20, 0x43, 0x61,
00454 0x73, 0x74, 0x61, 0xF1, 0x6F,
00455 0x00
00456 };
00457
00458 OVR_ASSERT(check_equal(test8, test32));
00459 }
00460
00461
00462 if (argc > 1)
00463 {
00464 const char* filename = argv[1];
00465 FILE* fp = fopen(filename, "rb");
00466 if (fp == NULL)
00467 {
00468 printf("Can't open file '%s'\n", filename);
00469 return 1;
00470 }
00471
00472
00473 const int LINE_SIZE = 200;
00474 char line_buffer_utf8[LINE_SIZE];
00475 char reencoded_utf8[6 * LINE_SIZE];
00476 UInt32 line_buffer_ucs[LINE_SIZE];
00477
00478 int byte_counter = 0;
00479 for (;;)
00480 {
00481 int c = fgetc(fp);
00482 if (c == EOF)
00483 {
00484
00485 break;
00486 }
00487 line_buffer_utf8[byte_counter++] = c;
00488 if (c == '\n' || byte_counter >= LINE_SIZE - 2)
00489 {
00490
00491 line_buffer_utf8[byte_counter++] = 0;
00492
00493
00494 const char* p = line_buffer_utf8;
00495 UInt32* q = line_buffer_ucs;
00496 for (;;)
00497 {
00498 UInt32 uc = UTF8Util::DecodeNextChar(&p);
00499 *q++ = uc;
00500
00501 OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE);
00502 OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE);
00503
00504 if (uc == 0) break;
00505 }
00506
00507
00508 q = line_buffer_ucs;
00509 int index = 0;
00510 for (;;)
00511 {
00512 UInt32 uc = *q++;
00513 OVR_ASSERT(index < LINE_SIZE * 6 - 6);
00514 int last_index = index;
00515 UTF8Util::EncodeChar(reencoded_utf8, &index, uc);
00516 OVR_ASSERT(index <= last_index + 6);
00517 if (uc == 0) break;
00518 }
00519
00520
00521 #if 0
00522
00523 log_ucs(line_buffer_ucs);
00524 log_ascii(reencoded_utf8);
00525 #endif // 0
00526
00527 OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs));
00528 OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs));
00529
00530
00531 byte_counter = 0;
00532 }
00533 }
00534
00535 fclose(fp);
00536 }
00537
00538 return 0;
00539 }
00540
00541
00542 #endif // UTF8_UNIT_TEST
00543
00544 }}
00545