00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042 #include "ConvertUTF.h"
00043 #ifdef CVTUTF_DEBUG
00044 #include <stdio.h>
00045 #endif
00046
00047 static const int halfShift = 10;
00048
00049 static const UTF32 halfBase = 0x0010000UL;
00050 static const UTF32 halfMask = 0x3FFUL;
00051
00052 #define UNI_SUR_HIGH_START (UTF32)0xD800
00053 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
00054 #define UNI_SUR_LOW_START (UTF32)0xDC00
00055 #define UNI_SUR_LOW_END (UTF32)0xDFFF
00056 #define false 0
00057 #define true 1
00058
00059
00060
00061 ConversionResult ConvertUTF32toUTF16 (
00062 const UTF32** sourceStart, const UTF32* sourceEnd,
00063 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
00064 ConversionResult result = conversionOK;
00065 const UTF32* source = *sourceStart;
00066 UTF16* target = *targetStart;
00067 while (source < sourceEnd) {
00068 UTF32 ch;
00069 if (target >= targetEnd) {
00070 result = targetExhausted; break;
00071 }
00072 ch = *source++;
00073 if (ch <= UNI_MAX_BMP) {
00074
00075 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
00076 if (flags == strictConversion) {
00077 --source;
00078 result = sourceIllegal;
00079 break;
00080 } else {
00081 *target++ = UNI_REPLACEMENT_CHAR;
00082 }
00083 } else {
00084 *target++ = (UTF16)ch;
00085 }
00086 } else if (ch > UNI_MAX_LEGAL_UTF32) {
00087 if (flags == strictConversion) {
00088 result = sourceIllegal;
00089 } else {
00090 *target++ = UNI_REPLACEMENT_CHAR;
00091 }
00092 } else {
00093
00094 if (target + 1 >= targetEnd) {
00095 --source;
00096 result = targetExhausted; break;
00097 }
00098 ch -= halfBase;
00099 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
00100 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
00101 }
00102 }
00103 *sourceStart = source;
00104 *targetStart = target;
00105 return result;
00106 }
00107
00108
00109
00110 ConversionResult ConvertUTF16toUTF32 (
00111 const UTF16** sourceStart, const UTF16* sourceEnd,
00112 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
00113 ConversionResult result = conversionOK;
00114 const UTF16* source = *sourceStart;
00115 UTF32* target = *targetStart;
00116 UTF32 ch, ch2;
00117 while (source < sourceEnd) {
00118 const UTF16* oldSource = source;
00119 ch = *source++;
00120
00121 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
00122
00123 if (source < sourceEnd) {
00124 ch2 = *source;
00125
00126 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
00127 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
00128 + (ch2 - UNI_SUR_LOW_START) + halfBase;
00129 ++source;
00130 } else if (flags == strictConversion) {
00131 --source;
00132 result = sourceIllegal;
00133 break;
00134 }
00135 } else {
00136 --source;
00137 result = sourceExhausted;
00138 break;
00139 }
00140 } else if (flags == strictConversion) {
00141
00142 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
00143 --source;
00144 result = sourceIllegal;
00145 break;
00146 }
00147 }
00148 if (target >= targetEnd) {
00149 source = oldSource;
00150 result = targetExhausted; break;
00151 }
00152 *target++ = ch;
00153 }
00154 *sourceStart = source;
00155 *targetStart = target;
00156 #ifdef CVTUTF_DEBUG
00157 if (result == sourceIllegal) {
00158 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
00159 fflush(stderr);
00160 }
00161 #endif
00162 return result;
00163 }
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174 static const char trailingBytesForUTF8[256] = {
00175 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00176 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00177 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00178 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00179 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00180 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00181 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00182 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
00183 };
00184
00185
00186
00187
00188
00189
00190 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
00191 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
00192
00193
00194
00195
00196
00197
00198
00199
00200 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214 ConversionResult ConvertUTF16toUTF8 (
00215 const UTF16** sourceStart, const UTF16* sourceEnd,
00216 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
00217 ConversionResult result = conversionOK;
00218 const UTF16* source = *sourceStart;
00219 UTF8* target = *targetStart;
00220 while (source < sourceEnd) {
00221 UTF32 ch;
00222 unsigned short bytesToWrite = 0;
00223 const UTF32 byteMask = 0xBF;
00224 const UTF32 byteMark = 0x80;
00225 const UTF16* oldSource = source;
00226 ch = *source++;
00227
00228 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
00229
00230 if (source < sourceEnd) {
00231 UTF32 ch2 = *source;
00232
00233 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
00234 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
00235 + (ch2 - UNI_SUR_LOW_START) + halfBase;
00236 ++source;
00237 } else if (flags == strictConversion) {
00238 --source;
00239 result = sourceIllegal;
00240 break;
00241 }
00242 } else {
00243 --source;
00244 result = sourceExhausted;
00245 break;
00246 }
00247 } else if (flags == strictConversion) {
00248
00249 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
00250 --source;
00251 result = sourceIllegal;
00252 break;
00253 }
00254 }
00255
00256 if (ch < (UTF32)0x80) { bytesToWrite = 1;
00257 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
00258 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
00259 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
00260 } else { bytesToWrite = 3;
00261 ch = UNI_REPLACEMENT_CHAR;
00262 }
00263
00264 target += bytesToWrite;
00265 if (target > targetEnd) {
00266 source = oldSource;
00267 target -= bytesToWrite; result = targetExhausted; break;
00268 }
00269 switch (bytesToWrite) {
00270 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00271 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00272 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00273 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
00274 }
00275 target += bytesToWrite;
00276 }
00277 *sourceStart = source;
00278 *targetStart = target;
00279 return result;
00280 }
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00293
00294
00295 static Boolean isLegalUTF8(const UTF8 *source, int length) {
00296 UTF8 a;
00297 const UTF8 *srcptr = source+length;
00298 switch (length) {
00299 default: return false;
00300
00301 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
00302 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
00303 case 2: if ((a = (*--srcptr)) > 0xBF) return false;
00304
00305 switch (*source) {
00306
00307 case 0xE0: if (a < 0xA0) return false; break;
00308 case 0xED: if (a > 0x9F) return false; break;
00309 case 0xF0: if (a < 0x90) return false; break;
00310 case 0xF4: if (a > 0x8F) return false; break;
00311 default: if (a < 0x80) return false;
00312 }
00313
00314 case 1: if (*source >= 0x80 && *source < 0xC2) return false;
00315 }
00316 if (*source > 0xF4) return false;
00317 return true;
00318 }
00319
00320
00321
00322
00323
00324
00325
00326 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
00327 int length = trailingBytesForUTF8[*source]+1;
00328 if (source+length > sourceEnd) {
00329 return false;
00330 }
00331 return isLegalUTF8(source, length);
00332 }
00333
00334
00335
00336 ConversionResult ConvertUTF8toUTF16 (
00337 const UTF8** sourceStart, const UTF8* sourceEnd,
00338 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
00339 ConversionResult result = conversionOK;
00340 const UTF8* source = *sourceStart;
00341 UTF16* target = *targetStart;
00342 while (source < sourceEnd) {
00343 UTF32 ch = 0;
00344 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
00345 if (source + extraBytesToRead >= sourceEnd) {
00346 result = sourceExhausted; break;
00347 }
00348
00349 if (! isLegalUTF8(source, extraBytesToRead+1)) {
00350 result = sourceIllegal;
00351 break;
00352 }
00353
00354
00355
00356 switch (extraBytesToRead) {
00357 case 5: ch += *source++; ch <<= 6;
00358 case 4: ch += *source++; ch <<= 6;
00359 case 3: ch += *source++; ch <<= 6;
00360 case 2: ch += *source++; ch <<= 6;
00361 case 1: ch += *source++; ch <<= 6;
00362 case 0: ch += *source++;
00363 }
00364 ch -= offsetsFromUTF8[extraBytesToRead];
00365
00366 if (target >= targetEnd) {
00367 source -= (extraBytesToRead+1);
00368 result = targetExhausted; break;
00369 }
00370 if (ch <= UNI_MAX_BMP) {
00371
00372 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
00373 if (flags == strictConversion) {
00374 source -= (extraBytesToRead+1);
00375 result = sourceIllegal;
00376 break;
00377 } else {
00378 *target++ = UNI_REPLACEMENT_CHAR;
00379 }
00380 } else {
00381 *target++ = (UTF16)ch;
00382 }
00383 } else if (ch > UNI_MAX_UTF16) {
00384 if (flags == strictConversion) {
00385 result = sourceIllegal;
00386 source -= (extraBytesToRead+1);
00387 break;
00388 } else {
00389 *target++ = UNI_REPLACEMENT_CHAR;
00390 }
00391 } else {
00392
00393 if (target + 1 >= targetEnd) {
00394 source -= (extraBytesToRead+1);
00395 result = targetExhausted; break;
00396 }
00397 ch -= halfBase;
00398 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
00399 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
00400 }
00401 }
00402 *sourceStart = source;
00403 *targetStart = target;
00404 return result;
00405 }
00406
00407
00408
00409 ConversionResult ConvertUTF32toUTF8 (
00410 const UTF32** sourceStart, const UTF32* sourceEnd,
00411 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
00412 ConversionResult result = conversionOK;
00413 const UTF32* source = *sourceStart;
00414 UTF8* target = *targetStart;
00415 while (source < sourceEnd) {
00416 UTF32 ch;
00417 unsigned short bytesToWrite = 0;
00418 const UTF32 byteMask = 0xBF;
00419 const UTF32 byteMark = 0x80;
00420 ch = *source++;
00421 if (flags == strictConversion ) {
00422
00423 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
00424 --source;
00425 result = sourceIllegal;
00426 break;
00427 }
00428 }
00429
00430
00431
00432
00433 if (ch < (UTF32)0x80) { bytesToWrite = 1;
00434 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
00435 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
00436 } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
00437 } else { bytesToWrite = 3;
00438 ch = UNI_REPLACEMENT_CHAR;
00439 result = sourceIllegal;
00440 }
00441
00442 target += bytesToWrite;
00443 if (target > targetEnd) {
00444 --source;
00445 target -= bytesToWrite; result = targetExhausted; break;
00446 }
00447 switch (bytesToWrite) {
00448 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00449 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00450 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00451 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
00452 }
00453 target += bytesToWrite;
00454 }
00455 *sourceStart = source;
00456 *targetStart = target;
00457 return result;
00458 }
00459
00460
00461
00462 ConversionResult ConvertUTF8toUTF32 (
00463 const UTF8** sourceStart, const UTF8* sourceEnd,
00464 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
00465 ConversionResult result = conversionOK;
00466 const UTF8* source = *sourceStart;
00467 UTF32* target = *targetStart;
00468 while (source < sourceEnd) {
00469 UTF32 ch = 0;
00470 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
00471 if (source + extraBytesToRead >= sourceEnd) {
00472 result = sourceExhausted; break;
00473 }
00474
00475 if (! isLegalUTF8(source, extraBytesToRead+1)) {
00476 result = sourceIllegal;
00477 break;
00478 }
00479
00480
00481
00482 switch (extraBytesToRead) {
00483 case 5: ch += *source++; ch <<= 6;
00484 case 4: ch += *source++; ch <<= 6;
00485 case 3: ch += *source++; ch <<= 6;
00486 case 2: ch += *source++; ch <<= 6;
00487 case 1: ch += *source++; ch <<= 6;
00488 case 0: ch += *source++;
00489 }
00490 ch -= offsetsFromUTF8[extraBytesToRead];
00491
00492 if (target >= targetEnd) {
00493 source -= (extraBytesToRead+1);
00494 result = targetExhausted; break;
00495 }
00496 if (ch <= UNI_MAX_LEGAL_UTF32) {
00497
00498
00499
00500
00501 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
00502 if (flags == strictConversion) {
00503 source -= (extraBytesToRead+1);
00504 result = sourceIllegal;
00505 break;
00506 } else {
00507 *target++ = UNI_REPLACEMENT_CHAR;
00508 }
00509 } else {
00510 *target++ = ch;
00511 }
00512 } else {
00513 result = sourceIllegal;
00514 *target++ = UNI_REPLACEMENT_CHAR;
00515 }
00516 }
00517 *sourceStart = source;
00518 *targetStart = target;
00519 return result;
00520 }
00521
00522
00523
00524
00525
00526
00527
00528
00529
00530
00531
00532
00533
00534
00535
00536
00537
00538
00539