oculus_sdk: OVR_UTF8Util.cpp Source File

Go to the documentation of this file.
00001 /**************************************************************************
00002 
00003 Filename    :   OVR_UTF8Util.cpp
00004 Content     :   UTF8 Unicode character encoding/decoding support
00005 Created     :   September 19, 2012
00006 Notes       : 
00007 Notes       :   Much useful info at "UTF-8 and Unicode FAQ"
00008                 http://www.cl.cam.ac.uk/~mgk25/unicode.html
00009 
00010 Copyright   :   Copyright 2012 Oculus VR, Inc. All Rights reserved.
00011 
00012 Use of this software is subject to the terms of the Oculus license
00013 agreement provided at the time of installation or download, or which
00014 otherwise accompanies this software in either electronic or hard copy form.
00015 
00016 ************************************************************************************/
00017 
00018 #include "OVR_UTF8Util.h"
00019 
00020 namespace OVR { namespace UTF8Util {
00021 
00022 SPInt OVR_STDCALL GetLength(const char* buf, SPInt buflen)
00023 {
00024     const char* p = buf;
00025     SPInt length = 0;
00026 
00027     if (buflen != -1)
00028     {
00029         while (p - buf < buflen)
00030         {
00031             // We should be able to have ASStrings with 0 in the middle.
00032             UTF8Util::DecodeNextChar_Advance0(&p);
00033             length++;
00034         }
00035     }
00036     else
00037     {
00038         while (UTF8Util::DecodeNextChar_Advance0(&p))
00039             length++;
00040     }
00041     
00042     return length;
00043 }
00044 
00045 UInt32 OVR_STDCALL GetCharAt(SPInt index, const char* putf8str, SPInt length)
00046 {
00047     const char* buf = putf8str;
00048     UInt32  c = 0;
00049 
00050     if (length != -1)
00051     {
00052         while (buf - putf8str < length)
00053         {           
00054             c = UTF8Util::DecodeNextChar_Advance0(&buf);
00055             if (index == 0)
00056                 return c;
00057             index--;
00058         }
00059 
00060         return c;
00061     }
00062 
00063     do 
00064     {
00065         c = UTF8Util::DecodeNextChar_Advance0(&buf);
00066         index--;
00067 
00068         if (c == 0)
00069         {
00070             // We've hit the end of the string; don't go further.
00071             OVR_ASSERT(index == 0);
00072             return c;
00073         }
00074     } while (index >= 0);
00075 
00076     return c;
00077 }
00078 
00079 SPInt OVR_STDCALL GetByteIndex(SPInt index, const char *putf8str, SPInt length)
00080 {
00081     const char* buf = putf8str;
00082 
00083     if (length != -1)
00084     {
00085         while ((buf - putf8str) < length && index > 0)
00086         {
00087             UTF8Util::DecodeNextChar_Advance0(&buf);
00088             index--;
00089         }
00090 
00091         return buf-putf8str;
00092     }
00093 
00094     while (index > 0) 
00095     {
00096         UInt32 c = UTF8Util::DecodeNextChar_Advance0(&buf);
00097         index--;
00098 
00099         if (c == 0)
00100             return buf-putf8str;
00101     };
00102 
00103     return buf-putf8str;
00104 }
00105 
00106 int OVR_STDCALL GetEncodeCharSize(UInt32 ucs_character)
00107 {
00108     if (ucs_character <= 0x7F)
00109         return 1;
00110     else if (ucs_character <= 0x7FF)
00111         return 2;
00112     else if (ucs_character <= 0xFFFF)
00113         return 3;
00114     else if (ucs_character <= 0x1FFFFF)
00115         return 4;
00116     else if (ucs_character <= 0x3FFFFFF)
00117         return 5;
00118     else if (ucs_character <= 0x7FFFFFFF)
00119         return 6;
00120     else
00121         return 0;
00122 }
00123 
00124 UInt32 OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer)
00125 {
00126     UInt32  uc;
00127     char    c;
00128     
00129     // Security considerations:
00130     //
00131     // Changed, this is now only the case for DecodeNextChar:
00132     //  - If we hit a zero byte, we want to return 0 without stepping
00133     //    the buffer pointer past the 0. th
00134     //
00135     // If we hit an "overlong sequence"; i.e. a character encoded
00136     // in a longer multibyte string than is necessary, then we
00137     // need to discard the character.  This is so attackers can't
00138     // disguise dangerous characters or character sequences --
00139     // there is only one valid encoding for each character.
00140     //
00141     // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
00142     // 0xFFFF } then we ignore them; they are not valid in UTF-8.
00143     
00144     // This isn't actually an invalid character; it's a valid char that
00145     // looks like an inverted question mark.
00146 #define INVALID_CHAR 0x0FFFD
00147     
00148 #define FIRST_BYTE(mask, shift)     \
00149     uc = (c & (mask)) << (shift);
00150     
00151 #define NEXT_BYTE(shift) \
00152     c = **putf8Buffer;   \
00153     if (c == 0) return 0; /* end of buffer, do not advance */   \
00154     if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */  \
00155     (*putf8Buffer)++;    \
00156     uc |= (c & 0x3F) << shift;
00157     
00158     c = **putf8Buffer;
00159     (*putf8Buffer)++;
00160     if (c == 0)
00161         return 0;   // End of buffer.
00162     
00163     if ((c & 0x80) == 0) return (UInt32) c; // Conventional 7-bit ASCII.
00164     
00165     // Multi-byte sequences.
00166     if ((c & 0xE0) == 0xC0)
00167     {
00168         // Two-byte sequence.
00169         FIRST_BYTE(0x1F, 6);
00170         NEXT_BYTE(0);
00171         if (uc < 0x80) return INVALID_CHAR;  // overlong
00172         return uc;
00173     }
00174     else if ((c & 0xF0) == 0xE0)
00175     {
00176         // Three-byte sequence.
00177         FIRST_BYTE(0x0F, 12);
00178         NEXT_BYTE(6);
00179         NEXT_BYTE(0);
00180         if (uc < 0x800) return INVALID_CHAR; // overlong
00181         // Not valid ISO 10646, but Flash requires these to work
00182         // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0)
00183         // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR;
00184         // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646
00185         return uc;
00186     }
00187     else if ((c & 0xF8) == 0xF0)
00188     {
00189         // Four-byte sequence.
00190         FIRST_BYTE(0x07, 18);
00191         NEXT_BYTE(12);
00192         NEXT_BYTE(6);
00193         NEXT_BYTE(0);
00194         if (uc < 0x010000) return INVALID_CHAR;  // overlong
00195         return uc;
00196     }
00197     else if ((c & 0xFC) == 0xF8)
00198     {
00199         // Five-byte sequence.
00200         FIRST_BYTE(0x03, 24);
00201         NEXT_BYTE(18);
00202         NEXT_BYTE(12);
00203         NEXT_BYTE(6);
00204         NEXT_BYTE(0);
00205         if (uc < 0x0200000) return INVALID_CHAR; // overlong
00206         return uc;
00207     }
00208     else if ((c & 0xFE) == 0xFC)
00209     {
00210         // Six-byte sequence.
00211         FIRST_BYTE(0x01, 30);
00212         NEXT_BYTE(24);
00213         NEXT_BYTE(18);
00214         NEXT_BYTE(12);
00215         NEXT_BYTE(6);
00216         NEXT_BYTE(0);
00217         if (uc < 0x04000000) return INVALID_CHAR;    // overlong
00218         return uc;
00219     }
00220     else
00221     {
00222         // Invalid.
00223         return INVALID_CHAR;
00224     }
00225 }
00226 
00227 
00228 void OVR_STDCALL EncodeChar(char* pbuffer, SPInt* pindex, UInt32 ucs_character)
00229 {
00230     if (ucs_character <= 0x7F)
00231     {
00232         // Plain single-byte ASCII.
00233         pbuffer[(*pindex)++] = (char) ucs_character;
00234     }
00235     else if (ucs_character <= 0x7FF)
00236     {
00237         // Two bytes.
00238         pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6);
00239         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
00240     }
00241     else if (ucs_character <= 0xFFFF)
00242     {
00243         // Three bytes.
00244         pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12);
00245         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
00246         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
00247     }
00248     else if (ucs_character <= 0x1FFFFF)
00249     {
00250         // Four bytes.
00251         pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18);
00252         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
00253         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
00254         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
00255     }
00256     else if (ucs_character <= 0x3FFFFFF)
00257     {
00258         // Five bytes.
00259         pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24);
00260         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
00261         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
00262         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
00263         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
00264     }
00265     else if (ucs_character <= 0x7FFFFFFF)
00266     {
00267         // Six bytes.
00268         pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30);
00269         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F);
00270         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
00271         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
00272         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
00273         pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
00274     }
00275     else
00276     {
00277         // Invalid char; don't encode anything.
00278     }
00279 }
00280 
00281 SPInt OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, SPInt length)
00282 {
00283     SPInt len = 0;
00284     if (length != -1)
00285         for (int i = 0; i < length; i++)
00286         {
00287             len += GetEncodeCharSize(pchar[i]);
00288         }
00289     else
00290         for (int i = 0;; i++)
00291         {
00292             if (pchar[i] == 0)
00293                 return len;
00294             len += GetEncodeCharSize(pchar[i]);
00295         }
00296     return len;
00297 }
00298 
00299 void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, SPInt length)
00300 {
00301     SPInt ofs = 0;
00302     if (length != -1)
00303     {
00304         for (int i = 0; i < length; i++)
00305         {            
00306             EncodeChar(pbuff, &ofs, pchar[i]);
00307         }
00308     }
00309     else
00310     {
00311         for (int i = 0;; i++)
00312         {
00313             if (pchar[i] == 0)
00314                 break;
00315             EncodeChar(pbuff, &ofs, pchar[i]);
00316         }
00317     }
00318     pbuff[ofs] = 0;
00319 }
00320 
00321 UPInt OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, SPInt bytesLen)
00322 {
00323     wchar_t *pbegin = pbuff;
00324     if (bytesLen == -1)
00325     {
00326         while (1)
00327         {
00328             UInt32 ch = DecodeNextChar_Advance0(&putf8str);
00329             if (ch == 0)
00330                 break;
00331             else if (ch >= 0xFFFF)
00332                 ch = 0xFFFD;
00333             *pbuff++ = wchar_t(ch);
00334         }
00335     }
00336     else
00337     {
00338         const char* p = putf8str;
00339         while ((p - putf8str) < bytesLen)
00340         {
00341             UInt32 ch = DecodeNextChar_Advance0(&p);
00342             if (ch >= 0xFFFF)
00343                 ch = 0xFFFD;
00344             *pbuff++ = wchar_t(ch);
00345         }
00346     }
00347 
00348     *pbuff = 0;
00349     return pbuff - pbegin;
00350 }
00351 
00352 
00353 #ifdef UTF8_UNIT_TEST
00354 
00355 // Compile this test case with something like:
00356 //
00357 // gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test
00358 //
00359 //    or
00360 //
00361 // cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I..
00362 //
00363 // If possible, try running the test program with the first arg
00364 // pointing at the file:
00365 //
00366 // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
00367 // 
00368 // and examine the results by eye to make sure they are acceptable to
00369 // you.
00370 
00371 
00372 #include "base/utility.h"
00373 #include <stdio.h>
00374 
00375 
00376 bool    check_equal(const char* utf8_in, const UInt32* ucs_in)
00377 {
00378     for (;;)
00379     {
00380         UInt32  next_ucs = *ucs_in++;
00381         UInt32  next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in);
00382         if (next_ucs != next_ucs_from_utf8)
00383         {
00384             return false;
00385         }
00386         if (next_ucs == 0)
00387         {
00388             OVR_ASSERT(next_ucs_from_utf8 == 0);
00389             break;
00390         }
00391     }
00392     
00393     return true;
00394 }
00395 
00396 
00397 void    log_ascii(const char* line)
00398 {
00399     for (;;)
00400     {
00401         unsigned char   c = (unsigned char) *line++;
00402         if (c == 0)
00403         {
00404             // End of line.
00405             return;
00406         }
00407         else if (c != '\n'
00408             && (c < 32 || c > 127))
00409         {
00410             // Non-printable as plain ASCII.
00411             printf("<0x%02X>", (int) c);
00412         }
00413         else
00414         {
00415             printf("%c", c);
00416         }
00417     }
00418 }
00419 
00420 
00421 void    log_ucs(const UInt32* line)
00422 {
00423     for (;;)
00424     {
00425         UInt32  uc = *line++;
00426         if (uc == 0)
00427         {
00428             // End of line.
00429             return;
00430         }
00431         else if (uc != '\n'
00432             && (uc < 32 || uc > 127))
00433         {
00434             // Non-printable as plain ASCII.
00435             printf("<U-%04X>", uc);
00436         }
00437         else
00438         {
00439             printf("%c", (char) uc);
00440         }
00441     }
00442 }
00443 
00444 
00445 // Simple canned test.
00446 int main(int argc, const char* argv[])
00447 {
00448     {
00449         const char* test8 = "Ignacio Castaño";
00450         const UInt32    test32[] =
00451         {
00452             0x49, 0x67, 0x6E, 0x61, 0x63,
00453                 0x69, 0x6F, 0x20, 0x43, 0x61,
00454                 0x73, 0x74, 0x61, 0xF1, 0x6F,
00455                 0x00
00456         };
00457         
00458         OVR_ASSERT(check_equal(test8, test32));
00459     }
00460         
00461         // If user passed an arg, try reading the file as UTF-8 encoded text.
00462         if (argc > 1)
00463         {
00464             const char* filename = argv[1];
00465             FILE*   fp = fopen(filename, "rb");
00466             if (fp == NULL)
00467             {
00468                 printf("Can't open file '%s'\n", filename);
00469                 return 1;
00470             }
00471             
00472             // Read lines from the file, encode/decode them, and highlight discrepancies.
00473             const int LINE_SIZE = 200;  // max line size
00474             char    line_buffer_utf8[LINE_SIZE];
00475             char    reencoded_utf8[6 * LINE_SIZE];
00476             UInt32  line_buffer_ucs[LINE_SIZE];
00477             
00478             int byte_counter = 0;
00479             for (;;)
00480             {
00481                 int c = fgetc(fp);
00482                 if (c == EOF)
00483                 {
00484                     // Done.
00485                     break;
00486                 }
00487                 line_buffer_utf8[byte_counter++] = c;
00488                 if (c == '\n' || byte_counter >= LINE_SIZE - 2)
00489                 {
00490                     // End of line.  Process the line.
00491                     line_buffer_utf8[byte_counter++] = 0;   // terminate.
00492                     
00493                     // Decode into UCS.
00494                     const char* p = line_buffer_utf8;
00495                     UInt32* q = line_buffer_ucs;
00496                     for (;;)
00497                     {
00498                         UInt32  uc = UTF8Util::DecodeNextChar(&p);
00499                         *q++ = uc;
00500                         
00501                         OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE);
00502                         OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE);
00503                         
00504                         if (uc == 0) break;
00505                     }
00506                     
00507                     // Encode back into UTF-8.
00508                     q = line_buffer_ucs;
00509                     int index = 0;
00510                     for (;;)
00511                     {
00512                         UInt32  uc = *q++;
00513                         OVR_ASSERT(index < LINE_SIZE * 6 - 6);
00514                         int last_index = index;
00515                         UTF8Util::EncodeChar(reencoded_utf8, &index, uc);
00516                         OVR_ASSERT(index <= last_index + 6);
00517                         if (uc == 0) break;
00518                     }
00519                     
00520                     // This can be useful for debugging.
00521 #if 0
00522                     // Show the UCS and the re-encoded UTF-8.
00523                     log_ucs(line_buffer_ucs);
00524                     log_ascii(reencoded_utf8);
00525 #endif // 0
00526                     
00527                     OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs));
00528                     OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs));
00529                     
00530                     // Start next line.
00531                     byte_counter = 0;
00532                 }
00533             }
00534             
00535             fclose(fp);
00536         }
00537         
00538         return 0;
00539 }
00540 
00541 
00542 #endif // UTF8_UNIT_TEST
00543 
00544 }} // namespace UTF8Util::OVR
00545