00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 #include "Poco/UTF8Encoding.h"
00038 #include "Poco/String.h"
00039
00040
00041 namespace Poco {
00042
00043
00044 const char* UTF8Encoding::_names[] =
00045 {
00046 "UTF-8",
00047 "UTF8",
00048 NULL
00049 };
00050
00051
00052 const TextEncoding::CharacterMap UTF8Encoding::_charMap =
00053 {
00054 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
00055 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
00056 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
00057 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
00058 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
00059 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
00060 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
00061 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
00062 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00063 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00064 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00065 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00066 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
00067 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
00068 -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
00069 -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -1, -1,
00070 };
00071
00072
00073 UTF8Encoding::UTF8Encoding()
00074 {
00075 }
00076
00077
00078 UTF8Encoding::~UTF8Encoding()
00079 {
00080 }
00081
00082
00083 const char* UTF8Encoding::canonicalName() const
00084 {
00085 return _names[0];
00086 }
00087
00088
00089 bool UTF8Encoding::isA(const std::string& encodingName) const
00090 {
00091 for (const char** name = _names; *name; ++name)
00092 {
00093 if (Poco::icompare(encodingName, *name) == 0)
00094 return true;
00095 }
00096 return false;
00097 }
00098
00099
00100 const TextEncoding::CharacterMap& UTF8Encoding::characterMap() const
00101 {
00102 return _charMap;
00103 }
00104
00105
00106 bool UTF8Encoding::isLegal(const unsigned char *bytes, int length)
00107 {
00108
00109
00110
00111
00112 if (0 == bytes || 0 == length) return false;
00113
00114 unsigned char a;
00115 const unsigned char* srcptr = bytes + length;
00116 switch (length)
00117 {
00118 default:
00119 return false;
00120
00121 case 4:
00122 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
00123 case 3:
00124 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
00125 case 2:
00126 if ((a = (*--srcptr)) > 0xBF) return false;
00127 switch (*bytes)
00128 {
00129 case 0xE0:
00130 if (a < 0xA0) return false;
00131 break;
00132 case 0xED:
00133 if (a > 0x9F) return false;
00134 break;
00135 case 0xF0:
00136 if (a < 0x90) return false;
00137 break;
00138 case 0xF4:
00139 if (a > 0x8F) return false;
00140 break;
00141 default:
00142 if (a < 0x80) return false;
00143 }
00144 case 1:
00145 if (*bytes >= 0x80 && *bytes < 0xC2) return false;
00146 }
00147 return *bytes <= 0xF4;
00148 }
00149
00150
00151 int UTF8Encoding::convert(const unsigned char* bytes) const
00152 {
00153 int n = _charMap[*bytes];
00154 int uc;
00155
00156 switch (n)
00157 {
00158 case -6:
00159 case -5:
00160 case -1:
00161 return -1;
00162 case -4:
00163 case -3:
00164 case -2:
00165 if (!isLegal(bytes, -n)) return -1;
00166 uc = *bytes & ((0x07 << (n + 4)) | 0x03);
00167 break;
00168 default:
00169 return n;
00170 }
00171
00172 while (n++ < -1)
00173 {
00174 uc <<= 6;
00175 uc |= (*++bytes & 0x3F);
00176 }
00177 return uc;
00178 }
00179
00180
00181 int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
00182 {
00183 #ifdef _DEBUG
00184 unsigned char* lb = bytes;
00185 #endif
00186
00187 if (ch <= 0x7F)
00188 {
00189 if (bytes && length >= 1)
00190 *bytes = (unsigned char) ch;
00191 return 1;
00192 }
00193 else if (ch <= 0x7FF)
00194 {
00195 if (bytes && length >= 2)
00196 {
00197 *bytes++ = (unsigned char) (((ch >> 6) & 0x1F) | 0xC0);
00198 *bytes = (unsigned char) ((ch & 0x3F) | 0x80);
00199 }
00200 poco_assert_dbg (isLegal(lb, 2));
00201 return 2;
00202 }
00203 else if (ch <= 0xFFFF)
00204 {
00205 if (bytes && length >= 3)
00206 {
00207 *bytes++ = (unsigned char) (((ch >> 12) & 0x0F) | 0xE0);
00208 *bytes++ = (unsigned char) (((ch >> 6) & 0x3F) | 0x80);
00209 *bytes = (unsigned char) ((ch & 0x3F) | 0x80);
00210 }
00211 poco_assert_dbg (isLegal(lb, 3));
00212 return 3;
00213 }
00214 else if (ch <= 0x10FFFF)
00215 {
00216 if (bytes && length >= 4)
00217 {
00218 *bytes++ = (unsigned char) (((ch >> 18) & 0x07) | 0xF0);
00219 *bytes++ = (unsigned char) (((ch >> 12) & 0x3F) | 0x80);
00220 *bytes++ = (unsigned char) (((ch >> 6) & 0x3F) | 0x80);
00221 *bytes = (unsigned char) ((ch & 0x3F) | 0x80);
00222 }
00223 poco_assert_dbg (isLegal(lb, 4));
00224 return 4;
00225 }
00226 else return 0;
00227 }
00228
00229
00230 }