Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00029 #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00030
00031 #include <iterator>
00032
00033 namespace utf8
00034 {
00035
00036
00037
00038 typedef unsigned char uint8_t;
00039 typedef unsigned short uint16_t;
00040 typedef unsigned int uint32_t;
00041
00042
00043 namespace internal
00044 {
00045
00046
00047
00048 const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
00049 const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
00050 const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
00051 const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
00052 const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
00053 const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
00054
00055
00056 const uint32_t CODE_POINT_MAX = 0x0010ffffu;
00057
00058 template<typename octet_type>
00059 inline uint8_t mask8(octet_type oc)
00060 {
00061 return static_cast<uint8_t>(0xff & oc);
00062 }
00063 template<typename u16_type>
00064 inline uint16_t mask16(u16_type oc)
00065 {
00066 return static_cast<uint16_t>(0xffff & oc);
00067 }
00068 template<typename octet_type>
00069 inline bool is_trail(octet_type oc)
00070 {
00071 return ((utf8::internal::mask8(oc) >> 6) == 0x2);
00072 }
00073
00074 template <typename u16>
00075 inline bool is_lead_surrogate(u16 cp)
00076 {
00077 return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
00078 }
00079
00080 template <typename u16>
00081 inline bool is_trail_surrogate(u16 cp)
00082 {
00083 return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
00084 }
00085
00086 template <typename u16>
00087 inline bool is_surrogate(u16 cp)
00088 {
00089 return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
00090 }
00091
00092 template <typename u32>
00093 inline bool is_code_point_valid(u32 cp)
00094 {
00095 return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
00096 }
00097
00098 template <typename octet_iterator>
00099 inline typename std::iterator_traits<octet_iterator>::difference_type
00100 sequence_length(octet_iterator lead_it)
00101 {
00102 uint8_t lead = utf8::internal::mask8(*lead_it);
00103 if (lead < 0x80)
00104 return 1;
00105 else if ((lead >> 5) == 0x6)
00106 return 2;
00107 else if ((lead >> 4) == 0xe)
00108 return 3;
00109 else if ((lead >> 3) == 0x1e)
00110 return 4;
00111 else
00112 return 0;
00113 }
00114
00115 template <typename octet_difference_type>
00116 inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
00117 {
00118 if (cp < 0x80) {
00119 if (length != 1)
00120 return true;
00121 }
00122 else if (cp < 0x800) {
00123 if (length != 2)
00124 return true;
00125 }
00126 else if (cp < 0x10000) {
00127 if (length != 3)
00128 return true;
00129 }
00130
00131 return false;
00132 }
00133
00134 enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
00135
00137 template <typename octet_iterator>
00138 utf_error increase_safely(octet_iterator& it, octet_iterator end)
00139 {
00140 if (++it == end)
00141 return NOT_ENOUGH_ROOM;
00142
00143 if (!utf8::internal::is_trail(*it))
00144 return INCOMPLETE_SEQUENCE;
00145
00146 return UTF8_OK;
00147 }
00148
00149 #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
00150
00152 template <typename octet_iterator>
00153 utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
00154 {
00155 if (it == end)
00156 return NOT_ENOUGH_ROOM;
00157
00158 code_point = utf8::internal::mask8(*it);
00159
00160 return UTF8_OK;
00161 }
00162
00163 template <typename octet_iterator>
00164 utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
00165 {
00166 if (it == end)
00167 return NOT_ENOUGH_ROOM;
00168
00169 code_point = utf8::internal::mask8(*it);
00170
00171 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
00172
00173 code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
00174
00175 return UTF8_OK;
00176 }
00177
00178 template <typename octet_iterator>
00179 utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
00180 {
00181 if (it == end)
00182 return NOT_ENOUGH_ROOM;
00183
00184 code_point = utf8::internal::mask8(*it);
00185
00186 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
00187
00188 code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
00189
00190 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
00191
00192 code_point += (*it) & 0x3f;
00193
00194 return UTF8_OK;
00195 }
00196
00197 template <typename octet_iterator>
00198 utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
00199 {
00200 if (it == end)
00201 return NOT_ENOUGH_ROOM;
00202
00203 code_point = utf8::internal::mask8(*it);
00204
00205 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
00206
00207 code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
00208
00209 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
00210
00211 code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
00212
00213 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
00214
00215 code_point += (*it) & 0x3f;
00216
00217 return UTF8_OK;
00218 }
00219
00220 #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
00221
00222 template <typename octet_iterator>
00223 utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
00224 {
00225
00226
00227 octet_iterator original_it = it;
00228
00229 uint32_t cp = 0;
00230
00231 typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
00232 const octet_difference_type length = utf8::internal::sequence_length(it);
00233
00234
00235 utf_error err = UTF8_OK;
00236 switch (length) {
00237 case 0:
00238 return INVALID_LEAD;
00239 case 1:
00240 err = utf8::internal::get_sequence_1(it, end, cp);
00241 break;
00242 case 2:
00243 err = utf8::internal::get_sequence_2(it, end, cp);
00244 break;
00245 case 3:
00246 err = utf8::internal::get_sequence_3(it, end, cp);
00247 break;
00248 case 4:
00249 err = utf8::internal::get_sequence_4(it, end, cp);
00250 break;
00251 }
00252
00253 if (err == UTF8_OK) {
00254
00255 if (utf8::internal::is_code_point_valid(cp)) {
00256 if (!utf8::internal::is_overlong_sequence(cp, length)){
00257
00258 code_point = cp;
00259 ++it;
00260 return UTF8_OK;
00261 }
00262 else
00263 err = OVERLONG_SEQUENCE;
00264 }
00265 else
00266 err = INVALID_CODE_POINT;
00267 }
00268
00269
00270 it = original_it;
00271 return err;
00272 }
00273
00274 template <typename octet_iterator>
00275 inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
00276 uint32_t ignored;
00277 return utf8::internal::validate_next(it, end, ignored);
00278 }
00279
00280 }
00281
00283
00284
00285 const uint8_t bom[] = {0xef, 0xbb, 0xbf};
00286
00287 template <typename octet_iterator>
00288 octet_iterator find_invalid(octet_iterator start, octet_iterator end)
00289 {
00290 octet_iterator result = start;
00291 while (result != end) {
00292 utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
00293 if (err_code != internal::UTF8_OK)
00294 return result;
00295 }
00296 return result;
00297 }
00298
00299 template <typename octet_iterator>
00300 inline bool is_valid(octet_iterator start, octet_iterator end)
00301 {
00302 return (utf8::find_invalid(start, end) == end);
00303 }
00304
00305 template <typename octet_iterator>
00306 inline bool starts_with_bom (octet_iterator it, octet_iterator end)
00307 {
00308 return (
00309 ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
00310 ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
00311 ((it != end) && (utf8::internal::mask8(*it)) == bom[2])
00312 );
00313 }
00314
00315
00316 template <typename octet_iterator>
00317 inline bool is_bom (octet_iterator it)
00318 {
00319 return (
00320 (utf8::internal::mask8(*it++)) == bom[0] &&
00321 (utf8::internal::mask8(*it++)) == bom[1] &&
00322 (utf8::internal::mask8(*it)) == bom[2]
00323 );
00324 }
00325 }
00326
00327 #endif // header guard
00328
00329