00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00029 #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00030
00031 #include "core.h"
00032 #include <stdexcept>
00033
00034 namespace utf8
00035 {
00036
00037 class exception : public ::std::exception {
00038 };
00039
00040
00041 class invalid_code_point : public exception {
00042 uint32_t cp;
00043 public:
00044 invalid_code_point(uint32_t cp) : cp(cp) {}
00045 virtual const char* what() const throw() { return "Invalid code point"; }
00046 uint32_t code_point() const {return cp;}
00047 };
00048
00049 class invalid_utf8 : public exception {
00050 uint8_t u8;
00051 public:
00052 invalid_utf8 (uint8_t u) : u8(u) {}
00053 virtual const char* what() const throw() { return "Invalid UTF-8"; }
00054 uint8_t utf8_octet() const {return u8;}
00055 };
00056
00057 class invalid_utf16 : public exception {
00058 uint16_t u16;
00059 public:
00060 invalid_utf16 (uint16_t u) : u16(u) {}
00061 virtual const char* what() const throw() { return "Invalid UTF-16"; }
00062 uint16_t utf16_word() const {return u16;}
00063 };
00064
00065 class not_enough_room : public exception {
00066 public:
00067 virtual const char* what() const throw() { return "Not enough space"; }
00068 };
00069
00071
00072 template <typename octet_iterator>
00073 octet_iterator append(uint32_t cp, octet_iterator result)
00074 {
00075 if (!utf8::internal::is_code_point_valid(cp))
00076 throw invalid_code_point(cp);
00077
00078 if (cp < 0x80)
00079 *(result++) = static_cast<uint8_t>(cp);
00080 else if (cp < 0x800) {
00081 *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
00082 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
00083 }
00084 else if (cp < 0x10000) {
00085 *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
00086 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
00087 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
00088 }
00089 else {
00090 *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
00091 *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
00092 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
00093 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
00094 }
00095 return result;
00096 }
00097
00098 template <typename octet_iterator, typename output_iterator>
00099 output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
00100 {
00101 while (start != end) {
00102 octet_iterator sequence_start = start;
00103 internal::utf_error err_code = utf8::internal::validate_next(start, end);
00104 switch (err_code) {
00105 case internal::UTF8_OK :
00106 for (octet_iterator it = sequence_start; it != start; ++it)
00107 *out++ = *it;
00108 break;
00109 case internal::NOT_ENOUGH_ROOM:
00110 throw not_enough_room();
00111 case internal::INVALID_LEAD:
00112 out = utf8::append (replacement, out);
00113 ++start;
00114 break;
00115 case internal::INCOMPLETE_SEQUENCE:
00116 case internal::OVERLONG_SEQUENCE:
00117 case internal::INVALID_CODE_POINT:
00118 out = utf8::append (replacement, out);
00119 ++start;
00120
00121 while (start != end && utf8::internal::is_trail(*start))
00122 ++start;
00123 break;
00124 }
00125 }
00126 return out;
00127 }
00128
00129 template <typename octet_iterator, typename output_iterator>
00130 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
00131 {
00132 static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
00133 return utf8::replace_invalid(start, end, out, replacement_marker);
00134 }
00135
00136 template <typename octet_iterator>
00137 uint32_t next(octet_iterator& it, octet_iterator end)
00138 {
00139 uint32_t cp = 0;
00140 internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
00141 switch (err_code) {
00142 case internal::UTF8_OK :
00143 break;
00144 case internal::NOT_ENOUGH_ROOM :
00145 throw not_enough_room();
00146 case internal::INVALID_LEAD :
00147 case internal::INCOMPLETE_SEQUENCE :
00148 case internal::OVERLONG_SEQUENCE :
00149 throw invalid_utf8(*it);
00150 case internal::INVALID_CODE_POINT :
00151 throw invalid_code_point(cp);
00152 }
00153 return cp;
00154 }
00155
00156 template <typename octet_iterator>
00157 uint32_t peek_next(octet_iterator it, octet_iterator end)
00158 {
00159 return utf8::next(it, end);
00160 }
00161
00162 template <typename octet_iterator>
00163 uint32_t prior(octet_iterator& it, octet_iterator start)
00164 {
00165
00166 if (it == start)
00167 throw not_enough_room();
00168
00169 octet_iterator end = it;
00170
00171 while (utf8::internal::is_trail(*(--it)))
00172 if (it == start)
00173 throw invalid_utf8(*it);
00174 return utf8::peek_next(it, end);
00175 }
00176
00178 template <typename octet_iterator>
00179 uint32_t previous(octet_iterator& it, octet_iterator pass_start)
00180 {
00181 octet_iterator end = it;
00182 while (utf8::internal::is_trail(*(--it)))
00183 if (it == pass_start)
00184 throw invalid_utf8(*it);
00185 octet_iterator temp = it;
00186 return utf8::next(temp, end);
00187 }
00188
00189 template <typename octet_iterator, typename distance_type>
00190 void advance (octet_iterator& it, distance_type n, octet_iterator end)
00191 {
00192 for (distance_type i = 0; i < n; ++i)
00193 utf8::next(it, end);
00194 }
00195
00196 template <typename octet_iterator>
00197 typename std::iterator_traits<octet_iterator>::difference_type
00198 distance (octet_iterator first, octet_iterator last)
00199 {
00200 typename std::iterator_traits<octet_iterator>::difference_type dist;
00201 for (dist = 0; first < last; ++dist)
00202 utf8::next(first, last);
00203 return dist;
00204 }
00205
00206 template <typename u16bit_iterator, typename octet_iterator>
00207 octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
00208 {
00209 while (start != end) {
00210 uint32_t cp = utf8::internal::mask16(*start++);
00211
00212 if (utf8::internal::is_lead_surrogate(cp)) {
00213 if (start != end) {
00214 uint32_t trail_surrogate = utf8::internal::mask16(*start++);
00215 if (utf8::internal::is_trail_surrogate(trail_surrogate))
00216 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
00217 else
00218 throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
00219 }
00220 else
00221 throw invalid_utf16(static_cast<uint16_t>(cp));
00222
00223 }
00224
00225 else if (utf8::internal::is_trail_surrogate(cp))
00226 throw invalid_utf16(static_cast<uint16_t>(cp));
00227
00228 result = utf8::append(cp, result);
00229 }
00230 return result;
00231 }
00232
00233 template <typename u16bit_iterator, typename octet_iterator>
00234 u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
00235 {
00236 while (start != end) {
00237 uint32_t cp = utf8::next(start, end);
00238 if (cp > 0xffff) {
00239 *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
00240 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
00241 }
00242 else
00243 *result++ = static_cast<uint16_t>(cp);
00244 }
00245 return result;
00246 }
00247
00248 template <typename octet_iterator, typename u32bit_iterator>
00249 octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
00250 {
00251 while (start != end)
00252 result = utf8::append(*(start++), result);
00253
00254 return result;
00255 }
00256
00257 template <typename octet_iterator, typename u32bit_iterator>
00258 u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
00259 {
00260 while (start != end)
00261 (*result++) = utf8::next(start, end);
00262
00263 return result;
00264 }
00265
00266
00267 template <typename octet_iterator>
00268 class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
00269 octet_iterator it;
00270 octet_iterator range_start;
00271 octet_iterator range_end;
00272 public:
00273 iterator () {}
00274 explicit iterator (const octet_iterator& octet_it,
00275 const octet_iterator& range_start,
00276 const octet_iterator& range_end) :
00277 it(octet_it), range_start(range_start), range_end(range_end)
00278 {
00279 if (it < range_start || it > range_end)
00280 throw std::out_of_range("Invalid utf-8 iterator position");
00281 }
00282
00283 octet_iterator base () const { return it; }
00284 uint32_t operator * () const
00285 {
00286 octet_iterator temp = it;
00287 return utf8::next(temp, range_end);
00288 }
00289 bool operator == (const iterator& rhs) const
00290 {
00291 if (range_start != rhs.range_start || range_end != rhs.range_end)
00292 throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
00293 return (it == rhs.it);
00294 }
00295 bool operator != (const iterator& rhs) const
00296 {
00297 return !(operator == (rhs));
00298 }
00299 iterator& operator ++ ()
00300 {
00301 utf8::next(it, range_end);
00302 return *this;
00303 }
00304 iterator operator ++ (int)
00305 {
00306 iterator temp = *this;
00307 utf8::next(it, range_end);
00308 return temp;
00309 }
00310 iterator& operator -- ()
00311 {
00312 utf8::prior(it, range_start);
00313 return *this;
00314 }
00315 iterator operator -- (int)
00316 {
00317 iterator temp = *this;
00318 utf8::prior(it, range_start);
00319 return temp;
00320 }
00321 };
00322
00323 }
00324
00325 #endif //header guard
00326
00327