00001 #include "stream.h"
00002 #include <iostream>
00003 #include "exp.h"
00004
00005 #ifndef YAML_PREFETCH_SIZE
00006 #define YAML_PREFETCH_SIZE 2048
00007 #endif
00008
00009 #define S_ARRAY_SIZE( A ) (sizeof(A)/sizeof(*(A)))
00010 #define S_ARRAY_END( A ) ((A) + S_ARRAY_SIZE(A))
00011
00012 #define CP_REPLACEMENT_CHARACTER (0xFFFD)
00013
00014 namespace YAML_PM
00015 {
00016 enum UtfIntroState {
00017 uis_start,
00018 uis_utfbe_b1,
00019 uis_utf32be_b2,
00020 uis_utf32be_bom3,
00021 uis_utf32be,
00022 uis_utf16be,
00023 uis_utf16be_bom1,
00024 uis_utfle_bom1,
00025 uis_utf16le_bom2,
00026 uis_utf32le_bom3,
00027 uis_utf16le,
00028 uis_utf32le,
00029 uis_utf8_imp,
00030 uis_utf16le_imp,
00031 uis_utf32le_imp3,
00032 uis_utf8_bom1,
00033 uis_utf8_bom2,
00034 uis_utf8,
00035 uis_error
00036 };
00037
00038 enum UtfIntroCharType {
00039 uict00,
00040 uictBB,
00041 uictBF,
00042 uictEF,
00043 uictFE,
00044 uictFF,
00045 uictAscii,
00046 uictOther,
00047 uictMax
00048 };
00049
00050 static bool s_introFinalState[] = {
00051 false,
00052 false,
00053 false,
00054 false,
00055 true,
00056 true,
00057 false,
00058 false,
00059 false,
00060 false,
00061 true,
00062 true,
00063 false,
00064 false,
00065 false,
00066 false,
00067 false,
00068 true,
00069 true,
00070 };
00071
00072 static UtfIntroState s_introTransitions[][uictMax] = {
00073
00074 {uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1, uis_utfle_bom1, uis_utf8_imp, uis_utf8},
00075 {uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8},
00076 {uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8, uis_utf8, uis_utf8},
00077 {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8, uis_utf8},
00078 {uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be},
00079 {uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be},
00080 {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8, uis_utf8},
00081 {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8, uis_utf8, uis_utf8},
00082 {uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
00083 {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
00084 {uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
00085 {uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le},
00086 {uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8},
00087 {uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
00088 {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
00089 {uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8},
00090 {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8},
00091 {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8},
00092 };
00093
00094 static char s_introUngetCount[][uictMax] = {
00095
00096 {0, 1, 1, 0, 0, 0, 0, 1},
00097 {0, 2, 2, 2, 2, 2, 2, 2},
00098 {3, 3, 3, 3, 0, 3, 3, 3},
00099 {4, 4, 4, 4, 4, 0, 4, 4},
00100 {1, 1, 1, 1, 1, 1, 1, 1},
00101 {1, 1, 1, 1, 1, 1, 1, 1},
00102 {2, 2, 2, 2, 2, 0, 2, 2},
00103 {2, 2, 2, 2, 0, 2, 2, 2},
00104 {0, 1, 1, 1, 1, 1, 1, 1},
00105 {0, 2, 2, 2, 2, 2, 2, 2},
00106 {1, 1, 1, 1, 1, 1, 1, 1},
00107 {1, 1, 1, 1, 1, 1, 1, 1},
00108 {0, 2, 2, 2, 2, 2, 2, 2},
00109 {0, 3, 3, 3, 3, 3, 3, 3},
00110 {4, 4, 4, 4, 4, 4, 4, 4},
00111 {2, 0, 2, 2, 2, 2, 2, 2},
00112 {3, 3, 0, 3, 3, 3, 3, 3},
00113 {1, 1, 1, 1, 1, 1, 1, 1},
00114 };
00115
00116 inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch)
00117 {
00118 if (std::istream::traits_type::eof() == ch) {
00119 return uictOther;
00120 }
00121
00122 switch (ch) {
00123 case 0: return uict00;
00124 case 0xBB: return uictBB;
00125 case 0xBF: return uictBF;
00126 case 0xEF: return uictEF;
00127 case 0xFE: return uictFE;
00128 case 0xFF: return uictFF;
00129 }
00130
00131 if ((ch > 0) && (ch < 0xFF)) {
00132 return uictAscii;
00133 }
00134
00135 return uictOther;
00136 }
00137
00138 inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits, unsigned char rshift)
00139 {
00140 const unsigned char header = ((1 << lead_bits) - 1) << (8 - lead_bits);
00141 const unsigned char mask = (0xFF >> (lead_bits + 1));
00142 return static_cast<char>(static_cast<unsigned char>(
00143 header | ((ch >> rshift) & mask)
00144 ));
00145 }
00146
00147 inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch)
00148 {
00149
00150
00151 if (static_cast<unsigned long>(Stream::eof()) == ch)
00152 {
00153 ch = CP_REPLACEMENT_CHARACTER;
00154 }
00155
00156 if (ch < 0x80)
00157 {
00158 q.push_back(Utf8Adjust(ch, 0, 0));
00159 }
00160 else if (ch < 0x800)
00161 {
00162 q.push_back(Utf8Adjust(ch, 2, 6));
00163 q.push_back(Utf8Adjust(ch, 1, 0));
00164 }
00165 else if (ch < 0x10000)
00166 {
00167 q.push_back(Utf8Adjust(ch, 3, 12));
00168 q.push_back(Utf8Adjust(ch, 1, 6));
00169 q.push_back(Utf8Adjust(ch, 1, 0));
00170 }
00171 else
00172 {
00173 q.push_back(Utf8Adjust(ch, 4, 18));
00174 q.push_back(Utf8Adjust(ch, 1, 12));
00175 q.push_back(Utf8Adjust(ch, 1, 6));
00176 q.push_back(Utf8Adjust(ch, 1, 0));
00177 }
00178 }
00179
00180 Stream::Stream(std::istream& input)
00181 : m_input(input),
00182 m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]),
00183 m_nPrefetchedAvailable(0), m_nPrefetchedUsed(0)
00184 {
00185 typedef std::istream::traits_type char_traits;
00186
00187 if(!input)
00188 return;
00189
00190
00191
00192 char_traits::int_type intro[4];
00193 int nIntroUsed = 0;
00194 UtfIntroState state = uis_start;
00195 for(; !s_introFinalState[state]; ) {
00196 std::istream::int_type ch = input.get();
00197 intro[nIntroUsed++] = ch;
00198 UtfIntroCharType charType = IntroCharTypeOf(ch);
00199 UtfIntroState newState = s_introTransitions[state][charType];
00200 int nUngets = s_introUngetCount[state][charType];
00201 if(nUngets > 0) {
00202 input.clear();
00203 for(; nUngets > 0; --nUngets) {
00204 if(char_traits::eof() != intro[--nIntroUsed])
00205 input.putback(char_traits::to_char_type(intro[nIntroUsed]));
00206 }
00207 }
00208 state = newState;
00209 }
00210
00211 switch (state) {
00212 case uis_utf8: m_charSet = utf8; break;
00213 case uis_utf16le: m_charSet = utf16le; break;
00214 case uis_utf16be: m_charSet = utf16be; break;
00215 case uis_utf32le: m_charSet = utf32le; break;
00216 case uis_utf32be: m_charSet = utf32be; break;
00217 default: m_charSet = utf8; break;
00218 }
00219
00220 ReadAheadTo(0);
00221 }
00222
00223 Stream::~Stream()
00224 {
00225 delete[] m_pPrefetched;
00226 }
00227
00228 char Stream::peek() const
00229 {
00230 if (m_readahead.empty())
00231 {
00232 return Stream::eof();
00233 }
00234
00235 return m_readahead[0];
00236 }
00237
00238 Stream::operator bool() const
00239 {
00240 return m_input.good() || (!m_readahead.empty() && m_readahead[0] != Stream::eof());
00241 }
00242
00243
00244
00245 char Stream::get()
00246 {
00247 char ch = peek();
00248 AdvanceCurrent();
00249 m_mark.column++;
00250
00251 if(ch == '\n') {
00252 m_mark.column = 0;
00253 m_mark.line++;
00254 }
00255
00256 return ch;
00257 }
00258
00259
00260
00261 std::string Stream::get(int n)
00262 {
00263 std::string ret;
00264 ret.reserve(n);
00265 for(int i=0;i<n;i++)
00266 ret += get();
00267 return ret;
00268 }
00269
00270
00271
00272 void Stream::eat(int n)
00273 {
00274 for(int i=0;i<n;i++)
00275 get();
00276 }
00277
00278 void Stream::AdvanceCurrent()
00279 {
00280 if (!m_readahead.empty())
00281 {
00282 m_readahead.pop_front();
00283 m_mark.pos++;
00284 }
00285
00286 ReadAheadTo(0);
00287 }
00288
00289 bool Stream::_ReadAheadTo(size_t i) const
00290 {
00291 while (m_input.good() && (m_readahead.size() <= i))
00292 {
00293 switch (m_charSet)
00294 {
00295 case utf8: StreamInUtf8(); break;
00296 case utf16le: StreamInUtf16(); break;
00297 case utf16be: StreamInUtf16(); break;
00298 case utf32le: StreamInUtf32(); break;
00299 case utf32be: StreamInUtf32(); break;
00300 }
00301 }
00302
00303
00304 if(!m_input.good())
00305 m_readahead.push_back(Stream::eof());
00306
00307 return m_readahead.size() > i;
00308 }
00309
00310 void Stream::StreamInUtf8() const
00311 {
00312 unsigned char b = GetNextByte();
00313 if (m_input.good())
00314 {
00315 m_readahead.push_back(b);
00316 }
00317 }
00318
00319 void Stream::StreamInUtf16() const
00320 {
00321 unsigned long ch = 0;
00322 unsigned char bytes[2];
00323 int nBigEnd = (m_charSet == utf16be) ? 0 : 1;
00324
00325 bytes[0] = GetNextByte();
00326 bytes[1] = GetNextByte();
00327 if (!m_input.good())
00328 {
00329 return;
00330 }
00331 ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
00332 static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
00333
00334 if (ch >= 0xDC00 && ch < 0xE000)
00335 {
00336
00337 QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
00338 return;
00339 }
00340 else if (ch >= 0xD800 && ch < 0xDC00)
00341 {
00342
00343
00344
00345
00346
00347 for (;;)
00348 {
00349 bytes[0] = GetNextByte();
00350 bytes[1] = GetNextByte();
00351 if (!m_input.good())
00352 {
00353 QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
00354 return;
00355 }
00356 unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
00357 static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
00358 if (chLow < 0xDC00 || ch >= 0xE000)
00359 {
00360
00361 QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
00362
00363
00364 if (chLow < 0xD800 || ch >= 0xE000)
00365 {
00366
00367 QueueUnicodeCodepoint(m_readahead, ch);
00368 return;
00369 }
00370 else
00371 {
00372
00373 ch = chLow;
00374 continue;
00375 }
00376 }
00377
00378
00379 ch &= 0x3FF;
00380 ch <<= 10;
00381
00382
00383 ch |= (chLow & 0x3FF);
00384
00385
00386 ch += 0x10000;
00387 }
00388 }
00389
00390 QueueUnicodeCodepoint(m_readahead, ch);
00391 }
00392
00393 inline char* ReadBuffer(unsigned char* pBuffer)
00394 {
00395 return reinterpret_cast<char*>(pBuffer);
00396 }
00397
00398 unsigned char Stream::GetNextByte() const
00399 {
00400 if (m_nPrefetchedUsed >= m_nPrefetchedAvailable)
00401 {
00402 std::streambuf *pBuf = m_input.rdbuf();
00403 m_nPrefetchedAvailable = pBuf->sgetn(ReadBuffer(m_pPrefetched),
00404 YAML_PREFETCH_SIZE);
00405 m_nPrefetchedUsed = 0;
00406 if (!m_nPrefetchedAvailable)
00407 {
00408 m_input.setstate(std::ios_base::eofbit);
00409 }
00410
00411 if (0 == m_nPrefetchedAvailable)
00412 {
00413 return 0;
00414 }
00415 }
00416
00417 return m_pPrefetched[m_nPrefetchedUsed++];
00418 }
00419
00420 void Stream::StreamInUtf32() const
00421 {
00422 static int indexes[2][4] = {
00423 {3, 2, 1, 0},
00424 {0, 1, 2, 3}
00425 };
00426
00427 unsigned long ch = 0;
00428 unsigned char bytes[4];
00429 int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0];
00430
00431 bytes[0] = GetNextByte();
00432 bytes[1] = GetNextByte();
00433 bytes[2] = GetNextByte();
00434 bytes[3] = GetNextByte();
00435 if (!m_input.good())
00436 {
00437 return;
00438 }
00439
00440 for (int i = 0; i < 4; ++i)
00441 {
00442 ch <<= 8;
00443 ch |= bytes[pIndexes[i]];
00444 }
00445
00446 QueueUnicodeCodepoint(m_readahead, ch);
00447 }
00448 }