stream.cpp
Go to the documentation of this file.
00001 #include "stream.h"
00002 #include <iostream>
00003 #include "exp.h"
00004 
00005 #ifndef YAML_PREFETCH_SIZE
00006 #define YAML_PREFETCH_SIZE 2048
00007 #endif
00008 
00009 #define S_ARRAY_SIZE( A ) (sizeof(A)/sizeof(*(A)))
00010 #define S_ARRAY_END( A ) ((A) + S_ARRAY_SIZE(A))
00011 
00012 #define CP_REPLACEMENT_CHARACTER (0xFFFD)
00013 
00014 namespace YAML_PM
00015 {
00016         enum UtfIntroState {
00017                 uis_start,
00018                 uis_utfbe_b1,
00019                 uis_utf32be_b2,
00020                 uis_utf32be_bom3,
00021                 uis_utf32be,
00022                 uis_utf16be,
00023                 uis_utf16be_bom1,
00024                 uis_utfle_bom1,
00025                 uis_utf16le_bom2,
00026                 uis_utf32le_bom3,
00027                 uis_utf16le,
00028                 uis_utf32le,
00029                 uis_utf8_imp,
00030                 uis_utf16le_imp,
00031                 uis_utf32le_imp3,
00032                 uis_utf8_bom1,
00033                 uis_utf8_bom2,
00034                 uis_utf8,
00035                 uis_error
00036         };
00037 
00038         enum UtfIntroCharType {
00039                 uict00,
00040                 uictBB,
00041                 uictBF,
00042                 uictEF,
00043                 uictFE,
00044                 uictFF,
00045                 uictAscii,
00046                 uictOther,
00047                 uictMax
00048         };
00049 
00050         static bool s_introFinalState[] = {
00051                 false, //uis_start
00052                 false, //uis_utfbe_b1
00053                 false, //uis_utf32be_b2
00054                 false, //uis_utf32be_bom3
00055                 true,  //uis_utf32be
00056                 true,  //uis_utf16be
00057                 false, //uis_utf16be_bom1
00058                 false, //uis_utfle_bom1
00059                 false, //uis_utf16le_bom2
00060                 false, //uis_utf32le_bom3
00061                 true,  //uis_utf16le
00062                 true,  //uis_utf32le
00063                 false, //uis_utf8_imp
00064                 false, //uis_utf16le_imp
00065                 false, //uis_utf32le_imp3
00066                 false, //uis_utf8_bom1
00067                 false, //uis_utf8_bom2
00068                 true,  //uis_utf8
00069                 true,  //uis_error
00070         };
00071 
00072         static UtfIntroState s_introTransitions[][uictMax] = {
00073                 // uict00,           uictBB,           uictBF,           uictEF,           uictFE,           uictFF,           uictAscii,        uictOther
00074                   {uis_utfbe_b1,     uis_utf8,         uis_utf8,         uis_utf8_bom1,    uis_utf16be_bom1, uis_utfle_bom1,   uis_utf8_imp,     uis_utf8},
00075                   {uis_utf32be_b2,   uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf16be,      uis_utf8},
00076                   {uis_utf32be,      uis_utf8,         uis_utf8,         uis_utf8,         uis_utf32be_bom3, uis_utf8,         uis_utf8,         uis_utf8},
00077                   {uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf32be,      uis_utf8,         uis_utf8},
00078                   {uis_utf32be,      uis_utf32be,      uis_utf32be,      uis_utf32be,      uis_utf32be,      uis_utf32be,      uis_utf32be,      uis_utf32be},
00079                   {uis_utf16be,      uis_utf16be,      uis_utf16be,      uis_utf16be,      uis_utf16be,      uis_utf16be,      uis_utf16be,      uis_utf16be},
00080                   {uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf16be,      uis_utf8,         uis_utf8},
00081                   {uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf16le_bom2, uis_utf8,         uis_utf8,         uis_utf8},
00082                   {uis_utf32le_bom3, uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le},
00083                   {uis_utf32le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le},
00084                   {uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le},
00085                   {uis_utf32le,      uis_utf32le,      uis_utf32le,      uis_utf32le,      uis_utf32le,      uis_utf32le,      uis_utf32le,      uis_utf32le},
00086                   {uis_utf16le_imp,  uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8},
00087                   {uis_utf32le_imp3, uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le},
00088                   {uis_utf32le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le},
00089                   {uis_utf8,         uis_utf8_bom2,    uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8},
00090                   {uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8},
00091                   {uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8},
00092         };
00093 
00094         static char s_introUngetCount[][uictMax] = {
00095                 // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
00096                   {0,      1,      1,      0,      0,      0,      0,         1},
00097                   {0,      2,      2,      2,      2,      2,      2,         2},
00098                   {3,      3,      3,      3,      0,      3,      3,         3},
00099                   {4,      4,      4,      4,      4,      0,      4,         4},
00100                   {1,      1,      1,      1,      1,      1,      1,         1},
00101                   {1,      1,      1,      1,      1,      1,      1,         1},
00102                   {2,      2,      2,      2,      2,      0,      2,         2},
00103                   {2,      2,      2,      2,      0,      2,      2,         2},
00104                   {0,      1,      1,      1,      1,      1,      1,         1},
00105                   {0,      2,      2,      2,      2,      2,      2,         2},
00106                   {1,      1,      1,      1,      1,      1,      1,         1},
00107                   {1,      1,      1,      1,      1,      1,      1,         1},
00108                   {0,      2,      2,      2,      2,      2,      2,         2},
00109                   {0,      3,      3,      3,      3,      3,      3,         3},
00110                   {4,      4,      4,      4,      4,      4,      4,         4},
00111                   {2,      0,      2,      2,      2,      2,      2,         2},
00112                   {3,      3,      0,      3,      3,      3,      3,         3},
00113                   {1,      1,      1,      1,      1,      1,      1,         1},
00114         };
00115 
00116         inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch)
00117         {
00118                 if (std::istream::traits_type::eof() == ch) {
00119                         return uictOther;
00120                 }
00121 
00122                 switch (ch) {
00123                 case 0: return uict00;
00124                 case 0xBB: return uictBB;
00125                 case 0xBF: return uictBF;
00126                 case 0xEF: return uictEF;
00127                 case 0xFE: return uictFE;
00128                 case 0xFF: return uictFF;
00129                 }
00130 
00131                 if ((ch > 0) && (ch < 0xFF)) {
00132                         return uictAscii;
00133                 }
00134 
00135                 return uictOther;
00136         }
00137 
00138         inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits, unsigned char rshift)
00139         {
00140                 const unsigned char header = ((1 << lead_bits) - 1) << (8 - lead_bits);
00141                 const unsigned char mask = (0xFF >> (lead_bits + 1));
00142                 return static_cast<char>(static_cast<unsigned char>(
00143                         header | ((ch >> rshift) & mask)
00144                         ));
00145         }
00146 
00147         inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch)
00148         {
00149                 // We are not allowed to queue the Stream::eof() codepoint, so
00150                 // replace it with CP_REPLACEMENT_CHARACTER
00151                 if (static_cast<unsigned long>(Stream::eof()) == ch)
00152                 {
00153                         ch = CP_REPLACEMENT_CHARACTER;
00154                 }
00155 
00156                 if (ch < 0x80)
00157                 {
00158                         q.push_back(Utf8Adjust(ch, 0, 0));
00159                 }
00160                 else if (ch < 0x800)
00161                 {
00162                         q.push_back(Utf8Adjust(ch, 2, 6));
00163                         q.push_back(Utf8Adjust(ch, 1, 0));
00164                 }
00165                 else if (ch < 0x10000)
00166                 {
00167                         q.push_back(Utf8Adjust(ch, 3, 12));
00168                         q.push_back(Utf8Adjust(ch, 1, 6));
00169                         q.push_back(Utf8Adjust(ch, 1, 0));
00170                 }
00171                 else
00172                 {
00173                         q.push_back(Utf8Adjust(ch, 4, 18));
00174                         q.push_back(Utf8Adjust(ch, 1, 12));
00175                         q.push_back(Utf8Adjust(ch, 1, 6));
00176                         q.push_back(Utf8Adjust(ch, 1, 0));
00177                 }
00178         }
00179 
00180         Stream::Stream(std::istream& input)
00181                 : m_input(input),
00182                 m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]), 
00183                 m_nPrefetchedAvailable(0), m_nPrefetchedUsed(0)
00184         {
00185                 typedef std::istream::traits_type char_traits;
00186 
00187                 if(!input)
00188                         return;
00189 
00190                 // Determine (or guess) the character-set by reading the BOM, if any.  See
00191                 // the YAML specification for the determination algorithm.
00192                 char_traits::int_type intro[4];
00193                 int nIntroUsed = 0;
00194                 UtfIntroState state = uis_start;
00195                 for(; !s_introFinalState[state]; ) {
00196                         std::istream::int_type ch = input.get();
00197                         intro[nIntroUsed++] = ch;
00198                         UtfIntroCharType charType = IntroCharTypeOf(ch);
00199                         UtfIntroState newState = s_introTransitions[state][charType];
00200                         int nUngets = s_introUngetCount[state][charType];
00201                         if(nUngets > 0) {
00202                                 input.clear();
00203                                 for(; nUngets > 0; --nUngets) {
00204                                         if(char_traits::eof() != intro[--nIntroUsed])
00205                                                 input.putback(char_traits::to_char_type(intro[nIntroUsed]));
00206                                 }
00207                         }
00208                         state = newState;
00209                 }
00210 
00211                 switch (state) {
00212                 case uis_utf8: m_charSet = utf8; break;
00213                 case uis_utf16le: m_charSet = utf16le; break;
00214                 case uis_utf16be: m_charSet = utf16be; break;
00215                 case uis_utf32le: m_charSet = utf32le; break;
00216                 case uis_utf32be: m_charSet = utf32be; break;
00217                 default: m_charSet = utf8; break;
00218                 }
00219 
00220                 ReadAheadTo(0);
00221         }
00222 
00223         Stream::~Stream()
00224         {
00225                 delete[] m_pPrefetched;
00226         }
00227 
00228         char Stream::peek() const
00229         {
00230                 if (m_readahead.empty())
00231                 {
00232                         return Stream::eof();
00233                 }
00234 
00235                 return m_readahead[0];
00236         }
00237         
00238         Stream::operator bool() const
00239         {
00240                 return m_input.good() || (!m_readahead.empty() && m_readahead[0] != Stream::eof());
00241         }
00242 
00243         // get
00244         // . Extracts a character from the stream and updates our position
00245         char Stream::get()
00246         {
00247                 char ch = peek();
00248                 AdvanceCurrent();
00249                 m_mark.column++;
00250                 
00251                 if(ch == '\n') {
00252                         m_mark.column = 0;
00253                         m_mark.line++;
00254                 }
00255                 
00256                 return ch;
00257         }
00258 
00259         // get
00260         // . Extracts 'n' characters from the stream and updates our position
00261         std::string Stream::get(int n)
00262         {
00263                 std::string ret;
00264                 ret.reserve(n);
00265                 for(int i=0;i<n;i++)
00266                         ret += get();
00267                 return ret;
00268         }
00269 
00270         // eat
00271         // . Eats 'n' characters and updates our position.
00272         void Stream::eat(int n)
00273         {
00274                 for(int i=0;i<n;i++)
00275                         get();
00276         }
00277 
00278         void Stream::AdvanceCurrent()
00279         {
00280                 if (!m_readahead.empty())
00281                 {
00282                         m_readahead.pop_front();
00283                         m_mark.pos++;
00284                 }
00285 
00286                 ReadAheadTo(0);
00287         }
00288 
00289         bool Stream::_ReadAheadTo(size_t i) const
00290         {
00291                 while (m_input.good() && (m_readahead.size() <= i))
00292                 {
00293                         switch (m_charSet)
00294                         {
00295                         case utf8: StreamInUtf8(); break;
00296                         case utf16le: StreamInUtf16(); break;
00297                         case utf16be: StreamInUtf16(); break;
00298                         case utf32le: StreamInUtf32(); break;
00299                         case utf32be: StreamInUtf32(); break;
00300                         }
00301                 }
00302                 
00303                 // signal end of stream
00304                 if(!m_input.good())
00305                         m_readahead.push_back(Stream::eof());
00306 
00307                 return m_readahead.size() > i;
00308         }
00309 
00310         void Stream::StreamInUtf8() const
00311         {
00312                 unsigned char b = GetNextByte();
00313                 if (m_input.good())
00314                 {
00315                         m_readahead.push_back(b);
00316                 }
00317         }
00318 
00319         void Stream::StreamInUtf16() const
00320         {
00321                 unsigned long ch = 0;
00322                 unsigned char bytes[2];
00323                 int nBigEnd = (m_charSet == utf16be) ? 0 : 1;
00324 
00325                 bytes[0] = GetNextByte();
00326                 bytes[1] = GetNextByte();
00327                 if (!m_input.good())
00328                 {
00329                         return;
00330                 }
00331                 ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
00332                         static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
00333 
00334                 if (ch >= 0xDC00 && ch < 0xE000)
00335                 {
00336                         // Trailing (low) surrogate...ugh, wrong order
00337                         QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
00338                         return;
00339                 }
00340                 else if (ch >= 0xD800 && ch < 0xDC00)
00341                 {
00342                         // ch is a leading (high) surrogate
00343 
00344                         // Four byte UTF-8 code point
00345 
00346                         // Read the trailing (low) surrogate
00347                         for (;;)
00348                         {
00349                                 bytes[0] = GetNextByte();
00350                                 bytes[1] = GetNextByte();
00351                                 if (!m_input.good())
00352                                 {
00353                                         QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
00354                                         return;
00355                                 }
00356                                 unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
00357                                         static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
00358                                 if (chLow < 0xDC00 || ch >= 0xE000)
00359                                 {
00360                                         // Trouble...not a low surrogate.  Dump a REPLACEMENT CHARACTER into the stream.
00361                                         QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
00362 
00363                                         // Deal with the next UTF-16 unit
00364                                         if (chLow < 0xD800 || ch >= 0xE000)
00365                                         {
00366                                                 // Easiest case: queue the codepoint and return
00367                                                 QueueUnicodeCodepoint(m_readahead, ch);
00368                                                 return;
00369                                         }
00370                                         else
00371                                         {
00372                                                 // Start the loop over with the new high surrogate
00373                                                 ch = chLow;
00374                                                 continue;
00375                                         }
00376                                 }
00377 
00378                                 // Select the payload bits from the high surrogate
00379                                 ch &= 0x3FF;
00380                                 ch <<= 10;
00381 
00382                                 // Include bits from low surrogate
00383                                 ch |= (chLow & 0x3FF);
00384 
00385                                 // Add the surrogacy offset
00386                                 ch += 0x10000;
00387                         }
00388                 }
00389 
00390                 QueueUnicodeCodepoint(m_readahead, ch);
00391         }
00392 
00393         inline char* ReadBuffer(unsigned char* pBuffer)
00394         {
00395                 return reinterpret_cast<char*>(pBuffer);
00396         }
00397 
00398         unsigned char Stream::GetNextByte() const
00399         {
00400                 if (m_nPrefetchedUsed >= m_nPrefetchedAvailable)
00401                 {
00402                         std::streambuf *pBuf = m_input.rdbuf();
00403                         m_nPrefetchedAvailable = pBuf->sgetn(ReadBuffer(m_pPrefetched), 
00404                                 YAML_PREFETCH_SIZE);
00405                         m_nPrefetchedUsed = 0;
00406                         if (!m_nPrefetchedAvailable)
00407                         {
00408                                 m_input.setstate(std::ios_base::eofbit);
00409                         }
00410 
00411                         if (0 == m_nPrefetchedAvailable)
00412                         {
00413                                 return 0;
00414                         }
00415                 }
00416 
00417                 return m_pPrefetched[m_nPrefetchedUsed++];
00418         }
00419 
00420         void Stream::StreamInUtf32() const
00421         {
00422                 static int indexes[2][4] = {
00423                         {3, 2, 1, 0},
00424                         {0, 1, 2, 3}
00425                 };
00426 
00427                 unsigned long ch = 0;
00428                 unsigned char bytes[4];
00429                 int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0];
00430 
00431                 bytes[0] = GetNextByte();
00432                 bytes[1] = GetNextByte();
00433                 bytes[2] = GetNextByte();
00434                 bytes[3] = GetNextByte();
00435                 if (!m_input.good())
00436                 {
00437                         return;
00438                 }
00439 
00440                 for (int i = 0; i < 4; ++i)
00441                 {
00442                         ch <<= 8;
00443                         ch |= bytes[pIndexes[i]];
00444                 }
00445 
00446                 QueueUnicodeCodepoint(m_readahead, ch);
00447         }
00448 }


libpointmatcher
Author(s):
autogenerated on Thu Jun 20 2019 19:51:32