stream.cpp
Go to the documentation of this file.
1 #include "stream.h"
2 #include <iostream>
3 #include "exp.h"
4 
5 #ifndef YAML_PREFETCH_SIZE
6 #define YAML_PREFETCH_SIZE 2048
7 #endif
8 
9 #define S_ARRAY_SIZE( A ) (sizeof(A)/sizeof(*(A)))
10 #define S_ARRAY_END( A ) ((A) + S_ARRAY_SIZE(A))
11 
12 #define CP_REPLACEMENT_CHARACTER (0xFFFD)
13 
14 namespace YAML_PM
15 {
36  };
37 
48  };
49 
50  static bool s_introFinalState[] = {
51  false, //uis_start
52  false, //uis_utfbe_b1
53  false, //uis_utf32be_b2
54  false, //uis_utf32be_bom3
55  true, //uis_utf32be
56  true, //uis_utf16be
57  false, //uis_utf16be_bom1
58  false, //uis_utfle_bom1
59  false, //uis_utf16le_bom2
60  false, //uis_utf32le_bom3
61  true, //uis_utf16le
62  true, //uis_utf32le
63  false, //uis_utf8_imp
64  false, //uis_utf16le_imp
65  false, //uis_utf32le_imp3
66  false, //uis_utf8_bom1
67  false, //uis_utf8_bom2
68  true, //uis_utf8
69  true, //uis_error
70  };
71 
73  // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
92  };
93 
94  static char s_introUngetCount[][uictMax] = {
95  // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
96  {0, 1, 1, 0, 0, 0, 0, 1},
97  {0, 2, 2, 2, 2, 2, 2, 2},
98  {3, 3, 3, 3, 0, 3, 3, 3},
99  {4, 4, 4, 4, 4, 0, 4, 4},
100  {1, 1, 1, 1, 1, 1, 1, 1},
101  {1, 1, 1, 1, 1, 1, 1, 1},
102  {2, 2, 2, 2, 2, 0, 2, 2},
103  {2, 2, 2, 2, 0, 2, 2, 2},
104  {0, 1, 1, 1, 1, 1, 1, 1},
105  {0, 2, 2, 2, 2, 2, 2, 2},
106  {1, 1, 1, 1, 1, 1, 1, 1},
107  {1, 1, 1, 1, 1, 1, 1, 1},
108  {0, 2, 2, 2, 2, 2, 2, 2},
109  {0, 3, 3, 3, 3, 3, 3, 3},
110  {4, 4, 4, 4, 4, 4, 4, 4},
111  {2, 0, 2, 2, 2, 2, 2, 2},
112  {3, 3, 0, 3, 3, 3, 3, 3},
113  {1, 1, 1, 1, 1, 1, 1, 1},
114  };
115 
116  inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch)
117  {
118  if (std::istream::traits_type::eof() == ch) {
119  return uictOther;
120  }
121 
122  switch (ch) {
123  case 0: return uict00;
124  case 0xBB: return uictBB;
125  case 0xBF: return uictBF;
126  case 0xEF: return uictEF;
127  case 0xFE: return uictFE;
128  case 0xFF: return uictFF;
129  }
130 
131  if ((ch > 0) && (ch < 0xFF)) {
132  return uictAscii;
133  }
134 
135  return uictOther;
136  }
137 
138  inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits, unsigned char rshift)
139  {
140  const unsigned char header = ((1 << lead_bits) - 1) << (8 - lead_bits);
141  const unsigned char mask = (0xFF >> (lead_bits + 1));
142  return static_cast<char>(static_cast<unsigned char>(
143  header | ((ch >> rshift) & mask)
144  ));
145  }
146 
147  inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch)
148  {
149  // We are not allowed to queue the Stream::eof() codepoint, so
150  // replace it with CP_REPLACEMENT_CHARACTER
151  if (static_cast<unsigned long>(Stream::eof()) == ch)
152  {
154  }
155 
156  if (ch < 0x80)
157  {
158  q.push_back(Utf8Adjust(ch, 0, 0));
159  }
160  else if (ch < 0x800)
161  {
162  q.push_back(Utf8Adjust(ch, 2, 6));
163  q.push_back(Utf8Adjust(ch, 1, 0));
164  }
165  else if (ch < 0x10000)
166  {
167  q.push_back(Utf8Adjust(ch, 3, 12));
168  q.push_back(Utf8Adjust(ch, 1, 6));
169  q.push_back(Utf8Adjust(ch, 1, 0));
170  }
171  else
172  {
173  q.push_back(Utf8Adjust(ch, 4, 18));
174  q.push_back(Utf8Adjust(ch, 1, 12));
175  q.push_back(Utf8Adjust(ch, 1, 6));
176  q.push_back(Utf8Adjust(ch, 1, 0));
177  }
178  }
179 
180  Stream::Stream(std::istream& input)
181  : m_input(input),
182  m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]),
183  m_nPrefetchedAvailable(0), m_nPrefetchedUsed(0)
184  {
185  typedef std::istream::traits_type char_traits;
186 
187  if(!input)
188  return;
189 
190  // Determine (or guess) the character-set by reading the BOM, if any. See
191  // the YAML specification for the determination algorithm.
192  char_traits::int_type intro[4];
193  int nIntroUsed = 0;
194  UtfIntroState state = uis_start;
195  for(; !s_introFinalState[state]; ) {
196  std::istream::int_type ch = input.get();
197  intro[nIntroUsed++] = ch;
198  UtfIntroCharType charType = IntroCharTypeOf(ch);
199  UtfIntroState newState = s_introTransitions[state][charType];
200  int nUngets = s_introUngetCount[state][charType];
201  if(nUngets > 0) {
202  input.clear();
203  for(; nUngets > 0; --nUngets) {
204  if(char_traits::eof() != intro[--nIntroUsed])
205  input.putback(char_traits::to_char_type(intro[nIntroUsed]));
206  }
207  }
208  state = newState;
209  }
210 
211  switch (state) {
212  case uis_utf8: m_charSet = utf8; break;
213  case uis_utf16le: m_charSet = utf16le; break;
214  case uis_utf16be: m_charSet = utf16be; break;
215  case uis_utf32le: m_charSet = utf32le; break;
216  case uis_utf32be: m_charSet = utf32be; break;
217  default: m_charSet = utf8; break;
218  }
219 
220  ReadAheadTo(0);
221  }
222 
224  {
225  delete[] m_pPrefetched;
226  }
227 
228  char Stream::peek() const
229  {
230  if (m_readahead.empty())
231  {
232  return Stream::eof();
233  }
234 
235  return m_readahead[0];
236  }
237 
238  Stream::operator bool() const
239  {
240  return m_input.good() || (!m_readahead.empty() && m_readahead[0] != Stream::eof());
241  }
242 
243  // get
244  // . Extracts a character from the stream and updates our position
245  char Stream::get()
246  {
247  char ch = peek();
248  AdvanceCurrent();
249  m_mark.column++;
250 
251  if(ch == '\n') {
252  m_mark.column = 0;
253  m_mark.line++;
254  }
255 
256  return ch;
257  }
258 
259  // get
260  // . Extracts 'n' characters from the stream and updates our position
262  {
263  std::string ret;
264  ret.reserve(n);
265  for(int i=0;i<n;i++)
266  ret += get();
267  return ret;
268  }
269 
270  // eat
271  // . Eats 'n' characters and updates our position.
272  void Stream::eat(int n)
273  {
274  for(int i=0;i<n;i++)
275  get();
276  }
277 
279  {
280  if (!m_readahead.empty())
281  {
282  m_readahead.pop_front();
283  m_mark.pos++;
284  }
285 
286  ReadAheadTo(0);
287  }
288 
289  bool Stream::_ReadAheadTo(size_t i) const
290  {
291  while (m_input.good() && (m_readahead.size() <= i))
292  {
293  switch (m_charSet)
294  {
295  case utf8: StreamInUtf8(); break;
296  case utf16le: StreamInUtf16(); break;
297  case utf16be: StreamInUtf16(); break;
298  case utf32le: StreamInUtf32(); break;
299  case utf32be: StreamInUtf32(); break;
300  }
301  }
302 
303  // signal end of stream
304  if(!m_input.good())
305  m_readahead.push_back(Stream::eof());
306 
307  return m_readahead.size() > i;
308  }
309 
310  void Stream::StreamInUtf8() const
311  {
312  unsigned char b = GetNextByte();
313  if (m_input.good())
314  {
315  m_readahead.push_back(b);
316  }
317  }
318 
320  {
321  unsigned long ch = 0;
322  unsigned char bytes[2];
323  int nBigEnd = (m_charSet == utf16be) ? 0 : 1;
324 
325  bytes[0] = GetNextByte();
326  bytes[1] = GetNextByte();
327  if (!m_input.good())
328  {
329  return;
330  }
331  ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
332  static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
333 
334  if (ch >= 0xDC00 && ch < 0xE000)
335  {
336  // Trailing (low) surrogate...ugh, wrong order
338  return;
339  }
340  else if (ch >= 0xD800 && ch < 0xDC00)
341  {
342  // ch is a leading (high) surrogate
343 
344  // Four byte UTF-8 code point
345 
346  // Read the trailing (low) surrogate
347  for (;;)
348  {
349  bytes[0] = GetNextByte();
350  bytes[1] = GetNextByte();
351  if (!m_input.good())
352  {
354  return;
355  }
356  unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
357  static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
358  if (chLow < 0xDC00 || ch >= 0xE000)
359  {
360  // Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the stream.
362 
363  // Deal with the next UTF-16 unit
364  if (chLow < 0xD800 || ch >= 0xE000)
365  {
366  // Easiest case: queue the codepoint and return
368  return;
369  }
370  else
371  {
372  // Start the loop over with the new high surrogate
373  ch = chLow;
374  continue;
375  }
376  }
377 
378  // Select the payload bits from the high surrogate
379  ch &= 0x3FF;
380  ch <<= 10;
381 
382  // Include bits from low surrogate
383  ch |= (chLow & 0x3FF);
384 
385  // Add the surrogacy offset
386  ch += 0x10000;
387  }
388  }
389 
391  }
392 
393  inline char* ReadBuffer(unsigned char* pBuffer)
394  {
395  return reinterpret_cast<char*>(pBuffer);
396  }
397 
398  unsigned char Stream::GetNextByte() const
399  {
401  {
402  std::streambuf *pBuf = m_input.rdbuf();
405  m_nPrefetchedUsed = 0;
407  {
408  m_input.setstate(std::ios_base::eofbit);
409  }
410 
411  if (0 == m_nPrefetchedAvailable)
412  {
413  return 0;
414  }
415  }
416 
418  }
419 
421  {
422  static int indexes[2][4] = {
423  {3, 2, 1, 0},
424  {0, 1, 2, 3}
425  };
426 
427  unsigned long ch = 0;
428  unsigned char bytes[4];
429  int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0];
430 
431  bytes[0] = GetNextByte();
432  bytes[1] = GetNextByte();
433  bytes[2] = GetNextByte();
434  bytes[3] = GetNextByte();
435  if (!m_input.good())
436  {
437  return;
438  }
439 
440  for (int i = 0; i < 4; ++i)
441  {
442  ch <<= 8;
443  ch |= bytes[pIndexes[i]];
444  }
445 
447  }
448 }
UtfIntroCharType
Definition: stream.cpp:38
std::istream & m_input
Definition: stream.h:47
unsigned char *const m_pPrefetched
Definition: stream.h:52
bool ReadAheadTo(size_t i) const
Definition: stream.h:72
void AdvanceCurrent()
Definition: stream.cpp:278
int line
Definition: mark.h:19
void StreamInUtf8() const
Definition: stream.cpp:310
std::deque< char > m_readahead
Definition: stream.h:51
unsigned char GetNextByte() const
Definition: stream.cpp:398
::std::string string
Definition: gtest.h:1979
bool _ReadAheadTo(size_t i) const
Definition: stream.cpp:289
char Utf8Adjust(unsigned long ch, unsigned char lead_bits, unsigned char rshift)
Definition: stream.cpp:138
static UtfIntroState s_introTransitions[][uictMax]
Definition: stream.cpp:72
static bool s_introFinalState[]
Definition: stream.cpp:50
char * ReadBuffer(unsigned char *pBuffer)
Definition: stream.cpp:393
size_t m_nPrefetchedAvailable
Definition: stream.h:53
size_t m_nPrefetchedUsed
Definition: stream.h:54
Mark m_mark
Definition: stream.h:48
int column
Definition: mark.h:19
void eat(int n=1)
Definition: stream.cpp:272
Stream(std::istream &input)
Definition: stream.cpp:180
UtfIntroState
Definition: stream.cpp:16
#define YAML_PREFETCH_SIZE
Definition: stream.cpp:6
static char s_introUngetCount[][uictMax]
Definition: stream.cpp:94
#define CP_REPLACEMENT_CHARACTER
Definition: stream.cpp:12
static char eof()
Definition: stream.h:36
int pos
Definition: mark.h:18
UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch)
Definition: stream.cpp:116
void StreamInUtf16() const
Definition: stream.cpp:319
char peek() const
Definition: stream.cpp:228
void StreamInUtf32() const
Definition: stream.cpp:420
CharacterSet m_charSet
Definition: stream.h:50
void QueueUnicodeCodepoint(std::deque< char > &q, unsigned long ch)
Definition: stream.cpp:147


libpointmatcher
Author(s):
autogenerated on Sat May 27 2023 02:38:03