escaping.cc
Go to the documentation of this file.
1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/strings/escaping.h"
16 
17 #include <algorithm>
18 #include <cassert>
19 #include <cstdint>
20 #include <cstring>
21 #include <iterator>
22 #include <limits>
23 #include <string>
24 
31 #include "absl/strings/str_cat.h"
32 #include "absl/strings/str_join.h"
34 
35 namespace absl {
36 namespace {
37 
38 // Digit conversion.
39 constexpr char kHexChar[] = "0123456789abcdef";
40 
41 constexpr char kHexTable[513] =
42  "000102030405060708090a0b0c0d0e0f"
43  "101112131415161718191a1b1c1d1e1f"
44  "202122232425262728292a2b2c2d2e2f"
45  "303132333435363738393a3b3c3d3e3f"
46  "404142434445464748494a4b4c4d4e4f"
47  "505152535455565758595a5b5c5d5e5f"
48  "606162636465666768696a6b6c6d6e6f"
49  "707172737475767778797a7b7c7d7e7f"
50  "808182838485868788898a8b8c8d8e8f"
51  "909192939495969798999a9b9c9d9e9f"
52  "a0a1a2a3a4a5a6a7a8a9aaabacadaeaf"
53  "b0b1b2b3b4b5b6b7b8b9babbbcbdbebf"
54  "c0c1c2c3c4c5c6c7c8c9cacbcccdcecf"
55  "d0d1d2d3d4d5d6d7d8d9dadbdcdddedf"
56  "e0e1e2e3e4e5e6e7e8e9eaebecedeeef"
57  "f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff";
58 
59 // These are used for the leave_nulls_escaped argument to CUnescapeInternal().
60 constexpr bool kUnescapeNulls = false;
61 
62 inline bool is_octal_digit(char c) { return ('0' <= c) && (c <= '7'); }
63 
64 inline int hex_digit_to_int(char c) {
65  static_assert('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61,
66  "Character set must be ASCII.");
67  assert(absl::ascii_isxdigit(c));
68  int x = static_cast<unsigned char>(c);
69  if (x > '9') {
70  x += 9;
71  }
72  return x & 0xf;
73 }
74 
75 inline bool IsSurrogate(char32_t c, absl::string_view src, std::string* error) {
76  if (c >= 0xD800 && c <= 0xDFFF) {
77  if (error) {
78  *error = absl::StrCat("invalid surrogate character (0xD800-DFFF): \\",
79  src);
80  }
81  return true;
82  }
83  return false;
84 }
85 
86 // ----------------------------------------------------------------------
87 // CUnescapeInternal()
88 // Implements both CUnescape() and CUnescapeForNullTerminatedString().
89 //
90 // Unescapes C escape sequences and is the reverse of CEscape().
91 //
92 // If 'source' is valid, stores the unescaped string and its size in
93 // 'dest' and 'dest_len' respectively, and returns true. Otherwise
94 // returns false and optionally stores the error description in
95 // 'error'. Set 'error' to nullptr to disable error reporting.
96 //
97 // 'dest' should point to a buffer that is at least as big as 'source'.
98 // 'source' and 'dest' may be the same.
99 //
100 // NOTE: any changes to this function must also be reflected in the older
101 // UnescapeCEscapeSequences().
102 // ----------------------------------------------------------------------
103 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
104  char* dest, ptrdiff_t* dest_len, std::string* error) {
105  char* d = dest;
106  const char* p = source.data();
107  const char* end = p + source.size();
108  const char* last_byte = end - 1;
109 
110  // Small optimization for case where source = dest and there's no escaping
111  while (p == d && p < end && *p != '\\') p++, d++;
112 
113  while (p < end) {
114  if (*p != '\\') {
115  *d++ = *p++;
116  } else {
117  if (++p > last_byte) { // skip past the '\\'
118  if (error) *error = "String cannot end with \\";
119  return false;
120  }
121  switch (*p) {
122  case 'a': *d++ = '\a'; break;
123  case 'b': *d++ = '\b'; break;
124  case 'f': *d++ = '\f'; break;
125  case 'n': *d++ = '\n'; break;
126  case 'r': *d++ = '\r'; break;
127  case 't': *d++ = '\t'; break;
128  case 'v': *d++ = '\v'; break;
129  case '\\': *d++ = '\\'; break;
130  case '?': *d++ = '\?'; break; // \? Who knew?
131  case '\'': *d++ = '\''; break;
132  case '"': *d++ = '\"'; break;
133  case '0':
134  case '1':
135  case '2':
136  case '3':
137  case '4':
138  case '5':
139  case '6':
140  case '7': {
141  // octal digit: 1 to 3 digits
142  const char* octal_start = p;
143  unsigned int ch = *p - '0';
144  if (p < last_byte && is_octal_digit(p[1])) ch = ch * 8 + *++p - '0';
145  if (p < last_byte && is_octal_digit(p[1]))
146  ch = ch * 8 + *++p - '0'; // now points at last digit
147  if (ch > 0xff) {
148  if (error) {
149  *error = "Value of \\" +
150  std::string(octal_start, p + 1 - octal_start) +
151  " exceeds 0xff";
152  }
153  return false;
154  }
155  if ((ch == 0) && leave_nulls_escaped) {
156  // Copy the escape sequence for the null character
157  const ptrdiff_t octal_size = p + 1 - octal_start;
158  *d++ = '\\';
159  memcpy(d, octal_start, octal_size);
160  d += octal_size;
161  break;
162  }
163  *d++ = ch;
164  break;
165  }
166  case 'x':
167  case 'X': {
168  if (p >= last_byte) {
169  if (error) *error = "String cannot end with \\x";
170  return false;
171  } else if (!absl::ascii_isxdigit(p[1])) {
172  if (error) *error = "\\x cannot be followed by a non-hex digit";
173  return false;
174  }
175  unsigned int ch = 0;
176  const char* hex_start = p;
177  while (p < last_byte && absl::ascii_isxdigit(p[1]))
178  // Arbitrarily many hex digits
179  ch = (ch << 4) + hex_digit_to_int(*++p);
180  if (ch > 0xFF) {
181  if (error) {
182  *error = "Value of \\" +
183  std::string(hex_start, p + 1 - hex_start) +
184  " exceeds 0xff";
185  }
186  return false;
187  }
188  if ((ch == 0) && leave_nulls_escaped) {
189  // Copy the escape sequence for the null character
190  const ptrdiff_t hex_size = p + 1 - hex_start;
191  *d++ = '\\';
192  memcpy(d, hex_start, hex_size);
193  d += hex_size;
194  break;
195  }
196  *d++ = ch;
197  break;
198  }
199  case 'u': {
200  // \uhhhh => convert 4 hex digits to UTF-8
201  char32_t rune = 0;
202  const char* hex_start = p;
203  if (p + 4 >= end) {
204  if (error) {
205  *error = "\\u must be followed by 4 hex digits: \\" +
206  std::string(hex_start, p + 1 - hex_start);
207  }
208  return false;
209  }
210  for (int i = 0; i < 4; ++i) {
211  // Look one char ahead.
212  if (absl::ascii_isxdigit(p[1])) {
213  rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
214  } else {
215  if (error) {
216  *error = "\\u must be followed by 4 hex digits: \\" +
217  std::string(hex_start, p + 1 - hex_start);
218  }
219  return false;
220  }
221  }
222  if ((rune == 0) && leave_nulls_escaped) {
223  // Copy the escape sequence for the null character
224  *d++ = '\\';
225  memcpy(d, hex_start, 5); // u0000
226  d += 5;
227  break;
228  }
229  if (IsSurrogate(rune, absl::string_view(hex_start, 5), error)) {
230  return false;
231  }
232  d += strings_internal::EncodeUTF8Char(d, rune);
233  break;
234  }
235  case 'U': {
236  // \Uhhhhhhhh => convert 8 hex digits to UTF-8
237  char32_t rune = 0;
238  const char* hex_start = p;
239  if (p + 8 >= end) {
240  if (error) {
241  *error = "\\U must be followed by 8 hex digits: \\" +
242  std::string(hex_start, p + 1 - hex_start);
243  }
244  return false;
245  }
246  for (int i = 0; i < 8; ++i) {
247  // Look one char ahead.
248  if (absl::ascii_isxdigit(p[1])) {
249  // Don't change rune until we're sure this
250  // is within the Unicode limit, but do advance p.
251  uint32_t newrune = (rune << 4) + hex_digit_to_int(*++p);
252  if (newrune > 0x10FFFF) {
253  if (error) {
254  *error = "Value of \\" +
255  std::string(hex_start, p + 1 - hex_start) +
256  " exceeds Unicode limit (0x10FFFF)";
257  }
258  return false;
259  } else {
260  rune = newrune;
261  }
262  } else {
263  if (error) {
264  *error = "\\U must be followed by 8 hex digits: \\" +
265  std::string(hex_start, p + 1 - hex_start);
266  }
267  return false;
268  }
269  }
270  if ((rune == 0) && leave_nulls_escaped) {
271  // Copy the escape sequence for the null character
272  *d++ = '\\';
273  memcpy(d, hex_start, 9); // U00000000
274  d += 9;
275  break;
276  }
277  if (IsSurrogate(rune, absl::string_view(hex_start, 9), error)) {
278  return false;
279  }
280  d += strings_internal::EncodeUTF8Char(d, rune);
281  break;
282  }
283  default: {
284  if (error) *error = std::string("Unknown escape sequence: \\") + *p;
285  return false;
286  }
287  }
288  p++; // read past letter we escaped
289  }
290  }
291  *dest_len = d - dest;
292  return true;
293 }
294 
295 // ----------------------------------------------------------------------
296 // CUnescapeInternal()
297 //
298 // Same as above but uses a std::string for output. 'source' and 'dest'
299 // may be the same.
300 // ----------------------------------------------------------------------
301 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
302  std::string* dest, std::string* error) {
304 
305  ptrdiff_t dest_size;
306  if (!CUnescapeInternal(source,
307  leave_nulls_escaped,
308  &(*dest)[0],
309  &dest_size,
310  error)) {
311  return false;
312  }
313  dest->erase(dest_size);
314  return true;
315 }
316 
317 // ----------------------------------------------------------------------
318 // CEscape()
319 // CHexEscape()
320 // Utf8SafeCEscape()
321 // Utf8SafeCHexEscape()
322 // Escapes 'src' using C-style escape sequences. This is useful for
323 // preparing query flags. The 'Hex' version uses hexadecimal rather than
324 // octal sequences. The 'Utf8Safe' version does not touch UTF-8 bytes.
325 //
326 // Escaped chars: \n, \r, \t, ", ', \, and !absl::ascii_isprint().
327 // ----------------------------------------------------------------------
328 std::string CEscapeInternal(absl::string_view src, bool use_hex,
329  bool utf8_safe) {
330  std::string dest;
331  bool last_hex_escape = false; // true if last output char was \xNN.
332 
333  for (unsigned char c : src) {
334  bool is_hex_escape = false;
335  switch (c) {
336  case '\n': dest.append("\\" "n"); break;
337  case '\r': dest.append("\\" "r"); break;
338  case '\t': dest.append("\\" "t"); break;
339  case '\"': dest.append("\\" "\""); break;
340  case '\'': dest.append("\\" "'"); break;
341  case '\\': dest.append("\\" "\\"); break;
342  default:
343  // Note that if we emit \xNN and the src character after that is a hex
344  // digit then that digit must be escaped too to prevent it being
345  // interpreted as part of the character code by C.
346  if ((!utf8_safe || c < 0x80) &&
347  (!absl::ascii_isprint(c) ||
348  (last_hex_escape && absl::ascii_isxdigit(c)))) {
349  if (use_hex) {
350  dest.append("\\" "x");
351  dest.push_back(kHexChar[c / 16]);
352  dest.push_back(kHexChar[c % 16]);
353  is_hex_escape = true;
354  } else {
355  dest.append("\\");
356  dest.push_back(kHexChar[c / 64]);
357  dest.push_back(kHexChar[(c % 64) / 8]);
358  dest.push_back(kHexChar[c % 8]);
359  }
360  } else {
361  dest.push_back(c);
362  break;
363  }
364  }
365  last_hex_escape = is_hex_escape;
366  }
367 
368  return dest;
369 }
370 
371 /* clang-format off */
372 constexpr char c_escaped_len[256] = {
373  4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r
374  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
375  1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", '
376  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9'
377  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O'
378  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\'
379  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o'
380  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL
381  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
382  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
383  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
384  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
385  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
386  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
387  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
388  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
389 };
390 /* clang-format on */
391 
392 // Calculates the length of the C-style escaped version of 'src'.
393 // Assumes that non-printable characters are escaped using octal sequences, and
394 // that UTF-8 bytes are not handled specially.
395 inline size_t CEscapedLength(absl::string_view src) {
396  size_t escaped_len = 0;
397  for (unsigned char c : src) escaped_len += c_escaped_len[c];
398  return escaped_len;
399 }
400 
401 void CEscapeAndAppendInternal(absl::string_view src, std::string* dest) {
402  size_t escaped_len = CEscapedLength(src);
403  if (escaped_len == src.size()) {
404  dest->append(src.data(), src.size());
405  return;
406  }
407 
408  size_t cur_dest_len = dest->size();
410  cur_dest_len + escaped_len);
411  char* append_ptr = &(*dest)[cur_dest_len];
412 
413  for (unsigned char c : src) {
414  int char_len = c_escaped_len[c];
415  if (char_len == 1) {
416  *append_ptr++ = c;
417  } else if (char_len == 2) {
418  switch (c) {
419  case '\n':
420  *append_ptr++ = '\\';
421  *append_ptr++ = 'n';
422  break;
423  case '\r':
424  *append_ptr++ = '\\';
425  *append_ptr++ = 'r';
426  break;
427  case '\t':
428  *append_ptr++ = '\\';
429  *append_ptr++ = 't';
430  break;
431  case '\"':
432  *append_ptr++ = '\\';
433  *append_ptr++ = '\"';
434  break;
435  case '\'':
436  *append_ptr++ = '\\';
437  *append_ptr++ = '\'';
438  break;
439  case '\\':
440  *append_ptr++ = '\\';
441  *append_ptr++ = '\\';
442  break;
443  }
444  } else {
445  *append_ptr++ = '\\';
446  *append_ptr++ = '0' + c / 64;
447  *append_ptr++ = '0' + (c % 64) / 8;
448  *append_ptr++ = '0' + c % 8;
449  }
450  }
451 }
452 
453 bool Base64UnescapeInternal(const char* src_param, size_t szsrc, char* dest,
454  size_t szdest, const signed char* unbase64,
455  size_t* len) {
456  static const char kPad64Equals = '=';
457  static const char kPad64Dot = '.';
458 
459  size_t destidx = 0;
460  int decode = 0;
461  int state = 0;
462  unsigned int ch = 0;
463  unsigned int temp = 0;
464 
465  // If "char" is signed by default, using *src as an array index results in
466  // accessing negative array elements. Treat the input as a pointer to
467  // unsigned char to avoid this.
468  const unsigned char* src = reinterpret_cast<const unsigned char*>(src_param);
469 
470  // The GET_INPUT macro gets the next input character, skipping
471  // over any whitespace, and stopping when we reach the end of the
472  // std::string or when we read any non-data character. The arguments are
473  // an arbitrary identifier (used as a label for goto) and the number
474  // of data bytes that must remain in the input to avoid aborting the
475  // loop.
476 #define GET_INPUT(label, remain) \
477  label: \
478  --szsrc; \
479  ch = *src++; \
480  decode = unbase64[ch]; \
481  if (decode < 0) { \
482  if (absl::ascii_isspace(ch) && szsrc >= remain) goto label; \
483  state = 4 - remain; \
484  break; \
485  }
486 
487  // if dest is null, we're just checking to see if it's legal input
488  // rather than producing output. (I suspect this could just be done
489  // with a regexp...). We duplicate the loop so this test can be
490  // outside it instead of in every iteration.
491 
492  if (dest) {
493  // This loop consumes 4 input bytes and produces 3 output bytes
494  // per iteration. We can't know at the start that there is enough
495  // data left in the std::string for a full iteration, so the loop may
496  // break out in the middle; if so 'state' will be set to the
497  // number of input bytes read.
498 
499  while (szsrc >= 4) {
500  // We'll start by optimistically assuming that the next four
501  // bytes of the std::string (src[0..3]) are four good data bytes
502  // (that is, no nulls, whitespace, padding chars, or illegal
503  // chars). We need to test src[0..2] for nulls individually
504  // before constructing temp to preserve the property that we
505  // never read past a null in the std::string (no matter how long
506  // szsrc claims the std::string is).
507 
508  if (!src[0] || !src[1] || !src[2] ||
509  ((temp = ((unsigned(unbase64[src[0]]) << 18) |
510  (unsigned(unbase64[src[1]]) << 12) |
511  (unsigned(unbase64[src[2]]) << 6) |
512  (unsigned(unbase64[src[3]])))) &
513  0x80000000)) {
514  // Iff any of those four characters was bad (null, illegal,
515  // whitespace, padding), then temp's high bit will be set
516  // (because unbase64[] is -1 for all bad characters).
517  //
518  // We'll back up and resort to the slower decoder, which knows
519  // how to handle those cases.
520 
521  GET_INPUT(first, 4);
522  temp = decode;
523  GET_INPUT(second, 3);
524  temp = (temp << 6) | decode;
525  GET_INPUT(third, 2);
526  temp = (temp << 6) | decode;
527  GET_INPUT(fourth, 1);
528  temp = (temp << 6) | decode;
529  } else {
530  // We really did have four good data bytes, so advance four
531  // characters in the std::string.
532 
533  szsrc -= 4;
534  src += 4;
535  }
536 
537  // temp has 24 bits of input, so write that out as three bytes.
538 
539  if (destidx + 3 > szdest) return false;
540  dest[destidx + 2] = temp;
541  temp >>= 8;
542  dest[destidx + 1] = temp;
543  temp >>= 8;
544  dest[destidx] = temp;
545  destidx += 3;
546  }
547  } else {
548  while (szsrc >= 4) {
549  if (!src[0] || !src[1] || !src[2] ||
550  ((temp = ((unsigned(unbase64[src[0]]) << 18) |
551  (unsigned(unbase64[src[1]]) << 12) |
552  (unsigned(unbase64[src[2]]) << 6) |
553  (unsigned(unbase64[src[3]])))) &
554  0x80000000)) {
555  GET_INPUT(first_no_dest, 4);
556  GET_INPUT(second_no_dest, 3);
557  GET_INPUT(third_no_dest, 2);
558  GET_INPUT(fourth_no_dest, 1);
559  } else {
560  szsrc -= 4;
561  src += 4;
562  }
563  destidx += 3;
564  }
565  }
566 
567 #undef GET_INPUT
568 
569  // if the loop terminated because we read a bad character, return
570  // now.
571  if (decode < 0 && ch != kPad64Equals && ch != kPad64Dot &&
572  !absl::ascii_isspace(ch))
573  return false;
574 
575  if (ch == kPad64Equals || ch == kPad64Dot) {
576  // if we stopped by hitting an '=' or '.', un-read that character -- we'll
577  // look at it again when we count to check for the proper number of
578  // equals signs at the end.
579  ++szsrc;
580  --src;
581  } else {
582  // This loop consumes 1 input byte per iteration. It's used to
583  // clean up the 0-3 input bytes remaining when the first, faster
584  // loop finishes. 'temp' contains the data from 'state' input
585  // characters read by the first loop.
586  while (szsrc > 0) {
587  --szsrc;
588  ch = *src++;
589  decode = unbase64[ch];
590  if (decode < 0) {
591  if (absl::ascii_isspace(ch)) {
592  continue;
593  } else if (ch == kPad64Equals || ch == kPad64Dot) {
594  // back up one character; we'll read it again when we check
595  // for the correct number of pad characters at the end.
596  ++szsrc;
597  --src;
598  break;
599  } else {
600  return false;
601  }
602  }
603 
604  // Each input character gives us six bits of output.
605  temp = (temp << 6) | decode;
606  ++state;
607  if (state == 4) {
608  // If we've accumulated 24 bits of output, write that out as
609  // three bytes.
610  if (dest) {
611  if (destidx + 3 > szdest) return false;
612  dest[destidx + 2] = temp;
613  temp >>= 8;
614  dest[destidx + 1] = temp;
615  temp >>= 8;
616  dest[destidx] = temp;
617  }
618  destidx += 3;
619  state = 0;
620  temp = 0;
621  }
622  }
623  }
624 
625  // Process the leftover data contained in 'temp' at the end of the input.
626  int expected_equals = 0;
627  switch (state) {
628  case 0:
629  // Nothing left over; output is a multiple of 3 bytes.
630  break;
631 
632  case 1:
633  // Bad input; we have 6 bits left over.
634  return false;
635 
636  case 2:
637  // Produce one more output byte from the 12 input bits we have left.
638  if (dest) {
639  if (destidx + 1 > szdest) return false;
640  temp >>= 4;
641  dest[destidx] = temp;
642  }
643  ++destidx;
644  expected_equals = 2;
645  break;
646 
647  case 3:
648  // Produce two more output bytes from the 18 input bits we have left.
649  if (dest) {
650  if (destidx + 2 > szdest) return false;
651  temp >>= 2;
652  dest[destidx + 1] = temp;
653  temp >>= 8;
654  dest[destidx] = temp;
655  }
656  destidx += 2;
657  expected_equals = 1;
658  break;
659 
660  default:
661  // state should have no other values at this point.
662  ABSL_RAW_LOG(FATAL, "This can't happen; base64 decoder state = %d",
663  state);
664  }
665 
666  // The remainder of the std::string should be all whitespace, mixed with
667  // exactly 0 equals signs, or exactly 'expected_equals' equals
668  // signs. (Always accepting 0 equals signs is an Abseil extension
669  // not covered in the RFC, as is accepting dot as the pad character.)
670 
671  int equals = 0;
672  while (szsrc > 0) {
673  if (*src == kPad64Equals || *src == kPad64Dot)
674  ++equals;
675  else if (!absl::ascii_isspace(*src))
676  return false;
677  --szsrc;
678  ++src;
679  }
680 
681  const bool ok = (equals == 0 || equals == expected_equals);
682  if (ok) *len = destidx;
683  return ok;
684 }
685 
686 // The arrays below were generated by the following code
687 // #include <sys/time.h>
688 // #include <stdlib.h>
689 // #include <string.h>
690 // main()
691 // {
692 // static const char Base64[] =
693 // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
694 // char* pos;
695 // int idx, i, j;
696 // printf(" ");
697 // for (i = 0; i < 255; i += 8) {
698 // for (j = i; j < i + 8; j++) {
699 // pos = strchr(Base64, j);
700 // if ((pos == nullptr) || (j == 0))
701 // idx = -1;
702 // else
703 // idx = pos - Base64;
704 // if (idx == -1)
705 // printf(" %2d, ", idx);
706 // else
707 // printf(" %2d/*%c*/,", idx, j);
708 // }
709 // printf("\n ");
710 // }
711 // }
712 //
713 // where the value of "Base64[]" was replaced by one of the base-64 conversion
714 // tables from the functions below.
715 /* clang-format off */
716 constexpr signed char kUnBase64[] = {
717  -1, -1, -1, -1, -1, -1, -1, -1,
718  -1, -1, -1, -1, -1, -1, -1, -1,
719  -1, -1, -1, -1, -1, -1, -1, -1,
720  -1, -1, -1, -1, -1, -1, -1, -1,
721  -1, -1, -1, -1, -1, -1, -1, -1,
722  -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */,
723  52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
724  60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
725  -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
726  07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
727  15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
728  23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1,
729  -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
730  33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
731  41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
732  49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
733  -1, -1, -1, -1, -1, -1, -1, -1,
734  -1, -1, -1, -1, -1, -1, -1, -1,
735  -1, -1, -1, -1, -1, -1, -1, -1,
736  -1, -1, -1, -1, -1, -1, -1, -1,
737  -1, -1, -1, -1, -1, -1, -1, -1,
738  -1, -1, -1, -1, -1, -1, -1, -1,
739  -1, -1, -1, -1, -1, -1, -1, -1,
740  -1, -1, -1, -1, -1, -1, -1, -1,
741  -1, -1, -1, -1, -1, -1, -1, -1,
742  -1, -1, -1, -1, -1, -1, -1, -1,
743  -1, -1, -1, -1, -1, -1, -1, -1,
744  -1, -1, -1, -1, -1, -1, -1, -1,
745  -1, -1, -1, -1, -1, -1, -1, -1,
746  -1, -1, -1, -1, -1, -1, -1, -1,
747  -1, -1, -1, -1, -1, -1, -1, -1,
748  -1, -1, -1, -1, -1, -1, -1, -1
749 };
750 
751 constexpr signed char kUnWebSafeBase64[] = {
752  -1, -1, -1, -1, -1, -1, -1, -1,
753  -1, -1, -1, -1, -1, -1, -1, -1,
754  -1, -1, -1, -1, -1, -1, -1, -1,
755  -1, -1, -1, -1, -1, -1, -1, -1,
756  -1, -1, -1, -1, -1, -1, -1, -1,
757  -1, -1, -1, -1, -1, 62/*-*/, -1, -1,
758  52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
759  60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
760  -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
761  07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
762  15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
763  23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/,
764  -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
765  33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
766  41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
767  49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
768  -1, -1, -1, -1, -1, -1, -1, -1,
769  -1, -1, -1, -1, -1, -1, -1, -1,
770  -1, -1, -1, -1, -1, -1, -1, -1,
771  -1, -1, -1, -1, -1, -1, -1, -1,
772  -1, -1, -1, -1, -1, -1, -1, -1,
773  -1, -1, -1, -1, -1, -1, -1, -1,
774  -1, -1, -1, -1, -1, -1, -1, -1,
775  -1, -1, -1, -1, -1, -1, -1, -1,
776  -1, -1, -1, -1, -1, -1, -1, -1,
777  -1, -1, -1, -1, -1, -1, -1, -1,
778  -1, -1, -1, -1, -1, -1, -1, -1,
779  -1, -1, -1, -1, -1, -1, -1, -1,
780  -1, -1, -1, -1, -1, -1, -1, -1,
781  -1, -1, -1, -1, -1, -1, -1, -1,
782  -1, -1, -1, -1, -1, -1, -1, -1,
783  -1, -1, -1, -1, -1, -1, -1, -1
784 };
785 /* clang-format on */
786 
787 size_t CalculateBase64EscapedLenInternal(size_t input_len, bool do_padding) {
788  // Base64 encodes three bytes of input at a time. If the input is not
789  // divisible by three, we pad as appropriate.
790  //
791  // (from https://tools.ietf.org/html/rfc3548)
792  // Special processing is performed if fewer than 24 bits are available
793  // at the end of the data being encoded. A full encoding quantum is
794  // always completed at the end of a quantity. When fewer than 24 input
795  // bits are available in an input group, zero bits are added (on the
796  // right) to form an integral number of 6-bit groups. Padding at the
797  // end of the data is performed using the '=' character. Since all base
798  // 64 input is an integral number of octets, only the following cases
799  // can arise:
800 
801  // Base64 encodes each three bytes of input into four bytes of output.
802  size_t len = (input_len / 3) * 4;
803 
804  if (input_len % 3 == 0) {
805  // (from https://tools.ietf.org/html/rfc3548)
806  // (1) the final quantum of encoding input is an integral multiple of 24
807  // bits; here, the final unit of encoded output will be an integral
808  // multiple of 4 characters with no "=" padding,
809  } else if (input_len % 3 == 1) {
810  // (from https://tools.ietf.org/html/rfc3548)
811  // (2) the final quantum of encoding input is exactly 8 bits; here, the
812  // final unit of encoded output will be two characters followed by two
813  // "=" padding characters, or
814  len += 2;
815  if (do_padding) {
816  len += 2;
817  }
818  } else { // (input_len % 3 == 2)
819  // (from https://tools.ietf.org/html/rfc3548)
820  // (3) the final quantum of encoding input is exactly 16 bits; here, the
821  // final unit of encoded output will be three characters followed by one
822  // "=" padding character.
823  len += 3;
824  if (do_padding) {
825  len += 1;
826  }
827  }
828 
829  assert(len >= input_len); // make sure we didn't overflow
830  return len;
831 }
832 
833 size_t Base64EscapeInternal(const unsigned char* src, size_t szsrc, char* dest,
834  size_t szdest, const char* base64,
835  bool do_padding) {
836  static const char kPad64 = '=';
837 
838  if (szsrc * 4 > szdest * 3) return 0;
839 
840  char* cur_dest = dest;
841  const unsigned char* cur_src = src;
842 
843  char* const limit_dest = dest + szdest;
844  const unsigned char* const limit_src = src + szsrc;
845 
846  // Three bytes of data encodes to four characters of cyphertext.
847  // So we can pump through three-byte chunks atomically.
848  if (szsrc >= 3) { // "limit_src - 3" is UB if szsrc < 3.
849  while (cur_src < limit_src - 3) { // While we have >= 32 bits.
850  uint32_t in = absl::big_endian::Load32(cur_src) >> 8;
851 
852  cur_dest[0] = base64[in >> 18];
853  in &= 0x3FFFF;
854  cur_dest[1] = base64[in >> 12];
855  in &= 0xFFF;
856  cur_dest[2] = base64[in >> 6];
857  in &= 0x3F;
858  cur_dest[3] = base64[in];
859 
860  cur_dest += 4;
861  cur_src += 3;
862  }
863  }
864  // To save time, we didn't update szdest or szsrc in the loop. So do it now.
865  szdest = limit_dest - cur_dest;
866  szsrc = limit_src - cur_src;
867 
868  /* now deal with the tail (<=3 bytes) */
869  switch (szsrc) {
870  case 0:
871  // Nothing left; nothing more to do.
872  break;
873  case 1: {
874  // One byte left: this encodes to two characters, and (optionally)
875  // two pad characters to round out the four-character cypherblock.
876  if (szdest < 2) return 0;
877  uint32_t in = cur_src[0];
878  cur_dest[0] = base64[in >> 2];
879  in &= 0x3;
880  cur_dest[1] = base64[in << 4];
881  cur_dest += 2;
882  szdest -= 2;
883  if (do_padding) {
884  if (szdest < 2) return 0;
885  cur_dest[0] = kPad64;
886  cur_dest[1] = kPad64;
887  cur_dest += 2;
888  szdest -= 2;
889  }
890  break;
891  }
892  case 2: {
893  // Two bytes left: this encodes to three characters, and (optionally)
894  // one pad character to round out the four-character cypherblock.
895  if (szdest < 3) return 0;
896  uint32_t in = absl::big_endian::Load16(cur_src);
897  cur_dest[0] = base64[in >> 10];
898  in &= 0x3FF;
899  cur_dest[1] = base64[in >> 4];
900  in &= 0x00F;
901  cur_dest[2] = base64[in << 2];
902  cur_dest += 3;
903  szdest -= 3;
904  if (do_padding) {
905  if (szdest < 1) return 0;
906  cur_dest[0] = kPad64;
907  cur_dest += 1;
908  szdest -= 1;
909  }
910  break;
911  }
912  case 3: {
913  // Three bytes left: same as in the big loop above. We can't do this in
914  // the loop because the loop above always reads 4 bytes, and the fourth
915  // byte is past the end of the input.
916  if (szdest < 4) return 0;
917  uint32_t in = (cur_src[0] << 16) + absl::big_endian::Load16(cur_src + 1);
918  cur_dest[0] = base64[in >> 18];
919  in &= 0x3FFFF;
920  cur_dest[1] = base64[in >> 12];
921  in &= 0xFFF;
922  cur_dest[2] = base64[in >> 6];
923  in &= 0x3F;
924  cur_dest[3] = base64[in];
925  cur_dest += 4;
926  szdest -= 4;
927  break;
928  }
929  default:
930  // Should not be reached: blocks of 4 bytes are handled
931  // in the while loop before this switch statement.
932  ABSL_RAW_LOG(FATAL, "Logic problem? szsrc = %zu", szsrc);
933  break;
934  }
935  return (cur_dest - dest);
936 }
937 
938 constexpr char kBase64Chars[] =
939  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
940 
941 constexpr char kWebSafeBase64Chars[] =
942  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
943 
944 template <typename String>
945 void Base64EscapeInternal(const unsigned char* src, size_t szsrc, String* dest,
946  bool do_padding, const char* base64_chars) {
947  const size_t calc_escaped_size =
948  CalculateBase64EscapedLenInternal(szsrc, do_padding);
949  strings_internal::STLStringResizeUninitialized(dest, calc_escaped_size);
950 
951  const size_t escaped_len = Base64EscapeInternal(
952  src, szsrc, &(*dest)[0], dest->size(), base64_chars, do_padding);
953  assert(calc_escaped_size == escaped_len);
954  dest->erase(escaped_len);
955 }
956 
957 template <typename String>
958 bool Base64UnescapeInternal(const char* src, size_t slen, String* dest,
959  const signed char* unbase64) {
960  // Determine the size of the output std::string. Base64 encodes every 3 bytes into
961  // 4 characters. any leftover chars are added directly for good measure.
962  // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
963  const size_t dest_len = 3 * (slen / 4) + (slen % 4);
964 
966 
967  // We are getting the destination buffer by getting the beginning of the
968  // std::string and converting it into a char *.
969  size_t len;
970  const bool ok =
971  Base64UnescapeInternal(src, slen, &(*dest)[0], dest_len, unbase64, &len);
972  if (!ok) {
973  dest->clear();
974  return false;
975  }
976 
977  // could be shorter if there was padding
978  assert(len <= dest_len);
979  dest->erase(len);
980 
981  return true;
982 }
983 
984 /* clang-format off */
985 constexpr char kHexValue[256] = {
986  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
987  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
988  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
989  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, // '0'..'9'
990  0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'A'..'F'
991  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
992  0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'a'..'f'
993  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
994  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
995  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
996  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
997  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
998  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
999  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1000  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1001  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1002 };
1003 /* clang-format on */
1004 
1005 // This is a templated function so that T can be either a char*
1006 // or a string. This works because we use the [] operator to access
1007 // individual characters at a time.
1008 template <typename T>
1009 void HexStringToBytesInternal(const char* from, T to, ptrdiff_t num) {
1010  for (int i = 0; i < num; i++) {
1011  to[i] = (kHexValue[from[i * 2] & 0xFF] << 4) +
1012  (kHexValue[from[i * 2 + 1] & 0xFF]);
1013  }
1014 }
1015 
1016 // This is a templated function so that T can be either a char* or a
1017 // std::string.
1018 template <typename T>
1019 void BytesToHexStringInternal(const unsigned char* src, T dest, ptrdiff_t num) {
1020  auto dest_ptr = &dest[0];
1021  for (auto src_ptr = src; src_ptr != (src + num); ++src_ptr, dest_ptr += 2) {
1022  const char* hex_p = &kHexTable[*src_ptr * 2];
1023  std::copy(hex_p, hex_p + 2, dest_ptr);
1024  }
1025 }
1026 
1027 } // namespace
1028 
1029 // ----------------------------------------------------------------------
1030 // CUnescape()
1031 //
1032 // See CUnescapeInternal() for implementation details.
1033 // ----------------------------------------------------------------------
1034 bool CUnescape(absl::string_view source, std::string* dest,
1035  std::string* error) {
1036  return CUnescapeInternal(source, kUnescapeNulls, dest, error);
1037 }
1038 
1039 std::string CEscape(absl::string_view src) {
1040  std::string dest;
1041  CEscapeAndAppendInternal(src, &dest);
1042  return dest;
1043 }
1044 
1045 std::string CHexEscape(absl::string_view src) {
1046  return CEscapeInternal(src, true, false);
1047 }
1048 
1050  return CEscapeInternal(src, false, true);
1051 }
1052 
1054  return CEscapeInternal(src, true, true);
1055 }
1056 
1057 // ----------------------------------------------------------------------
1058 // Base64Unescape() - base64 decoder
1059 // Base64Escape() - base64 encoder
1060 // WebSafeBase64Unescape() - Google's variation of base64 decoder
1061 // WebSafeBase64Escape() - Google's variation of base64 encoder
1062 //
1063 // Check out
1064 // http://tools.ietf.org/html/rfc2045 for formal description, but what we
1065 // care about is that...
1066 // Take the encoded stuff in groups of 4 characters and turn each
1067 // character into a code 0 to 63 thus:
1068 // A-Z map to 0 to 25
1069 // a-z map to 26 to 51
1070 // 0-9 map to 52 to 61
1071 // +(- for WebSafe) maps to 62
1072 // /(_ for WebSafe) maps to 63
1073 // There will be four numbers, all less than 64 which can be represented
1074 // by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
1075 // Arrange the 6 digit binary numbers into three bytes as such:
1076 // aaaaaabb bbbbcccc ccdddddd
1077 // Equals signs (one or two) are used at the end of the encoded block to
1078 // indicate that the text was not an integer multiple of three bytes long.
1079 // ----------------------------------------------------------------------
1080 
1081 bool Base64Unescape(absl::string_view src, std::string* dest) {
1082  return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
1083 }
1084 
1085 bool WebSafeBase64Unescape(absl::string_view src, std::string* dest) {
1086  return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
1087 }
1088 
1089 void Base64Escape(absl::string_view src, std::string* dest) {
1090  Base64EscapeInternal(reinterpret_cast<const unsigned char*>(src.data()),
1091  src.size(), dest, true, kBase64Chars);
1092 }
1093 
1094 void WebSafeBase64Escape(absl::string_view src, std::string* dest) {
1095  Base64EscapeInternal(reinterpret_cast<const unsigned char*>(src.data()),
1096  src.size(), dest, false, kWebSafeBase64Chars);
1097 }
1098 
1100  std::string dest;
1101  Base64EscapeInternal(reinterpret_cast<const unsigned char*>(src.data()),
1102  src.size(), &dest, true, kBase64Chars);
1103  return dest;
1104 }
1105 
1107  std::string dest;
1108  Base64EscapeInternal(reinterpret_cast<const unsigned char*>(src.data()),
1109  src.size(), &dest, false, kWebSafeBase64Chars);
1110  return dest;
1111 }
1112 
1114  std::string result;
1115  const auto num = from.size() / 2;
1117  absl::HexStringToBytesInternal<std::string&>(from.data(), result, num);
1118  return result;
1119 }
1120 
1122  std::string result;
1124  absl::BytesToHexStringInternal<std::string&>(
1125  reinterpret_cast<const unsigned char*>(from.data()), result, from.size());
1126  return result;
1127 }
1128 
1129 } // namespace absl
std::string Utf8SafeCEscape(absl::string_view src)
Definition: escaping.cc:1049
std::string CEscape(absl::string_view src)
Definition: escaping.cc:1039
#define ABSL_RAW_LOG(severity,...)
Definition: raw_logging.h:42
uint16_t Load16(const void *p)
Definition: endian.h:244
std::string StrCat(const AlphaNum &a, const AlphaNum &b)
Definition: str_cat.cc:98
std::string Utf8SafeCHexEscape(absl::string_view src)
Definition: escaping.cc:1053
bool ascii_isprint(unsigned char c)
Definition: ascii.h:135
bool ascii_isspace(unsigned char c)
Definition: ascii.h:93
std::string BytesToHexString(absl::string_view from)
Definition: escaping.cc:1121
char * end
bool WebSafeBase64Unescape(absl::string_view src, std::string *dest)
Definition: escaping.cc:1085
Definition: algorithm.h:29
size_t EncodeUTF8Char(char *buffer, char32_t utf8_char)
Definition: utf8.cc:22
constexpr size_type size() const noexcept
Definition: string_view.h:260
std::string CHexEscape(absl::string_view src)
Definition: escaping.cc:1045
void WebSafeBase64Escape(absl::string_view src, std::string *dest)
Definition: escaping.cc:1094
size_t to
uint32_t Load32(const void *p)
Definition: endian.h:252
void STLStringResizeUninitialized(string_type *s, size_t new_size)
void Base64Escape(absl::string_view src, std::string *dest)
Definition: escaping.cc:1089
std::string HexStringToBytes(absl::string_view from)
Definition: escaping.cc:1113
bool Base64Unescape(absl::string_view src, std::string *dest)
Definition: escaping.cc:1081
bool CUnescape(absl::string_view source, std::string *dest, std::string *error)
Definition: escaping.cc:1034
bool ascii_isxdigit(unsigned char c)
Definition: ascii.h:122
constexpr const_pointer data() const noexcept
Definition: string_view.h:302
#define GET_INPUT(label, remain)
size_t from
const char * in
Definition: parser_test.cc:350


abseil_cpp
Author(s):
autogenerated on Wed Jun 19 2019 19:19:56