protobuf/src/google/protobuf/util/internal/json_escaping.cc
Go to the documentation of this file.
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 #include <google/protobuf/util/internal/json_escaping.h>
32 
33 #include <cstdint>
34 
35 #include <google/protobuf/stubs/logging.h>
36 #include <google/protobuf/stubs/common.h>
37 
38 namespace google {
39 namespace protobuf {
40 namespace util {
41 namespace converter {
42 
43 namespace {
44 
45 // Array of hex characters for conversion to hex.
46 static const char kHex[] = "0123456789abcdef";
47 
48 // Characters 0x00 to 0x9f are very commonly used, so we provide a special
49 // table lookup.
50 //
51 // For unicode code point ch < 0xa0:
52 // kCommonEscapes[ch] is the escaped string of ch, if escaping is needed;
53 // or an empty string, if escaping is not needed.
54 static const char kCommonEscapes[160][7] = {
55  // C0 (ASCII and derivatives) control characters
56  "\\u0000", "\\u0001", "\\u0002", "\\u0003", // 0x00
57  "\\u0004", "\\u0005", "\\u0006", "\\u0007", "\\b", "\\t", "\\n", "\\u000b",
58  "\\f", "\\r", "\\u000e", "\\u000f", "\\u0010", "\\u0011", "\\u0012",
59  "\\u0013", // 0x10
60  "\\u0014", "\\u0015", "\\u0016", "\\u0017", "\\u0018", "\\u0019", "\\u001a",
61  "\\u001b", "\\u001c", "\\u001d", "\\u001e", "\\u001f",
62  // Escaping of " and \ are required by www.json.org string definition.
63  // Escaping of < and > are required for HTML security.
64  "", "", "\\\"", "", "", "", "", "", // 0x20
65  "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", // 0x30
66  "", "", "", "", "\\u003c", "", "\\u003e", "", "", "", "", "", "", "", "",
67  "", // 0x40
68  "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", // 0x50
69  "", "", "", "", "\\\\", "", "", "", "", "", "", "", "", "", "", "", // 0x60
70  "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", // 0x70
71  "", "", "", "", "", "", "", "\\u007f",
72  // C1 (ISO 8859 and Unicode) extended control characters
73  "\\u0080", "\\u0081", "\\u0082", "\\u0083", // 0x80
74  "\\u0084", "\\u0085", "\\u0086", "\\u0087", "\\u0088", "\\u0089", "\\u008a",
75  "\\u008b", "\\u008c", "\\u008d", "\\u008e", "\\u008f", "\\u0090", "\\u0091",
76  "\\u0092", "\\u0093", // 0x90
77  "\\u0094", "\\u0095", "\\u0096", "\\u0097", "\\u0098", "\\u0099", "\\u009a",
78  "\\u009b", "\\u009c", "\\u009d", "\\u009e", "\\u009f"};
79 
80 // Determines if the given char value is a unicode surrogate code unit (either
81 // high-surrogate or low-surrogate).
82 inline bool IsSurrogate(uint32_t c) {
83  // Optimized form of:
84  // return c >= kMinHighSurrogate && c <= kMaxLowSurrogate;
85  // (Reduced from 3 ALU instructions to 2 ALU instructions)
86  return (c & 0xfffff800) == JsonEscaping::kMinHighSurrogate;
87 }
88 
89 // Returns true if the given unicode code point cp is a valid
90 // unicode code point (i.e. in the range 0 <= cp <= kMaxCodePoint).
91 inline bool IsValidCodePoint(uint32_t cp) {
92  return cp <= JsonEscaping::kMaxCodePoint;
93 }
94 
95 // Returns the low surrogate for the given unicode code point. The result is
96 // meaningless if the given code point is not a supplementary character.
97 inline uint16_t ToLowSurrogate(uint32_t cp) {
98  return (cp &
101 }
102 
103 // Returns the high surrogate for the given unicode code point. The result is
104 // meaningless if the given code point is not a supplementary character.
105 inline uint16_t ToHighSurrogate(uint32_t cp) {
106  return (cp >> 10) + (JsonEscaping::kMinHighSurrogate -
108 }
109 
110 // Input str is encoded in UTF-8. A unicode code point could be encoded in
111 // UTF-8 using anywhere from 1 to 4 characters, and it could span multiple
112 // reads of the ByteSource.
113 //
114 // This function reads the next unicode code point from the input (str) at
115 // the given position (index), taking into account any left-over partial
116 // code point from the previous iteration (cp), together with the number
117 // of characters left to read to complete this code point (num_left).
118 //
119 // This function assumes that the input (str) is valid at the given position
120 // (index). In order words, at least one character could be read successfully.
121 //
122 // The code point read (partial or complete) is stored in (cp). Upon return,
123 // (num_left) stores the number of characters that has yet to be read in
124 // order to complete the current unicode code point. If the read is complete,
125 // then (num_left) is 0. Also, (num_read) is the number of characters read.
126 //
127 // Returns false if we encounter an invalid UTF-8 string. Returns true
128 // otherwise, including the case when we reach the end of the input (str)
129 // before a complete unicode code point is read.
130 bool ReadCodePoint(StringPiece str, int index, uint32_t* cp,
131  int* num_left, int* num_read) {
132  if (*num_left == 0) {
133  // Last read was complete. Start reading a new unicode code point.
134  *cp = static_cast<uint8_t>(str[index++]);
135  *num_read = 1;
136  // The length of the code point is determined from reading the first byte.
137  //
138  // If the first byte is between:
139  // 0..0x7f: that's the value of the code point.
140  // 0x80..0xbf: <invalid>
141  // 0xc0..0xdf: 11-bit code point encoded in 2 bytes.
142  // bit 10-6, bit 5-0
143  // 0xe0..0xef: 16-bit code point encoded in 3 bytes.
144  // bit 15-12, bit 11-6, bit 5-0
145  // 0xf0..0xf7: 21-bit code point encoded in 4 bytes.
146  // bit 20-18, bit 17-12, bit 11-6, bit 5-0
147  // 0xf8..0xff: <invalid>
148  //
149  // Meaning of each bit:
150  // <msb> bit 7: 0 - single byte code point: bits 6-0 are values.
151  // 1 - multibyte code point
152  // bit 6: 0 - subsequent bytes of multibyte code point:
153  // bits 5-0 are values.
154  // 1 - first byte of multibyte code point
155  // bit 5: 0 - first byte of 2-byte code point: bits 4-0 are values.
156  // 1 - first byte of code point with >= 3 bytes.
157  // bit 4: 0 - first byte of 3-byte code point: bits 3-0 are values.
158  // 1 - first byte of code point with >= 4 bytes.
159  // bit 3: 0 - first byte of 4-byte code point: bits 2-0 are values.
160  // 1 - reserved for future expansion.
161  if (*cp <= 0x7f) {
162  return true;
163  } else if (*cp <= 0xbf) {
164  return false;
165  } else if (*cp <= 0xdf) {
166  *cp &= 0x1f;
167  *num_left = 1;
168  } else if (*cp <= 0xef) {
169  *cp &= 0x0f;
170  *num_left = 2;
171  } else if (*cp <= 0xf7) {
172  *cp &= 0x07;
173  *num_left = 3;
174  } else {
175  return false;
176  }
177  } else {
178  // Last read was partial. Initialize num_read to 0 and continue reading
179  // the last unicode code point.
180  *num_read = 0;
181  }
182  while (*num_left > 0 && index < str.size()) {
183  uint32_t ch = static_cast<uint8_t>(str[index++]);
184  --(*num_left);
185  ++(*num_read);
186  *cp = (*cp << 6) | (ch & 0x3f);
187  if (ch < 0x80 || ch > 0xbf) return false;
188  }
189  return *num_left > 0 || (!IsSurrogate(*cp) && IsValidCodePoint(*cp));
190 }
191 
192 // Stores the 16-bit unicode code point as its hexadecimal digits in buffer
193 // and returns a StringPiece that points to this buffer. The input buffer needs
194 // to be at least 6 bytes long.
195 StringPiece ToHex(uint16_t cp, char* buffer) {
196  buffer[5] = kHex[cp & 0x0f];
197  cp >>= 4;
198  buffer[4] = kHex[cp & 0x0f];
199  cp >>= 4;
200  buffer[3] = kHex[cp & 0x0f];
201  cp >>= 4;
202  buffer[2] = kHex[cp & 0x0f];
203  return StringPiece(buffer, 6);
204 }
205 
206 // Stores the 32-bit unicode code point as its hexadecimal digits in buffer
207 // and returns a StringPiece that points to this buffer. The input buffer needs
208 // to be at least 12 bytes long.
209 StringPiece ToSurrogateHex(uint32_t cp, char* buffer) {
210  uint16_t low = ToLowSurrogate(cp);
211  uint16_t high = ToHighSurrogate(cp);
212 
213  buffer[11] = kHex[low & 0x0f];
214  low >>= 4;
215  buffer[10] = kHex[low & 0x0f];
216  low >>= 4;
217  buffer[9] = kHex[low & 0x0f];
218  low >>= 4;
219  buffer[8] = kHex[low & 0x0f];
220 
221  buffer[5] = kHex[high & 0x0f];
222  high >>= 4;
223  buffer[4] = kHex[high & 0x0f];
224  high >>= 4;
225  buffer[3] = kHex[high & 0x0f];
226  high >>= 4;
227  buffer[2] = kHex[high & 0x0f];
228 
229  return StringPiece(buffer, 12);
230 }
231 
232 // If the given unicode code point needs escaping, then returns the
233 // escaped form. The returned StringPiece either points to statically
234 // pre-allocated char[] or to the given buffer. The input buffer needs
235 // to be at least 12 bytes long.
236 //
237 // If the given unicode code point does not need escaping, an empty
238 // StringPiece is returned.
239 StringPiece EscapeCodePoint(uint32_t cp, char* buffer) {
240  if (cp < 0xa0) return kCommonEscapes[cp];
241  switch (cp) {
242  // These are not required by json spec
243  // but used to prevent security bugs in javascript.
244  case 0xfeff: // Zero width no-break space
245  case 0xfff9: // Interlinear annotation anchor
246  case 0xfffa: // Interlinear annotation separator
247  case 0xfffb: // Interlinear annotation terminator
248 
249  case 0x00ad: // Soft-hyphen
250  case 0x06dd: // Arabic end of ayah
251  case 0x070f: // Syriac abbreviation mark
252  case 0x17b4: // Khmer vowel inherent Aq
253  case 0x17b5: // Khmer vowel inherent Aa
254  return ToHex(cp, buffer);
255 
256  default:
257  if ((cp >= 0x0600 && cp <= 0x0603) || // Arabic signs
258  (cp >= 0x200b && cp <= 0x200f) || // Zero width etc.
259  (cp >= 0x2028 && cp <= 0x202e) || // Separators etc.
260  (cp >= 0x2060 && cp <= 0x2064) || // Invisible etc.
261  (cp >= 0x206a && cp <= 0x206f)) { // Shaping etc.
262  return ToHex(cp, buffer);
263  }
264 
265  if (cp == 0x000e0001 || // Language tag
266  (cp >= 0x0001d173 && cp <= 0x0001d17a) || // Music formatting
267  (cp >= 0x000e0020 && cp <= 0x000e007f)) { // TAG symbols
268  return ToSurrogateHex(cp, buffer);
269  }
270  }
271  return StringPiece();
272 }
273 
274 // Tries to escape the given code point first. If the given code point
275 // does not need to be escaped, but force_output is true, then render
276 // the given multi-byte code point in UTF8 in the buffer and returns it.
277 StringPiece EscapeCodePoint(uint32_t cp, char* buffer,
278  bool force_output) {
279  StringPiece sp = EscapeCodePoint(cp, buffer);
280  if (force_output && sp.empty()) {
281  buffer[5] = (cp & 0x3f) | 0x80;
282  cp >>= 6;
283  if (cp <= 0x1f) {
284  buffer[4] = cp | 0xc0;
285  sp = StringPiece(buffer + 4, 2);
286  return sp;
287  }
288  buffer[4] = (cp & 0x3f) | 0x80;
289  cp >>= 6;
290  if (cp <= 0x0f) {
291  buffer[3] = cp | 0xe0;
292  sp = StringPiece(buffer + 3, 3);
293  return sp;
294  }
295  buffer[3] = (cp & 0x3f) | 0x80;
296  buffer[2] = ((cp >> 6) & 0x07) | 0xf0;
297  sp = StringPiece(buffer + 2, 4);
298  }
299  return sp;
300 }
301 
302 } // namespace
303 
304 void JsonEscaping::Escape(strings::ByteSource* input,
305  strings::ByteSink* output) {
306  char buffer[12] = "\\udead\\ubee";
307  uint32_t cp = 0; // Current unicode code point.
308  int num_left = 0; // Num of chars to read to complete the code point.
309  while (input->Available() > 0) {
310  StringPiece str = input->Peek();
311  StringPiece escaped;
312  int i = 0;
313  int num_read;
314  bool ok;
315  bool cp_was_split = num_left > 0;
316  // Loop until we encounter either
317  // i) a code point that needs to be escaped; or
318  // ii) a split code point is completely read; or
319  // iii) a character that is not a valid utf8; or
320  // iv) end of the StringPiece str is reached.
321  do {
322  ok = ReadCodePoint(str, i, &cp, &num_left, &num_read);
323  if (num_left > 0 || !ok) break; // case iii or iv
324  escaped = EscapeCodePoint(cp, buffer, cp_was_split);
325  if (!escaped.empty()) break; // case i or ii
326  i += num_read;
327  num_read = 0;
328  } while (i < str.length()); // case iv
329  // First copy the un-escaped prefix, if any, to the output ByteSink.
330  if (i > 0) input->CopyTo(output, i);
331  if (num_read > 0) input->Skip(num_read);
332  if (!ok) {
333  // Case iii: Report error.
334  // TODO(wpoon): Add error reporting.
335  num_left = 0;
336  } else if (num_left == 0 && !escaped.empty()) {
337  // Case i or ii: Append the escaped code point to the output ByteSink.
338  output->Append(escaped.data(), escaped.size());
339  }
340  }
341  if (num_left > 0) {
342  // Treat as case iii: report error.
343  // TODO(wpoon): Add error reporting.
344  }
345 }
346 
348  const size_t len = input.length();
349  const char* p = input.data();
350 
351  bool can_skip_escaping = true;
352  for (int i = 0; i < len; i++) {
353  char c = p[i];
354  if (c < 0x20 || c >= 0x7F || c == '"' || c == '<' || c == '>' ||
355  c == '\\') {
356  can_skip_escaping = false;
357  break;
358  }
359  }
360 
361  if (can_skip_escaping) {
362  output->Append(input.data(), input.length());
363  } else {
365  Escape(&source, output);
366  }
367 }
368 
369 } // namespace converter
370 } // namespace util
371 } // namespace protobuf
372 } // namespace google
xds_interop_client.str
str
Definition: xds_interop_client.py:487
uint16_t
unsigned short uint16_t
Definition: stdint-msvc2008.h:79
google::protobuf
Definition: bloaty/third_party/protobuf/benchmarks/util/data_proto2_to_proto3_util.h:12
make_cmakelists.converter
converter
Definition: make_cmakelists.py:317
uint8_t
unsigned char uint8_t
Definition: stdint-msvc2008.h:78
google::protobuf::ToHex
PROTOBUF_EXPORT string ToHex(uint64 num)
uint32_t
unsigned int uint32_t
Definition: stdint-msvc2008.h:80
absl::ABSL_NAMESPACE_BEGIN::IsSurrogate
bool IsSurrogate(char32_t c, absl::string_view src, std::string *error)
Definition: abseil-cpp/absl/strings/escaping.cc:56
google::protobuf::util::converter::JsonEscaping::kMinSupplementaryCodePoint
static const uint32 kMinSupplementaryCodePoint
Definition: bloaty/third_party/protobuf/src/google/protobuf/util/internal/json_escaping.h:66
gmock_output_test.output
output
Definition: bloaty/third_party/googletest/googlemock/test/gmock_output_test.py:175
google::protobuf::StringPiece
Definition: bloaty/third_party/protobuf/src/google/protobuf/stubs/stringpiece.h:180
google::protobuf::util::converter::JsonEscaping::kMaxLowSurrogate
static const uint16 kMaxLowSurrogate
Definition: bloaty/third_party/protobuf/src/google/protobuf/util/internal/json_escaping.h:62
buffer
char buffer[1024]
Definition: libuv/docs/code/idle-compute/main.c:8
google::protobuf::IsValidCodePoint
bool IsValidCodePoint(uint32 code_point)
Definition: bloaty/third_party/protobuf/src/google/protobuf/stubs/strutil.h:865
index
int index
Definition: bloaty/third_party/protobuf/php/ext/google/protobuf/protobuf.h:1184
ok
bool ok
Definition: async_end2end_test.cc:197
google::protobuf::util::converter::JsonEscaping::Escape
static void Escape(strings::ByteSource *input, strings::ByteSink *output)
Definition: bloaty/third_party/protobuf/src/google/protobuf/util/internal/json_escaping.cc:301
google::protobuf::util::converter::JsonEscaping::kMaxCodePoint
static const uint32 kMaxCodePoint
Definition: bloaty/third_party/protobuf/src/google/protobuf/util/internal/json_escaping.h:74
google::protobuf::strings::ByteSink
Definition: bloaty/third_party/protobuf/src/google/protobuf/stubs/bytestream.h:78
input
std::string input
Definition: bloaty/third_party/protobuf/src/google/protobuf/io/tokenizer_unittest.cc:197
google::protobuf::strings::ArrayByteSource
Definition: bloaty/third_party/protobuf/src/google/protobuf/stubs/bytestream.h:296
google::protobuf::util::converter::JsonEscaping::kMinHighSurrogate
static const uint16 kMinHighSurrogate
Definition: bloaty/third_party/protobuf/src/google/protobuf/util/internal/json_escaping.h:47
ch
char ch
Definition: bloaty/third_party/googletest/googlemock/test/gmock-matchers_test.cc:3621
len
int len
Definition: abseil-cpp/absl/base/internal/low_level_alloc_test.cc:46
google::protobuf::util::converter::JsonEscaping::kMinLowSurrogate
static const uint16 kMinLowSurrogate
Definition: bloaty/third_party/protobuf/src/google/protobuf/util/internal/json_escaping.h:57
google
Definition: bloaty/third_party/protobuf/benchmarks/util/data_proto2_to_proto3_util.h:11
i
uint64_t i
Definition: abseil-cpp/absl/container/btree_benchmark.cc:230


grpc
Author(s):
autogenerated on Fri May 16 2025 02:59:12