json_escaping.cc
Go to the documentation of this file.
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
32 
35 
36 namespace google {
37 namespace protobuf {
38 namespace util {
39 namespace converter {
40 
41 namespace {
42 
43 // Array of hex characters for conversion to hex.
44 static const char kHex[] = "0123456789abcdef";
45 
46 // Characters 0x00 to 0x9f are very commonly used, so we provide a special
47 // table lookup.
48 //
49 // For unicode code point ch < 0xa0:
50 // kCommonEscapes[ch] is the escaped string of ch, if escaping is needed;
51 // or an empty string, if escaping is not needed.
52 static const char kCommonEscapes[160][7] = {
53  // C0 (ASCII and derivatives) control characters
54  "\\u0000", "\\u0001", "\\u0002", "\\u0003", // 0x00
55  "\\u0004", "\\u0005", "\\u0006", "\\u0007", "\\b", "\\t", "\\n", "\\u000b",
56  "\\f", "\\r", "\\u000e", "\\u000f", "\\u0010", "\\u0011", "\\u0012",
57  "\\u0013", // 0x10
58  "\\u0014", "\\u0015", "\\u0016", "\\u0017", "\\u0018", "\\u0019", "\\u001a",
59  "\\u001b", "\\u001c", "\\u001d", "\\u001e", "\\u001f",
60  // Escaping of " and \ are required by www.json.org string definition.
61  // Escaping of < and > are required for HTML security.
62  "", "", "\\\"", "", "", "", "", "", // 0x20
63  "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", // 0x30
64  "", "", "", "", "\\u003c", "", "\\u003e", "", "", "", "", "", "", "", "",
65  "", // 0x40
66  "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", // 0x50
67  "", "", "", "", "\\\\", "", "", "", "", "", "", "", "", "", "", "", // 0x60
68  "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", // 0x70
69  "", "", "", "", "", "", "", "\\u007f",
70  // C1 (ISO 8859 and Unicode) extended control characters
71  "\\u0080", "\\u0081", "\\u0082", "\\u0083", // 0x80
72  "\\u0084", "\\u0085", "\\u0086", "\\u0087", "\\u0088", "\\u0089", "\\u008a",
73  "\\u008b", "\\u008c", "\\u008d", "\\u008e", "\\u008f", "\\u0090", "\\u0091",
74  "\\u0092", "\\u0093", // 0x90
75  "\\u0094", "\\u0095", "\\u0096", "\\u0097", "\\u0098", "\\u0099", "\\u009a",
76  "\\u009b", "\\u009c", "\\u009d", "\\u009e", "\\u009f"};
77 
78 // Determines if the given char value is a unicode surrogate code unit (either
79 // high-surrogate or low-surrogate).
80 inline bool IsSurrogate(uint32 c) {
81  // Optimized form of:
82  // return c >= kMinHighSurrogate && c <= kMaxLowSurrogate;
83  // (Reduced from 3 ALU instructions to 2 ALU instructions)
84  return (c & 0xfffff800) == JsonEscaping::kMinHighSurrogate;
85 }
86 
87 // Returns true if the given unicode code point cp is a valid
88 // unicode code point (i.e. in the range 0 <= cp <= kMaxCodePoint).
89 inline bool IsValidCodePoint(uint32 cp) {
90  return cp <= JsonEscaping::kMaxCodePoint;
91 }
92 
93 // Returns the low surrogate for the given unicode code point. The result is
94 // meaningless if the given code point is not a supplementary character.
95 inline uint16 ToLowSurrogate(uint32 cp) {
96  return (cp &
99 }
100 
101 // Returns the high surrogate for the given unicode code point. The result is
102 // meaningless if the given code point is not a supplementary character.
103 inline uint16 ToHighSurrogate(uint32 cp) {
104  return (cp >> 10) + (JsonEscaping::kMinHighSurrogate -
106 }
107 
108 // Input str is encoded in UTF-8. A unicode code point could be encoded in
109 // UTF-8 using anywhere from 1 to 4 characters, and it could span multiple
110 // reads of the ByteSource.
111 //
112 // This function reads the next unicode code point from the input (str) at
113 // the given position (index), taking into account any left-over partial
114 // code point from the previous iteration (cp), together with the number
115 // of characters left to read to complete this code point (num_left).
116 //
117 // This function assumes that the input (str) is valid at the given position
118 // (index). In order words, at least one character could be read successfully.
119 //
120 // The code point read (partial or complete) is stored in (cp). Upon return,
121 // (num_left) stores the number of characters that has yet to be read in
122 // order to complete the current unicode code point. If the read is complete,
123 // then (num_left) is 0. Also, (num_read) is the number of characters read.
124 //
125 // Returns false if we encounter an invalid UTF-8 string. Returns true
126 // otherwise, including the case when we reach the end of the input (str)
127 // before a complete unicode code point is read.
128 bool ReadCodePoint(StringPiece str, int index, uint32* cp, int* num_left,
129  int* num_read) {
130  if (*num_left == 0) {
131  // Last read was complete. Start reading a new unicode code point.
132  *cp = static_cast<uint8>(str[index++]);
133  *num_read = 1;
134  // The length of the code point is determined from reading the first byte.
135  //
136  // If the first byte is between:
137  // 0..0x7f: that's the value of the code point.
138  // 0x80..0xbf: <invalid>
139  // 0xc0..0xdf: 11-bit code point encoded in 2 bytes.
140  // bit 10-6, bit 5-0
141  // 0xe0..0xef: 16-bit code point encoded in 3 bytes.
142  // bit 15-12, bit 11-6, bit 5-0
143  // 0xf0..0xf7: 21-bit code point encoded in 4 bytes.
144  // bit 20-18, bit 17-12, bit 11-6, bit 5-0
145  // 0xf8..0xff: <invalid>
146  //
147  // Meaning of each bit:
148  // <msb> bit 7: 0 - single byte code point: bits 6-0 are values.
149  // 1 - multibyte code point
150  // bit 6: 0 - subsequent bytes of multibyte code point:
151  // bits 5-0 are values.
152  // 1 - first byte of multibyte code point
153  // bit 5: 0 - first byte of 2-byte code point: bits 4-0 are values.
154  // 1 - first byte of code point with >= 3 bytes.
155  // bit 4: 0 - first byte of 3-byte code point: bits 3-0 are values.
156  // 1 - first byte of code point with >= 4 bytes.
157  // bit 3: 0 - first byte of 4-byte code point: bits 2-0 are values.
158  // 1 - reserved for future expansion.
159  if (*cp <= 0x7f) {
160  return true;
161  } else if (*cp <= 0xbf) {
162  return false;
163  } else if (*cp <= 0xdf) {
164  *cp &= 0x1f;
165  *num_left = 1;
166  } else if (*cp <= 0xef) {
167  *cp &= 0x0f;
168  *num_left = 2;
169  } else if (*cp <= 0xf7) {
170  *cp &= 0x07;
171  *num_left = 3;
172  } else {
173  return false;
174  }
175  } else {
176  // Last read was partial. Initialize num_read to 0 and continue reading
177  // the last unicode code point.
178  *num_read = 0;
179  }
180  while (*num_left > 0 && index < str.size()) {
181  uint32 ch = static_cast<uint8>(str[index++]);
182  --(*num_left);
183  ++(*num_read);
184  *cp = (*cp << 6) | (ch & 0x3f);
185  if (ch < 0x80 || ch > 0xbf) return false;
186  }
187  return *num_left > 0 || (!IsSurrogate(*cp) && IsValidCodePoint(*cp));
188 }
189 
190 // Stores the 16-bit unicode code point as its hexadecimal digits in buffer
191 // and returns a StringPiece that points to this buffer. The input buffer needs
192 // to be at least 6 bytes long.
193 StringPiece ToHex(uint16 cp, char* buffer) {
194  buffer[5] = kHex[cp & 0x0f];
195  cp >>= 4;
196  buffer[4] = kHex[cp & 0x0f];
197  cp >>= 4;
198  buffer[3] = kHex[cp & 0x0f];
199  cp >>= 4;
200  buffer[2] = kHex[cp & 0x0f];
201  return StringPiece(buffer, 6);
202 }
203 
204 // Stores the 32-bit unicode code point as its hexadecimal digits in buffer
205 // and returns a StringPiece that points to this buffer. The input buffer needs
206 // to be at least 12 bytes long.
207 StringPiece ToSurrogateHex(uint32 cp, char* buffer) {
208  uint16 low = ToLowSurrogate(cp);
209  uint16 high = ToHighSurrogate(cp);
210 
211  buffer[11] = kHex[low & 0x0f];
212  low >>= 4;
213  buffer[10] = kHex[low & 0x0f];
214  low >>= 4;
215  buffer[9] = kHex[low & 0x0f];
216  low >>= 4;
217  buffer[8] = kHex[low & 0x0f];
218 
219  buffer[5] = kHex[high & 0x0f];
220  high >>= 4;
221  buffer[4] = kHex[high & 0x0f];
222  high >>= 4;
223  buffer[3] = kHex[high & 0x0f];
224  high >>= 4;
225  buffer[2] = kHex[high & 0x0f];
226 
227  return StringPiece(buffer, 12);
228 }
229 
230 // If the given unicode code point needs escaping, then returns the
231 // escaped form. The returned StringPiece either points to statically
232 // pre-allocated char[] or to the given buffer. The input buffer needs
233 // to be at least 12 bytes long.
234 //
235 // If the given unicode code point does not need escaping, an empty
236 // StringPiece is returned.
237 StringPiece EscapeCodePoint(uint32 cp, char* buffer) {
238  if (cp < 0xa0) return kCommonEscapes[cp];
239  switch (cp) {
240  // These are not required by json spec
241  // but used to prevent security bugs in javascript.
242  case 0xfeff: // Zero width no-break space
243  case 0xfff9: // Interlinear annotation anchor
244  case 0xfffa: // Interlinear annotation separator
245  case 0xfffb: // Interlinear annotation terminator
246 
247  case 0x00ad: // Soft-hyphen
248  case 0x06dd: // Arabic end of ayah
249  case 0x070f: // Syriac abbreviation mark
250  case 0x17b4: // Khmer vowel inherent Aq
251  case 0x17b5: // Khmer vowel inherent Aa
252  return ToHex(cp, buffer);
253 
254  default:
255  if ((cp >= 0x0600 && cp <= 0x0603) || // Arabic signs
256  (cp >= 0x200b && cp <= 0x200f) || // Zero width etc.
257  (cp >= 0x2028 && cp <= 0x202e) || // Separators etc.
258  (cp >= 0x2060 && cp <= 0x2064) || // Invisible etc.
259  (cp >= 0x206a && cp <= 0x206f)) { // Shaping etc.
260  return ToHex(cp, buffer);
261  }
262 
263  if (cp == 0x000e0001 || // Language tag
264  (cp >= 0x0001d173 && cp <= 0x0001d17a) || // Music formatting
265  (cp >= 0x000e0020 && cp <= 0x000e007f)) { // TAG symbols
266  return ToSurrogateHex(cp, buffer);
267  }
268  }
269  return StringPiece();
270 }
271 
272 // Tries to escape the given code point first. If the given code point
273 // does not need to be escaped, but force_output is true, then render
274 // the given multi-byte code point in UTF8 in the buffer and returns it.
275 StringPiece EscapeCodePoint(uint32 cp, char* buffer, bool force_output) {
276  StringPiece sp = EscapeCodePoint(cp, buffer);
277  if (force_output && sp.empty()) {
278  buffer[5] = (cp & 0x3f) | 0x80;
279  cp >>= 6;
280  if (cp <= 0x1f) {
281  buffer[4] = cp | 0xc0;
282  sp = StringPiece(buffer + 4, 2);
283  return sp;
284  }
285  buffer[4] = (cp & 0x3f) | 0x80;
286  cp >>= 6;
287  if (cp <= 0x0f) {
288  buffer[3] = cp | 0xe0;
289  sp = StringPiece(buffer + 3, 3);
290  return sp;
291  }
292  buffer[3] = (cp & 0x3f) | 0x80;
293  buffer[2] = ((cp >> 6) & 0x07) | 0xf0;
294  sp = StringPiece(buffer + 2, 4);
295  }
296  return sp;
297 }
298 
299 } // namespace
300 
301 void JsonEscaping::Escape(strings::ByteSource* input,
302  strings::ByteSink* output) {
303  char buffer[12] = "\\udead\\ubee";
304  uint32 cp = 0; // Current unicode code point.
305  int num_left = 0; // Num of chars to read to complete the code point.
306  while (input->Available() > 0) {
307  StringPiece str = input->Peek();
308  StringPiece escaped;
309  int i = 0;
310  int num_read;
311  bool ok;
312  bool cp_was_split = num_left > 0;
313  // Loop until we encounter either
314  // i) a code point that needs to be escaped; or
315  // ii) a split code point is completely read; or
316  // iii) a character that is not a valid utf8; or
317  // iv) end of the StringPiece str is reached.
318  do {
319  ok = ReadCodePoint(str, i, &cp, &num_left, &num_read);
320  if (num_left > 0 || !ok) break; // case iii or iv
321  escaped = EscapeCodePoint(cp, buffer, cp_was_split);
322  if (!escaped.empty()) break; // case i or ii
323  i += num_read;
324  num_read = 0;
325  } while (i < str.length()); // case iv
326  // First copy the un-escaped prefix, if any, to the output ByteSink.
327  if (i > 0) input->CopyTo(output, i);
328  if (num_read > 0) input->Skip(num_read);
329  if (!ok) {
330  // Case iii: Report error.
331  // TODO(wpoon): Add error reporting.
332  num_left = 0;
333  } else if (num_left == 0 && !escaped.empty()) {
334  // Case i or ii: Append the escaped code point to the output ByteSink.
335  output->Append(escaped.data(), escaped.size());
336  }
337  }
338  if (num_left > 0) {
339  // Treat as case iii: report error.
340  // TODO(wpoon): Add error reporting.
341  }
342 }
343 
344 } // namespace converter
345 } // namespace util
346 } // namespace protobuf
347 } // namespace google
google::protobuf::StringPiece::data
const char * data() const
Definition: stringpiece.h:247
input
std::string input
Definition: tokenizer_unittest.cc:197
google::protobuf::uint8
uint8_t uint8
Definition: protobuf/src/google/protobuf/stubs/port.h:153
google::protobuf::uint32
uint32_t uint32
Definition: protobuf/src/google/protobuf/stubs/port.h:155
ok
ROSCPP_DECL bool ok()
google::protobuf::util::converter::JsonEscaping::kMinSupplementaryCodePoint
static const uint32 kMinSupplementaryCodePoint
Definition: json_escaping.h:66
google::protobuf::StringPiece::empty
bool empty() const
Definition: stringpiece.h:250
google::protobuf::StringPiece
Definition: stringpiece.h:180
google::protobuf::uint16
uint16_t uint16
Definition: protobuf/src/google/protobuf/stubs/port.h:154
google::protobuf::util::converter::JsonEscaping::kMaxLowSurrogate
static const uint16 kMaxLowSurrogate
Definition: json_escaping.h:62
update_failure_list.str
str
Definition: update_failure_list.py:41
buffer
Definition: buffer_processor.h:43
i
int i
Definition: gmock-matchers_test.cc:764
google::protobuf::IsValidCodePoint
bool IsValidCodePoint(uint32 code_point)
Definition: strutil.h:865
common.h
ch
char ch
Definition: gmock-matchers_test.cc:3871
google::protobuf::ToHex
PROTOBUF_EXPORT string ToHex(uint64 num)
logging.h
google::protobuf::util::converter::JsonEscaping::Escape
static void Escape(strings::ByteSource *input, strings::ByteSink *output)
Definition: json_escaping.cc:301
google::protobuf::util::converter::JsonEscaping::kMaxCodePoint
static const uint32 kMaxCodePoint
Definition: json_escaping.h:74
json_escaping.h
google::protobuf::util::converter::JsonEscaping::kMinHighSurrogate
static const uint16 kMinHighSurrogate
Definition: json_escaping.h:47
output
const upb_json_parsermethod const upb_symtab upb_sink * output
Definition: ruby/ext/google/protobuf_c/upb.h:10503
google::protobuf::StringPiece::size
stringpiece_ssize_type size() const
Definition: stringpiece.h:248
index
GLuint index
Definition: glcorearb.h:3055
google::protobuf::util::converter::JsonEscaping::kMinLowSurrogate
static const uint16 kMinLowSurrogate
Definition: json_escaping.h:57
google
Definition: data_proto2_to_proto3_util.h:11


libaditof
Author(s):
autogenerated on Wed May 21 2025 02:06:55