structurally_valid.cc
Go to the documentation of this file.
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // Author: jrm@google.com (Jim Meehan)
32 
34 
36 
37 namespace google {
38 namespace protobuf {
39 namespace internal {
40 
41 // These four-byte entries compactly encode how many bytes 0..255 to delete
42 // in making a string replacement, how many bytes to add 0..255, and the offset
43 // 0..64k-1 of the replacement string in remap_string.
44 struct RemapEntry {
48 };
49 
50 // Exit type codes for state tables. All but the first get stuffed into
51 // signed one-byte entries. The first is only generated by executable code.
52 // To distinguish from next-state entries, these must be contiguous and
53 // all <= kExitNone
54 typedef enum {
57  kExitOK, // 241
58  kExitReject, // ...
71  kExitNone // 255
72 } ExitReason;
73 
74 
75 // This struct represents one entire state table. The three initialized byte
76 // areas are state_table, remap_base, and remap_string. state0 and state0_size
77 // give the byte offset and length within state_table of the initial state --
78 // table lookups are expected to start and end in this state, but for
79 // truncated UTF-8 strings, may end in a different state. These allow a quick
80 // test for that condition. entry_shift is 8 for tables subscripted by a full
81 // byte value and 6 for space-optimized tables subscripted by only six
82 // significant bits in UTF-8 continuation bytes.
83 typedef struct {
84  const uint32 state0;
87  const int max_expand;
88  const int entry_shift;
89  const int bytes_per_entry;
90  const uint32 losub;
91  const uint32 hiadd;
95  const uint8* fast_state;
97 
99 
100 #define X__ (kExitIllegalStructure)
101 #define RJ_ (kExitReject)
102 #define S1_ (kExitReplace1)
103 #define S2_ (kExitReplace2)
104 #define S3_ (kExitReplace3)
105 #define S21 (kExitReplace21)
106 #define S31 (kExitReplace31)
107 #define S32 (kExitReplace32)
108 #define T1_ (kExitReplaceOffset1)
109 #define T2_ (kExitReplaceOffset2)
110 #define S11 (kExitReplace1S0)
111 #define SP_ (kExitSpecial)
112 #define D__ (kExitDoAgain)
113 #define RJA (kExitRejectAlt)
114 
115 // Entire table has 9 state blocks of 256 entries each
116 static const unsigned int utf8acceptnonsurrogates_STATE0 = 0; // state[0]
117 static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256; // =[1]
118 static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304;
119 static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0;
120 static const unsigned int utf8acceptnonsurrogates_SHIFT = 8;
121 static const unsigned int utf8acceptnonsurrogates_BYTES = 1;
122 static const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020;
123 static const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000;
124 
125 static const uint8 utf8acceptnonsurrogates[] = {
126 // state[0] 0x000000 Byte 1
127  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
128  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
129  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
131 
132  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
133  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
134  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
135  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
136 
137 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
138 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
139 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
140 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
141 
142 X__, X__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
143  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
144  2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 3, 3,
145  4, 5, 5, 5, 6, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
146 
147 // state[1] 0x000080 Byte 2 of 2
148 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
149 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
150 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
151 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
152 
153 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
154 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
155 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
156 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
157 
158  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
159  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
160  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
161  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
162 
163 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
164 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
165 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
166 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
167 
168 // state[2] 0x000000 Byte 2 of 3
169 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
170 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
171 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
172 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
173 
174 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
175 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
176 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
177 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
178 
179 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
180 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
181  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
182  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
183 
184 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
185 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
186 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
187 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
188 
189 // state[3] 0x001000 Byte 2 of 3
190 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
191 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
192 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
193 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
194 
195 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
196 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
197 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
198 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
199 
200  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
201  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
202  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
203  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
204 
205 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
206 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
207 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
208 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
209 
210 // state[4] 0x000000 Byte 2 of 4
211 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
212 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
213 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
214 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
215 
216 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
217 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
218 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
219 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
220 
221 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
222  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
223  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
224  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
225 
226 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
227 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
228 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
229 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
230 
231 // state[5] 0x040000 Byte 2 of 4
232 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
233 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
234 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
235 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
236 
237 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
238 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
239 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
240 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
241 
242  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
243  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
244  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
245  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
246 
247 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
248 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
249 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
250 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
251 
252 // state[6] 0x100000 Byte 2 of 4
253 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
254 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
255 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
256 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
257 
258 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
259 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
260 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
261 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
262 
263  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
264 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
265 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
266 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
267 
268 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
269 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
270 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
271 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
272 
273 // state[7] 0x00d000 Byte 2 of 3
274 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
275 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
276 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
277 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
278 
279 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
280 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
281 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
282 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
283 
284  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
285  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
286  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
287  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
288 
289 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
290 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
291 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
292 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
293 
294 // state[8] 0x00d800 Byte 3 of 3
295 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
296 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
297 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
298 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
299 
300 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
301 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
302 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
303 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
304 
305 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
306 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
307 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
308 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
309 
310 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
311 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
312 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
313 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
314 };
315 
316 // Remap base[0] = (del, add, string_offset)
318 {0, 0, 0} };
319 
320 // Remap string[0]
321 static const unsigned char utf8acceptnonsurrogates_remap_string[] = {
322 0 };
323 
324 static const unsigned char utf8acceptnonsurrogates_fast[256] = {
325 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
327 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
329 
330 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
334 
335 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
336 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
337 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
338 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
339 
340 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
341 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
342 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
343 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
344 };
345 
359 };
360 
361 
362 #undef X__
363 #undef RJ_
364 #undef S1_
365 #undef S2_
366 #undef S3_
367 #undef S21
368 #undef S31
369 #undef S32
370 #undef T1_
371 #undef T2_
372 #undef S11
373 #undef SP_
374 #undef D__
375 #undef RJA
376 
377 // Return true if current Tbl pointer is within state0 range
378 // Note that unsigned compare checks both ends of range simultaneously
379 static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
380  const uint8* Tbl0 = &st->state_table[st->state0];
381  return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
382 }
383 
384 // Scan a UTF-8 string based on state table.
385 // Always scan complete UTF-8 characters
386 // Set number of bytes scanned. Return reason for exiting
388  const char * str,
389  int str_length,
390  int* bytes_consumed) {
391  *bytes_consumed = 0;
392  if (str_length == 0) return kExitOK;
393 
394  int eshift = st->entry_shift;
395  const uint8* isrc = reinterpret_cast<const uint8*>(str);
396  const uint8* src = isrc;
397  const uint8* srclimit = isrc + str_length;
398  const uint8* srclimit8 = srclimit - 7;
399  const uint8* Tbl_0 = &st->state_table[st->state0];
400 
401  DoAgain:
402  // Do state-table scan
403  int e = 0;
404  uint8 c;
405  const uint8* Tbl2 = &st->fast_state[0];
406  const uint32 losub = st->losub;
407  const uint32 hiadd = st->hiadd;
408  // Check initial few bytes one at a time until 8-byte aligned
409  //----------------------------
410  while ((((uintptr_t)src & 0x07) != 0) &&
411  (src < srclimit) &&
412  Tbl2[src[0]] == 0) {
413  src++;
414  }
415  if (((uintptr_t)src & 0x07) == 0) {
416  // Do fast for groups of 8 identity bytes.
417  // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
418  // including slowing slightly on cr/lf/ht
419  //----------------------------
420  while (src < srclimit8) {
421  uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
422  uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
423  src += 8;
424  // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
425  uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
426  (s4567 - losub) | (s4567 + hiadd);
427  if ((temp & 0x80808080) != 0) {
428  // We typically end up here on cr/lf/ht; src was incremented
429  int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
430  (Tbl2[src[-6]] | Tbl2[src[-5]]);
431  if (e0123 != 0) {
432  src -= 8;
433  break;
434  } // Exit on Non-interchange
435  e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
436  (Tbl2[src[-2]] | Tbl2[src[-1]]);
437  if (e0123 != 0) {
438  src -= 4;
439  break;
440  } // Exit on Non-interchange
441  // Else OK, go around again
442  }
443  }
444  }
445  //----------------------------
446 
447  // Byte-at-a-time scan
448  //----------------------------
449  const uint8* Tbl = Tbl_0;
450  while (src < srclimit) {
451  c = *src;
452  e = Tbl[c];
453  src++;
454  if (e >= kExitIllegalStructure) {break;}
455  Tbl = &Tbl_0[e << eshift];
456  }
457  //----------------------------
458 
459 
460  // Exit posibilities:
461  // Some exit code, !state0, back up over last char
462  // Some exit code, state0, back up one byte exactly
463  // source consumed, !state0, back up over partial char
464  // source consumed, state0, exit OK
465  // For illegal byte in state0, avoid backup up over PREVIOUS char
466  // For truncated last char, back up to beginning of it
467 
468  if (e >= kExitIllegalStructure) {
469  // Back up over exactly one byte of rejected/illegal UTF-8 character
470  src--;
471  // Back up more if needed
472  if (!InStateZero(st, Tbl)) {
473  do {
474  src--;
475  } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
476  }
477  } else if (!InStateZero(st, Tbl)) {
478  // Back up over truncated UTF-8 character
480  do {
481  src--;
482  } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
483  } else {
484  // Normal termination, source fully consumed
485  e = kExitOK;
486  }
487 
488  if (e == kExitDoAgain) {
489  // Loop back up to the fast scan
490  goto DoAgain;
491  }
492 
493  *bytes_consumed = src - isrc;
494  return e;
495 }
496 
498  const char * str,
499  int str_length,
500  int* bytes_consumed) {
501  *bytes_consumed = 0;
502  if (str_length == 0) return kExitOK;
503 
504  const uint8* isrc = reinterpret_cast<const uint8*>(str);
505  const uint8* src = isrc;
506  const uint8* srclimit = isrc + str_length;
507  const uint8* srclimit8 = srclimit - 7;
508  int n;
509  int rest_consumed;
510  int exit_reason;
511  do {
512  // Check initial few bytes one at a time until 8-byte aligned
513  while ((((uintptr_t)src & 0x07) != 0) &&
514  (src < srclimit) && (src[0] < 0x80)) {
515  src++;
516  }
517  if (((uintptr_t)src & 0x07) == 0) {
518  while ((src < srclimit8) &&
519  (((reinterpret_cast<const uint32*>(src)[0] |
520  reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
521  src += 8;
522  }
523  }
524  while ((src < srclimit) && (src[0] < 0x80)) {
525  src++;
526  }
527  // Run state table on the rest
528  n = src - isrc;
529  exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed);
530  src += rest_consumed;
531  } while ( exit_reason == kExitDoAgain );
532 
533  *bytes_consumed = src - isrc;
534  return exit_reason;
535 }
536 
537 // Hack: On some compilers the static tables are initialized at startup.
538 // We can't use them until they are initialized. However, some Protocol
539 // Buffer parsing happens at static init time and may try to validate
540 // UTF-8 strings. Since UTF-8 validation is only used for debugging
541 // anyway, we simply always return success if initialization hasn't
542 // occurred yet.
543 namespace {
544 
545 bool module_initialized_ = false;
546 
547 struct InitDetector {
548  InitDetector() {
549  module_initialized_ = true;
550  }
551 };
552 InitDetector init_detector;
553 
554 } // namespace
555 
556 bool IsStructurallyValidUTF8(const char* buf, int len) {
557  if (!module_initialized_) return true;
558 
559  int bytes_consumed = 0;
561  buf, len, &bytes_consumed);
562  return (bytes_consumed == len);
563 }
564 
566  if (!module_initialized_) return str.size();
567 
568  int bytes_consumed = 0;
570  str.data(), str.size(), &bytes_consumed);
571  return bytes_consumed;
572 }
573 
574 // Coerce UTF-8 byte string in src_str to be
575 // a structurally-valid equal-length string by selectively
576 // overwriting illegal bytes with replace_char (typically blank).
577 // replace_char must be legal printable 7-bit Ascii 0x20..0x7e.
578 // src_str is read-only. If any overwriting is needed, a modified byte string
579 // is created in idst, length isrclen.
580 //
581 // Returns pointer to output buffer, isrc if no changes were made,
582 // or idst if some bytes were changed.
583 //
584 // Fast case: all is structurally valid and no byte copying is done.
585 //
587  char* idst,
588  const char replace_char) {
589  const char* isrc = src_str.data();
590  const int len = src_str.length();
591  int n = UTF8SpnStructurallyValid(src_str);
592  if (n == len) { // Normal case -- all is cool, return
593  return const_cast<char*>(isrc);
594  } else { // Unusual case -- copy w/o bad bytes
595  const char* src = isrc;
596  const char* srclimit = isrc + len;
597  char* dst = idst;
598  memmove(dst, src, n); // Copy initial good chunk
599  src += n;
600  dst += n;
601  while (src < srclimit) { // src points to bogus byte or is off the end
602  dst[0] = replace_char; // replace one bad byte
603  src++;
604  dst++;
605  StringPiece str2(src, srclimit - src);
606  n = UTF8SpnStructurallyValid(str2); // scan the remainder
607  memmove(dst, src, n); // copy next good chunk
608  src += n;
609  dst += n;
610  }
611  }
612  return idst;
613 }
614 
615 } // namespace internal
616 } // namespace protobuf
617 } // namespace google
google::protobuf.internal::kExitRejectAlt
@ kExitRejectAlt
Definition: structurally_valid.cc:70
google::protobuf.internal::kExitDstSpaceFull
@ kExitDstSpaceFull
Definition: structurally_valid.cc:55
google::protobuf.internal::kExitDoAgain
@ kExitDoAgain
Definition: structurally_valid.cc:69
RJ_
#define RJ_
Definition: structurally_valid.cc:101
google::protobuf::StringPiece::length
stringpiece_ssize_type length() const
Definition: stringpiece.h:249
src
GLenum src
Definition: glcorearb.h:3364
google::protobuf.internal::kExitReplace2
@ kExitReplace2
Definition: structurally_valid.cc:60
google::protobuf.internal::UTF8SpnStructurallyValid
PROTOBUF_EXPORT int UTF8SpnStructurallyValid(const StringPiece &str)
Definition: structurally_valid.cc:565
google::protobuf::StringPiece::data
const char * data() const
Definition: stringpiece.h:247
google::protobuf::uint8
uint8_t uint8
Definition: protobuf/src/google/protobuf/stubs/port.h:153
google::protobuf.internal::utf8acceptnonsurrogates_SHIFT
static const unsigned int utf8acceptnonsurrogates_SHIFT
Definition: structurally_valid.cc:120
google::protobuf.internal::utf8acceptnonsurrogates_BYTES
static const unsigned int utf8acceptnonsurrogates_BYTES
Definition: structurally_valid.cc:121
google::protobuf.internal::utf8acceptnonsurrogates_remap_base
static const RemapEntry utf8acceptnonsurrogates_remap_base[]
Definition: structurally_valid.cc:317
google::protobuf.internal::utf8acceptnonsurrogates_fast
static const unsigned char utf8acceptnonsurrogates_fast[256]
Definition: structurally_valid.cc:324
google::protobuf.internal::UTF8StateMachineObj::hiadd
const uint32 hiadd
Definition: structurally_valid.cc:91
google::protobuf::uint32
uint32_t uint32
Definition: protobuf/src/google/protobuf/stubs/port.h:155
google::protobuf.internal::InStateZero
static bool InStateZero(const UTF8ScanObj *st, const uint8 *Tbl)
Definition: structurally_valid.cc:379
google::protobuf.internal::kExitReplace1S0
@ kExitReplace1S0
Definition: structurally_valid.cc:67
google::protobuf.internal::utf8acceptnonsurrogates_TOTAL_SIZE
static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE
Definition: structurally_valid.cc:118
google::protobuf.internal::kExitOK
@ kExitOK
Definition: structurally_valid.cc:57
google::protobuf.internal::UTF8StateMachineObj::entry_shift
const int entry_shift
Definition: structurally_valid.cc:88
google::protobuf.internal::RemapEntry::delete_bytes
uint8 delete_bytes
Definition: structurally_valid.cc:45
google::protobuf.internal::RemapEntry::bytes_offset
uint16 bytes_offset
Definition: structurally_valid.cc:47
google::protobuf.internal::kExitReplaceOffset2
@ kExitReplaceOffset2
Definition: structurally_valid.cc:66
stringpiece.h
google::protobuf.internal::UTF8StateMachineObj::state_table
const uint8 * state_table
Definition: structurally_valid.cc:92
google::protobuf.internal::UTF8StateMachineObj::total_size
const uint32 total_size
Definition: structurally_valid.cc:86
google::protobuf.internal::kExitIllegalStructure
@ kExitIllegalStructure
Definition: structurally_valid.cc:56
google::protobuf.internal::UTF8GenericScan
int UTF8GenericScan(const UTF8ScanObj *st, const char *str, int str_length, int *bytes_consumed)
Definition: structurally_valid.cc:387
google::protobuf.internal::UTF8StateMachineObj::max_expand
const int max_expand
Definition: structurally_valid.cc:87
google::protobuf.internal::utf8acceptnonsurrogates_obj
static const UTF8ScanObj utf8acceptnonsurrogates_obj
Definition: structurally_valid.cc:346
google::protobuf.internal::kExitReplace32
@ kExitReplace32
Definition: structurally_valid.cc:64
google::protobuf.internal::kExitReplace21
@ kExitReplace21
Definition: structurally_valid.cc:62
google::protobuf.internal::UTF8StateMachineObj::remap_string
const uint8 * remap_string
Definition: structurally_valid.cc:94
google::protobuf.internal::kExitReplace31
@ kExitReplace31
Definition: structurally_valid.cc:63
google::protobuf::StringPiece
Definition: stringpiece.h:180
google::protobuf.internal::UTF8StateMachineObj::state0_size
const uint32 state0_size
Definition: structurally_valid.cc:85
google::protobuf::uint16
uint16_t uint16
Definition: protobuf/src/google/protobuf/stubs/port.h:154
update_failure_list.str
str
Definition: update_failure_list.py:41
google::protobuf.internal::UTF8CoerceToStructurallyValid
PROTOBUF_EXPORT char * UTF8CoerceToStructurallyValid(const StringPiece &str, char *dst, char replace_char)
Definition: structurally_valid.cc:586
google::protobuf.internal::kExitSpecial
@ kExitSpecial
Definition: structurally_valid.cc:68
buf
GLenum GLuint GLenum GLsizei const GLchar * buf
Definition: glcorearb.h:4175
google::protobuf.internal::utf8acceptnonsurrogates_MAX_EXPAND_X4
static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4
Definition: structurally_valid.cc:119
X__
#define X__
Definition: structurally_valid.cc:100
n
GLdouble n
Definition: glcorearb.h:4153
google::protobuf.internal::RemapEntry
Definition: structurally_valid.cc:44
google::protobuf.internal::kExitReplace1
@ kExitReplace1
Definition: structurally_valid.cc:59
len
int len
Definition: php/ext/google/protobuf/map.c:206
common.h
google::protobuf.internal::UTF8GenericScanFastAscii
int UTF8GenericScanFastAscii(const UTF8ScanObj *st, const char *str, int str_length, int *bytes_consumed)
Definition: structurally_valid.cc:497
google::protobuf.internal::IsStructurallyValidUTF8
PROTOBUF_EXPORT bool IsStructurallyValidUTF8(const char *buf, int len)
Definition: structurally_valid.cc:556
google::protobuf.internal::UTF8StateMachineObj::fast_state
const uint8 * fast_state
Definition: structurally_valid.cc:95
google::protobuf.internal::kExitReplaceOffset1
@ kExitReplaceOffset1
Definition: structurally_valid.cc:65
google::protobuf.internal::utf8acceptnonsurrogates
static const uint8 utf8acceptnonsurrogates[]
Definition: structurally_valid.cc:125
dst
GLenum GLenum dst
Definition: glcorearb.h:3364
google::protobuf.internal::utf8acceptnonsurrogates_STATE0
static const unsigned int utf8acceptnonsurrogates_STATE0
Definition: structurally_valid.cc:116
google::protobuf.internal::utf8acceptnonsurrogates_LOSUB
static const unsigned int utf8acceptnonsurrogates_LOSUB
Definition: structurally_valid.cc:122
google::protobuf.internal::ExitReason
ExitReason
Definition: structurally_valid.cc:54
google::protobuf.internal::utf8acceptnonsurrogates_HIADD
static const unsigned int utf8acceptnonsurrogates_HIADD
Definition: structurally_valid.cc:123
internal
Definition: any.pb.h:40
google::protobuf.internal::UTF8StateMachineObj
Definition: structurally_valid.cc:83
google::protobuf.internal::RemapEntry::add_bytes
uint8 add_bytes
Definition: structurally_valid.cc:46
google::protobuf.internal::UTF8StateMachineObj::losub
const uint32 losub
Definition: structurally_valid.cc:90
google::protobuf.internal::UTF8StateMachineObj::bytes_per_entry
const int bytes_per_entry
Definition: structurally_valid.cc:89
google::protobuf.internal::UTF8StateMachineObj::state0
const uint32 state0
Definition: structurally_valid.cc:84
google::protobuf.internal::UTF8ScanObj
UTF8StateMachineObj UTF8ScanObj
Definition: structurally_valid.cc:98
google::protobuf.internal::kExitNone
@ kExitNone
Definition: structurally_valid.cc:71
google::protobuf.internal::kExitReplace3
@ kExitReplace3
Definition: structurally_valid.cc:61
google::protobuf.internal::utf8acceptnonsurrogates_STATE0_SIZE
static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE
Definition: structurally_valid.cc:117
google::protobuf.internal::utf8acceptnonsurrogates_remap_string
static const unsigned char utf8acceptnonsurrogates_remap_string[]
Definition: structurally_valid.cc:321
google::protobuf.internal::kExitReject
@ kExitReject
Definition: structurally_valid.cc:58
google
Definition: data_proto2_to_proto3_util.h:11
google::protobuf.internal::UTF8StateMachineObj::remap_base
const RemapEntry * remap_base
Definition: structurally_valid.cc:93


libaditof
Author(s):
autogenerated on Wed May 21 2025 02:06:59