IsValidUtf8TestUtil.java
Go to the documentation of this file.
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 package com.google.protobuf;
32 
33 import static org.junit.Assert.assertEquals;
34 import static org.junit.Assert.assertFalse;
35 import static org.junit.Assert.assertSame;
36 import static org.junit.Assert.assertTrue;
37 import static org.junit.Assert.fail;
38 
39 import java.lang.ref.SoftReference;
40 import java.nio.ByteBuffer;
41 import java.nio.CharBuffer;
42 import java.nio.charset.CharsetDecoder;
43 import java.nio.charset.CharsetEncoder;
44 import java.nio.charset.CoderResult;
45 import java.nio.charset.CodingErrorAction;
46 import java.util.ArrayList;
47 import java.util.Arrays;
48 import java.util.List;
49 import java.util.Random;
50 import java.util.logging.Logger;
51 
58 final class IsValidUtf8TestUtil {
59  private static Logger logger = Logger.getLogger(IsValidUtf8TestUtil.class.getName());
60 
61  private IsValidUtf8TestUtil() {}
62 
63  static interface ByteStringFactory {
64  ByteString newByteString(byte[] bytes);
65  }
66 
67  static final ByteStringFactory LITERAL_FACTORY =
68  new ByteStringFactory() {
69  @Override
70  public ByteString newByteString(byte[] bytes) {
71  return ByteString.wrap(bytes);
72  }
73  };
74 
75  static final ByteStringFactory HEAP_NIO_FACTORY =
76  new ByteStringFactory() {
77  @Override
78  public ByteString newByteString(byte[] bytes) {
79  return new NioByteString(ByteBuffer.wrap(bytes));
80  }
81  };
82 
83  private static ThreadLocal<SoftReference<ByteBuffer>> directBuffer =
84  new ThreadLocal<SoftReference<ByteBuffer>>();
85 
92  static final ByteStringFactory DIRECT_NIO_FACTORY =
93  new ByteStringFactory() {
94  @Override
95  public ByteString newByteString(byte[] bytes) {
96  SoftReference<ByteBuffer> ref = directBuffer.get();
97  ByteBuffer buffer = ref == null ? null : ref.get();
98  if (buffer == null || buffer.capacity() < bytes.length) {
99  buffer = ByteBuffer.allocateDirect(bytes.length);
100  directBuffer.set(new SoftReference<ByteBuffer>(buffer));
101  }
102  buffer.clear();
103  buffer.put(bytes);
104  buffer.flip();
105  return new NioByteString(buffer);
106  }
107  };
108 
109  // 128 - [chars 0x0000 to 0x007f]
110  static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1;
111 
112  // 128
113  static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT = ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
114 
115  // 1920 [chars 0x0080 to 0x07FF]
116  static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1;
117 
118  // 18,304
119  static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
120  // Both bytes are one byte characters
121  (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2)
122  +
123  // The possible number of two byte characters
124  TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
125 
126  // 2048
127  static final long THREE_BYTE_SURROGATES = 2 * 1024;
128 
129  // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
130  static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
131  0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
132 
133  // 2,650,112
134  static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
135  // All one byte characters
136  (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3)
137  +
138  // One two byte character and a one byte character
139  2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
140  +
141  // Three byte characters
142  THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
143 
144  // 1,048,576 [chars 0x10000L to 0x10FFFF]
145  static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1;
146 
147  // 289,571,839
148  static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
149  // All one byte characters
150  (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4)
151  +
152  // One and three byte characters
153  2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
154  +
155  // Two two byte characters
156  TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
157  +
158  // Permutations of one and two byte characters
159  3
160  * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
161  * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
162  * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
163  +
164  // Four byte characters
165  FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
166 
167  static final class Shard {
168  final long index;
169  final long start;
170  final long lim;
171  final long expected;
172 
173  public Shard(long index, long start, long lim, long expected) {
174  assertTrue(start < lim);
175  this.index = index;
176  this.start = start;
177  this.lim = lim;
178  this.expected = expected;
179  }
180  }
181 
182  static final long[] FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES =
183  generateFourByteShardsExpectedRunnables();
184 
185  private static long[] generateFourByteShardsExpectedRunnables() {
186  long[] expected = new long[128];
187 
188  // 0-63 are all 5300224
189  for (int i = 0; i <= 63; i++) {
190  expected[i] = 5300224;
191  }
192 
193  // 97-111 are all 2342912
194  for (int i = 97; i <= 111; i++) {
195  expected[i] = 2342912;
196  }
197 
198  // 113-117 are all 1048576
199  for (int i = 113; i <= 117; i++) {
200  expected[i] = 1048576;
201  }
202 
203  // One offs
204  expected[112] = 786432;
205  expected[118] = 786432;
206  expected[119] = 1048576;
207  expected[120] = 458752;
208  expected[121] = 524288;
209  expected[122] = 65536;
210 
211  // Anything not assigned was the default 0.
212  return expected;
213  }
214 
215  static final List<Shard> FOUR_BYTE_SHARDS =
216  generateFourByteShards(128, FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES);
217 
218  private static List<Shard> generateFourByteShards(int numShards, long[] expected) {
219  assertEquals(numShards, expected.length);
220  List<Shard> shards = new ArrayList<Shard>(numShards);
221  long lim = 1L << 32;
222  long increment = lim / numShards;
223  assertTrue(lim % numShards == 0);
224  for (int i = 0; i < numShards; i++) {
225  shards.add(new Shard(i, increment * i, increment * (i + 1), expected[i]));
226  }
227  return shards;
228  }
229 
237  static void testBytes(ByteStringFactory factory, int numBytes, long expectedCount) {
238  testBytes(factory, numBytes, expectedCount, 0, -1);
239  }
240 
252  static void testBytes(
253  ByteStringFactory factory, int numBytes, long expectedCount, long start, long lim) {
254  Random rnd = new Random();
255  byte[] bytes = new byte[numBytes];
256 
257  if (lim == -1) {
258  lim = 1L << (numBytes * 8);
259  }
260  long count = 0;
261  long countRoundTripped = 0;
262  for (long byteChar = start; byteChar < lim; byteChar++) {
263  long tmpByteChar = byteChar;
264  for (int i = 0; i < numBytes; i++) {
265  bytes[bytes.length - i - 1] = (byte) tmpByteChar;
266  tmpByteChar = tmpByteChar >> 8;
267  }
268  ByteString bs = factory.newByteString(bytes);
269  boolean isRoundTrippable = bs.isValidUtf8();
270  String s = new String(bytes, Internal.UTF_8);
271  byte[] bytesReencoded = s.getBytes(Internal.UTF_8);
272  boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);
273 
274  if (bytesEqual != isRoundTrippable) {
275  outputFailure(byteChar, bytes, bytesReencoded);
276  }
277 
278  // Check agreement with static Utf8 methods.
279  assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes));
280  assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes, 0, numBytes));
281 
282  try {
283  assertEquals(s, Utf8.decodeUtf8(bytes, 0, numBytes));
284  } catch (InvalidProtocolBufferException e) {
285  if (isRoundTrippable) {
286  System.out.println("Could not decode utf-8");
287  outputFailure(byteChar, bytes, bytesReencoded);
288  }
289  }
290 
291  // Test partial sequences.
292  // Partition numBytes into three segments (not necessarily non-empty).
293  int i = rnd.nextInt(numBytes);
294  int j = rnd.nextInt(numBytes);
295  if (j < i) {
296  int tmp = i;
297  i = j;
298  j = tmp;
299  }
300  int state1 = Utf8.partialIsValidUtf8(Utf8.COMPLETE, bytes, 0, i);
301  int state2 = Utf8.partialIsValidUtf8(state1, bytes, i, j);
302  int state3 = Utf8.partialIsValidUtf8(state2, bytes, j, numBytes);
303  if (isRoundTrippable != (state3 == Utf8.COMPLETE)) {
304  System.out.printf("state=%04x %04x %04x i=%d j=%d%n", state1, state2, state3, i, j);
305  outputFailure(byteChar, bytes, bytesReencoded);
306  }
307  assertEquals(isRoundTrippable, (state3 == Utf8.COMPLETE));
308 
309  // Test ropes built out of small partial sequences
310  ByteString rope =
311  RopeByteString.newInstanceForTest(
312  bs.substring(0, i),
313  RopeByteString.newInstanceForTest(bs.substring(i, j), bs.substring(j, numBytes)));
314  assertSame(RopeByteString.class, rope.getClass());
315 
316  ByteString[] byteStrings = {bs, bs.substring(0, numBytes), rope};
317  for (ByteString x : byteStrings) {
318  assertEquals(isRoundTrippable, x.isValidUtf8());
319  assertEquals(state3, x.partialIsValidUtf8(Utf8.COMPLETE, 0, numBytes));
320 
321  assertEquals(state1, x.partialIsValidUtf8(Utf8.COMPLETE, 0, i));
322  assertEquals(state1, x.substring(0, i).partialIsValidUtf8(Utf8.COMPLETE, 0, i));
323  assertEquals(state2, x.partialIsValidUtf8(state1, i, j - i));
324  assertEquals(state2, x.substring(i, j).partialIsValidUtf8(state1, 0, j - i));
325  assertEquals(state3, x.partialIsValidUtf8(state2, j, numBytes - j));
326  assertEquals(state3, x.substring(j, numBytes).partialIsValidUtf8(state2, 0, numBytes - j));
327  }
328 
329  // ByteString reduplication should not affect its UTF-8 validity.
330  ByteString ropeADope = RopeByteString.newInstanceForTest(bs, bs.substring(0, numBytes));
331  assertEquals(isRoundTrippable, ropeADope.isValidUtf8());
332 
333  if (isRoundTrippable) {
334  countRoundTripped++;
335  }
336  count++;
337  if (byteChar != 0 && byteChar % 1000000L == 0) {
338  logger.info("Processed " + (byteChar / 1000000L) + " million characters");
339  }
340  }
341  logger.info("Round tripped " + countRoundTripped + " of " + count);
342  assertEquals(expectedCount, countRoundTripped);
343  }
344 
358  static void testBytesUsingByteBuffers(
359  ByteStringFactory factory, int numBytes, long expectedCount, long start, long lim) {
360  CharsetDecoder decoder =
361  Internal.UTF_8
362  .newDecoder()
363  .onMalformedInput(CodingErrorAction.REPLACE)
364  .onUnmappableCharacter(CodingErrorAction.REPLACE);
365  CharsetEncoder encoder =
366  Internal.UTF_8
367  .newEncoder()
368  .onMalformedInput(CodingErrorAction.REPLACE)
369  .onUnmappableCharacter(CodingErrorAction.REPLACE);
370  byte[] bytes = new byte[numBytes];
371  int maxChars = (int) (decoder.maxCharsPerByte() * numBytes) + 1;
372  char[] charsDecoded = new char[(int) (decoder.maxCharsPerByte() * numBytes) + 1];
373  int maxBytes = (int) (encoder.maxBytesPerChar() * maxChars) + 1;
374  byte[] bytesReencoded = new byte[maxBytes];
375 
376  ByteBuffer bb = ByteBuffer.wrap(bytes);
377  CharBuffer cb = CharBuffer.wrap(charsDecoded);
378  ByteBuffer bbReencoded = ByteBuffer.wrap(bytesReencoded);
379  if (lim == -1) {
380  lim = 1L << (numBytes * 8);
381  }
382  long count = 0;
383  long countRoundTripped = 0;
384  for (long byteChar = start; byteChar < lim; byteChar++) {
385  bb.rewind();
386  bb.limit(bytes.length);
387  cb.rewind();
388  cb.limit(charsDecoded.length);
389  bbReencoded.rewind();
390  bbReencoded.limit(bytesReencoded.length);
391  encoder.reset();
392  decoder.reset();
393  long tmpByteChar = byteChar;
394  for (int i = 0; i < bytes.length; i++) {
395  bytes[bytes.length - i - 1] = (byte) tmpByteChar;
396  tmpByteChar = tmpByteChar >> 8;
397  }
398  boolean isRoundTrippable = factory.newByteString(bytes).isValidUtf8();
399  CoderResult result = decoder.decode(bb, cb, true);
400  assertFalse(result.isError());
401  result = decoder.flush(cb);
402  assertFalse(result.isError());
403 
404  int charLen = cb.position();
405  cb.rewind();
406  cb.limit(charLen);
407  result = encoder.encode(cb, bbReencoded, true);
408  assertFalse(result.isError());
409  result = encoder.flush(bbReencoded);
410  assertFalse(result.isError());
411 
412  boolean bytesEqual = true;
413  int bytesLen = bbReencoded.position();
414  if (bytesLen != numBytes) {
415  bytesEqual = false;
416  } else {
417  for (int i = 0; i < numBytes; i++) {
418  if (bytes[i] != bytesReencoded[i]) {
419  bytesEqual = false;
420  break;
421  }
422  }
423  }
424  if (bytesEqual != isRoundTrippable) {
425  outputFailure(byteChar, bytes, bytesReencoded, bytesLen);
426  }
427 
428  count++;
429  if (isRoundTrippable) {
430  countRoundTripped++;
431  }
432  if (byteChar != 0 && byteChar % 1000000 == 0) {
433  logger.info("Processed " + (byteChar / 1000000) + " million characters");
434  }
435  }
436  logger.info("Round tripped " + countRoundTripped + " of " + count);
437  assertEquals(expectedCount, countRoundTripped);
438  }
439 
440  private static void outputFailure(long byteChar, byte[] bytes, byte[] after) {
441  outputFailure(byteChar, bytes, after, after.length);
442  }
443 
444  private static void outputFailure(long byteChar, byte[] bytes, byte[] after, int len) {
445  fail(
446  String.format(
447  "Failure: (%s) %s => %s",
448  Long.toHexString(byteChar), toHexString(bytes), toHexString(after, len)));
449  }
450 
451  private static String toHexString(byte[] b) {
452  return toHexString(b, b.length);
453  }
454 
455  private static String toHexString(byte[] b, int len) {
456  StringBuilder s = new StringBuilder();
457  s.append("\"");
458  for (int i = 0; i < len; i++) {
459  if (i > 0) {
460  s.append(" ");
461  }
462  s.append(String.format("%02x", b[i] & 0xFF));
463  }
464  s.append("\"");
465  return s.toString();
466  }
467 }
java::lang
s
XmlRpcServer s
encoder
static char encoder[85+1]
Definition: zmq_utils.cpp:72
x
GLint GLenum GLint x
Definition: glcorearb.h:2834
bytes
uint8 bytes[10]
Definition: coded_stream_unittest.cc:153
b
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:3228
start
GLuint start
Definition: glcorearb.h:2858
buffer
Definition: buffer_processor.h:43
byte
SETUP_TEARDOWN_TESTCONTEXT typedef uint8_t byte
Definition: test_stream.cpp:12
decoder
static uint8_t decoder[96]
Definition: zmq_utils.cpp:85
i
int i
Definition: gmock-matchers_test.cc:764
java
len
int len
Definition: php/ext/google/protobuf/map.c:206
count
GLint GLsizei count
Definition: glcorearb.h:2830
ref
GLint ref
Definition: glcorearb.h:2789
index
GLuint index
Definition: glcorearb.h:3055
google::protobuf.internal.decoder.long
long
Definition: decoder.py:89


libaditof
Author(s):
autogenerated on Wed May 21 2025 02:06:54