31 package com.google.protobuf;
33 import static org.junit.Assert.assertEquals;
34 import static org.junit.Assert.assertFalse;
35 import static org.junit.Assert.assertSame;
36 import static org.junit.Assert.assertTrue;
37 import static org.junit.Assert.fail;
40 import java.nio.ByteBuffer;
41 import java.nio.CharBuffer;
42 import java.nio.charset.CharsetDecoder;
43 import java.nio.charset.CharsetEncoder;
44 import java.nio.charset.CoderResult;
45 import java.nio.charset.CodingErrorAction;
46 import java.util.ArrayList;
47 import java.util.Arrays;
48 import java.util.List;
49 import java.util.Random;
50 import java.util.logging.Logger;
58 final class IsValidUtf8TestUtil {
59 private static Logger logger = Logger.getLogger(IsValidUtf8TestUtil.class.getName());
61 private IsValidUtf8TestUtil() {}
63 static interface ByteStringFactory {
64 ByteString newByteString(
byte[]
bytes);
67 static final ByteStringFactory LITERAL_FACTORY =
68 new ByteStringFactory() {
70 public ByteString newByteString(
byte[]
bytes) {
71 return ByteString.wrap(
bytes);
75 static final ByteStringFactory HEAP_NIO_FACTORY =
76 new ByteStringFactory() {
78 public ByteString newByteString(
byte[]
bytes) {
79 return new NioByteString(ByteBuffer.wrap(
bytes));
83 private static ThreadLocal<SoftReference<ByteBuffer>> directBuffer =
84 new ThreadLocal<SoftReference<ByteBuffer>>();
92 static final ByteStringFactory DIRECT_NIO_FACTORY =
93 new ByteStringFactory() {
95 public ByteString newByteString(
byte[]
bytes) {
96 SoftReference<ByteBuffer>
ref = directBuffer.get();
100 directBuffer.set(
new SoftReference<ByteBuffer>(
buffer));
105 return new NioByteString(
buffer);
110 static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1;
113 static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT = ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
116 static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1;
119 static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
121 (
long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2)
124 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
127 static final long THREE_BYTE_SURROGATES = 2 * 1024;
130 static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
131 0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
134 static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
136 (
long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3)
139 2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
142 THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
145 static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1;
148 static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
150 (
long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4)
153 2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
156 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
160 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
161 * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
162 * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
165 FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
167 static final class Shard {
173 public Shard(
long index,
long start,
long lim,
long expected) {
174 assertTrue(
start < lim);
178 this.expected = expected;
182 static final long[] FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES =
183 generateFourByteShardsExpectedRunnables();
185 private static long[] generateFourByteShardsExpectedRunnables() {
186 long[] expected =
new long[128];
189 for (
int i = 0;
i <= 63;
i++) {
190 expected[
i] = 5300224;
194 for (
int i = 97;
i <= 111;
i++) {
195 expected[
i] = 2342912;
199 for (
int i = 113;
i <= 117;
i++) {
200 expected[
i] = 1048576;
204 expected[112] = 786432;
205 expected[118] = 786432;
206 expected[119] = 1048576;
207 expected[120] = 458752;
208 expected[121] = 524288;
209 expected[122] = 65536;
215 static final List<Shard> FOUR_BYTE_SHARDS =
216 generateFourByteShards(128, FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES);
218 private static List<Shard> generateFourByteShards(
int numShards,
long[] expected) {
219 assertEquals(numShards, expected.length);
220 List<Shard> shards =
new ArrayList<Shard>(numShards);
222 long increment = lim / numShards;
223 assertTrue(lim % numShards == 0);
224 for (
int i = 0;
i < numShards;
i++) {
225 shards.add(
new Shard(
i, increment *
i, increment * (
i + 1), expected[
i]));
237 static void testBytes(ByteStringFactory factory,
int numBytes,
long expectedCount) {
238 testBytes(factory, numBytes, expectedCount, 0, -1);
252 static void testBytes(
253 ByteStringFactory factory,
int numBytes,
long expectedCount,
long start,
long lim) {
254 Random rnd =
new Random();
255 byte[]
bytes =
new byte[numBytes];
258 lim = 1L << (numBytes * 8);
261 long countRoundTripped = 0;
262 for (
long byteChar =
start; byteChar < lim; byteChar++) {
263 long tmpByteChar = byteChar;
264 for (
int i = 0;
i < numBytes;
i++) {
266 tmpByteChar = tmpByteChar >> 8;
268 ByteString bs = factory.newByteString(
bytes);
269 boolean isRoundTrippable = bs.isValidUtf8();
270 String
s =
new String(
bytes, Internal.UTF_8);
271 byte[] bytesReencoded =
s.getBytes(Internal.UTF_8);
272 boolean bytesEqual = Arrays.equals(
bytes, bytesReencoded);
274 if (bytesEqual != isRoundTrippable) {
275 outputFailure(byteChar,
bytes, bytesReencoded);
279 assertEquals(isRoundTrippable, Utf8.isValidUtf8(
bytes));
280 assertEquals(isRoundTrippable, Utf8.isValidUtf8(
bytes, 0, numBytes));
283 assertEquals(s, Utf8.decodeUtf8(
bytes, 0, numBytes));
284 }
catch (InvalidProtocolBufferException e) {
285 if (isRoundTrippable) {
286 System.out.println(
"Could not decode utf-8");
287 outputFailure(byteChar,
bytes, bytesReencoded);
293 int i = rnd.nextInt(numBytes);
294 int j = rnd.nextInt(numBytes);
300 int state1 = Utf8.partialIsValidUtf8(Utf8.COMPLETE,
bytes, 0,
i);
301 int state2 = Utf8.partialIsValidUtf8(state1,
bytes,
i, j);
302 int state3 = Utf8.partialIsValidUtf8(state2,
bytes, j, numBytes);
303 if (isRoundTrippable != (state3 == Utf8.COMPLETE)) {
304 System.out.printf(
"state=%04x %04x %04x i=%d j=%d%n", state1, state2, state3,
i, j);
305 outputFailure(byteChar,
bytes, bytesReencoded);
307 assertEquals(isRoundTrippable, (state3 == Utf8.COMPLETE));
311 RopeByteString.newInstanceForTest(
313 RopeByteString.newInstanceForTest(bs.substring(
i, j), bs.substring(j, numBytes)));
314 assertSame(RopeByteString.class, rope.getClass());
316 ByteString[] byteStrings = {bs, bs.substring(0, numBytes), rope};
317 for (ByteString
x : byteStrings) {
318 assertEquals(isRoundTrippable,
x.isValidUtf8());
319 assertEquals(state3,
x.partialIsValidUtf8(Utf8.COMPLETE, 0, numBytes));
321 assertEquals(state1,
x.partialIsValidUtf8(Utf8.COMPLETE, 0,
i));
322 assertEquals(state1,
x.substring(0,
i).partialIsValidUtf8(Utf8.COMPLETE, 0,
i));
323 assertEquals(state2,
x.partialIsValidUtf8(state1,
i, j -
i));
324 assertEquals(state2,
x.substring(
i, j).partialIsValidUtf8(state1, 0, j -
i));
325 assertEquals(state3,
x.partialIsValidUtf8(state2, j, numBytes - j));
326 assertEquals(state3,
x.substring(j, numBytes).partialIsValidUtf8(state2, 0, numBytes - j));
330 ByteString ropeADope = RopeByteString.newInstanceForTest(bs, bs.substring(0, numBytes));
331 assertEquals(isRoundTrippable, ropeADope.isValidUtf8());
333 if (isRoundTrippable) {
337 if (byteChar != 0 && byteChar % 1000000L == 0) {
338 logger.info(
"Processed " + (byteChar / 1000000L) +
" million characters");
341 logger.info(
"Round tripped " + countRoundTripped +
" of " +
count);
342 assertEquals(expectedCount, countRoundTripped);
358 static void testBytesUsingByteBuffers(
359 ByteStringFactory factory,
int numBytes,
long expectedCount,
long start,
long lim) {
363 .onMalformedInput(CodingErrorAction.REPLACE)
364 .onUnmappableCharacter(CodingErrorAction.REPLACE);
368 .onMalformedInput(CodingErrorAction.REPLACE)
369 .onUnmappableCharacter(CodingErrorAction.REPLACE);
370 byte[]
bytes =
new byte[numBytes];
371 int maxChars = (int) (
decoder.maxCharsPerByte() * numBytes) + 1;
372 char[] charsDecoded =
new char[(int) (
decoder.maxCharsPerByte() * numBytes) + 1];
373 int maxBytes = (int) (
encoder.maxBytesPerChar() * maxChars) + 1;
374 byte[] bytesReencoded =
new byte[maxBytes];
376 ByteBuffer bb = ByteBuffer.wrap(
bytes);
377 CharBuffer cb = CharBuffer.wrap(charsDecoded);
378 ByteBuffer bbReencoded = ByteBuffer.wrap(bytesReencoded);
380 lim = 1L << (numBytes * 8);
383 long countRoundTripped = 0;
384 for (
long byteChar =
start; byteChar < lim; byteChar++) {
386 bb.limit(
bytes.length);
388 cb.limit(charsDecoded.length);
389 bbReencoded.rewind();
390 bbReencoded.limit(bytesReencoded.length);
393 long tmpByteChar = byteChar;
394 for (
int i = 0;
i <
bytes.length;
i++) {
396 tmpByteChar = tmpByteChar >> 8;
398 boolean isRoundTrippable = factory.newByteString(
bytes).isValidUtf8();
399 CoderResult result =
decoder.decode(bb, cb,
true);
400 assertFalse(result.isError());
402 assertFalse(result.isError());
404 int charLen = cb.position();
407 result =
encoder.encode(cb, bbReencoded,
true);
408 assertFalse(result.isError());
409 result =
encoder.flush(bbReencoded);
410 assertFalse(result.isError());
412 boolean bytesEqual =
true;
413 int bytesLen = bbReencoded.position();
414 if (bytesLen != numBytes) {
417 for (
int i = 0;
i < numBytes;
i++) {
418 if (
bytes[
i] != bytesReencoded[
i]) {
424 if (bytesEqual != isRoundTrippable) {
425 outputFailure(byteChar,
bytes, bytesReencoded, bytesLen);
429 if (isRoundTrippable) {
432 if (byteChar != 0 && byteChar % 1000000 == 0) {
433 logger.info(
"Processed " + (byteChar / 1000000) +
" million characters");
436 logger.info(
"Round tripped " + countRoundTripped +
" of " +
count);
437 assertEquals(expectedCount, countRoundTripped);
440 private static void outputFailure(
long byteChar,
byte[]
bytes,
byte[] after) {
441 outputFailure(byteChar,
bytes, after, after.length);
444 private static void outputFailure(
long byteChar,
byte[]
bytes,
byte[] after,
int len) {
447 "Failure: (%s) %s => %s",
448 Long.toHexString(byteChar), toHexString(
bytes), toHexString(after,
len)));
451 private static String toHexString(
byte[]
b) {
452 return toHexString(
b,
b.length);
455 private static String toHexString(
byte[]
b,
int len) {
456 StringBuilder
s =
new StringBuilder();
458 for (
int i = 0;
i <
len;
i++) {
462 s.append(String.format(
"%02x",
b[
i] & 0xFF));