31 package com.google.protobuf;
36 import static java.
lang.Character.MAX_SURROGATE;
37 import static java.
lang.Character.MIN_HIGH_SURROGATE;
38 import static java.
lang.Character.MIN_LOW_SURROGATE;
39 import static java.
lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
40 import static java.
lang.Character.MIN_SURROGATE;
41 import static java.
lang.Character.isSurrogatePair;
42 import static java.
lang.Character.toCodePoint;
44 import java.nio.ByteBuffer;
83 private static final Processor processor =
84 (UnsafeProcessor.isAvailable() && !Android.isOnAndroidDevice())
85 ?
new UnsafeProcessor()
86 :
new SafeProcessor();
92 private static final long ASCII_MASK_LONG = 0x8080808080808080L;
99 static final int MAX_BYTES_PER_CHAR = 3;
105 public static final int COMPLETE = 0;
108 public static final int MALFORMED = -1;
119 private static final int UNSAFE_COUNT_ASCII_THRESHOLD = 16;
146 public static boolean isValidUtf8(
byte[]
bytes) {
147 return processor.isValidUtf8(
bytes, 0,
bytes.length);
158 public static boolean isValidUtf8(
byte[]
bytes,
int index,
int limit) {
159 return processor.isValidUtf8(
bytes,
index, limit);
175 public static int partialIsValidUtf8(
int state,
byte[]
bytes,
int index,
int limit) {
176 return processor.partialIsValidUtf8(state,
bytes,
index, limit);
179 private static int incompleteStateFor(
int byte1) {
180 return (byte1 > (
byte) 0xF4) ? MALFORMED : byte1;
183 private static int incompleteStateFor(
int byte1,
int byte2) {
184 return (byte1 > (
byte) 0xF4 || byte2 > (
byte) 0xBF) ? MALFORMED : byte1 ^ (byte2 << 8);
187 private static int incompleteStateFor(
int byte1,
int byte2,
int byte3) {
188 return (byte1 > (
byte) 0xF4 || byte2 > (
byte) 0xBF || byte3 > (
byte) 0xBF)
190 : byte1 ^ (byte2 << 8) ^ (byte3 << 16);
193 private static int incompleteStateFor(
byte[]
bytes,
int index,
int limit) {
195 switch (limit -
index) {
197 return incompleteStateFor(byte1);
199 return incompleteStateFor(byte1,
bytes[
index]);
203 throw new AssertionError();
207 private static int incompleteStateFor(
208 final ByteBuffer
buffer,
final int byte1,
final int index,
final int remaining) {
211 return incompleteStateFor(byte1);
213 return incompleteStateFor(byte1,
buffer.get(
index));
217 throw new AssertionError();
225 static class UnpairedSurrogateException
extends IllegalArgumentException {
226 UnpairedSurrogateException(
int index,
int length) {
227 super(
"Unpaired surrogate at index " +
index +
" of " +
length);
239 static int encodedLength(CharSequence sequence) {
241 int utf16Length = sequence.length();
242 int utf8Length = utf16Length;
246 while (
i < utf16Length && sequence.charAt(
i) < 0x80) {
251 for (;
i < utf16Length;
i++) {
252 char c = sequence.charAt(
i);
254 utf8Length += ((0x7f - c) >>> 31);
256 utf8Length += encodedLengthGeneral(sequence,
i);
261 if (utf8Length < utf16Length) {
263 throw new IllegalArgumentException(
264 "UTF-8 length does not fit in int: " + (utf8Length + (1L << 32)));
269 private static int encodedLengthGeneral(CharSequence sequence,
int start) {
270 int utf16Length = sequence.length();
272 for (
int i =
start;
i < utf16Length;
i++) {
273 char c = sequence.charAt(
i);
275 utf8Length += (0x7f - c) >>> 31;
279 if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
281 int cp = Character.codePointAt(sequence,
i);
282 if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
283 throw new UnpairedSurrogateException(
i, utf16Length);
292 static int encode(CharSequence in,
byte[] out,
int offset,
int length) {
306 static boolean isValidUtf8(ByteBuffer
buffer) {
319 static int partialIsValidUtf8(
int state, ByteBuffer
buffer,
int index,
int limit) {
320 return processor.partialIsValidUtf8(state,
buffer,
index, limit);
329 throws InvalidProtocolBufferException {
339 throws InvalidProtocolBufferException {
353 static void encodeUtf8(CharSequence in, ByteBuffer out) {
354 processor.encodeUtf8(in, out);
368 private static int estimateConsecutiveAscii(ByteBuffer
buffer,
int index,
int limit) {
370 final int lim = limit - 7;
374 for (;
i < lim && (
buffer.getLong(
i) & ASCII_MASK_LONG) == 0;
i += 8) {}
380 abstract static class Processor {
389 final boolean isValidUtf8(
byte[]
bytes,
int index,
int limit) {
390 return partialIsValidUtf8(COMPLETE,
bytes,
index, limit) == COMPLETE;
406 abstract int partialIsValidUtf8(
int state,
byte[]
bytes,
int index,
int limit);
416 final boolean isValidUtf8(ByteBuffer
buffer,
int index,
int limit) {
417 return partialIsValidUtf8(COMPLETE,
buffer,
index, limit) == COMPLETE;
426 final int partialIsValidUtf8(
427 final int state,
final ByteBuffer
buffer,
int index,
final int limit) {
431 }
else if (
buffer.isDirect()) {
432 return partialIsValidUtf8Direct(state,
buffer,
index, limit);
434 return partialIsValidUtf8Default(state,
buffer,
index, limit);
438 abstract int partialIsValidUtf8Direct(
439 final int state,
final ByteBuffer
buffer,
int index,
final int limit);
446 final int partialIsValidUtf8Default(
447 final int state,
final ByteBuffer
buffer,
int index,
final int limit) {
448 if (state != COMPLETE) {
456 if (
index >= limit) {
460 byte byte1 = (
byte) state;
462 if (byte1 < (
byte) 0xE0) {
467 if (byte1 < (
byte) 0xC2
472 }
else if (byte1 < (
byte) 0xF0) {
476 byte byte2 = (
byte) ~(state >> 8);
479 if (
index >= limit) {
480 return incompleteStateFor(byte1, byte2);
483 if (byte2 > (
byte) 0xBF
485 || (byte1 == (
byte) 0xE0 && byte2 < (
byte) 0xA0)
487 || (byte1 == (
byte) 0xED && byte2 >= (
byte) 0xA0)
496 byte byte2 = (
byte) ~(state >> 8);
500 if (
index >= limit) {
501 return incompleteStateFor(byte1, byte2);
504 byte3 = (
byte) (state >> 16);
508 if (
index >= limit) {
509 return incompleteStateFor(byte1, byte2, byte3);
517 if (byte2 > (
byte) 0xBF
522 || (((byte1 << 28) + (byte2 - (
byte) 0x90)) >> 30) != 0
524 || byte3 > (
byte) 0xBF
540 private static int partialIsValidUtf8(
final ByteBuffer
buffer,
int index,
final int limit) {
549 if (
index >= limit) {
555 if (byte1 < (
byte) 0xE0) {
557 if (
index >= limit) {
564 if (byte1 < (
byte) 0xC2 ||
buffer.get(
index) > (
byte) 0xBF) {
568 }
else if (byte1 < (
byte) 0xF0) {
570 if (
index >= limit - 1) {
576 if (byte2 > (
byte) 0xBF
578 || (byte1 == (
byte) 0xE0 && byte2 < (
byte) 0xA0)
580 || (byte1 == (
byte) 0xED && byte2 >= (
byte) 0xA0)
588 if (
index >= limit - 2) {
595 if (byte2 > (
byte) 0xBF
600 || (((byte1 << 28) + (byte2 - (
byte) 0x90)) >> 30) != 0
617 throws InvalidProtocolBufferException;
625 throws InvalidProtocolBufferException {
629 }
else if (
buffer.isDirect()) {
636 abstract String decodeUtf8Direct(ByteBuffer
buffer,
int index,
int size)
637 throws InvalidProtocolBufferException;
644 throws InvalidProtocolBufferException {
647 throw new ArrayIndexOutOfBoundsException(
648 String.format(
"buffer limit=%d, index=%d, limit=%d",
buffer.limit(),
index,
size));
656 char[] resultArr =
new char[
size];
663 if (!DecodeUtil.isOneByte(
b)) {
667 DecodeUtil.handleOneByte(
b, resultArr, resultPos++);
672 if (DecodeUtil.isOneByte(byte1)) {
673 DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
678 if (!DecodeUtil.isOneByte(
b)) {
682 DecodeUtil.handleOneByte(
b, resultArr, resultPos++);
684 }
else if (DecodeUtil.isTwoBytes(byte1)) {
686 throw InvalidProtocolBufferException.invalidUtf8();
688 DecodeUtil.handleTwoBytes(
690 }
else if (DecodeUtil.isThreeBytes(byte1)) {
691 if (
offset >= limit - 1) {
692 throw InvalidProtocolBufferException.invalidUtf8();
694 DecodeUtil.handleThreeBytes(
701 if (
offset >= limit - 2) {
702 throw InvalidProtocolBufferException.invalidUtf8();
704 DecodeUtil.handleFourBytes(
716 return new String(resultArr, 0, resultPos);
749 abstract int encodeUtf8(CharSequence in,
byte[] out,
int offset,
int length);
768 final void encodeUtf8(CharSequence in, ByteBuffer out) {
769 if (out.hasArray()) {
770 final int offset = out.arrayOffset();
771 int endIndex = Utf8.encode(in, out.array(),
offset + out.position(), out.remaining());
772 out.position(endIndex -
offset);
773 }
else if (out.isDirect()) {
774 encodeUtf8Direct(in, out);
776 encodeUtf8Default(in, out);
781 abstract void encodeUtf8Direct(CharSequence in, ByteBuffer out);
787 final void encodeUtf8Default(CharSequence in, ByteBuffer out) {
788 final int inLength = in.length();
789 int outIx = out.position();
798 for (
char c; inIx < inLength && (c = in.charAt(inIx)) < 0x80; ++inIx) {
799 out.put(outIx + inIx, (
byte) c);
801 if (inIx == inLength) {
803 out.position(outIx + inIx);
808 for (
char c; inIx < inLength; ++inIx, ++outIx) {
812 out.put(outIx, (
byte) c);
813 }
else if (c < 0x800) {
817 out.put(outIx++, (
byte) (0xC0 | (c >>> 6)));
818 out.put(outIx, (
byte) (0x80 | (0x3F & c)));
819 }
else if (c < MIN_SURROGATE || MAX_SURROGATE < c) {
824 out.put(outIx++, (
byte) (0xE0 | (c >>> 12)));
825 out.put(outIx++, (
byte) (0x80 | (0x3F & (c >>> 6))));
826 out.put(outIx, (
byte) (0x80 | (0x3F & c)));
833 if (inIx + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
834 throw new UnpairedSurrogateException(inIx, inLength);
837 int codePoint = toCodePoint(c, low);
838 out.put(outIx++, (
byte) ((0xF << 4) | (codePoint >>> 18)));
839 out.put(outIx++, (
byte) (0x80 | (0x3F & (codePoint >>> 12))));
840 out.put(outIx++, (
byte) (0x80 | (0x3F & (codePoint >>> 6))));
841 out.put(outIx, (
byte) (0x80 | (0x3F & codePoint)));
847 }
catch (IndexOutOfBoundsException e) {
852 int badWriteIndex = out.position() + Math.max(inIx, outIx - out.position() + 1);
853 throw new ArrayIndexOutOfBoundsException(
854 "Failed writing " + in.charAt(inIx) +
" at index " + badWriteIndex);
860 static final class SafeProcessor
extends Processor {
862 int partialIsValidUtf8(
int state,
byte[]
bytes,
int index,
int limit) {
863 if (state != COMPLETE) {
871 if (
index >= limit) {
874 int byte1 = (
byte) state;
876 if (byte1 < (
byte) 0xE0) {
881 if (byte1 < (
byte) 0xC2
886 }
else if (byte1 < (
byte) 0xF0) {
890 int byte2 = (
byte) ~(state >> 8);
893 if (
index >= limit) {
894 return incompleteStateFor(byte1, byte2);
897 if (byte2 > (
byte) 0xBF
899 || (byte1 == (
byte) 0xE0 && byte2 < (
byte) 0xA0)
901 || (byte1 == (
byte) 0xED && byte2 >= (
byte) 0xA0)
910 int byte2 = (
byte) ~(state >> 8);
914 if (
index >= limit) {
915 return incompleteStateFor(byte1, byte2);
918 byte3 = (
byte) (state >> 16);
922 if (
index >= limit) {
923 return incompleteStateFor(byte1, byte2, byte3);
931 if (byte2 > (
byte) 0xBF
936 || (((byte1 << 28) + (byte2 - (
byte) 0x90)) >> 30) != 0
938 || byte3 > (
byte) 0xBF
946 return partialIsValidUtf8(
bytes,
index, limit);
950 int partialIsValidUtf8Direct(
int state, ByteBuffer
buffer,
int index,
int limit) {
952 return partialIsValidUtf8Default(state,
buffer,
index, limit);
956 String decodeUtf8(
byte[]
bytes,
int index,
int size)
throws InvalidProtocolBufferException {
959 throw new ArrayIndexOutOfBoundsException(
960 String.format(
"buffer length=%d, index=%d, size=%d",
bytes.length,
index,
size));
968 char[] resultArr =
new char[
size];
975 if (!DecodeUtil.isOneByte(
b)) {
979 DecodeUtil.handleOneByte(
b, resultArr, resultPos++);
984 if (DecodeUtil.isOneByte(byte1)) {
985 DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
990 if (!DecodeUtil.isOneByte(
b)) {
994 DecodeUtil.handleOneByte(
b, resultArr, resultPos++);
996 }
else if (DecodeUtil.isTwoBytes(byte1)) {
998 throw InvalidProtocolBufferException.invalidUtf8();
1000 DecodeUtil.handleTwoBytes(byte1,
bytes[
offset++], resultArr, resultPos++);
1001 }
else if (DecodeUtil.isThreeBytes(byte1)) {
1002 if (
offset >= limit - 1) {
1003 throw InvalidProtocolBufferException.invalidUtf8();
1005 DecodeUtil.handleThreeBytes(
1012 if (
offset >= limit - 2) {
1013 throw InvalidProtocolBufferException.invalidUtf8();
1015 DecodeUtil.handleFourBytes(
1027 return new String(resultArr, 0, resultPos);
1032 throws InvalidProtocolBufferException {
1038 int encodeUtf8(CharSequence in,
byte[] out,
int offset,
int length) {
1039 int utf16Length = in.length();
1045 for (
char c;
i < utf16Length &&
i + j < limit && (c = in.charAt(
i)) < 0x80;
i++) {
1046 out[j +
i] = (
byte) c;
1048 if (
i == utf16Length) {
1049 return j + utf16Length;
1052 for (
char c;
i < utf16Length;
i++) {
1054 if (c < 0x80 && j < limit) {
1055 out[j++] = (
byte) c;
1056 }
else if (c < 0x800 && j <= limit - 2) {
1057 out[j++] = (
byte) ((0xF << 6) | (c >>> 6));
1058 out[j++] = (
byte) (0x80 | (0x3F & c));
1059 }
else if ((c < Character.MIN_SURROGATE || Character.MAX_SURROGATE < c) && j <= limit - 3) {
1061 out[j++] = (
byte) ((0xF << 5) | (c >>> 12));
1062 out[j++] = (
byte) (0x80 | (0x3F & (c >>> 6)));
1063 out[j++] = (
byte) (0x80 | (0x3F & c));
1064 }
else if (j <= limit - 4) {
1068 if (
i + 1 == in.length() || !Character.isSurrogatePair(c, (low = in.charAt(++
i)))) {
1069 throw new UnpairedSurrogateException((
i - 1), utf16Length);
1071 int codePoint = Character.toCodePoint(c, low);
1072 out[j++] = (
byte) ((0xF << 4) | (codePoint >>> 18));
1073 out[j++] = (
byte) (0x80 | (0x3F & (codePoint >>> 12)));
1074 out[j++] = (
byte) (0x80 | (0x3F & (codePoint >>> 6)));
1075 out[j++] = (
byte) (0x80 | (0x3F & codePoint));
1079 if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE)
1080 && (
i + 1 == in.length() || !Character.isSurrogatePair(c, in.charAt(
i + 1)))) {
1081 throw new UnpairedSurrogateException(
i, utf16Length);
1083 throw new ArrayIndexOutOfBoundsException(
"Failed writing " + c +
" at index " + j);
1090 void encodeUtf8Direct(CharSequence in, ByteBuffer out) {
1092 encodeUtf8Default(in, out);
1095 private static int partialIsValidUtf8(
byte[]
bytes,
int index,
int limit) {
1102 return (
index >= limit) ? COMPLETE : partialIsValidUtf8NonAscii(
bytes,
index, limit);
1105 private static int partialIsValidUtf8NonAscii(
byte[]
bytes,
int index,
int limit) {
1112 if (
index >= limit) {
1117 if (byte1 < (
byte) 0xE0) {
1120 if (
index >= limit) {
1127 if (byte1 < (
byte) 0xC2 ||
bytes[
index++] > (
byte) 0xBF) {
1130 }
else if (byte1 < (
byte) 0xF0) {
1133 if (
index >= limit - 1) {
1134 return incompleteStateFor(
bytes,
index, limit);
1138 || (byte1 == (
byte) 0xE0 && byte2 < (
byte) 0xA0)
1140 || (byte1 == (
byte) 0xED && byte2 >= (
byte) 0xA0)
1148 if (
index >= limit - 2) {
1149 return incompleteStateFor(
bytes,
index, limit);
1156 || (((byte1 << 28) + (byte2 - (
byte) 0x90)) >> 30) != 0
1169 static final class UnsafeProcessor
extends Processor {
1171 static boolean isAvailable() {
1172 return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations();
1176 int partialIsValidUtf8(
int state,
byte[]
bytes,
final int index,
final int limit) {
1178 if ((
index | limit |
bytes.length - limit) < 0) {
1179 throw new ArrayIndexOutOfBoundsException(
1180 String.format(
"Array length=%d, index=%d, limit=%d",
bytes.length,
index, limit));
1183 final long offsetLimit = limit;
1184 if (state != COMPLETE) {
1192 if (
offset >= offsetLimit) {
1195 int byte1 = (
byte) state;
1197 if (byte1 < (
byte) 0xE0) {
1202 if (byte1 < (
byte) 0xC2
1207 }
else if (byte1 < (
byte) 0xF0) {
1211 int byte2 = (
byte) ~(state >> 8);
1214 if (
offset >= offsetLimit) {
1215 return incompleteStateFor(byte1, byte2);
1218 if (byte2 > (
byte) 0xBF
1220 || (byte1 == (
byte) 0xE0 && byte2 < (
byte) 0xA0)
1222 || (byte1 == (
byte) 0xED && byte2 >= (
byte) 0xA0)
1224 || UnsafeUtil.getByte(
bytes,
offset++) > (
byte) 0xBF) {
1231 int byte2 = (
byte) ~(state >> 8);
1235 if (
offset >= offsetLimit) {
1236 return incompleteStateFor(byte1, byte2);
1239 byte3 = (
byte) (state >> 16);
1243 if (
offset >= offsetLimit) {
1244 return incompleteStateFor(byte1, byte2, byte3);
1252 if (byte2 > (
byte) 0xBF
1257 || (((byte1 << 28) + (byte2 - (
byte) 0x90)) >> 30) != 0
1259 || byte3 > (
byte) 0xBF
1261 || UnsafeUtil.getByte(
bytes,
offset++) > (
byte) 0xBF) {
1271 int partialIsValidUtf8Direct(
1272 final int state, ByteBuffer
buffer,
final int index,
final int limit) {
1274 if ((
index | limit |
buffer.limit() - limit) < 0) {
1275 throw new ArrayIndexOutOfBoundsException(
1276 String.format(
"buffer limit=%d, index=%d, limit=%d",
buffer.limit(),
index, limit));
1280 if (state != COMPLETE) {
1288 if (
address >= addressLimit) {
1292 final int byte1 = (
byte) state;
1294 if (byte1 < (
byte) 0xE0) {
1299 if (byte1 < (
byte) 0xC2
1304 }
else if (byte1 < (
byte) 0xF0) {
1308 int byte2 = (
byte) ~(state >> 8);
1310 byte2 = UnsafeUtil.getByte(
address++);
1311 if (
address >= addressLimit) {
1312 return incompleteStateFor(byte1, byte2);
1315 if (byte2 > (
byte) 0xBF
1317 || (byte1 == (
byte) 0xE0 && byte2 < (
byte) 0xA0)
1319 || (byte1 == (
byte) 0xED && byte2 >= (
byte) 0xA0)
1321 || UnsafeUtil.getByte(
address++) > (
byte) 0xBF) {
1328 int byte2 = (
byte) ~(state >> 8);
1331 byte2 = UnsafeUtil.getByte(
address++);
1332 if (
address >= addressLimit) {
1333 return incompleteStateFor(byte1, byte2);
1336 byte3 = (
byte) (state >> 16);
1339 byte3 = UnsafeUtil.getByte(
address++);
1340 if (
address >= addressLimit) {
1341 return incompleteStateFor(byte1, byte2, byte3);
1349 if (byte2 > (
byte) 0xBF
1354 || (((byte1 << 28) + (byte2 - (
byte) 0x90)) >> 30) != 0
1356 || byte3 > (
byte) 0xBF
1358 || UnsafeUtil.getByte(
address++) > (
byte) 0xBF) {
1364 return partialIsValidUtf8(
address, (
int) (addressLimit -
address));
1368 String decodeUtf8(
byte[]
bytes,
int index,
int size)
throws InvalidProtocolBufferException {
1370 throw new ArrayIndexOutOfBoundsException(
1371 String.format(
"buffer length=%d, index=%d, size=%d",
bytes.length,
index,
size));
1379 char[] resultArr =
new char[
size];
1386 if (!DecodeUtil.isOneByte(
b)) {
1390 DecodeUtil.handleOneByte(
b, resultArr, resultPos++);
1395 if (DecodeUtil.isOneByte(byte1)) {
1396 DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
1401 if (!DecodeUtil.isOneByte(
b)) {
1405 DecodeUtil.handleOneByte(
b, resultArr, resultPos++);
1407 }
else if (DecodeUtil.isTwoBytes(byte1)) {
1409 throw InvalidProtocolBufferException.invalidUtf8();
1411 DecodeUtil.handleTwoBytes(
1412 byte1, UnsafeUtil.getByte(
bytes,
offset++), resultArr, resultPos++);
1413 }
else if (DecodeUtil.isThreeBytes(byte1)) {
1414 if (
offset >= limit - 1) {
1415 throw InvalidProtocolBufferException.invalidUtf8();
1417 DecodeUtil.handleThreeBytes(
1424 if (
offset >= limit - 2) {
1425 throw InvalidProtocolBufferException.invalidUtf8();
1427 DecodeUtil.handleFourBytes(
1439 return new String(resultArr, 0, resultPos);
1444 throws InvalidProtocolBufferException {
1447 throw new ArrayIndexOutOfBoundsException(
1448 String.format(
"buffer limit=%d, index=%d, limit=%d",
buffer.limit(),
index,
size));
1455 char[] resultArr =
new char[
size];
1460 while (
address < addressLimit) {
1461 byte b = UnsafeUtil.getByte(
address);
1462 if (!DecodeUtil.isOneByte(
b)) {
1466 DecodeUtil.handleOneByte(
b, resultArr, resultPos++);
1469 while (
address < addressLimit) {
1470 byte byte1 = UnsafeUtil.getByte(
address++);
1471 if (DecodeUtil.isOneByte(byte1)) {
1472 DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
1475 while (
address < addressLimit) {
1476 byte b = UnsafeUtil.getByte(
address);
1477 if (!DecodeUtil.isOneByte(
b)) {
1481 DecodeUtil.handleOneByte(
b, resultArr, resultPos++);
1483 }
else if (DecodeUtil.isTwoBytes(byte1)) {
1484 if (
address >= addressLimit) {
1485 throw InvalidProtocolBufferException.invalidUtf8();
1487 DecodeUtil.handleTwoBytes(
1488 byte1, UnsafeUtil.getByte(
address++), resultArr, resultPos++);
1489 }
else if (DecodeUtil.isThreeBytes(byte1)) {
1490 if (
address >= addressLimit - 1) {
1491 throw InvalidProtocolBufferException.invalidUtf8();
1493 DecodeUtil.handleThreeBytes(
1495 UnsafeUtil.getByte(
address++),
1496 UnsafeUtil.getByte(
address++),
1500 if (
address >= addressLimit - 2) {
1501 throw InvalidProtocolBufferException.invalidUtf8();
1503 DecodeUtil.handleFourBytes(
1505 UnsafeUtil.getByte(
address++),
1506 UnsafeUtil.getByte(
address++),
1507 UnsafeUtil.getByte(
address++),
1515 return new String(resultArr, 0, resultPos);
1519 int encodeUtf8(
final CharSequence in,
final byte[] out,
final int offset,
final int length) {
1521 final long outLimit = outIx +
length;
1522 final int inLimit = in.length();
1525 throw new ArrayIndexOutOfBoundsException(
1526 "Failed writing " + in.charAt(inLimit - 1) +
" at index " + (
offset +
length));
1532 for (
char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {
1533 UnsafeUtil.putByte(out, outIx++, (
byte) c);
1535 if (inIx == inLimit) {
1540 for (
char c; inIx < inLimit; ++inIx) {
1541 c = in.charAt(inIx);
1542 if (c < 0x80 && outIx < outLimit) {
1543 UnsafeUtil.putByte(out, outIx++, (
byte) c);
1544 }
else if (c < 0x800 && outIx <= outLimit - 2L) {
1545 UnsafeUtil.putByte(out, outIx++, (
byte) ((0xF << 6) | (c >>> 6)));
1546 UnsafeUtil.putByte(out, outIx++, (
byte) (0x80 | (0x3F & c)));
1547 }
else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) {
1549 UnsafeUtil.putByte(out, outIx++, (
byte) ((0xF << 5) | (c >>> 12)));
1550 UnsafeUtil.putByte(out, outIx++, (
byte) (0x80 | (0x3F & (c >>> 6))));
1551 UnsafeUtil.putByte(out, outIx++, (
byte) (0x80 | (0x3F & c)));
1552 }
else if (outIx <= outLimit - 4L) {
1556 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
1557 throw new UnpairedSurrogateException((inIx - 1), inLimit);
1559 int codePoint = toCodePoint(c, low);
1560 UnsafeUtil.putByte(out, outIx++, (
byte) ((0xF << 4) | (codePoint >>> 18)));
1561 UnsafeUtil.putByte(out, outIx++, (
byte) (0x80 | (0x3F & (codePoint >>> 12))));
1562 UnsafeUtil.putByte(out, outIx++, (
byte) (0x80 | (0x3F & (codePoint >>> 6))));
1563 UnsafeUtil.putByte(out, outIx++, (
byte) (0x80 | (0x3F & codePoint)));
1565 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)
1566 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) {
1568 throw new UnpairedSurrogateException(inIx, inLimit);
1571 throw new ArrayIndexOutOfBoundsException(
"Failed writing " + c +
" at index " + outIx);
1580 void encodeUtf8Direct(CharSequence in, ByteBuffer out) {
1581 final long address = addressOffset(out);
1582 long outIx =
address + out.position();
1583 final long outLimit =
address + out.limit();
1584 final int inLimit = in.length();
1585 if (inLimit > outLimit - outIx) {
1587 throw new ArrayIndexOutOfBoundsException(
1588 "Failed writing " + in.charAt(inLimit - 1) +
" at index " + out.limit());
1594 for (
char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {
1595 UnsafeUtil.putByte(outIx++, (
byte) c);
1597 if (inIx == inLimit) {
1599 out.position((
int) (outIx -
address));
1603 for (
char c; inIx < inLimit; ++inIx) {
1604 c = in.charAt(inIx);
1605 if (c < 0x80 && outIx < outLimit) {
1606 UnsafeUtil.putByte(outIx++, (
byte) c);
1607 }
else if (c < 0x800 && outIx <= outLimit - 2L) {
1608 UnsafeUtil.putByte(outIx++, (
byte) ((0xF << 6) | (c >>> 6)));
1609 UnsafeUtil.putByte(outIx++, (
byte) (0x80 | (0x3F & c)));
1610 }
else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) {
1612 UnsafeUtil.putByte(outIx++, (
byte) ((0xF << 5) | (c >>> 12)));
1613 UnsafeUtil.putByte(outIx++, (
byte) (0x80 | (0x3F & (c >>> 6))));
1614 UnsafeUtil.putByte(outIx++, (
byte) (0x80 | (0x3F & c)));
1615 }
else if (outIx <= outLimit - 4L) {
1619 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
1620 throw new UnpairedSurrogateException((inIx - 1), inLimit);
1622 int codePoint = toCodePoint(c, low);
1623 UnsafeUtil.putByte(outIx++, (
byte) ((0xF << 4) | (codePoint >>> 18)));
1624 UnsafeUtil.putByte(outIx++, (
byte) (0x80 | (0x3F & (codePoint >>> 12))));
1625 UnsafeUtil.putByte(outIx++, (
byte) (0x80 | (0x3F & (codePoint >>> 6))));
1626 UnsafeUtil.putByte(outIx++, (
byte) (0x80 | (0x3F & codePoint)));
1628 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)
1629 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) {
1631 throw new UnpairedSurrogateException(inIx, inLimit);
1634 throw new ArrayIndexOutOfBoundsException(
"Failed writing " + c +
" at index " + outIx);
1639 out.position((
int) (outIx -
address));
1652 private static int unsafeEstimateConsecutiveAscii(
1654 if (maxChars < UNSAFE_COUNT_ASCII_THRESHOLD) {
1659 for (
int i = 0;
i < maxChars;
i++) {
1671 private static int unsafeEstimateConsecutiveAscii(
long address,
final int maxChars) {
1672 int remaining = maxChars;
1673 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) {
1681 final int unaligned = 8 - ((int)
address & 7);
1682 for (
int j = unaligned; j > 0; j--) {
1683 if (UnsafeUtil.getByte(
address++) < 0) {
1684 return unaligned - j;
1691 remaining -= unaligned;
1693 remaining >= 8 && (UnsafeUtil.getLong(
address) & ASCII_MASK_LONG) == 0;
1694 address += 8, remaining -= 8) {}
1695 return maxChars - remaining;
1698 private static int partialIsValidUtf8(
final byte[]
bytes,
long offset,
int remaining) {
1700 final int skipped = unsafeEstimateConsecutiveAscii(
bytes,
offset, remaining);
1701 remaining -= skipped;
1709 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(
bytes,
offset++)) >= 0; --remaining) {}
1710 if (remaining == 0) {
1716 if (byte1 < (
byte) 0xE0) {
1718 if (remaining == 0) {
1726 if (byte1 < (
byte) 0xC2 || UnsafeUtil.getByte(
bytes,
offset++) > (
byte) 0xBF) {
1729 }
else if (byte1 < (
byte) 0xF0) {
1731 if (remaining < 2) {
1733 return unsafeIncompleteStateFor(
bytes, byte1,
offset, remaining);
1738 if ((byte2 = UnsafeUtil.getByte(
bytes,
offset++)) > (
byte) 0xBF
1740 || (byte1 == (
byte) 0xE0 && byte2 < (
byte) 0xA0)
1742 || (byte1 == (
byte) 0xED && byte2 >= (
byte) 0xA0)
1744 || UnsafeUtil.getByte(
bytes,
offset++) > (
byte) 0xBF) {
1749 if (remaining < 3) {
1751 return unsafeIncompleteStateFor(
bytes, byte1,
offset, remaining);
1756 if ((byte2 = UnsafeUtil.getByte(
bytes,
offset++)) > (
byte) 0xBF
1761 || (((byte1 << 28) + (byte2 - (
byte) 0x90)) >> 30) != 0
1763 || UnsafeUtil.getByte(
bytes,
offset++) > (
byte) 0xBF
1765 || UnsafeUtil.getByte(
bytes,
offset++) > (
byte) 0xBF) {
1772 private static int partialIsValidUtf8(
long address,
int remaining) {
1774 final int skipped = unsafeEstimateConsecutiveAscii(
address, remaining);
1776 remaining -= skipped;
1783 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(
address++)) >= 0; --remaining) {}
1784 if (remaining == 0) {
1789 if (byte1 < (
byte) 0xE0) {
1792 if (remaining == 0) {
1800 if (byte1 < (
byte) 0xC2 || UnsafeUtil.getByte(
address++) > (
byte) 0xBF) {
1803 }
else if (byte1 < (
byte) 0xF0) {
1806 if (remaining < 2) {
1808 return unsafeIncompleteStateFor(
address, byte1, remaining);
1812 final byte byte2 = UnsafeUtil.getByte(
address++);
1813 if (byte2 > (
byte) 0xBF
1815 || (byte1 == (
byte) 0xE0 && byte2 < (
byte) 0xA0)
1817 || (byte1 == (
byte) 0xED && byte2 >= (
byte) 0xA0)
1819 || UnsafeUtil.getByte(
address++) > (
byte) 0xBF) {
1825 if (remaining < 3) {
1827 return unsafeIncompleteStateFor(
address, byte1, remaining);
1831 final byte byte2 = UnsafeUtil.getByte(
address++);
1832 if (byte2 > (
byte) 0xBF
1837 || (((byte1 << 28) + (byte2 - (
byte) 0x90)) >> 30) != 0
1839 || UnsafeUtil.getByte(
address++) > (
byte) 0xBF
1841 || UnsafeUtil.getByte(
address++) > (
byte) 0xBF) {
1848 private static int unsafeIncompleteStateFor(
1849 byte[]
bytes,
int byte1,
long offset,
int remaining) {
1850 switch (remaining) {
1852 return incompleteStateFor(byte1);
1854 return incompleteStateFor(byte1, UnsafeUtil.getByte(
bytes,
offset));
1856 return incompleteStateFor(
1859 throw new AssertionError();
1863 private static int unsafeIncompleteStateFor(
long address,
final int byte1,
int remaining) {
1864 switch (remaining) {
1866 return incompleteStateFor(byte1);
1868 return incompleteStateFor(byte1, UnsafeUtil.getByte(
address));
1870 return incompleteStateFor(
1871 byte1, UnsafeUtil.getByte(
address), UnsafeUtil.getByte(
address + 1));
1873 throw new AssertionError();
1892 return b < (
byte) 0xE0;
1897 return b < (
byte) 0xF0;
1900 private static void handleOneByte(
byte byte1,
char[] resultArr,
int resultPos) {
1901 resultArr[resultPos] = (char) byte1;
1904 private static void handleTwoBytes(
byte byte1,
byte byte2,
char[] resultArr,
int resultPos)
1911 resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) |
trailingByteValue(byte2));
1915 byte byte1,
byte byte2,
byte byte3,
char[] resultArr,
int resultPos)
1919 || (byte1 == (
byte) 0xE0 && byte2 < (
byte) 0xA0)
1921 || (byte1 == (
byte) 0xED && byte2 >= (
byte) 0xA0)
1925 resultArr[resultPos] =
1931 byte byte1,
byte byte2,
byte byte3,
byte byte4,
char[] resultArr,
int resultPos)
1941 || (((byte1 << 28) + (byte2 - (
byte) 0x90)) >> 30) != 0
1947 ((byte1 & 0x07) << 18)
1957 return b > (
byte) 0xBF;
1967 ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)) + (codePoint >>> 10));
1971 return (
char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff));