Utf8Utils.java
Go to the documentation of this file.
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 package com.google.protobuf;
32 
33 import static java.lang.Character.MIN_HIGH_SURROGATE;
34 import static java.lang.Character.MIN_LOW_SURROGATE;
35 import static java.lang.Character.MIN_SURROGATE;
36 
37 import java.util.Random;
38 
40 final class Utf8Utils {
41  private Utf8Utils() {}
42 
43  static class MaxCodePoint {
44  final int value;
45 
50  private static int decode(String userFriendly) {
51  try {
52  return Integer.decode(userFriendly);
53  } catch (NumberFormatException ignored) {
54  if (userFriendly.matches("(?i)(?:American|English|ASCII)")) {
55  // 1-byte UTF-8 sequences - "American" ASCII text
56  return 0x80;
57  } else if (userFriendly.matches("(?i)(?:Danish|Latin|Western.*European)")) {
58  // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte
59  // sequences - "Western European" text
60  return 0x90;
61  } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) {
62  // Mostly 2-byte UTF-8 sequences - "European" text
63  return 0x800;
64  } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) {
65  // Mostly 3-byte UTF-8 sequences - "Asian" text
66  return Character.MIN_SUPPLEMENTARY_CODE_POINT;
67  } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) {
68  // Mostly 4-byte UTF-8 sequences - "rare exotic" text
69  return Character.MAX_CODE_POINT;
70  } else {
71  throw new IllegalArgumentException("Can't decode codepoint " + userFriendly);
72  }
73  }
74  }
75 
76  public static MaxCodePoint valueOf(String userFriendly) {
77  return new MaxCodePoint(userFriendly);
78  }
79 
80  public MaxCodePoint(String userFriendly) {
81  value = decode(userFriendly);
82  }
83  }
84 
92  public enum Utf8Distribution {
93  GMM_UTF8_DISTRIBUTION {
94  @Override
95  public int[] getDistribution() {
96  return new int[] {53059, 104, 0, 0};
97  }
98  },
99  GSR_UTF8_DISTRIBUTION {
100  @Override
101  public int[] getDistribution() {
102  return new int[] {119458, 74, 2706, 0};
103  }
104  };
105 
106  public abstract int[] getDistribution();
107  }
108 
117  static String[] randomStrings(int stringCount, int charCount, MaxCodePoint maxCodePoint) {
118  final long seed = 99;
119  final Random rnd = new Random(seed);
120  String[] strings = new String[stringCount];
121  for (int i = 0; i < stringCount; i++) {
122  strings[i] = randomString(rnd, charCount, maxCodePoint);
123  }
124  return strings;
125  }
126 
134  static String randomString(Random rnd, int charCount, MaxCodePoint maxCodePoint) {
135  StringBuilder sb = new StringBuilder();
136  for (int i = 0; i < charCount; i++) {
137  int codePoint;
138  do {
139  codePoint = rnd.nextInt(maxCodePoint.value);
140  } while (Utf8Utils.isSurrogate(codePoint));
141  sb.appendCodePoint(codePoint);
142  }
143  return sb.toString();
144  }
145 
147  static boolean isSurrogate(int c) {
148  return Character.MIN_HIGH_SURROGATE <= c && c <= Character.MAX_LOW_SURROGATE;
149  }
150 
157  static String[] randomStringsWithDistribution(
158  int stringCount, int charCount, Utf8Distribution utf8Distribution) {
159  final int[] distribution = utf8Distribution.getDistribution();
160  for (int i = 0; i < 3; i++) {
161  distribution[i + 1] += distribution[i];
162  }
163  final long seed = 99;
164  final Random rnd = new Random(seed);
165  String[] strings = new String[stringCount];
166  for (int i = 0; i < stringCount; i++) {
167  StringBuilder sb = new StringBuilder();
168  for (int j = 0; j < charCount; j++) {
169  int codePoint;
170  do {
171  codePoint = rnd.nextInt(distribution[3]);
172  if (codePoint < distribution[0]) {
173  // 1 bytes
174  sb.append(0x7F);
175  } else if (codePoint < distribution[1]) {
176  // 2 bytes
177  sb.append(0x7FF);
178  } else if (codePoint < distribution[2]) {
179  // 3 bytes
180  sb.append(MIN_SURROGATE - 1);
181  } else {
182  // 4 bytes
183  sb.append(MIN_HIGH_SURROGATE);
184  sb.append(MIN_LOW_SURROGATE);
185  }
186  } while (Utf8Utils.isSurrogate(codePoint));
187  sb.appendCodePoint(codePoint);
188  }
189  strings[i] = sb.toString();
190  }
191  return strings;
192  }
193 }
java::lang
strings
GLsizei const GLchar *const * strings
Definition: glcorearb.h:4046
sb
void * sb
Definition: test_channel.cpp:8
i
int i
Definition: gmock-matchers_test.cc:764
java
com.google.protobuf.Utf8Utils.Utf8Distribution
Definition: Utf8Utils.java:92
value
GLsizei const GLfloat * value
Definition: glcorearb.h:3093


libaditof
Author(s):
autogenerated on Wed May 21 2025 02:07:01