naive.c
Go to the documentation of this file.
1 #include <stdio.h>
2 
3 /*
4  * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
5  *
6  * Table 3-7. Well-Formed UTF-8 Byte Sequences
7  *
8  * +--------------------+------------+-------------+------------+-------------+
9  * | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte |
10  * +--------------------+------------+-------------+------------+-------------+
11  * | U+0000..U+007F | 00..7F | | | |
12  * +--------------------+------------+-------------+------------+-------------+
13  * | U+0080..U+07FF | C2..DF | 80..BF | | |
14  * +--------------------+------------+-------------+------------+-------------+
15  * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
16  * +--------------------+------------+-------------+------------+-------------+
17  * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
18  * +--------------------+------------+-------------+------------+-------------+
19  * | U+D000..U+D7FF | ED | 80..9F | 80..BF | |
20  * +--------------------+------------+-------------+------------+-------------+
21  * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
22  * +--------------------+------------+-------------+------------+-------------+
23  * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
24  * +--------------------+------------+-------------+------------+-------------+
25  * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
26  * +--------------------+------------+-------------+------------+-------------+
27  * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
28  * +--------------------+------------+-------------+------------+-------------+
29  */
30 
31 /* Return 0 - success, >0 - index(1 based) of first error char */
32 int utf8_naive(const unsigned char *data, int len)
33 {
34  int err_pos = 1;
35 
36  while (len) {
37  int bytes;
38  const unsigned char byte1 = data[0];
39 
40  /* 00..7F */
41  if (byte1 <= 0x7F) {
42  bytes = 1;
43  /* C2..DF, 80..BF */
44  } else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF &&
45  (signed char)data[1] <= (signed char)0xBF) {
46  bytes = 2;
47  } else if (len >= 3) {
48  const unsigned char byte2 = data[1];
49 
50  /* Is byte2, byte3 between 0x80 ~ 0xBF */
51  const int byte2_ok = (signed char)byte2 <= (signed char)0xBF;
52  const int byte3_ok = (signed char)data[2] <= (signed char)0xBF;
53 
54  if (byte2_ok && byte3_ok &&
55  /* E0, A0..BF, 80..BF */
56  ((byte1 == 0xE0 && byte2 >= 0xA0) ||
57  /* E1..EC, 80..BF, 80..BF */
58  (byte1 >= 0xE1 && byte1 <= 0xEC) ||
59  /* ED, 80..9F, 80..BF */
60  (byte1 == 0xED && byte2 <= 0x9F) ||
61  /* EE..EF, 80..BF, 80..BF */
62  (byte1 >= 0xEE && byte1 <= 0xEF))) {
63  bytes = 3;
64  } else if (len >= 4) {
65  /* Is byte4 between 0x80 ~ 0xBF */
66  const int byte4_ok = (signed char)data[3] <= (signed char)0xBF;
67 
68  if (byte2_ok && byte3_ok && byte4_ok &&
69  /* F0, 90..BF, 80..BF, 80..BF */
70  ((byte1 == 0xF0 && byte2 >= 0x90) ||
71  /* F1..F3, 80..BF, 80..BF, 80..BF */
72  (byte1 >= 0xF1 && byte1 <= 0xF3) ||
73  /* F4, 80..8F, 80..BF, 80..BF */
74  (byte1 == 0xF4 && byte2 <= 0x8F))) {
75  bytes = 4;
76  } else {
77  return err_pos;
78  }
79  } else {
80  return err_pos;
81  }
82  } else {
83  return err_pos;
84  }
85 
86  len -= bytes;
87  err_pos += bytes;
88  data += bytes;
89  }
90 
91  return 0;
92 }
utf8_naive
int utf8_naive(const unsigned char *data, int len)
Definition: naive.c:32
data
char data[kBufferLength]
Definition: abseil-cpp/absl/strings/internal/str_format/float_conversion.cc:1006
bytes
uint8 bytes[10]
Definition: bloaty/third_party/protobuf/src/google/protobuf/io/coded_stream_unittest.cc:153
len
int len
Definition: abseil-cpp/absl/base/internal/low_level_alloc_test.cc:46


grpc
Author(s):
autogenerated on Fri May 16 2025 02:59:31