ascii.cc
Go to the documentation of this file.
1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/strings/ascii.h"
16 
17 namespace absl {
18 namespace ascii_internal {
19 
20 // # Table generated by this Python code (bit 0x02 is currently unused):
21 // TODO(mbar) Move Python code for generation of table to BUILD and link here.
22 
23 // NOTE: The kAsciiPropertyBits table used within this code was generated by
24 // Python code of the following form. (Bit 0x02 is currently unused and
25 // available.)
26 //
27 // def Hex2(n):
28 // return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
29 // def IsPunct(ch):
30 // return (ord(ch) >= 32 and ord(ch) < 127 and
31 // not ch.isspace() and not ch.isalnum())
32 // def IsBlank(ch):
33 // return ch in ' \t'
34 // def IsCntrl(ch):
35 // return ord(ch) < 32 or ord(ch) == 127
36 // def IsXDigit(ch):
37 // return ch.isdigit() or ch.lower() in 'abcdef'
38 // for i in range(128):
39 // ch = chr(i)
40 // mask = ((ch.isalpha() and 0x01 or 0) |
41 // (ch.isalnum() and 0x04 or 0) |
42 // (ch.isspace() and 0x08 or 0) |
43 // (IsPunct(ch) and 0x10 or 0) |
44 // (IsBlank(ch) and 0x20 or 0) |
45 // (IsCntrl(ch) and 0x40 or 0) |
46 // (IsXDigit(ch) and 0x80 or 0))
47 // print Hex2(mask) + ',',
48 // if i % 16 == 7:
49 // print ' //', Hex2(i & 0x78)
50 // elif i % 16 == 15:
51 // print
52 
53 // clang-format off
54 // Array of bitfields holding character information. Each bit value corresponds
55 // to a particular character feature. For readability, and because the value
56 // of these bits is tightly coupled to this implementation, the individual bits
57 // are not named. Note that bitfields for all characters above ASCII 127 are
58 // zero-initialized.
59 const unsigned char kPropertyBits[256] = {
60  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x00
61  0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
62  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x10
63  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
64  0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, // 0x20
65  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
66  0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, // 0x30
67  0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
68  0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x40
69  0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
70  0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x50
71  0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
72  0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x60
73  0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
74  0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x70
75  0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
76 };
77 
78 // Array of characters for the ascii_tolower() function. For values 'A'
79 // through 'Z', return the lower-case character; otherwise, return the
80 // identity of the passed character.
81 const char kToLower[256] = {
82  '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
83  '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
84  '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
85  '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
86  '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
87  '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
88  '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
89  '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
90  '\x40', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
91  'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
92  'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
93  'x', 'y', 'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
94  '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
95  '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
96  '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
97  '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
98  '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
99  '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
100  '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
101  '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
102  '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
103  '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
104  '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
105  '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
106  '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
107  '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
108  '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
109  '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
110  '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
111  '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
112  '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
113  '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
114 };
115 
116 // Array of characters for the ascii_toupper() function. For values 'a'
117 // through 'z', return the upper-case character; otherwise, return the
118 // identity of the passed character.
119 const char kToUpper[256] = {
120  '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
121  '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
122  '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
123  '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
124  '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
125  '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
126  '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
127  '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
128  '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
129  '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
130  '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
131  '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
132  '\x60', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
133  'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
134  'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
135  'X', 'Y', 'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
136  '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
137  '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
138  '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
139  '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
140  '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
141  '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
142  '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
143  '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
144  '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
145  '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
146  '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
147  '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
148  '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
149  '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
150  '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
151  '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
152 };
153 // clang-format on
154 
155 } // namespace ascii_internal
156 
157 void AsciiStrToLower(std::string* s) {
158  for (auto& ch : *s) {
159  ch = absl::ascii_tolower(ch);
160  }
161 }
162 
163 void AsciiStrToUpper(std::string* s) {
164  for (auto& ch : *s) {
165  ch = absl::ascii_toupper(ch);
166  }
167 }
168 
169 void RemoveExtraAsciiWhitespace(std::string* str) {
170  auto stripped = StripAsciiWhitespace(*str);
171 
172  if (stripped.empty()) {
173  str->clear();
174  return;
175  }
176 
177  auto input_it = stripped.begin();
178  auto input_end = stripped.end();
179  auto output_it = &(*str)[0];
180  bool is_ws = false;
181 
182  for (; input_it < input_end; ++input_it) {
183  if (is_ws) {
184  // Consecutive whitespace? Keep only the last.
185  is_ws = absl::ascii_isspace(*input_it);
186  if (is_ws) --output_it;
187  } else {
188  is_ws = absl::ascii_isspace(*input_it);
189  }
190 
191  *output_it = *input_it;
192  ++output_it;
193  }
194 
195  str->erase(output_it - &(*str)[0]);
196 }
197 
198 } // namespace absl
void AsciiStrToLower(std::string *s)
Definition: ascii.cc:157
const char kToUpper[256]
Definition: ascii.cc:119
bool ascii_isspace(unsigned char c)
Definition: ascii.h:93
void AsciiStrToUpper(std::string *s)
Definition: ascii.cc:163
void RemoveExtraAsciiWhitespace(std::string *str)
Definition: ascii.cc:169
Definition: algorithm.h:29
const char kToLower[256]
Definition: ascii.cc:81
char ascii_toupper(unsigned char c)
Definition: ascii.h:179
ABSL_MUST_USE_RESULT absl::string_view StripAsciiWhitespace(absl::string_view str)
Definition: ascii.h:223
char ascii_tolower(unsigned char c)
Definition: ascii.h:161
const unsigned char kPropertyBits[256]
Definition: ascii.cc:59


abseil_cpp
Author(s):
autogenerated on Mon Feb 28 2022 21:31:17