bloaty/third_party/re2/re2/unicode.py
Go to the documentation of this file.
1 # Copyright 2008 The RE2 Authors. All Rights Reserved.
2 # Use of this source code is governed by a BSD-style
3 # license that can be found in the LICENSE file.
4 
5 """Parser for Unicode data files (as distributed by unicode.org)."""
6 
7 from __future__ import absolute_import
8 from __future__ import division
9 from __future__ import print_function
10 
11 import os
12 import re
13 from six.moves import urllib
14 
15 # Directory or URL where Unicode tables reside.
16 _UNICODE_DIR = "https://www.unicode.org/Public/12.1.0/ucd"
17 
18 # Largest valid Unicode code value.
19 _RUNE_MAX = 0x10FFFF
20 
21 
22 class Error(Exception):
23  """Unicode error base class."""
24 
25 
27  """Unicode input error class. Raised on invalid input."""
28 
29 
30 def _UInt(s):
31  """Converts string to Unicode code point ('263A' => 0x263a).
32 
33  Args:
34  s: string to convert
35 
36  Returns:
37  Unicode code point
38 
39  Raises:
40  InputError: the string is not a valid Unicode value.
41  """
42 
43  try:
44  v = int(s, 16)
45  except ValueError:
46  v = -1
47  if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
48  raise InputError("invalid Unicode value %s" % (s,))
49  return v
50 
51 
52 def _URange(s):
53  """Converts string to Unicode range.
54 
55  '0001..0003' => [1, 2, 3].
56  '0001' => [1].
57 
58  Args:
59  s: string to convert
60 
61  Returns:
62  Unicode range
63 
64  Raises:
65  InputError: the string is not a valid Unicode range.
66  """
67  a = s.split("..")
68  if len(a) == 1:
69  return [_UInt(a[0])]
70  if len(a) == 2:
71  lo = _UInt(a[0])
72  hi = _UInt(a[1])
73  if lo < hi:
74  return range(lo, hi + 1)
75  raise InputError("invalid Unicode range %s" % (s,))
76 
77 
78 def _UStr(v):
79  """Converts Unicode code point to hex string.
80 
81  0x263a => '0x263A'.
82 
83  Args:
84  v: code point to convert
85 
86  Returns:
87  Unicode string
88 
89  Raises:
90  InputError: the argument is not a valid Unicode value.
91  """
92  if v < 0 or v > _RUNE_MAX:
93  raise InputError("invalid Unicode value %s" % (v,))
94  return "0x%04X" % (v,)
95 
96 
98  """Parses a Unicode continuation field.
99 
100  These are of the form '<Name, First>' or '<Name, Last>'.
101  Instead of giving an explicit range in a single table entry,
102  some Unicode tables use two entries, one for the first
103  code value in the range and one for the last.
104  The first entry's description is '<Name, First>' instead of 'Name'
105  and the second is '<Name, Last>'.
106 
107  '<Name, First>' => ('Name', 'First')
108  '<Name, Last>' => ('Name', 'Last')
109  'Anything else' => ('Anything else', None)
110 
111  Args:
112  s: continuation field string
113 
114  Returns:
115  pair: name and ('First', 'Last', or None)
116  """
117 
118  match = re.match("<(.*), (First|Last)>", s)
119  if match is not None:
120  return match.groups()
121  return (s, None)
122 
123 
124 def ReadUnicodeTable(filename, nfields, doline):
125  """Generic Unicode table text file reader.
126 
127  The reader takes care of stripping out comments and also
128  parsing the two different ways that the Unicode tables specify
129  code ranges (using the .. notation and splitting the range across
130  multiple lines).
131 
132  Each non-comment line in the table is expected to have the given
133  number of fields. The first field is known to be the Unicode value
134  and the second field its description.
135 
136  The reader calls doline(codes, fields) for each entry in the table.
137  If fn raises an exception, the reader prints that exception,
138  prefixed with the file name and line number, and continues
139  processing the file. When done with the file, the reader re-raises
140  the first exception encountered during the file.
141 
142  Arguments:
143  filename: the Unicode data file to read, or a file-like object.
144  nfields: the number of expected fields per line in that file.
145  doline: the function to call for each table entry.
146 
147  Raises:
148  InputError: nfields is invalid (must be >= 2).
149  """
150 
151  if nfields < 2:
152  raise InputError("invalid number of fields %d" % (nfields,))
153 
154  if type(filename) == str:
155  if filename.startswith("https://"):
156  fil = urllib.request.urlopen(filename)
157  else:
158  fil = open(filename, "rb")
159  else:
160  fil = filename
161 
162  first = None # first code in multiline range
163  expect_last = None # tag expected for "Last" line in multiline range
164  lineno = 0 # current line number
165  for line in fil:
166  lineno += 1
167  try:
168  line = line.decode('latin1')
169 
170  # Chop # comments and white space; ignore empty lines.
171  sharp = line.find("#")
172  if sharp >= 0:
173  line = line[:sharp]
174  line = line.strip()
175  if not line:
176  continue
177 
178  # Split fields on ";", chop more white space.
179  # Must have the expected number of fields.
180  fields = [s.strip() for s in line.split(";")]
181  if len(fields) != nfields:
182  raise InputError("wrong number of fields %d %d - %s" %
183  (len(fields), nfields, line))
184 
185  # The Unicode text files have two different ways
186  # to list a Unicode range. Either the first field is
187  # itself a range (0000..FFFF), or the range is split
188  # across two lines, with the second field noting
189  # the continuation.
190  codes = _URange(fields[0])
191  (name, cont) = _ParseContinue(fields[1])
192 
193  if expect_last is not None:
194  # If the last line gave the First code in a range,
195  # this one had better give the Last one.
196  if (len(codes) != 1 or codes[0] <= first or
197  cont != "Last" or name != expect_last):
198  raise InputError("expected Last line for %s" %
199  (expect_last,))
200  codes = range(first, codes[0] + 1)
201  first = None
202  expect_last = None
203  fields[0] = "%04X..%04X" % (codes[0], codes[-1])
204  fields[1] = name
205  elif cont == "First":
206  # Otherwise, if this is the First code in a range,
207  # remember it and go to the next line.
208  if len(codes) != 1:
209  raise InputError("bad First line: range given")
210  expect_last = name
211  first = codes[0]
212  continue
213 
214  doline(codes, fields)
215 
216  except Exception as e:
217  print("%s:%d: %s" % (filename, lineno, e))
218  raise
219 
220  if expect_last is not None:
221  raise InputError("expected Last line for %s; got EOF" %
222  (expect_last,))
223 
224 
225 def CaseGroups(unicode_dir=_UNICODE_DIR):
226  """Returns list of Unicode code groups equivalent under case folding.
227 
228  Each group is a sorted list of code points,
229  and the list of groups is sorted by first code point
230  in the group.
231 
232  Args:
233  unicode_dir: Unicode data directory
234 
235  Returns:
236  list of Unicode code groups
237  """
238 
239  # Dict mapping lowercase code point to fold-equivalent group.
240  togroup = {}
241 
242  def DoLine(codes, fields):
243  """Process single CaseFolding.txt line, updating togroup."""
244  (_, foldtype, lower, _) = fields
245  if foldtype not in ("C", "S"):
246  return
247  lower = _UInt(lower)
248  togroup.setdefault(lower, [lower]).extend(codes)
249 
250  ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
251 
252  groups = list(togroup.values())
253  for g in groups:
254  g.sort()
255  groups.sort()
256  return togroup, groups
257 
258 
259 def Scripts(unicode_dir=_UNICODE_DIR):
260  """Returns dict mapping script names to code lists.
261 
262  Args:
263  unicode_dir: Unicode data directory
264 
265  Returns:
266  dict mapping script names to code lists
267  """
268 
269  scripts = {}
270 
271  def DoLine(codes, fields):
272  """Process single Scripts.txt line, updating scripts."""
273  (_, name) = fields
274  scripts.setdefault(name, []).extend(codes)
275 
276  ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
277  return scripts
278 
279 
280 def Categories(unicode_dir=_UNICODE_DIR):
281  """Returns dict mapping category names to code lists.
282 
283  Args:
284  unicode_dir: Unicode data directory
285 
286  Returns:
287  dict mapping category names to code lists
288  """
289 
290  categories = {}
291 
292  def DoLine(codes, fields):
293  """Process single UnicodeData.txt line, updating categories."""
294  category = fields[2]
295  categories.setdefault(category, []).extend(codes)
296  # Add codes from Lu into L, etc.
297  if len(category) > 1:
298  short = category[0]
299  categories.setdefault(short, []).extend(codes)
300 
301  ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
302  return categories
303 
capstone.range
range
Definition: third_party/bloaty/third_party/capstone/bindings/python/capstone/__init__.py:6
unicode._UInt
def _UInt(s)
Definition: bloaty/third_party/re2/re2/unicode.py:30
unicode.ReadUnicodeTable
def ReadUnicodeTable(filename, nfields, doline)
Definition: bloaty/third_party/re2/re2/unicode.py:124
unicode.Categories
def Categories(unicode_dir=_UNICODE_DIR)
Definition: bloaty/third_party/re2/re2/unicode.py:280
unicode.Error
Definition: bloaty/third_party/re2/re2/unicode.py:22
xds_interop_client.int
int
Definition: xds_interop_client.py:113
unicode._UStr
def _UStr(v)
Definition: bloaty/third_party/re2/re2/unicode.py:78
unicode.Scripts
def Scripts(unicode_dir=_UNICODE_DIR)
Definition: bloaty/third_party/re2/re2/unicode.py:259
unicode.InputError
Definition: bloaty/third_party/re2/re2/unicode.py:26
unicode.CaseGroups
def CaseGroups(unicode_dir=_UNICODE_DIR)
Definition: bloaty/third_party/re2/re2/unicode.py:225
unicode._ParseContinue
def _ParseContinue(s)
Definition: bloaty/third_party/re2/re2/unicode.py:97
open
#define open
Definition: test-fs.c:46
asyncio_get_stats.type
type
Definition: asyncio_get_stats.py:37
len
int len
Definition: abseil-cpp/absl/base/internal/low_level_alloc_test.cc:46
unicode._URange
def _URange(s)
Definition: bloaty/third_party/re2/re2/unicode.py:52


grpc
Author(s):
autogenerated on Fri May 16 2025 03:00:43