5 """Parser for Unicode data files (as distributed by unicode.org)."""
7 from __future__
import absolute_import
8 from __future__
import division
9 from __future__
import print_function
13 from six.moves
import urllib
16 _UNICODE_DIR =
"https://www.unicode.org/Public/12.1.0/ucd"
23 """Unicode error base class."""
27 """Unicode input error class. Raised on invalid input."""
31 """Converts string to Unicode code point ('263A' => 0x263a).
40 InputError: the string is not a valid Unicode value.
47 if len(s) < 4
or len(s) > 6
or v < 0
or v > _RUNE_MAX:
48 raise InputError(
"invalid Unicode value %s" % (s,))
53 """Converts string to Unicode range.
55 '0001..0003' => [1, 2, 3].
65 InputError: the string is not a valid Unicode range.
74 return range(lo, hi + 1)
75 raise InputError(
"invalid Unicode range %s" % (s,))
79 """Converts Unicode code point to hex string.
84 v: code point to convert
90 InputError: the argument is not a valid Unicode value.
92 if v < 0
or v > _RUNE_MAX:
93 raise InputError(
"invalid Unicode value %s" % (v,))
94 return "0x%04X" % (v,)
98 """Parses a Unicode continuation field.
100 These are of the form '<Name, First>' or '<Name, Last>'.
101 Instead of giving an explicit range in a single table entry,
102 some Unicode tables use two entries, one for the first
103 code value in the range and one for the last.
104 The first entry's description is '<Name, First>' instead of 'Name'
105 and the second is '<Name, Last>'.
107 '<Name, First>' => ('Name', 'First')
108 '<Name, Last>' => ('Name', 'Last')
109 'Anything else' => ('Anything else', None)
112 s: continuation field string
115 pair: name and ('First', 'Last', or None)
118 match = re.match(
"<(.*), (First|Last)>", s)
119 if match
is not None:
120 return match.groups()
125 """Generic Unicode table text file reader.
127 The reader takes care of stripping out comments and also
128 parsing the two different ways that the Unicode tables specify
129 code ranges (using the .. notation and splitting the range across
132 Each non-comment line in the table is expected to have the given
133 number of fields. The first field is known to be the Unicode value
134 and the second field its description.
136 The reader calls doline(codes, fields) for each entry in the table.
137 If fn raises an exception, the reader prints that exception,
138 prefixed with the file name and line number, and continues
139 processing the file. When done with the file, the reader re-raises
140 the first exception encountered during the file.
143 filename: the Unicode data file to read, or a file-like object.
144 nfields: the number of expected fields per line in that file.
145 doline: the function to call for each table entry.
148 InputError: nfields is invalid (must be >= 2).
152 raise InputError(
"invalid number of fields %d" % (nfields,))
154 if type(filename) == str:
155 if filename.startswith(
"https://"):
156 fil = urllib.request.urlopen(filename)
158 fil =
open(filename,
"rb")
168 line = line.decode(
'latin1')
171 sharp = line.find(
"#")
180 fields = [s.strip()
for s
in line.split(
";")]
181 if len(fields) != nfields:
182 raise InputError(
"wrong number of fields %d %d - %s" %
183 (
len(fields), nfields, line))
193 if expect_last
is not None:
196 if (
len(codes) != 1
or codes[0] <= first
or
197 cont !=
"Last" or name != expect_last):
198 raise InputError(
"expected Last line for %s" %
200 codes =
range(first, codes[0] + 1)
203 fields[0] =
"%04X..%04X" % (codes[0], codes[-1])
205 elif cont ==
"First":
209 raise InputError(
"bad First line: range given")
214 doline(codes, fields)
216 except Exception
as e:
217 print(
"%s:%d: %s" % (filename, lineno, e))
220 if expect_last
is not None:
221 raise InputError(
"expected Last line for %s; got EOF" %
226 """Returns list of Unicode code groups equivalent under case folding.
228 Each group is a sorted list of code points,
229 and the list of groups is sorted by first code point
233 unicode_dir: Unicode data directory
236 list of Unicode code groups
242 def DoLine(codes, fields):
243 """Process single CaseFolding.txt line, updating togroup."""
244 (_, foldtype, lower, _) = fields
245 if foldtype
not in (
"C",
"S"):
248 togroup.setdefault(lower, [lower]).extend(codes)
252 groups = list(togroup.values())
256 return togroup, groups
260 """Returns dict mapping script names to code lists.
263 unicode_dir: Unicode data directory
266 dict mapping script names to code lists
271 def DoLine(codes, fields):
272 """Process single Scripts.txt line, updating scripts."""
274 scripts.setdefault(name, []).extend(codes)
281 """Returns dict mapping category names to code lists.
284 unicode_dir: Unicode data directory
287 dict mapping category names to code lists
292 def DoLine(codes, fields):
293 """Process single UnicodeData.txt line, updating categories."""
295 categories.setdefault(category, []).extend(codes)
297 if len(category) > 1:
299 categories.setdefault(short, []).extend(codes)