17 from collections
import namedtuple
18 from datetime
import datetime, timedelta
20 from dateutil.relativedelta
import relativedelta
24 _LONG_ORDINAL_STRING_EN, _LONG_SCALE_EN, \
25 _SHORT_SCALE_EN, _SHORT_ORDINAL_STRING_EN
32 Produce a dictionary with the keys and values 33 inverted, relative to the dict passed in. 36 original dict: The dict like object to invert 42 return {value: key
for key, value
in original.items()}
47 Return a new set or dict containing the original values, 48 all with 's' appended to them. 51 originals set(str) or dict(str, any): values to pluralize 54 set(str) or dict(str, any) 57 if isinstance(originals, dict):
58 return {key +
's': value
for key, value
in originals.items()}
59 return {value +
"s" for value
in originals}
63 _NEGATIVES = {
"negative",
"minus"}
66 _SUMS = {
'twenty',
'20',
'thirty',
'30',
'forty',
'40',
'fifty',
'50',
67 'sixty',
'60',
'seventy',
'70',
'eighty',
'80',
'ninety',
'90'}
69 _MULTIPLIES_LONG_SCALE_EN = set(_LONG_SCALE_EN.values()) | \
72 _MULTIPLIES_SHORT_SCALE_EN = set(_SHORT_SCALE_EN.values()) | \
77 _FRACTION_MARKER = {
"and"}
80 _DECIMAL_MARKER = {
"point",
"dot"}
84 _STRING_NUM_EN.update({
98 _Token = namedtuple(
'_Token',
'word index')
103 Similar to _Token, this class is used in number parsing. 105 Once we've found a number in a string, this class contains all 106 the info about the value, and where it came from in the original text. 107 In other words, it is the text, and the number that can replace it in 116 return bool(self.
value is not None and self.
value is not False)
120 return self.
tokens[0].index
124 return self.
tokens[-1].index
128 return ' '.join([t.word
for t
in self.
tokens])
133 except AttributeError:
136 raise Exception(
"Immutable!")
139 return "({v}, {t})".format(v=self.
value, t=self.
tokens)
142 return "{n}({v}, {t})".format(n=self.__class__.__name__, v=self.
value,
148 Generate a list of token object, given a string. 150 text str: Text to tokenize. 156 return [
_Token(word, index)
for index, word
in enumerate(text.split())]
161 Partition a list of items. 163 Works similarly to str.partition 168 Should return a boolean. Each item will be passed to 169 this callable in succession, and partitions will be 170 created any time it returns True. 180 splits.append(current_split)
181 splits.append([item])
184 current_split.append(item)
185 splits.append(current_split)
186 return list(filter(
lambda x: len(x) != 0, splits))
191 Convert words in a string into their equivalent numbers. 194 short_scale boolean: True if short scale numbers should be used. 195 ordinals boolean: True if ordinals (e.g. first, second, third) should 196 be parsed to their number values (1, 2, 3...) 200 The original text, with numbers subbed in where appropriate. 205 numbers_to_replace = \
207 numbers_to_replace.sort(key=
lambda number: number.start_index)
211 if not numbers_to_replace
or \
212 token.index < numbers_to_replace[0].start_index:
213 results.append(token.word)
215 if numbers_to_replace
and \
216 token.index == numbers_to_replace[0].start_index:
217 results.append(str(numbers_to_replace[0].value))
218 if numbers_to_replace
and \
219 token.index == numbers_to_replace[0].end_index:
220 numbers_to_replace.pop(0)
222 return ' '.join(results)
226 ordinals=
False, fractional_numbers=
True):
228 Extract all numbers from a list of _Tokens, with the words that 232 [_Token]: The tokens to parse. 233 short_scale bool: True if short scale numbers should be used, False for 234 long scale. True by default. 235 ordinals bool: True if ordinal words (first, second, third, etc) should 237 fractional_numbers bool: True if we should look for fractions and 241 [_ReplaceableNumber]: A list of tuples, each containing a number and a 245 placeholder =
"<placeholder>" 250 ordinals, fractional_numbers)
255 results.append(to_replace)
259 to_replace.start_index <= t.index <= to_replace.end_index
261 _Token(placeholder, t.index)
for t
in tokens
263 results.sort(key=
lambda n: n.start_index)
268 ordinals=
False, fractional_numbers=
True):
270 This function extracts a number from a list of _Tokens. 273 tokens str: the string to normalize 274 short_scale (bool): use short scale if True, long scale if False 275 ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 276 fractional_numbers (bool): True if we should look for fractions and 284 ordinals, fractional_numbers)
285 while tokens
and tokens[0].word
in _ARTICLES:
291 short_scale=
True, ordinals=
False,
292 fractional_numbers=
True):
294 Helper for _extract_number_with_text_en. 296 This contains the real logic for parsing, but produces 297 a result that needs a little cleaning (specific, it may 298 contain leading articles that can be trimmed off). 304 fractional_numbers boolean: 307 int or float, [_Tokens] 310 if fractional_numbers:
311 fraction, fraction_text = \
314 return fraction, fraction_text
316 decimal, decimal_text = \
319 return decimal, decimal_text
326 Extract fraction numbers from a string. 328 This function handles text such as '2 and 3/4'. Note that "one half" or 329 similar will be parsed by the whole number function. 332 tokens [_Token]: words and their indexes in the original string. 337 (int or float, [_Token]) 338 The value found, and the list of relevant tokens. 339 (None, None) if no fraction value is found. 342 for c
in _FRACTION_MARKER:
345 if len(partitions) == 3:
348 ordinals, fractional_numbers=
False)
351 ordinals, fractional_numbers=
True)
353 if not numbers1
or not numbers2:
359 if num1.value >= 1
and 0 < num2.value < 1:
360 return num1.value + num2.value, \
361 num1.tokens + partitions[1] + num2.tokens
368 Extract decimal numbers from a string. 370 This function handles text such as '2 point 5'. 373 While this is a helper for extractnumber_en, it also depends on 374 extractnumber_en, to parse out the components of the decimal. 376 This does not currently handle things like: 377 number dot number number number 380 tokens [_Token]: The text to parse. 386 The value found and relevant tokens. 387 (None, None) if no decimal value is found. 390 for c
in _DECIMAL_MARKER:
393 if len(partitions) == 3:
396 ordinals, fractional_numbers=
False)
399 ordinals, fractional_numbers=
False)
401 if not numbers1
or not numbers2:
404 number = numbers1[-1]
405 decimal = numbers2[0]
408 if "." not in str(decimal.text):
409 return number.value + float(
'0.' + str(decimal.value)), \
410 number.tokens + partitions[1] + decimal.tokens
416 Handle numbers not handled by the decimal or fraction functions. This is 417 generally whole numbers. Note that phrases such as "one half" will be 418 handled by this function, while "one and a half" are handled by the 427 int or float, [_Tokens] 428 The value parsed, and tokens that it corresponds to. 431 multiplies, string_num_ordinal, string_num_scale = \
439 for idx, token
in enumerate(tokens):
446 if word
in _ARTICLES
or word
in _NEGATIVES:
447 number_words.append(token)
450 prev_word = tokens[idx - 1].word
if idx > 0
else "" 451 next_word = tokens[idx + 1].word
if idx + 1 < len(tokens)
else "" 453 if word
not in string_num_scale
and \
454 word
not in _STRING_NUM_EN
and \
455 word
not in _SUMS
and \
456 word
not in multiplies
and \
457 not (ordinals
and word
in string_num_ordinal)
and \
461 words_only = [token.word
for token
in number_words]
462 if number_words
and not all([w
in _ARTICLES |
463 _NEGATIVES
for w
in words_only]):
468 elif word
not in multiplies \
469 and prev_word
not in multiplies \
470 and prev_word
not in _SUMS \
471 and not (ordinals
and prev_word
in string_num_ordinal) \
472 and prev_word
not in _NEGATIVES \
473 and prev_word
not in _ARTICLES:
474 number_words = [token]
475 elif prev_word
in _SUMS
and word
in _SUMS:
476 number_words = [token]
478 number_words.append(token)
489 if word
in _STRING_NUM_EN:
490 val = _STRING_NUM_EN.get(word)
492 elif word
in string_num_scale:
493 val = string_num_scale.get(word)
495 elif ordinals
and word
in string_num_ordinal:
496 val = string_num_ordinal[word]
501 if ordinals
and prev_word
in string_num_ordinal
and val
is 1:
506 if prev_word
in _SUMS
and val
and val < 10:
511 if word
in multiplies:
529 number_words.append(tokens[idx + 1])
532 if val
and prev_word
and prev_word
in _NEGATIVES:
538 aPieces = word.split(
'/')
540 val = float(aPieces[0]) / float(aPieces[1])
544 if prev_word
in _SUMS
and word
not in _SUMS
and current_val >= 10:
554 if word
in multiplies
and next_word
not in multiplies:
559 if val
is not None and to_sum:
562 return val, number_words
567 Generate dictionaries of words to numbers, based on scale. 569 This is a helper function for _extract_whole_number. 575 (set(str), dict(str, number), dict(str, number)) 576 multiplies, string_num_ordinal, string_num_scale 579 multiplies = _MULTIPLIES_SHORT_SCALE_EN
if short_scale \
580 else _MULTIPLIES_LONG_SCALE_EN
582 string_num_ordinal_en = _STRING_SHORT_ORDINAL_EN
if short_scale \
583 else _STRING_LONG_ORDINAL_EN
585 string_num_scale_en = _SHORT_SCALE_EN
if short_scale
else _LONG_SCALE_EN
589 return multiplies, string_num_ordinal_en, string_num_scale_en
594 This function extracts a number from a text string, 595 handles pronunciations in long scale and short scale 597 https://en.wikipedia.org/wiki/Names_of_large_numbers 600 text (str): the string to normalize 601 short_scale (bool): use short scale if True, long scale if False 602 ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 604 (int) or (float) or False: The extracted number or False if no number 609 short_scale, ordinals).value
614 Convert an english phrase into a number of seconds 619 "3 days 8 hours 10 minutes and 49 seconds" 620 into an int, representing the total number of seconds. 622 The words used in the duration will be consumed, and 623 the remainder returned. 625 As an example, "set a timer for 5 minutes" would return 626 (300, "set a timer for"). 629 text (str): string containing a duration 633 A tuple containing the duration and the remaining text 634 not consumed in the parsing. The first value will 635 be None if no duration is found. The text returned 636 will have whitespace stripped from the ends. 642 'microseconds':
None,
643 'milliseconds':
None,
651 pattern =
r"(?P<value>\d+(?:\.?\d+)?)\s+{unit}s?" 654 for unit
in time_units:
655 unit_pattern = pattern.format(unit=unit[:-1])
656 matches = re.findall(unit_pattern, text)
657 value = sum(map(float, matches))
658 time_units[unit] = value
659 text = re.sub(unit_pattern,
'', text)
662 duration = timedelta(**time_units)
if any(time_units.values())
else None 664 return (duration, text)
668 """ Convert a human date reference into an exact datetime 673 "next Tuesday at 4pm" 675 into a datetime. If a reference date is not provided, the current 676 local time is used. Also consumes the words used to define the date 677 returning the remaining string. For example, the string 678 "what is Tuesday's weather forecast" 679 returns the date for the forthcoming Tuesday relative to the reference 680 date and the remainder string 681 "what is weather forecast". 684 string (str): string containing date words 685 dateNow (datetime): A reference date/time for "tommorrow", etc 686 default_time (time): Time to set if no time was found in the string 689 [datetime, str]: An array containing the datetime and the remaining 690 text not consumed in the parsing, or None if no 691 date or time related text was found. 696 s = s.lower().replace(
'?',
'').replace(
'.',
'').replace(
',',
'') \
697 .replace(
' the ',
' ').replace(
' a ',
' ').replace(
' an ',
' ') \
698 .replace(
"o' clock",
"o'clock").replace(
"o clock",
"o'clock") \
699 .replace(
"o ' clock",
"o'clock").replace(
"o 'clock",
"o'clock") \
700 .replace(
"oclock",
"o'clock").replace(
"couple",
"2") \
701 .replace(
"centuries",
"century").replace(
"decades",
"decade") \
702 .replace(
"millenniums",
"millennium")
705 for idx, word
in enumerate(wordList):
706 word = word.replace(
"'s",
"")
708 ordinals = [
"rd",
"st",
"nd",
"th"]
709 if word[0].isdigit():
710 for ordinal
in ordinals:
712 if ordinal
in word
and "second" not in word:
713 word = word.replace(ordinal,
"")
722 yearOffset != 0
or monthOffset != 0
or 723 dayOffset
is True or hrOffset != 0
or 724 hrAbs
or minOffset != 0
or 725 minAbs
or secOffset != 0
728 if string ==
"" or not dateNow:
736 today = dateNow.strftime(
"%w")
737 currentYear = dateNow.strftime(
"%Y")
743 timeQualifiersAM = [
'morning']
744 timeQualifiersPM = [
'afternoon',
'evening',
'night',
'tonight']
745 timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM)
746 markers = [
'at',
'in',
'on',
'by',
'this',
'around',
'for',
'of',
"within"]
747 days = [
'monday',
'tuesday',
'wednesday',
748 'thursday',
'friday',
'saturday',
'sunday']
749 months = [
'january',
'february',
'march',
'april',
'may',
'june',
750 'july',
'august',
'september',
'october',
'november',
752 recur_markers = days + [d+
's' for d
in days] + [
'weekend',
'weekday',
753 'weekends',
'weekdays']
754 monthsShort = [
'jan',
'feb',
'mar',
'apr',
'may',
'june',
'july',
'aug',
755 'sept',
'oct',
'nov',
'dec']
756 year_multiples = [
"decade",
"century",
"millennium"]
757 day_multiples = [
"weeks",
"months",
"years"]
759 words = clean_string(string)
761 for idx, word
in enumerate(words):
764 wordPrevPrev = words[idx - 2]
if idx > 1
else "" 765 wordPrev = words[idx - 1]
if idx > 0
else "" 766 wordNext = words[idx + 1]
if idx + 1 < len(words)
else "" 767 wordNextNext = words[idx + 2]
if idx + 2 < len(words)
else "" 770 word = word.rstrip(
's')
775 if word ==
"now" and not datestr:
776 resultStr =
" ".join(words[idx + 1:])
777 resultStr =
' '.join(resultStr.split())
778 extractedDate = dateNow.replace(microsecond=0)
779 return [extractedDate, resultStr]
780 elif wordNext
in year_multiples:
784 multiplier = multiplier
or 1
785 multiplier = int(multiplier)
787 if wordNext ==
"decade":
788 yearOffset = multiplier * 10
789 elif wordNext ==
"century":
790 yearOffset = multiplier * 100
791 elif wordNext ==
"millennium":
792 yearOffset = multiplier * 1000
794 elif word ==
"2" and wordNext ==
"of" and \
795 wordNextNext
in year_multiples:
798 if wordNextNext ==
"decade":
799 yearOffset = multiplier * 10
800 elif wordNextNext ==
"century":
801 yearOffset = multiplier * 100
802 elif wordNextNext ==
"millennium":
803 yearOffset = multiplier * 1000
804 elif word ==
"2" and wordNext ==
"of" and \
805 wordNextNext
in day_multiples:
808 if wordNextNext ==
"years":
809 yearOffset = multiplier
810 elif wordNextNext ==
"months":
811 monthOffset = multiplier
812 elif wordNextNext ==
"weeks":
813 dayOffset = multiplier * 7
814 elif word
in timeQualifiersList:
817 elif word ==
"today" and not fromFlag:
820 elif word ==
"tomorrow" and not fromFlag:
823 elif (word ==
"day" and 824 wordNext ==
"after" and 825 wordNextNext ==
"tomorrow" and 827 not wordPrev[0].isdigit()):
830 if wordPrev ==
"the":
835 if wordPrev[0].isdigit():
836 dayOffset += int(wordPrev)
839 elif word ==
"week" and not fromFlag:
840 if wordPrev[0].isdigit():
841 dayOffset += int(wordPrev) * 7
844 elif wordPrev ==
"next":
848 elif wordPrev ==
"last":
853 elif word ==
"month" and not fromFlag:
854 if wordPrev[0].isdigit():
855 monthOffset = int(wordPrev)
858 elif wordPrev ==
"next":
862 elif wordPrev ==
"last":
867 elif word ==
"year" and not fromFlag:
868 if wordPrev[0].isdigit():
869 yearOffset = int(wordPrev)
872 elif wordPrev ==
"next":
876 elif wordPrev ==
"last":
882 elif word
in days
and not fromFlag:
884 dayOffset = (d + 1) - int(today)
888 if wordPrev ==
"next":
892 elif wordPrev ==
"last":
897 elif word
in months
or word
in monthsShort
and not fromFlag:
899 m = months.index(word)
901 m = monthsShort.index(word)
904 if wordPrev
and (wordPrev[0].isdigit()
or 905 (wordPrev ==
"of" and wordPrevPrev[0].isdigit())):
906 if wordPrev ==
"of" and wordPrevPrev[0].isdigit():
907 datestr +=
" " + words[idx - 2]
911 datestr +=
" " + wordPrev
914 if wordNext
and wordNext[0].isdigit():
915 datestr +=
" " + wordNext
921 elif wordNext
and wordNext[0].isdigit():
922 datestr +=
" " + wordNext
924 if wordNextNext
and wordNextNext[0].isdigit():
925 datestr +=
" " + wordNextNext
932 validFollowups = days + months + monthsShort
933 validFollowups.append(
"today")
934 validFollowups.append(
"tomorrow")
935 validFollowups.append(
"next")
936 validFollowups.append(
"last")
937 validFollowups.append(
"now")
938 if (word ==
"from" or word ==
"after")
and wordNext
in validFollowups:
941 if wordNext ==
"tomorrow":
943 elif wordNext
in days:
944 d = days.index(wordNext)
945 tmpOffset = (d + 1) - int(today)
949 dayOffset += tmpOffset
950 elif wordNextNext
and wordNextNext
in days:
951 d = days.index(wordNextNext)
952 tmpOffset = (d + 1) - int(today)
954 if wordNext ==
"next":
958 elif wordNext ==
"last":
962 dayOffset += tmpOffset
964 if start - 1 > 0
and words[start - 1] ==
"this":
968 for i
in range(0, used):
969 words[i + start] =
"" 971 if start - 1 >= 0
and words[start - 1]
in markers:
972 words[start - 1] =
"" 984 for idx, word
in enumerate(words):
988 wordPrevPrev = words[idx - 2]
if idx > 1
else "" 989 wordPrev = words[idx - 1]
if idx > 0
else "" 990 wordNext = words[idx + 1]
if idx + 1 < len(words)
else "" 991 wordNextNext = words[idx + 2]
if idx + 2 < len(words)
else "" 997 elif word ==
"midnight":
1000 elif word ==
"morning":
1004 elif word ==
"afternoon":
1008 elif word ==
"evening":
1013 elif word ==
"2" and wordNext ==
"of" and \
1014 wordNextNext
in [
"hours",
"minutes",
"seconds"]:
1016 if wordNextNext ==
"hours":
1018 elif wordNextNext ==
"minutes":
1020 elif wordNextNext ==
"seconds":
1023 elif word ==
"hour" and \
1024 (wordPrev
in markers
or wordPrevPrev
in markers):
1025 if wordPrev ==
"half":
1027 elif wordPrev ==
"quarter":
1029 elif wordPrevPrev ==
"quarter":
1031 if idx > 2
and words[idx - 3]
in markers:
1033 if words[idx - 3] ==
"this":
1036 elif wordPrev ==
"within":
1040 if wordPrevPrev
in markers:
1042 if wordPrevPrev ==
"this":
1050 elif word ==
"minute" and wordPrev ==
"in":
1055 elif word ==
"second" and wordPrev ==
"in":
1059 elif word[0].isdigit():
1064 wordNextNextNext = words[idx + 3] \
1065 if idx + 3 < len(words)
else "" 1066 if wordNext ==
"tonight" or wordNextNext ==
"tonight" or \
1067 wordPrev ==
"tonight" or wordPrevPrev ==
"tonight" or \
1068 wordNextNextNext ==
"tonight":
1071 if wordPrev ==
"tonight":
1073 if wordPrevPrev ==
"tonight":
1075 if wordNextNext ==
"tonight":
1077 if wordNextNextNext ==
"tonight":
1085 for i
in range(length):
1087 if word[i].isdigit():
1089 elif word[i] ==
":":
1095 if word[i].isdigit():
1101 remainder = word[i:].replace(
".",
"")
1104 nextWord = wordNext.replace(
".",
"")
1105 if nextWord ==
"am" or nextWord ==
"pm":
1106 remainder = nextWord
1109 elif wordNext ==
"in" and wordNextNext ==
"the" and \
1110 words[idx + 3] ==
"morning":
1113 elif wordNext ==
"in" and wordNextNext ==
"the" and \
1114 words[idx + 3] ==
"afternoon":
1117 elif wordNext ==
"in" and wordNextNext ==
"the" and \
1118 words[idx + 3] ==
"evening":
1121 elif wordNext ==
"in" and wordNextNext ==
"morning":
1124 elif wordNext ==
"in" and wordNextNext ==
"afternoon":
1127 elif wordNext ==
"in" and wordNextNext ==
"evening":
1130 elif wordNext ==
"this" and wordNextNext ==
"morning":
1134 elif wordNext ==
"this" and wordNextNext ==
"afternoon":
1138 elif wordNext ==
"this" and wordNextNext ==
"evening":
1142 elif wordNext ==
"at" and wordNextNext ==
"night":
1143 if strHH
and int(strHH) > 5:
1150 if timeQualifier !=
"":
1152 if strHH
and int(strHH) <= 12
and \
1153 (timeQualifier
in timeQualifiersPM):
1154 strHH += str(int(strHH) + 12)
1162 for i
in range(length):
1163 if word[i].isdigit():
1166 remainder += word[i]
1169 remainder = wordNext.replace(
".",
"").lstrip().rstrip()
1171 remainder ==
"pm" or 1173 remainder ==
"p.m." or 1174 wordNext ==
"p.m."):
1179 remainder ==
"am" or 1181 remainder ==
"a.m." or 1182 wordNext ==
"a.m."):
1187 remainder
in recur_markers
or 1188 wordNext
in recur_markers
or 1189 wordNextNext
in recur_markers):
1197 int(strNum) > 100
and 1203 strHH = str(int(strNum) // 100)
1204 strMM = str(int(strNum) % 100)
1206 if wordNext ==
"hours":
1209 (wordNext ==
"hours" or wordNext ==
"hour" or 1210 remainder ==
"hours" or remainder ==
"hour")
and 1213 int(strNum) < 100
or 1218 hrOffset = int(strNum)
1224 elif wordNext ==
"minutes" or wordNext ==
"minute" or \
1225 remainder ==
"minutes" or remainder ==
"minute":
1227 minOffset = int(strNum)
1232 elif wordNext ==
"seconds" or wordNext ==
"second" \
1233 or remainder ==
"seconds" or remainder ==
"second":
1235 secOffset = int(strNum)
1240 elif int(strNum) > 100:
1242 strHH = str(int(strNum) // 100)
1243 strMM = str(int(strNum) % 100)
1245 if wordNext ==
"hours" or wordNext ==
"hour" or \
1246 remainder ==
"hours" or remainder ==
"hour":
1248 elif wordNext
and wordNext[0].isdigit():
1254 if (wordNextNext ==
"hours" or 1255 wordNextNext ==
"hour" or 1256 remainder ==
"hours" or remainder ==
"hour"):
1259 wordNext ==
"" or wordNext ==
"o'clock" or 1261 wordNext ==
"in" and 1263 wordNextNext ==
"the" or 1264 wordNextNext == timeQualifier
1266 )
or wordNext ==
'tonight' or 1267 wordNextNext ==
'tonight'):
1271 if wordNext ==
"o'clock":
1274 if wordNext ==
"in" or wordNextNext ==
"in":
1275 used += (1
if wordNext ==
"in" else 2)
1276 wordNextNextNext = words[idx + 3] \
1277 if idx + 3 < len(words)
else "" 1279 if (wordNextNext
and 1280 (wordNextNext
in timeQualifier
or 1281 wordNextNextNext
in timeQualifier)):
1282 if (wordNextNext
in timeQualifiersPM
or 1283 wordNextNextNext
in timeQualifiersPM):
1286 if (wordNextNext
in timeQualifiersAM
or 1287 wordNextNextNext
in timeQualifiersAM):
1291 if timeQualifier !=
"":
1292 if timeQualifier
in timeQualifiersPM:
1296 elif timeQualifier
in timeQualifiersAM:
1305 HH = int(strHH)
if strHH
else 0
1306 MM = int(strMM)
if strMM
else 0
1307 HH = HH + 12
if remainder ==
"pm" and HH < 12
else HH
1308 HH = HH - 12
if remainder ==
"am" and HH >= 12
else HH
1310 if (
not military
and 1311 remainder
not in [
'am',
'pm',
'hours',
'minutes',
1312 "second",
"seconds",
1313 "hour",
"minute"]
and 1314 ((
not daySpecified)
or dayOffset < 1)):
1317 if dateNow.hour < HH
or (dateNow.hour == HH
and 1318 dateNow.minute < MM):
1320 elif dateNow.hour < HH + 12:
1326 if timeQualifier
in timeQualifiersPM
and HH < 12:
1329 if HH > 24
or MM > 59:
1339 for i
in range(used):
1340 if idx + i >= len(words):
1344 if wordPrev ==
"o" or wordPrev ==
"oh":
1345 words[words.index(wordPrev)] =
"" 1347 if wordPrev ==
"early":
1351 elif wordPrev ==
"late":
1355 if idx > 0
and wordPrev
in markers:
1357 if wordPrev ==
"this":
1359 if idx > 1
and wordPrevPrev
in markers:
1361 if wordPrevPrev ==
"this":
1370 if dayOffset
is False:
1375 extractedDate = dateNow.replace(microsecond=0)
1380 temp = datetime.strptime(datestr,
"%B %d")
1383 temp = datetime.strptime(datestr,
"%B %d %Y")
1384 extractedDate = extractedDate.replace(hour=0, minute=0, second=0)
1386 temp = temp.replace(year=extractedDate.year,
1387 tzinfo=extractedDate.tzinfo)
1388 if extractedDate < temp:
1389 extractedDate = extractedDate.replace(
1390 year=int(currentYear),
1391 month=int(temp.strftime(
"%m")),
1392 day=int(temp.strftime(
"%d")),
1393 tzinfo=extractedDate.tzinfo)
1395 extractedDate = extractedDate.replace(
1396 year=int(currentYear) + 1,
1397 month=int(temp.strftime(
"%m")),
1398 day=int(temp.strftime(
"%d")),
1399 tzinfo=extractedDate.tzinfo)
1401 extractedDate = extractedDate.replace(
1402 year=int(temp.strftime(
"%Y")),
1403 month=int(temp.strftime(
"%m")),
1404 day=int(temp.strftime(
"%d")),
1405 tzinfo=extractedDate.tzinfo)
1408 if hrOffset == 0
and minOffset == 0
and secOffset == 0:
1409 extractedDate = extractedDate.replace(hour=0, minute=0, second=0)
1412 extractedDate = extractedDate + relativedelta(years=yearOffset)
1413 if monthOffset != 0:
1414 extractedDate = extractedDate + relativedelta(months=monthOffset)
1416 extractedDate = extractedDate + relativedelta(days=dayOffset)
1417 if hrAbs != -1
and minAbs != -1:
1420 if hrAbs
is None and minAbs
is None and default_time
is not None:
1421 hrAbs, minAbs = default_time.hour, default_time.minute
1424 minAbs = minAbs
or 0
1426 extractedDate = extractedDate + relativedelta(hours=hrAbs,
1428 if (hrAbs != 0
or minAbs != 0)
and datestr ==
"":
1429 if not daySpecified
and dateNow > extractedDate:
1430 extractedDate = extractedDate + relativedelta(days=1)
1432 extractedDate = extractedDate + relativedelta(hours=hrOffset)
1434 extractedDate = extractedDate + relativedelta(minutes=minOffset)
1436 extractedDate = extractedDate + relativedelta(seconds=secOffset)
1437 for idx, word
in enumerate(words):
1438 if words[idx] ==
"and" and \
1439 words[idx - 1] ==
"" and words[idx + 1] ==
"":
1442 resultStr =
" ".join(words)
1443 resultStr =
' '.join(resultStr.split())
1444 return [extractedDate, resultStr]
1449 This function takes the given text and checks if it is a fraction. 1452 input_str (str): the string to check if fractional 1453 short_scale (bool): use short scale if True, long scale if False 1455 (bool) or (float): False if not a fraction, otherwise the fraction 1458 if input_str.endswith(
's', -1):
1459 input_str = input_str[:len(input_str) - 1]
1461 fracts = {
"whole": 1,
"half": 2,
"halve": 2,
"quarter": 4}
1463 for num
in _SHORT_ORDINAL_STRING_EN:
1465 fracts[_SHORT_ORDINAL_STRING_EN[num]] = num
1467 for num
in _LONG_ORDINAL_STRING_EN:
1469 fracts[_LONG_ORDINAL_STRING_EN[num]] = num
1471 if input_str.lower()
in fracts:
1472 return 1.0 / fracts[input_str.lower()]
1478 Takes in a string and extracts a list of numbers. 1481 text (str): the string to extract a number from 1482 short_scale (bool): Use "short scale" or "long scale" for large 1483 numbers -- over a million. The default is short scale, which 1484 is now common in most English speaking countries. 1485 See https://en.wikipedia.org/wiki/Names_of_large_numbers 1486 ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 1488 list: list of extracted numbers as floats 1491 short_scale, ordinals)
1492 return [float(result.value)
for result
in results]
1496 """ English string normalization """ 1498 words = text.split()
1501 if remove_articles
and word
in [
"the",
"a",
"an"]:
1505 contraction = [
"ain't",
"aren't",
"can't",
"could've",
"couldn't",
1506 "didn't",
"doesn't",
"don't",
"gonna",
"gotta",
1507 "hadn't",
"hasn't",
"haven't",
"he'd",
"he'll",
"he's",
1508 "how'd",
"how'll",
"how's",
"I'd",
"I'll",
"I'm",
1509 "I've",
"isn't",
"it'd",
"it'll",
"it's",
"mightn't",
1510 "might've",
"mustn't",
"must've",
"needn't",
1512 "shan't",
"she'd",
"she'll",
"she's",
"shouldn't",
1513 "should've",
"somebody's",
"someone'd",
"someone'll",
1514 "someone's",
"that'll",
"that's",
"that'd",
"there'd",
1515 "there're",
"there's",
"they'd",
"they'll",
"they're",
1516 "they've",
"wasn't",
"we'd",
"we'll",
"we're",
"we've",
1517 "weren't",
"what'd",
"what'll",
"what're",
"what's",
1519 "what've",
"when's",
"when'd",
"where'd",
"where's",
1520 "where've",
"who'd",
"who'd've",
"who'll",
"who're",
1521 "who's",
"who've",
"why'd",
"why're",
"why's",
"won't",
1522 "won't've",
"would've",
"wouldn't",
"wouldn't've",
1523 "y'all",
"ya'll",
"you'd",
"you'd've",
"you'll",
1524 "y'aint",
"y'ain't",
"you're",
"you've"]
1525 if word
in contraction:
1526 expansion = [
"is not",
"are not",
"can not",
"could have",
1527 "could not",
"did not",
"does not",
"do not",
1528 "going to",
"got to",
"had not",
"has not",
1529 "have not",
"he would",
"he will",
"he is",
1531 "how will",
"how is",
"I would",
"I will",
"I am",
1532 "I have",
"is not",
"it would",
"it will",
"it is",
1533 "might not",
"might have",
"must not",
"must have",
1534 "need not",
"ought not",
"shall not",
"she would",
1535 "she will",
"she is",
"should not",
"should have",
1536 "somebody is",
"someone would",
"someone will",
1537 "someone is",
"that will",
"that is",
"that would",
1538 "there would",
"there are",
"there is",
"they would",
1539 "they will",
"they are",
"they have",
"was not",
1540 "we would",
"we will",
"we are",
"we have",
1541 "were not",
"what did",
"what will",
"what are",
1543 "what is",
"what have",
"when is",
"when did",
1544 "where did",
"where is",
"where have",
"who would",
1545 "who would have",
"who will",
"who are",
"who is",
1546 "who have",
"why did",
"why are",
"why is",
1547 "will not",
"will not have",
"would have",
1548 "would not",
"would not have",
"you all",
"you all",
1549 "you would",
"you would have",
"you will",
1550 "you are not",
"you are not",
"you are",
"you have"]
1551 word = expansion[contraction.index(word)]
1554 textNumbers = [
"zero",
"one",
"two",
"three",
"four",
"five",
"six",
1555 "seven",
"eight",
"nine",
"ten",
"eleven",
"twelve",
1556 "thirteen",
"fourteen",
"fifteen",
"sixteen",
1557 "seventeen",
"eighteen",
"nineteen",
"twenty"]
1559 if word
in textNumbers:
1560 word = str(textNumbers.index(word))
1562 normalized +=
" " + word
1564 return normalized[1:]
def normalize_en(text, remove_articles)
def _invert_dict(original)
def extract_datetime_en(string, dateNow, default_time)
def _extract_decimal_with_text_en(tokens, short_scale, ordinals)
def _generate_plurals(originals)
def extractnumber_en(text, short_scale=True, ordinals=False)
def isFractional_en(input_str, short_scale=True)
def _extract_whole_number_with_text_en(tokens, short_scale, ordinals)
def extract_numbers_en(text, short_scale=True, ordinals=False)
def extract_duration_en(text)
def _partition_list(items, split_on)
def is_numeric(input_str)
def look_for_fractions(split_list)
def _extract_number_with_text_en(tokens, short_scale=True, ordinals=False, fractional_numbers=True)
def _extract_fraction_with_text_en(tokens, short_scale, ordinals)
def __setattr__(self, key, value)
def _initialize_number_data(short_scale)
def _extract_number_with_text_en_helper(tokens, short_scale=True, ordinals=False, fractional_numbers=True)
def _extract_numbers_with_text(tokens, short_scale=True, ordinals=False, fractional_numbers=True)
def _convert_words_to_numbers(text, short_scale=True, ordinals=False)