parse_en.py
Go to the documentation of this file.
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright 2017 Mycroft AI Inc.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #
17 from collections import namedtuple
18 from datetime import datetime, timedelta
19 
20 from dateutil.relativedelta import relativedelta
21 
22 from mycroft.util.lang.parse_common import is_numeric, look_for_fractions
23 from mycroft.util.lang.common_data_en import _ARTICLES, _NUM_STRING_EN, \
24  _LONG_ORDINAL_STRING_EN, _LONG_SCALE_EN, \
25  _SHORT_SCALE_EN, _SHORT_ORDINAL_STRING_EN
26 
27 import re
28 
29 
30 def _invert_dict(original):
31  """
32  Produce a dictionary with the keys and values
33  inverted, relative to the dict passed in.
34 
35  Args:
36  original dict: The dict like object to invert
37 
38  Returns:
39  dict
40 
41  """
42  return {value: key for key, value in original.items()}
43 
44 
45 def _generate_plurals(originals):
46  """
47  Return a new set or dict containing the original values,
48  all with 's' appended to them.
49 
50  Args:
51  originals set(str) or dict(str, any): values to pluralize
52 
53  Returns:
54  set(str) or dict(str, any)
55 
56  """
57  if isinstance(originals, dict):
58  return {key + 's': value for key, value in originals.items()}
59  return {value + "s" for value in originals}
60 
61 
62 # negate next number (-2 = 0 - 2)
63 _NEGATIVES = {"negative", "minus"}
64 
65 # sum the next number (twenty two = 20 + 2)
66 _SUMS = {'twenty', '20', 'thirty', '30', 'forty', '40', 'fifty', '50',
67  'sixty', '60', 'seventy', '70', 'eighty', '80', 'ninety', '90'}
68 
69 _MULTIPLIES_LONG_SCALE_EN = set(_LONG_SCALE_EN.values()) | \
70  _generate_plurals(_LONG_SCALE_EN.values())
71 
72 _MULTIPLIES_SHORT_SCALE_EN = set(_SHORT_SCALE_EN.values()) | \
73  _generate_plurals(_SHORT_SCALE_EN.values())
74 
75 
76 # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 )
77 _FRACTION_MARKER = {"and"}
78 
79 # decimal marker ( 1 point 5 = 1 + 0.5)
80 _DECIMAL_MARKER = {"point", "dot"}
81 
82 _STRING_NUM_EN = _invert_dict(_NUM_STRING_EN)
83 _STRING_NUM_EN.update(_generate_plurals(_STRING_NUM_EN))
84 _STRING_NUM_EN.update({
85  "half": 0.5,
86  "halves": 0.5,
87  "couple": 2
88 })
89 
90 _STRING_SHORT_ORDINAL_EN = _invert_dict(_SHORT_ORDINAL_STRING_EN)
91 _STRING_LONG_ORDINAL_EN = _invert_dict(_LONG_ORDINAL_STRING_EN)
92 
93 
94 # _Token is intended to be used in the number processing functions in
95 # this module. The parsing requires slicing and dividing of the original
96 # text. To ensure things parse correctly, we need to know where text came
97 # from in the original input, hence this nametuple.
98 _Token = namedtuple('_Token', 'word index')
99 
100 
102  """
103  Similar to _Token, this class is used in number parsing.
104 
105  Once we've found a number in a string, this class contains all
106  the info about the value, and where it came from in the original text.
107  In other words, it is the text, and the number that can replace it in
108  the string.
109  """
110 
111  def __init__(self, value, tokens: [_Token]):
112  self.value = value
113  self.tokens = tokens
114 
115  def __bool__(self):
116  return bool(self.value is not None and self.value is not False)
117 
118  @property
119  def start_index(self):
120  return self.tokens[0].index
121 
122  @property
123  def end_index(self):
124  return self.tokens[-1].index
125 
126  @property
127  def text(self):
128  return ' '.join([t.word for t in self.tokens])
129 
130  def __setattr__(self, key, value):
131  try:
132  getattr(self, key)
133  except AttributeError:
134  super().__setattr__(key, value)
135  else:
136  raise Exception("Immutable!")
137 
138  def __str__(self):
139  return "({v}, {t})".format(v=self.value, t=self.tokens)
140 
141  def __repr__(self):
142  return "{n}({v}, {t})".format(n=self.__class__.__name__, v=self.value,
143  t=self.tokens)
144 
145 
146 def _tokenize(text):
147  """
148  Generate a list of token object, given a string.
149  Args:
150  text str: Text to tokenize.
151 
152  Returns:
153  [_Token]
154 
155  """
156  return [_Token(word, index) for index, word in enumerate(text.split())]
157 
158 
159 def _partition_list(items, split_on):
160  """
161  Partition a list of items.
162 
163  Works similarly to str.partition
164 
165  Args:
166  items:
167  split_on callable:
168  Should return a boolean. Each item will be passed to
169  this callable in succession, and partitions will be
170  created any time it returns True.
171 
172  Returns:
173  [[any]]
174 
175  """
176  splits = []
177  current_split = []
178  for item in items:
179  if split_on(item):
180  splits.append(current_split)
181  splits.append([item])
182  current_split = []
183  else:
184  current_split.append(item)
185  splits.append(current_split)
186  return list(filter(lambda x: len(x) != 0, splits))
187 
188 
189 def _convert_words_to_numbers(text, short_scale=True, ordinals=False):
190  """
191  Convert words in a string into their equivalent numbers.
192  Args:
193  text str:
194  short_scale boolean: True if short scale numbers should be used.
195  ordinals boolean: True if ordinals (e.g. first, second, third) should
196  be parsed to their number values (1, 2, 3...)
197 
198  Returns:
199  str
200  The original text, with numbers subbed in where appropriate.
201 
202  """
203  text = text.lower()
204  tokens = _tokenize(text)
205  numbers_to_replace = \
206  _extract_numbers_with_text(tokens, short_scale, ordinals)
207  numbers_to_replace.sort(key=lambda number: number.start_index)
208 
209  results = []
210  for token in tokens:
211  if not numbers_to_replace or \
212  token.index < numbers_to_replace[0].start_index:
213  results.append(token.word)
214  else:
215  if numbers_to_replace and \
216  token.index == numbers_to_replace[0].start_index:
217  results.append(str(numbers_to_replace[0].value))
218  if numbers_to_replace and \
219  token.index == numbers_to_replace[0].end_index:
220  numbers_to_replace.pop(0)
221 
222  return ' '.join(results)
223 
224 
225 def _extract_numbers_with_text(tokens, short_scale=True,
226  ordinals=False, fractional_numbers=True):
227  """
228  Extract all numbers from a list of _Tokens, with the words that
229  represent them.
230 
231  Args:
232  [_Token]: The tokens to parse.
233  short_scale bool: True if short scale numbers should be used, False for
234  long scale. True by default.
235  ordinals bool: True if ordinal words (first, second, third, etc) should
236  be parsed.
237  fractional_numbers bool: True if we should look for fractions and
238  decimals.
239 
240  Returns:
241  [_ReplaceableNumber]: A list of tuples, each containing a number and a
242  string.
243 
244  """
245  placeholder = "<placeholder>" # inserted to maintain correct indices
246  results = []
247  while True:
248  to_replace = \
249  _extract_number_with_text_en(tokens, short_scale,
250  ordinals, fractional_numbers)
251 
252  if not to_replace:
253  break
254 
255  results.append(to_replace)
256 
257  tokens = [
258  t if not
259  to_replace.start_index <= t.index <= to_replace.end_index
260  else
261  _Token(placeholder, t.index) for t in tokens
262  ]
263  results.sort(key=lambda n: n.start_index)
264  return results
265 
266 
267 def _extract_number_with_text_en(tokens, short_scale=True,
268  ordinals=False, fractional_numbers=True):
269  """
270  This function extracts a number from a list of _Tokens.
271 
272  Args:
273  tokens str: the string to normalize
274  short_scale (bool): use short scale if True, long scale if False
275  ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
276  fractional_numbers (bool): True if we should look for fractions and
277  decimals.
278  Returns:
279  _ReplaceableNumber
280 
281  """
282  number, tokens = \
283  _extract_number_with_text_en_helper(tokens, short_scale,
284  ordinals, fractional_numbers)
285  while tokens and tokens[0].word in _ARTICLES:
286  tokens.pop(0)
287  return _ReplaceableNumber(number, tokens)
288 
289 
291  short_scale=True, ordinals=False,
292  fractional_numbers=True):
293  """
294  Helper for _extract_number_with_text_en.
295 
296  This contains the real logic for parsing, but produces
297  a result that needs a little cleaning (specific, it may
298  contain leading articles that can be trimmed off).
299 
300  Args:
301  tokens [_Token]:
302  short_scale boolean:
303  ordinals boolean:
304  fractional_numbers boolean:
305 
306  Returns:
307  int or float, [_Tokens]
308 
309  """
310  if fractional_numbers:
311  fraction, fraction_text = \
312  _extract_fraction_with_text_en(tokens, short_scale, ordinals)
313  if fraction:
314  return fraction, fraction_text
315 
316  decimal, decimal_text = \
317  _extract_decimal_with_text_en(tokens, short_scale, ordinals)
318  if decimal:
319  return decimal, decimal_text
320 
321  return _extract_whole_number_with_text_en(tokens, short_scale, ordinals)
322 
323 
324 def _extract_fraction_with_text_en(tokens, short_scale, ordinals):
325  """
326  Extract fraction numbers from a string.
327 
328  This function handles text such as '2 and 3/4'. Note that "one half" or
329  similar will be parsed by the whole number function.
330 
331  Args:
332  tokens [_Token]: words and their indexes in the original string.
333  short_scale boolean:
334  ordinals boolean:
335 
336  Returns:
337  (int or float, [_Token])
338  The value found, and the list of relevant tokens.
339  (None, None) if no fraction value is found.
340 
341  """
342  for c in _FRACTION_MARKER:
343  partitions = _partition_list(tokens, lambda t: t.word == c)
344 
345  if len(partitions) == 3:
346  numbers1 = \
347  _extract_numbers_with_text(partitions[0], short_scale,
348  ordinals, fractional_numbers=False)
349  numbers2 = \
350  _extract_numbers_with_text(partitions[2], short_scale,
351  ordinals, fractional_numbers=True)
352 
353  if not numbers1 or not numbers2:
354  return None, None
355 
356  # ensure first is not a fraction and second is a fraction
357  num1 = numbers1[-1]
358  num2 = numbers2[0]
359  if num1.value >= 1 and 0 < num2.value < 1:
360  return num1.value + num2.value, \
361  num1.tokens + partitions[1] + num2.tokens
362 
363  return None, None
364 
365 
366 def _extract_decimal_with_text_en(tokens, short_scale, ordinals):
367  """
368  Extract decimal numbers from a string.
369 
370  This function handles text such as '2 point 5'.
371 
372  Notes:
373  While this is a helper for extractnumber_en, it also depends on
374  extractnumber_en, to parse out the components of the decimal.
375 
376  This does not currently handle things like:
377  number dot number number number
378 
379  Args:
380  tokens [_Token]: The text to parse.
381  short_scale boolean:
382  ordinals boolean:
383 
384  Returns:
385  (float, [_Token])
386  The value found and relevant tokens.
387  (None, None) if no decimal value is found.
388 
389  """
390  for c in _DECIMAL_MARKER:
391  partitions = _partition_list(tokens, lambda t: t.word == c)
392 
393  if len(partitions) == 3:
394  numbers1 = \
395  _extract_numbers_with_text(partitions[0], short_scale,
396  ordinals, fractional_numbers=False)
397  numbers2 = \
398  _extract_numbers_with_text(partitions[2], short_scale,
399  ordinals, fractional_numbers=False)
400 
401  if not numbers1 or not numbers2:
402  return None, None
403 
404  number = numbers1[-1]
405  decimal = numbers2[0]
406 
407  # TODO handle number dot number number number
408  if "." not in str(decimal.text):
409  return number.value + float('0.' + str(decimal.value)), \
410  number.tokens + partitions[1] + decimal.tokens
411  return None, None
412 
413 
414 def _extract_whole_number_with_text_en(tokens, short_scale, ordinals):
415  """
416  Handle numbers not handled by the decimal or fraction functions. This is
417  generally whole numbers. Note that phrases such as "one half" will be
418  handled by this function, while "one and a half" are handled by the
419  fraction function.
420 
421  Args:
422  tokens [_Token]:
423  short_scale boolean:
424  ordinals boolean:
425 
426  Returns:
427  int or float, [_Tokens]
428  The value parsed, and tokens that it corresponds to.
429 
430  """
431  multiplies, string_num_ordinal, string_num_scale = \
432  _initialize_number_data(short_scale)
433 
434  number_words = [] # type: [_Token]
435  val = False
436  prev_val = None
437  next_val = None
438  to_sum = []
439  for idx, token in enumerate(tokens):
440  current_val = None
441  if next_val:
442  next_val = None
443  continue
444 
445  word = token.word
446  if word in _ARTICLES or word in _NEGATIVES:
447  number_words.append(token)
448  continue
449 
450  prev_word = tokens[idx - 1].word if idx > 0 else ""
451  next_word = tokens[idx + 1].word if idx + 1 < len(tokens) else ""
452 
453  if word not in string_num_scale and \
454  word not in _STRING_NUM_EN and \
455  word not in _SUMS and \
456  word not in multiplies and \
457  not (ordinals and word in string_num_ordinal) and \
458  not is_numeric(word) and \
459  not isFractional_en(word, short_scale=short_scale) and \
460  not look_for_fractions(word.split('/')):
461  words_only = [token.word for token in number_words]
462  if number_words and not all([w in _ARTICLES |
463  _NEGATIVES for w in words_only]):
464  break
465  else:
466  number_words = []
467  continue
468  elif word not in multiplies \
469  and prev_word not in multiplies \
470  and prev_word not in _SUMS \
471  and not (ordinals and prev_word in string_num_ordinal) \
472  and prev_word not in _NEGATIVES \
473  and prev_word not in _ARTICLES:
474  number_words = [token]
475  elif prev_word in _SUMS and word in _SUMS:
476  number_words = [token]
477  else:
478  number_words.append(token)
479 
480  # is this word already a number ?
481  if is_numeric(word):
482  if word.isdigit(): # doesn't work with decimals
483  val = int(word)
484  else:
485  val = float(word)
486  current_val = val
487 
488  # is this word the name of a number ?
489  if word in _STRING_NUM_EN:
490  val = _STRING_NUM_EN.get(word)
491  current_val = val
492  elif word in string_num_scale:
493  val = string_num_scale.get(word)
494  current_val = val
495  elif ordinals and word in string_num_ordinal:
496  val = string_num_ordinal[word]
497  current_val = val
498 
499  # is the prev word an ordinal number and current word is one?
500  # second one, third one
501  if ordinals and prev_word in string_num_ordinal and val is 1:
502  val = prev_val
503 
504  # is the prev word a number and should we sum it?
505  # twenty two, fifty six
506  if prev_word in _SUMS and val and val < 10:
507  val = prev_val + val
508 
509  # is the prev word a number and should we multiply it?
510  # twenty hundred, six hundred
511  if word in multiplies:
512  if not prev_val:
513  prev_val = 1
514  val = prev_val * val
515 
516  # is this a spoken fraction?
517  # half cup
518  if val is False:
519  val = isFractional_en(word, short_scale=short_scale)
520  current_val = val
521 
522  # 2 fifths
523  if not ordinals:
524  next_val = isFractional_en(next_word, short_scale=short_scale)
525  if next_val:
526  if not val:
527  val = 1
528  val = val * next_val
529  number_words.append(tokens[idx + 1])
530 
531  # is this a negative number?
532  if val and prev_word and prev_word in _NEGATIVES:
533  val = 0 - val
534 
535  # let's make sure it isn't a fraction
536  if not val:
537  # look for fractions like "2/3"
538  aPieces = word.split('/')
539  if look_for_fractions(aPieces):
540  val = float(aPieces[0]) / float(aPieces[1])
541  current_val = val
542 
543  else:
544  if prev_word in _SUMS and word not in _SUMS and current_val >= 10:
545  # Backtrack - we've got numbers we can't sum.
546  number_words.pop()
547  val = prev_val
548  break
549  prev_val = val
550 
551  # handle long numbers
552  # six hundred sixty six
553  # two million five hundred thousand
554  if word in multiplies and next_word not in multiplies:
555  to_sum.append(val)
556  val = 0
557  prev_val = 0
558 
559  if val is not None and to_sum:
560  val += sum(to_sum)
561 
562  return val, number_words
563 
564 
565 def _initialize_number_data(short_scale):
566  """
567  Generate dictionaries of words to numbers, based on scale.
568 
569  This is a helper function for _extract_whole_number.
570 
571  Args:
572  short_scale boolean:
573 
574  Returns:
575  (set(str), dict(str, number), dict(str, number))
576  multiplies, string_num_ordinal, string_num_scale
577 
578  """
579  multiplies = _MULTIPLIES_SHORT_SCALE_EN if short_scale \
580  else _MULTIPLIES_LONG_SCALE_EN
581 
582  string_num_ordinal_en = _STRING_SHORT_ORDINAL_EN if short_scale \
583  else _STRING_LONG_ORDINAL_EN
584 
585  string_num_scale_en = _SHORT_SCALE_EN if short_scale else _LONG_SCALE_EN
586  string_num_scale_en = _invert_dict(string_num_scale_en)
587  string_num_scale_en.update(_generate_plurals(string_num_scale_en))
588 
589  return multiplies, string_num_ordinal_en, string_num_scale_en
590 
591 
592 def extractnumber_en(text, short_scale=True, ordinals=False):
593  """
594  This function extracts a number from a text string,
595  handles pronunciations in long scale and short scale
596 
597  https://en.wikipedia.org/wiki/Names_of_large_numbers
598 
599  Args:
600  text (str): the string to normalize
601  short_scale (bool): use short scale if True, long scale if False
602  ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
603  Returns:
604  (int) or (float) or False: The extracted number or False if no number
605  was found
606 
607  """
609  short_scale, ordinals).value
610 
611 
613  """
614  Convert an english phrase into a number of seconds
615 
616  Convert things like:
617  "10 minute"
618  "2 and a half hours"
619  "3 days 8 hours 10 minutes and 49 seconds"
620  into an int, representing the total number of seconds.
621 
622  The words used in the duration will be consumed, and
623  the remainder returned.
624 
625  As an example, "set a timer for 5 minutes" would return
626  (300, "set a timer for").
627 
628  Args:
629  text (str): string containing a duration
630 
631  Returns:
632  (timedelta, str):
633  A tuple containing the duration and the remaining text
634  not consumed in the parsing. The first value will
635  be None if no duration is found. The text returned
636  will have whitespace stripped from the ends.
637  """
638  if not text:
639  return None
640 
641  time_units = {
642  'microseconds': None,
643  'milliseconds': None,
644  'seconds': None,
645  'minutes': None,
646  'hours': None,
647  'days': None,
648  'weeks': None
649  }
650 
651  pattern = r"(?P<value>\d+(?:\.?\d+)?)\s+{unit}s?"
652  text = _convert_words_to_numbers(text)
653 
654  for unit in time_units:
655  unit_pattern = pattern.format(unit=unit[:-1]) # remove 's' from unit
656  matches = re.findall(unit_pattern, text)
657  value = sum(map(float, matches))
658  time_units[unit] = value
659  text = re.sub(unit_pattern, '', text)
660 
661  text = text.strip()
662  duration = timedelta(**time_units) if any(time_units.values()) else None
663 
664  return (duration, text)
665 
666 
667 def extract_datetime_en(string, dateNow, default_time):
668  """ Convert a human date reference into an exact datetime
669 
670  Convert things like
671  "today"
672  "tomorrow afternoon"
673  "next Tuesday at 4pm"
674  "August 3rd"
675  into a datetime. If a reference date is not provided, the current
676  local time is used. Also consumes the words used to define the date
677  returning the remaining string. For example, the string
678  "what is Tuesday's weather forecast"
679  returns the date for the forthcoming Tuesday relative to the reference
680  date and the remainder string
681  "what is weather forecast".
682 
683  Args:
684  string (str): string containing date words
685  dateNow (datetime): A reference date/time for "tommorrow", etc
686  default_time (time): Time to set if no time was found in the string
687 
688  Returns:
689  [datetime, str]: An array containing the datetime and the remaining
690  text not consumed in the parsing, or None if no
691  date or time related text was found.
692  """
693 
694  def clean_string(s):
695  # clean unneeded punctuation and capitalization among other things.
696  s = s.lower().replace('?', '').replace('.', '').replace(',', '') \
697  .replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ') \
698  .replace("o' clock", "o'clock").replace("o clock", "o'clock") \
699  .replace("o ' clock", "o'clock").replace("o 'clock", "o'clock") \
700  .replace("oclock", "o'clock").replace("couple", "2") \
701  .replace("centuries", "century").replace("decades", "decade") \
702  .replace("millenniums", "millennium")
703 
704  wordList = s.split()
705  for idx, word in enumerate(wordList):
706  word = word.replace("'s", "")
707 
708  ordinals = ["rd", "st", "nd", "th"]
709  if word[0].isdigit():
710  for ordinal in ordinals:
711  # "second" is the only case we should not do this
712  if ordinal in word and "second" not in word:
713  word = word.replace(ordinal, "")
714  wordList[idx] = word
715 
716  return wordList
717 
718  def date_found():
719  return found or \
720  (
721  datestr != "" or
722  yearOffset != 0 or monthOffset != 0 or
723  dayOffset is True or hrOffset != 0 or
724  hrAbs or minOffset != 0 or
725  minAbs or secOffset != 0
726  )
727 
728  if string == "" or not dateNow:
729  return None
730 
731  found = False
732  daySpecified = False
733  dayOffset = False
734  monthOffset = 0
735  yearOffset = 0
736  today = dateNow.strftime("%w")
737  currentYear = dateNow.strftime("%Y")
738  fromFlag = False
739  datestr = ""
740  hasYear = False
741  timeQualifier = ""
742 
743  timeQualifiersAM = ['morning']
744  timeQualifiersPM = ['afternoon', 'evening', 'night', 'tonight']
745  timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM)
746  markers = ['at', 'in', 'on', 'by', 'this', 'around', 'for', 'of', "within"]
747  days = ['monday', 'tuesday', 'wednesday',
748  'thursday', 'friday', 'saturday', 'sunday']
749  months = ['january', 'february', 'march', 'april', 'may', 'june',
750  'july', 'august', 'september', 'october', 'november',
751  'december']
752  recur_markers = days + [d+'s' for d in days] + ['weekend', 'weekday',
753  'weekends', 'weekdays']
754  monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug',
755  'sept', 'oct', 'nov', 'dec']
756  year_multiples = ["decade", "century", "millennium"]
757  day_multiples = ["weeks", "months", "years"]
758 
759  words = clean_string(string)
760 
761  for idx, word in enumerate(words):
762  if word == "":
763  continue
764  wordPrevPrev = words[idx - 2] if idx > 1 else ""
765  wordPrev = words[idx - 1] if idx > 0 else ""
766  wordNext = words[idx + 1] if idx + 1 < len(words) else ""
767  wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
768 
769  # this isn't in clean string because I don't want to save back to words
770  word = word.rstrip('s')
771  start = idx
772  used = 0
773  # save timequalifier for later
774 
775  if word == "now" and not datestr:
776  resultStr = " ".join(words[idx + 1:])
777  resultStr = ' '.join(resultStr.split())
778  extractedDate = dateNow.replace(microsecond=0)
779  return [extractedDate, resultStr]
780  elif wordNext in year_multiples:
781  multiplier = None
782  if is_numeric(word):
783  multiplier = extractnumber_en(word)
784  multiplier = multiplier or 1
785  multiplier = int(multiplier)
786  used += 2
787  if wordNext == "decade":
788  yearOffset = multiplier * 10
789  elif wordNext == "century":
790  yearOffset = multiplier * 100
791  elif wordNext == "millennium":
792  yearOffset = multiplier * 1000
793  # couple of
794  elif word == "2" and wordNext == "of" and \
795  wordNextNext in year_multiples:
796  multiplier = 2
797  used += 3
798  if wordNextNext == "decade":
799  yearOffset = multiplier * 10
800  elif wordNextNext == "century":
801  yearOffset = multiplier * 100
802  elif wordNextNext == "millennium":
803  yearOffset = multiplier * 1000
804  elif word == "2" and wordNext == "of" and \
805  wordNextNext in day_multiples:
806  multiplier = 2
807  used += 3
808  if wordNextNext == "years":
809  yearOffset = multiplier
810  elif wordNextNext == "months":
811  monthOffset = multiplier
812  elif wordNextNext == "weeks":
813  dayOffset = multiplier * 7
814  elif word in timeQualifiersList:
815  timeQualifier = word
816  # parse today, tomorrow, day after tomorrow
817  elif word == "today" and not fromFlag:
818  dayOffset = 0
819  used += 1
820  elif word == "tomorrow" and not fromFlag:
821  dayOffset = 1
822  used += 1
823  elif (word == "day" and
824  wordNext == "after" and
825  wordNextNext == "tomorrow" and
826  not fromFlag and
827  not wordPrev[0].isdigit()):
828  dayOffset = 2
829  used = 3
830  if wordPrev == "the":
831  start -= 1
832  used += 1
833  # parse 5 days, 10 weeks, last week, next week
834  elif word == "day":
835  if wordPrev[0].isdigit():
836  dayOffset += int(wordPrev)
837  start -= 1
838  used = 2
839  elif word == "week" and not fromFlag:
840  if wordPrev[0].isdigit():
841  dayOffset += int(wordPrev) * 7
842  start -= 1
843  used = 2
844  elif wordPrev == "next":
845  dayOffset = 7
846  start -= 1
847  used = 2
848  elif wordPrev == "last":
849  dayOffset = -7
850  start -= 1
851  used = 2
852  # parse 10 months, next month, last month
853  elif word == "month" and not fromFlag:
854  if wordPrev[0].isdigit():
855  monthOffset = int(wordPrev)
856  start -= 1
857  used = 2
858  elif wordPrev == "next":
859  monthOffset = 1
860  start -= 1
861  used = 2
862  elif wordPrev == "last":
863  monthOffset = -1
864  start -= 1
865  used = 2
866  # parse 5 years, next year, last year
867  elif word == "year" and not fromFlag:
868  if wordPrev[0].isdigit():
869  yearOffset = int(wordPrev)
870  start -= 1
871  used = 2
872  elif wordPrev == "next":
873  yearOffset = 1
874  start -= 1
875  used = 2
876  elif wordPrev == "last":
877  yearOffset = -1
878  start -= 1
879  used = 2
880  # parse Monday, Tuesday, etc., and next Monday,
881  # last Tuesday, etc.
882  elif word in days and not fromFlag:
883  d = days.index(word)
884  dayOffset = (d + 1) - int(today)
885  used = 1
886  if dayOffset < 0:
887  dayOffset += 7
888  if wordPrev == "next":
889  dayOffset += 7
890  used += 1
891  start -= 1
892  elif wordPrev == "last":
893  dayOffset -= 7
894  used += 1
895  start -= 1
896  # parse 15 of July, June 20th, Feb 18, 19 of February
897  elif word in months or word in monthsShort and not fromFlag:
898  try:
899  m = months.index(word)
900  except ValueError:
901  m = monthsShort.index(word)
902  used += 1
903  datestr = months[m]
904  if wordPrev and (wordPrev[0].isdigit() or
905  (wordPrev == "of" and wordPrevPrev[0].isdigit())):
906  if wordPrev == "of" and wordPrevPrev[0].isdigit():
907  datestr += " " + words[idx - 2]
908  used += 1
909  start -= 1
910  else:
911  datestr += " " + wordPrev
912  start -= 1
913  used += 1
914  if wordNext and wordNext[0].isdigit():
915  datestr += " " + wordNext
916  used += 1
917  hasYear = True
918  else:
919  hasYear = False
920 
921  elif wordNext and wordNext[0].isdigit():
922  datestr += " " + wordNext
923  used += 1
924  if wordNextNext and wordNextNext[0].isdigit():
925  datestr += " " + wordNextNext
926  used += 1
927  hasYear = True
928  else:
929  hasYear = False
930  # parse 5 days from tomorrow, 10 weeks from next thursday,
931  # 2 months from July
932  validFollowups = days + months + monthsShort
933  validFollowups.append("today")
934  validFollowups.append("tomorrow")
935  validFollowups.append("next")
936  validFollowups.append("last")
937  validFollowups.append("now")
938  if (word == "from" or word == "after") and wordNext in validFollowups:
939  used = 2
940  fromFlag = True
941  if wordNext == "tomorrow":
942  dayOffset += 1
943  elif wordNext in days:
944  d = days.index(wordNext)
945  tmpOffset = (d + 1) - int(today)
946  used = 2
947  if tmpOffset < 0:
948  tmpOffset += 7
949  dayOffset += tmpOffset
950  elif wordNextNext and wordNextNext in days:
951  d = days.index(wordNextNext)
952  tmpOffset = (d + 1) - int(today)
953  used = 3
954  if wordNext == "next":
955  tmpOffset += 7
956  used += 1
957  start -= 1
958  elif wordNext == "last":
959  tmpOffset -= 7
960  used += 1
961  start -= 1
962  dayOffset += tmpOffset
963  if used > 0:
964  if start - 1 > 0 and words[start - 1] == "this":
965  start -= 1
966  used += 1
967 
968  for i in range(0, used):
969  words[i + start] = ""
970 
971  if start - 1 >= 0 and words[start - 1] in markers:
972  words[start - 1] = ""
973  found = True
974  daySpecified = True
975 
976  # parse time
977  hrOffset = 0
978  minOffset = 0
979  secOffset = 0
980  hrAbs = None
981  minAbs = None
982  military = False
983 
984  for idx, word in enumerate(words):
985  if word == "":
986  continue
987 
988  wordPrevPrev = words[idx - 2] if idx > 1 else ""
989  wordPrev = words[idx - 1] if idx > 0 else ""
990  wordNext = words[idx + 1] if idx + 1 < len(words) else ""
991  wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
992  # parse noon, midnight, morning, afternoon, evening
993  used = 0
994  if word == "noon":
995  hrAbs = 12
996  used += 1
997  elif word == "midnight":
998  hrAbs = 0
999  used += 1
1000  elif word == "morning":
1001  if hrAbs is None:
1002  hrAbs = 8
1003  used += 1
1004  elif word == "afternoon":
1005  if hrAbs is None:
1006  hrAbs = 15
1007  used += 1
1008  elif word == "evening":
1009  if hrAbs is None:
1010  hrAbs = 19
1011  used += 1
1012  # couple of time_unit
1013  elif word == "2" and wordNext == "of" and \
1014  wordNextNext in ["hours", "minutes", "seconds"]:
1015  used += 3
1016  if wordNextNext == "hours":
1017  hrOffset = 2
1018  elif wordNextNext == "minutes":
1019  minOffset = 2
1020  elif wordNextNext == "seconds":
1021  secOffset = 2
1022  # parse half an hour, quarter hour
1023  elif word == "hour" and \
1024  (wordPrev in markers or wordPrevPrev in markers):
1025  if wordPrev == "half":
1026  minOffset = 30
1027  elif wordPrev == "quarter":
1028  minOffset = 15
1029  elif wordPrevPrev == "quarter":
1030  minOffset = 15
1031  if idx > 2 and words[idx - 3] in markers:
1032  words[idx - 3] = ""
1033  if words[idx - 3] == "this":
1034  daySpecified = True
1035  words[idx - 2] = ""
1036  elif wordPrev == "within":
1037  hrOffset = 1
1038  else:
1039  hrOffset = 1
1040  if wordPrevPrev in markers:
1041  words[idx - 2] = ""
1042  if wordPrevPrev == "this":
1043  daySpecified = True
1044  words[idx - 1] = ""
1045  used += 1
1046  hrAbs = -1
1047  minAbs = -1
1048  # parse 5:00 am, 12:00 p.m., etc
1049  # parse in a minute
1050  elif word == "minute" and wordPrev == "in":
1051  minOffset = 1
1052  words[idx - 1] = ""
1053  used += 1
1054  # parse in a second
1055  elif word == "second" and wordPrev == "in":
1056  secOffset = 1
1057  words[idx - 1] = ""
1058  used += 1
1059  elif word[0].isdigit():
1060  isTime = True
1061  strHH = ""
1062  strMM = ""
1063  remainder = ""
1064  wordNextNextNext = words[idx + 3] \
1065  if idx + 3 < len(words) else ""
1066  if wordNext == "tonight" or wordNextNext == "tonight" or \
1067  wordPrev == "tonight" or wordPrevPrev == "tonight" or \
1068  wordNextNextNext == "tonight":
1069  remainder = "pm"
1070  used += 1
1071  if wordPrev == "tonight":
1072  words[idx - 1] = ""
1073  if wordPrevPrev == "tonight":
1074  words[idx - 2] = ""
1075  if wordNextNext == "tonight":
1076  used += 1
1077  if wordNextNextNext == "tonight":
1078  used += 1
1079 
1080  if ':' in word:
1081  # parse colons
1082  # "3:00 in the morning"
1083  stage = 0
1084  length = len(word)
1085  for i in range(length):
1086  if stage == 0:
1087  if word[i].isdigit():
1088  strHH += word[i]
1089  elif word[i] == ":":
1090  stage = 1
1091  else:
1092  stage = 2
1093  i -= 1
1094  elif stage == 1:
1095  if word[i].isdigit():
1096  strMM += word[i]
1097  else:
1098  stage = 2
1099  i -= 1
1100  elif stage == 2:
1101  remainder = word[i:].replace(".", "")
1102  break
1103  if remainder == "":
1104  nextWord = wordNext.replace(".", "")
1105  if nextWord == "am" or nextWord == "pm":
1106  remainder = nextWord
1107  used += 1
1108 
1109  elif wordNext == "in" and wordNextNext == "the" and \
1110  words[idx + 3] == "morning":
1111  remainder = "am"
1112  used += 3
1113  elif wordNext == "in" and wordNextNext == "the" and \
1114  words[idx + 3] == "afternoon":
1115  remainder = "pm"
1116  used += 3
1117  elif wordNext == "in" and wordNextNext == "the" and \
1118  words[idx + 3] == "evening":
1119  remainder = "pm"
1120  used += 3
1121  elif wordNext == "in" and wordNextNext == "morning":
1122  remainder = "am"
1123  used += 2
1124  elif wordNext == "in" and wordNextNext == "afternoon":
1125  remainder = "pm"
1126  used += 2
1127  elif wordNext == "in" and wordNextNext == "evening":
1128  remainder = "pm"
1129  used += 2
1130  elif wordNext == "this" and wordNextNext == "morning":
1131  remainder = "am"
1132  used = 2
1133  daySpecified = True
1134  elif wordNext == "this" and wordNextNext == "afternoon":
1135  remainder = "pm"
1136  used = 2
1137  daySpecified = True
1138  elif wordNext == "this" and wordNextNext == "evening":
1139  remainder = "pm"
1140  used = 2
1141  daySpecified = True
1142  elif wordNext == "at" and wordNextNext == "night":
1143  if strHH and int(strHH) > 5:
1144  remainder = "pm"
1145  else:
1146  remainder = "am"
1147  used += 2
1148 
1149  else:
1150  if timeQualifier != "":
1151  military = True
1152  if strHH and int(strHH) <= 12 and \
1153  (timeQualifier in timeQualifiersPM):
1154  strHH += str(int(strHH) + 12)
1155 
1156  else:
1157  # try to parse numbers without colons
1158  # 5 hours, 10 minutes etc.
1159  length = len(word)
1160  strNum = ""
1161  remainder = ""
1162  for i in range(length):
1163  if word[i].isdigit():
1164  strNum += word[i]
1165  else:
1166  remainder += word[i]
1167 
1168  if remainder == "":
1169  remainder = wordNext.replace(".", "").lstrip().rstrip()
1170  if (
1171  remainder == "pm" or
1172  wordNext == "pm" or
1173  remainder == "p.m." or
1174  wordNext == "p.m."):
1175  strHH = strNum
1176  remainder = "pm"
1177  used = 1
1178  elif (
1179  remainder == "am" or
1180  wordNext == "am" or
1181  remainder == "a.m." or
1182  wordNext == "a.m."):
1183  strHH = strNum
1184  remainder = "am"
1185  used = 1
1186  elif (
1187  remainder in recur_markers or
1188  wordNext in recur_markers or
1189  wordNextNext in recur_markers):
1190  # Ex: "7 on mondays" or "3 this friday"
1191  # Set strHH so that isTime == True
1192  # when am or pm is not specified
1193  strHH = strNum
1194  used = 1
1195  else:
1196  if (
1197  int(strNum) > 100 and
1198  (
1199  wordPrev == "o" or
1200  wordPrev == "oh"
1201  )):
1202  # 0800 hours (pronounced oh-eight-hundred)
1203  strHH = str(int(strNum) // 100)
1204  strMM = str(int(strNum) % 100)
1205  military = True
1206  if wordNext == "hours":
1207  used += 1
1208  elif (
1209  (wordNext == "hours" or wordNext == "hour" or
1210  remainder == "hours" or remainder == "hour") and
1211  word[0] != '0' and
1212  (
1213  int(strNum) < 100 or
1214  int(strNum) > 2400
1215  )):
1216  # ignores military time
1217  # "in 3 hours"
1218  hrOffset = int(strNum)
1219  used = 2
1220  isTime = False
1221  hrAbs = -1
1222  minAbs = -1
1223 
1224  elif wordNext == "minutes" or wordNext == "minute" or \
1225  remainder == "minutes" or remainder == "minute":
1226  # "in 10 minutes"
1227  minOffset = int(strNum)
1228  used = 2
1229  isTime = False
1230  hrAbs = -1
1231  minAbs = -1
1232  elif wordNext == "seconds" or wordNext == "second" \
1233  or remainder == "seconds" or remainder == "second":
1234  # in 5 seconds
1235  secOffset = int(strNum)
1236  used = 2
1237  isTime = False
1238  hrAbs = -1
1239  minAbs = -1
1240  elif int(strNum) > 100:
1241  # military time, eg. "3300 hours"
1242  strHH = str(int(strNum) // 100)
1243  strMM = str(int(strNum) % 100)
1244  military = True
1245  if wordNext == "hours" or wordNext == "hour" or \
1246  remainder == "hours" or remainder == "hour":
1247  used += 1
1248  elif wordNext and wordNext[0].isdigit():
1249  # military time, e.g. "04 38 hours"
1250  strHH = strNum
1251  strMM = wordNext
1252  military = True
1253  used += 1
1254  if (wordNextNext == "hours" or
1255  wordNextNext == "hour" or
1256  remainder == "hours" or remainder == "hour"):
1257  used += 1
1258  elif (
1259  wordNext == "" or wordNext == "o'clock" or
1260  (
1261  wordNext == "in" and
1262  (
1263  wordNextNext == "the" or
1264  wordNextNext == timeQualifier
1265  )
1266  ) or wordNext == 'tonight' or
1267  wordNextNext == 'tonight'):
1268 
1269  strHH = strNum
1270  strMM = "00"
1271  if wordNext == "o'clock":
1272  used += 1
1273 
1274  if wordNext == "in" or wordNextNext == "in":
1275  used += (1 if wordNext == "in" else 2)
1276  wordNextNextNext = words[idx + 3] \
1277  if idx + 3 < len(words) else ""
1278 
1279  if (wordNextNext and
1280  (wordNextNext in timeQualifier or
1281  wordNextNextNext in timeQualifier)):
1282  if (wordNextNext in timeQualifiersPM or
1283  wordNextNextNext in timeQualifiersPM):
1284  remainder = "pm"
1285  used += 1
1286  if (wordNextNext in timeQualifiersAM or
1287  wordNextNextNext in timeQualifiersAM):
1288  remainder = "am"
1289  used += 1
1290 
1291  if timeQualifier != "":
1292  if timeQualifier in timeQualifiersPM:
1293  remainder = "pm"
1294  used += 1
1295 
1296  elif timeQualifier in timeQualifiersAM:
1297  remainder = "am"
1298  used += 1
1299  else:
1300  # TODO: Unsure if this is 100% accurate
1301  used += 1
1302  military = True
1303  else:
1304  isTime = False
1305  HH = int(strHH) if strHH else 0
1306  MM = int(strMM) if strMM else 0
1307  HH = HH + 12 if remainder == "pm" and HH < 12 else HH
1308  HH = HH - 12 if remainder == "am" and HH >= 12 else HH
1309 
1310  if (not military and
1311  remainder not in ['am', 'pm', 'hours', 'minutes',
1312  "second", "seconds",
1313  "hour", "minute"] and
1314  ((not daySpecified) or dayOffset < 1)):
1315  # ambiguous time, detect whether they mean this evening or
1316  # the next morning based on whether it has already passed
1317  if dateNow.hour < HH or (dateNow.hour == HH and
1318  dateNow.minute < MM):
1319  pass # No modification needed
1320  elif dateNow.hour < HH + 12:
1321  HH += 12
1322  else:
1323  # has passed, assume the next morning
1324  dayOffset += 1
1325 
1326  if timeQualifier in timeQualifiersPM and HH < 12:
1327  HH += 12
1328 
1329  if HH > 24 or MM > 59:
1330  isTime = False
1331  used = 0
1332  if isTime:
1333  hrAbs = HH
1334  minAbs = MM
1335  used += 1
1336 
1337  if used > 0:
1338  # removed parsed words from the sentence
1339  for i in range(used):
1340  if idx + i >= len(words):
1341  break
1342  words[idx + i] = ""
1343 
1344  if wordPrev == "o" or wordPrev == "oh":
1345  words[words.index(wordPrev)] = ""
1346 
1347  if wordPrev == "early":
1348  hrOffset = -1
1349  words[idx - 1] = ""
1350  idx -= 1
1351  elif wordPrev == "late":
1352  hrOffset = 1
1353  words[idx - 1] = ""
1354  idx -= 1
1355  if idx > 0 and wordPrev in markers:
1356  words[idx - 1] = ""
1357  if wordPrev == "this":
1358  daySpecified = True
1359  if idx > 1 and wordPrevPrev in markers:
1360  words[idx - 2] = ""
1361  if wordPrevPrev == "this":
1362  daySpecified = True
1363 
1364  idx += used - 1
1365  found = True
1366  # check that we found a date
1367  if not date_found:
1368  return None
1369 
1370  if dayOffset is False:
1371  dayOffset = 0
1372 
1373  # perform date manipulation
1374 
1375  extractedDate = dateNow.replace(microsecond=0)
1376 
1377  if datestr != "":
1378  # date included an explicit date, e.g. "june 5" or "june 2, 2017"
1379  try:
1380  temp = datetime.strptime(datestr, "%B %d")
1381  except ValueError:
1382  # Try again, allowing the year
1383  temp = datetime.strptime(datestr, "%B %d %Y")
1384  extractedDate = extractedDate.replace(hour=0, minute=0, second=0)
1385  if not hasYear:
1386  temp = temp.replace(year=extractedDate.year,
1387  tzinfo=extractedDate.tzinfo)
1388  if extractedDate < temp:
1389  extractedDate = extractedDate.replace(
1390  year=int(currentYear),
1391  month=int(temp.strftime("%m")),
1392  day=int(temp.strftime("%d")),
1393  tzinfo=extractedDate.tzinfo)
1394  else:
1395  extractedDate = extractedDate.replace(
1396  year=int(currentYear) + 1,
1397  month=int(temp.strftime("%m")),
1398  day=int(temp.strftime("%d")),
1399  tzinfo=extractedDate.tzinfo)
1400  else:
1401  extractedDate = extractedDate.replace(
1402  year=int(temp.strftime("%Y")),
1403  month=int(temp.strftime("%m")),
1404  day=int(temp.strftime("%d")),
1405  tzinfo=extractedDate.tzinfo)
1406  else:
1407  # ignore the current HH:MM:SS if relative using days or greater
1408  if hrOffset == 0 and minOffset == 0 and secOffset == 0:
1409  extractedDate = extractedDate.replace(hour=0, minute=0, second=0)
1410 
1411  if yearOffset != 0:
1412  extractedDate = extractedDate + relativedelta(years=yearOffset)
1413  if monthOffset != 0:
1414  extractedDate = extractedDate + relativedelta(months=monthOffset)
1415  if dayOffset != 0:
1416  extractedDate = extractedDate + relativedelta(days=dayOffset)
1417  if hrAbs != -1 and minAbs != -1:
1418  # If no time was supplied in the string set the time to default
1419  # time if it's available
1420  if hrAbs is None and minAbs is None and default_time is not None:
1421  hrAbs, minAbs = default_time.hour, default_time.minute
1422  else:
1423  hrAbs = hrAbs or 0
1424  minAbs = minAbs or 0
1425 
1426  extractedDate = extractedDate + relativedelta(hours=hrAbs,
1427  minutes=minAbs)
1428  if (hrAbs != 0 or minAbs != 0) and datestr == "":
1429  if not daySpecified and dateNow > extractedDate:
1430  extractedDate = extractedDate + relativedelta(days=1)
1431  if hrOffset != 0:
1432  extractedDate = extractedDate + relativedelta(hours=hrOffset)
1433  if minOffset != 0:
1434  extractedDate = extractedDate + relativedelta(minutes=minOffset)
1435  if secOffset != 0:
1436  extractedDate = extractedDate + relativedelta(seconds=secOffset)
1437  for idx, word in enumerate(words):
1438  if words[idx] == "and" and \
1439  words[idx - 1] == "" and words[idx + 1] == "":
1440  words[idx] = ""
1441 
1442  resultStr = " ".join(words)
1443  resultStr = ' '.join(resultStr.split())
1444  return [extractedDate, resultStr]
1445 
1446 
1447 def isFractional_en(input_str, short_scale=True):
1448  """
1449  This function takes the given text and checks if it is a fraction.
1450 
1451  Args:
1452  input_str (str): the string to check if fractional
1453  short_scale (bool): use short scale if True, long scale if False
1454  Returns:
1455  (bool) or (float): False if not a fraction, otherwise the fraction
1456 
1457  """
1458  if input_str.endswith('s', -1):
1459  input_str = input_str[:len(input_str) - 1] # e.g. "fifths"
1460 
1461  fracts = {"whole": 1, "half": 2, "halve": 2, "quarter": 4}
1462  if short_scale:
1463  for num in _SHORT_ORDINAL_STRING_EN:
1464  if num > 2:
1465  fracts[_SHORT_ORDINAL_STRING_EN[num]] = num
1466  else:
1467  for num in _LONG_ORDINAL_STRING_EN:
1468  if num > 2:
1469  fracts[_LONG_ORDINAL_STRING_EN[num]] = num
1470 
1471  if input_str.lower() in fracts:
1472  return 1.0 / fracts[input_str.lower()]
1473  return False
1474 
1475 
1476 def extract_numbers_en(text, short_scale=True, ordinals=False):
1477  """
1478  Takes in a string and extracts a list of numbers.
1479 
1480  Args:
1481  text (str): the string to extract a number from
1482  short_scale (bool): Use "short scale" or "long scale" for large
1483  numbers -- over a million. The default is short scale, which
1484  is now common in most English speaking countries.
1485  See https://en.wikipedia.org/wiki/Names_of_large_numbers
1486  ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
1487  Returns:
1488  list: list of extracted numbers as floats
1489  """
1490  results = _extract_numbers_with_text(_tokenize(text),
1491  short_scale, ordinals)
1492  return [float(result.value) for result in results]
1493 
1494 
1495 def normalize_en(text, remove_articles):
1496  """ English string normalization """
1497 
1498  words = text.split() # this also removed extra spaces
1499  normalized = ""
1500  for word in words:
1501  if remove_articles and word in ["the", "a", "an"]:
1502  continue
1503 
1504  # Expand common contractions, e.g. "isn't" -> "is not"
1505  contraction = ["ain't", "aren't", "can't", "could've", "couldn't",
1506  "didn't", "doesn't", "don't", "gonna", "gotta",
1507  "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's",
1508  "how'd", "how'll", "how's", "I'd", "I'll", "I'm",
1509  "I've", "isn't", "it'd", "it'll", "it's", "mightn't",
1510  "might've", "mustn't", "must've", "needn't",
1511  "oughtn't",
1512  "shan't", "she'd", "she'll", "she's", "shouldn't",
1513  "should've", "somebody's", "someone'd", "someone'll",
1514  "someone's", "that'll", "that's", "that'd", "there'd",
1515  "there're", "there's", "they'd", "they'll", "they're",
1516  "they've", "wasn't", "we'd", "we'll", "we're", "we've",
1517  "weren't", "what'd", "what'll", "what're", "what's",
1518  "whats", # technically incorrect but some STT outputs
1519  "what've", "when's", "when'd", "where'd", "where's",
1520  "where've", "who'd", "who'd've", "who'll", "who're",
1521  "who's", "who've", "why'd", "why're", "why's", "won't",
1522  "won't've", "would've", "wouldn't", "wouldn't've",
1523  "y'all", "ya'll", "you'd", "you'd've", "you'll",
1524  "y'aint", "y'ain't", "you're", "you've"]
1525  if word in contraction:
1526  expansion = ["is not", "are not", "can not", "could have",
1527  "could not", "did not", "does not", "do not",
1528  "going to", "got to", "had not", "has not",
1529  "have not", "he would", "he will", "he is",
1530  "how did",
1531  "how will", "how is", "I would", "I will", "I am",
1532  "I have", "is not", "it would", "it will", "it is",
1533  "might not", "might have", "must not", "must have",
1534  "need not", "ought not", "shall not", "she would",
1535  "she will", "she is", "should not", "should have",
1536  "somebody is", "someone would", "someone will",
1537  "someone is", "that will", "that is", "that would",
1538  "there would", "there are", "there is", "they would",
1539  "they will", "they are", "they have", "was not",
1540  "we would", "we will", "we are", "we have",
1541  "were not", "what did", "what will", "what are",
1542  "what is",
1543  "what is", "what have", "when is", "when did",
1544  "where did", "where is", "where have", "who would",
1545  "who would have", "who will", "who are", "who is",
1546  "who have", "why did", "why are", "why is",
1547  "will not", "will not have", "would have",
1548  "would not", "would not have", "you all", "you all",
1549  "you would", "you would have", "you will",
1550  "you are not", "you are not", "you are", "you have"]
1551  word = expansion[contraction.index(word)]
1552 
1553  # Convert numbers into digits, e.g. "two" -> "2"
1554  textNumbers = ["zero", "one", "two", "three", "four", "five", "six",
1555  "seven", "eight", "nine", "ten", "eleven", "twelve",
1556  "thirteen", "fourteen", "fifteen", "sixteen",
1557  "seventeen", "eighteen", "nineteen", "twenty"]
1558 
1559  if word in textNumbers:
1560  word = str(textNumbers.index(word))
1561 
1562  normalized += " " + word
1563 
1564  return normalized[1:] # strip the initial space
def normalize_en(text, remove_articles)
Definition: parse_en.py:1495
def _invert_dict(original)
Definition: parse_en.py:30
def extract_datetime_en(string, dateNow, default_time)
Definition: parse_en.py:667
def _extract_decimal_with_text_en(tokens, short_scale, ordinals)
Definition: parse_en.py:366
def _generate_plurals(originals)
Definition: parse_en.py:45
def extractnumber_en(text, short_scale=True, ordinals=False)
Definition: parse_en.py:592
def isFractional_en(input_str, short_scale=True)
Definition: parse_en.py:1447
def _extract_whole_number_with_text_en(tokens, short_scale, ordinals)
Definition: parse_en.py:414
def extract_numbers_en(text, short_scale=True, ordinals=False)
Definition: parse_en.py:1476
def extract_duration_en(text)
Definition: parse_en.py:612
def _partition_list(items, split_on)
Definition: parse_en.py:159
def look_for_fractions(split_list)
Definition: parse_common.py:36
def _extract_number_with_text_en(tokens, short_scale=True, ordinals=False, fractional_numbers=True)
Definition: parse_en.py:268
def _extract_fraction_with_text_en(tokens, short_scale, ordinals)
Definition: parse_en.py:324
def _initialize_number_data(short_scale)
Definition: parse_en.py:565
def _extract_number_with_text_en_helper(tokens, short_scale=True, ordinals=False, fractional_numbers=True)
Definition: parse_en.py:292
def _extract_numbers_with_text(tokens, short_scale=True, ordinals=False, fractional_numbers=True)
Definition: parse_en.py:226
def _convert_words_to_numbers(text, short_scale=True, ordinals=False)
Definition: parse_en.py:189


mycroft_ros
Author(s):
autogenerated on Mon Apr 26 2021 02:35:40