parse_es.py
Go to the documentation of this file.
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright 2017 Mycroft AI Inc.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #
17 """
18  Parse functions for spanish (es)
19  TODO: numbers greater than 999999
20 """
21 from datetime import datetime
22 from dateutil.relativedelta import relativedelta
23 from mycroft.util.lang.parse_common import is_numeric, look_for_fractions
24 
25 # Undefined articles ["un", "una", "unos", "unas"] can not be supressed,
26 # in Spanish, "un caballo" means "a horse" or "one horse".
27 es_articles = ["el", "la", "los", "las"]
28 
29 es_numbers = {
30  "cero": 0,
31  "un": 1,
32  "uno": 1,
33  "una": 1,
34  "dos": 2,
35  "tres": 3,
36  u"trés": 3,
37  "cuatro": 4,
38  "cinco": 5,
39  "seis": 6,
40  "siete": 7,
41  "ocho": 8,
42  "nueve": 9,
43  "diez": 10,
44  "once": 11,
45  "doce": 12,
46  "trece": 13,
47  "catorce": 14,
48  "quince": 15,
49  "dieciseis": 16,
50  u"dieciséis": 16,
51  "diecisiete": 17,
52  "dieciocho": 18,
53  "diecinueve": 19,
54  "veinte": 20,
55  "veintiuno": 21,
56  u"veintid�s": 22,
57  u"veintitr�s": 23,
58  "veintidos": 22,
59  "veintitres": 23,
60  u"veintitrés": 23,
61  "veinticuatro": 24,
62  "veinticinco": 25,
63  u"veintiséis": 26,
64  "veintiseis": 26,
65  "veintisiete": 27,
66  "veintiocho": 28,
67  "veintinueve": 29,
68  "treinta": 30,
69  "cuarenta": 40,
70  "cincuenta": 50,
71  "sesenta": 60,
72  "setenta": 70,
73  "ochenta": 80,
74  "noventa": 90,
75  "cien": 100,
76  "ciento": 100,
77  "doscientos": 200,
78  "doscientas": 200,
79  "trescientos": 300,
80  "trescientas": 300,
81  "cuatrocientos": 400,
82  "cuatrocientas": 400,
83  "quinientos": 500,
84  "quinientas": 500,
85  "seiscientos": 600,
86  "seiscientas": 600,
87  "setecientos": 700,
88  "setecientas": 700,
89  "ochocientos": 800,
90  "ochocientas": 800,
91  "novecientos": 900,
92  "novecientas": 900,
93  "mil": 1000}
94 
95 
96 def isFractional_es(input_str):
97  """
98  This function takes the given text and checks if it is a fraction.
99 
100  Args:
101  text (str): the string to check if fractional
102  Returns:
103  (bool) or (float): False if not a fraction, otherwise the fraction
104 
105  """
106  if input_str.endswith('s', -1):
107  input_str = input_str[:len(input_str) - 1] # e.g. "fifths"
108 
109  aFrac = ["medio", "media", "tercio", "cuarto", "cuarta", "quinto",
110  "quinta", "sexto", "sexta", u"séptimo", u"séptima", "octavo",
111  "octava", "noveno", "novena", u"décimo", u"décima", u"onceavo",
112  u"onceava", u"doceavo", u"doceava"]
113 
114  if input_str.lower() in aFrac:
115  return 1.0 / (aFrac.index(input_str) + 2)
116  if (input_str == "cuarto" or input_str == "cuarta"):
117  return 1.0 / 4
118  if (input_str == u"vigésimo" or input_str == u"vigésima"):
119  return 1.0 / 20
120  if (input_str == u"trigésimo" or input_str == u"trigésima"):
121  return 1.0 / 30
122  if (input_str == u"centésimo" or input_str == u"centésima"):
123  return 1.0 / 100
124  if (input_str == u"milésimo" or input_str == u"milésima"):
125  return 1.0 / 1000
126  return False
127 
128 
130  """
131  This function prepares the given text for parsing by making
132  numbers consistent, getting rid of contractions, etc.
133  Args:
134  text (str): the string to normalize
135  Returns:
136  (int) or (float): The value of extracted number
137 
138  """
139  aWords = text.split()
140  count = 0
141  result = None
142  while count < len(aWords):
143  val = 0
144  word = aWords[count]
145  next_next_word = None
146  if count + 1 < len(aWords):
147  next_word = aWords[count + 1]
148  if count + 2 < len(aWords):
149  next_next_word = aWords[count + 2]
150  else:
151  next_word = None
152 
153  # is current word a number?
154  if word in es_numbers:
155  val = es_numbers[word]
156  elif word.isdigit(): # doesn't work with decimals
157  val = int(word)
158  elif is_numeric(word):
159  val = float(word)
160  elif isFractional_es(word):
161  if not result:
162  result = 1
163  result = result * isFractional_es(word)
164  count += 1
165  continue
166 
167  if not val:
168  # look for fractions like "2/3"
169  aPieces = word.split('/')
170  # if (len(aPieces) == 2 and is_numeric(aPieces[0])
171  # and is_numeric(aPieces[1])):
172  if look_for_fractions(aPieces):
173  val = float(aPieces[0]) / float(aPieces[1])
174 
175  if val:
176  if result is None:
177  result = 0
178  # handle fractions
179  if next_word != "avos":
180  result += val
181  else:
182  result = float(result) / float(val)
183 
184  if next_word is None:
185  break
186 
187  # number word and fraction
188  ands = ["e"]
189  if next_word in ands:
190  zeros = 0
191  if result is None:
192  count += 1
193  continue
194  newWords = aWords[count + 2:]
195  newText = ""
196  for word in newWords:
197  newText += word + " "
198 
199  afterAndVal = extractnumber_es(newText[:-1])
200  if afterAndVal:
201  if result < afterAndVal or result < 20:
202  while afterAndVal > 1:
203  afterAndVal = afterAndVal / 10.0
204  for word in newWords:
205  if word == "cero" or word == "0":
206  zeros += 1
207  else:
208  break
209  for _ in range(0, zeros):
210  afterAndVal = afterAndVal / 10.0
211  result += afterAndVal
212  break
213  elif next_next_word is not None:
214  if next_next_word in ands:
215  newWords = aWords[count + 3:]
216  newText = ""
217  for word in newWords:
218  newText += word + " "
219  afterAndVal = extractnumber_es(newText[:-1])
220  if afterAndVal:
221  if result is None:
222  result = 0
223  result += afterAndVal
224  break
225 
226  decimals = ["punto", "coma", ".", ","]
227  if next_word in decimals:
228  zeros = 0
229  newWords = aWords[count + 2:]
230  newText = ""
231  for word in newWords:
232  newText += word + " "
233  for word in newWords:
234  if word == "cero" or word == "0":
235  zeros += 1
236  else:
237  break
238  afterDotVal = str(extractnumber_es(newText[:-1]))
239  afterDotVal = zeros * "0" + afterDotVal
240  result = float(str(result) + "." + afterDotVal)
241  break
242  count += 1
243 
244  if result is None:
245  return False
246 
247  # Return the $str with the number related words removed
248  # (now empty strings, so strlen == 0)
249  # aWords = [word for word in aWords if len(word) > 0]
250  # text = ' '.join(aWords)
251  if "." in str(result):
252  integer, dec = str(result).split(".")
253  # cast float to int
254  if dec == "0":
255  result = int(integer)
256 
257  return result
258 
259 
260 def es_number_parse(words, i):
261  def es_cte(i, s):
262  if i < len(words) and s == words[i]:
263  return s, i + 1
264  return None
265 
266  def es_number_word(i, mi, ma):
267  if i < len(words):
268  v = es_numbers.get(words[i])
269  if v and v >= mi and v <= ma:
270  return v, i + 1
271  return None
272 
273  def es_number_1_99(i):
274  r1 = es_number_word(i, 1, 29)
275  if r1:
276  return r1
277 
278  r1 = es_number_word(i, 30, 90)
279  if r1:
280  v1, i1 = r1
281  r2 = es_cte(i1, "y")
282  if r2:
283  i2 = r2[1]
284  r3 = es_number_word(i2, 1, 9)
285  if r3:
286  v3, i3 = r3
287  return v1 + v3, i3
288  return r1
289  return None
290 
291  def es_number_1_999(i):
292  # [2-9]cientos [1-99]?
293  r1 = es_number_word(i, 100, 900)
294  if r1:
295  v1, i1 = r1
296  r2 = es_number_1_99(i1)
297  if r2:
298  v2, i2 = r2
299  return v1 + v2, i2
300  else:
301  return r1
302 
303  # [1-99]
304  r1 = es_number_1_99(i)
305  if r1:
306  return r1
307 
308  return None
309 
310  def es_number(i):
311  # check for cero
312  r1 = es_number_word(i, 0, 0)
313  if r1:
314  return r1
315 
316  # check for [1-999] (mil [0-999])?
317  r1 = es_number_1_999(i)
318  if r1:
319  v1, i1 = r1
320  r2 = es_cte(i1, "mil")
321  if r2:
322  i2 = r2[1]
323  r3 = es_number_1_999(i2)
324  if r3:
325  v3, i3 = r3
326  return v1 * 1000 + v3, i3
327  else:
328  return v1 * 1000, i2
329  else:
330  return r1
331  return None
332 
333  return es_number(i)
334 
335 
336 def normalize_es(text, remove_articles):
337  """ Spanish string normalization """
338 
339  words = text.split() # this also removed extra spaces
340 
341  normalized = ""
342  i = 0
343  while i < len(words):
344  word = words[i]
345 
346  if remove_articles and word in es_articles:
347  i += 1
348  continue
349 
350  # Convert numbers into digits
351  r = es_number_parse(words, i)
352  if r:
353  v, i = r
354  normalized += " " + str(v)
355  continue
356 
357  normalized += " " + word
358  i += 1
359 
360  return normalized[1:] # strip the initial space
361 
362 
363 def extract_datetime_es(input_str, currentDate=None, default_time=None):
364  def clean_string(s):
365  # cleans the input string of unneeded punctuation and capitalization
366  # among other things
367  symbols = [".", ",", ";", "?", "!", u"º", u"ª"]
368  noise_words = ["entre", "la", "del", "al", "el", "de",
369  "por", "para", "una", "cualquier", "a",
370  "e'", "esta", "este"]
371 
372  for word in symbols:
373  s = s.replace(word, "")
374  for word in noise_words:
375  s = s.replace(" " + word + " ", " ")
376  s = s.lower().replace(
377  u"á",
378  "a").replace(
379  u"é",
380  "e").replace(
381  u"ó",
382  "o").replace(
383  "-",
384  " ").replace(
385  "_",
386  "")
387  # handle synonims and equivalents, "tomorrow early = tomorrow morning
388  synonims = {u"mañana": ["amanecer", "temprano", "muy temprano"],
389  "tarde": ["media tarde", "atardecer"],
390  "noche": ["anochecer", "tarde"]}
391  for syn in synonims:
392  for word in synonims[syn]:
393  s = s.replace(" " + word + " ", " " + syn + " ")
394  # relevant plurals, cant just extract all s in pt
395  wordlist = [u"mañanas", "tardes", "noches", u"días", "semanas",
396  u"años", "minutos", "segundos", "las", "los", "siguientes",
397  u"próximas", u"próximos", "horas"]
398  for _, word in enumerate(wordlist):
399  s = s.replace(word, word.rstrip('s'))
400  s = s.replace("meses", "mes").replace("anteriores", "anterior")
401  return s
402 
403  def date_found():
404  return found or \
405  (
406  datestr != "" or
407  yearOffset != 0 or monthOffset != 0 or
408  dayOffset is True or hrOffset != 0 or
409  hrAbs or minOffset != 0 or
410  minAbs or secOffset != 0
411  )
412 
413  if input_str == "":
414  return None
415  if currentDate is None:
416  currentDate = datetime.now()
417 
418  found = False
419  daySpecified = False
420  dayOffset = False
421  monthOffset = 0
422  yearOffset = 0
423  dateNow = currentDate
424  today = dateNow.strftime("%w")
425  currentYear = dateNow.strftime("%Y")
426  fromFlag = False
427  datestr = ""
428  hasYear = False
429  timeQualifier = ""
430 
431  words = clean_string(input_str).split(" ")
432  timeQualifiersList = [u'mañana', 'tarde', 'noche']
433  time_indicators = ["en", "la", "al", "por", "pasados",
434  "pasadas", u"día", "hora"]
435  days = ['lunes', 'martes', u'miércoles',
436  'jueves', 'viernes', u'sábado', 'domingo']
437  months = ['enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio',
438  'julio', 'agosto', 'septiembre', 'octubre', 'noviembre',
439  'diciembre']
440  monthsShort = ['ene', 'feb', 'mar', 'abr', 'may', 'jun', 'jul', 'ago',
441  'sep', 'oct', 'nov', 'dic']
442  nexts = ["siguiente", u"próximo", u"próxima"]
443  suffix_nexts = ["siguientes", "subsecuentes"]
444  lasts = [u"último", u"última"]
445  suffix_lasts = ["pasada", "pasado", "anterior", "antes"]
446  nxts = [u"después", "siguiente", u"próximo", u"próxima"]
447  prevs = ["antes", "previa", "previo", "anterior"]
448  froms = ["desde", "en", "para", u"después de", "por", u"próximo",
449  u"próxima", "de"]
450  thises = ["este", "esta"]
451  froms += thises
452  lists = nxts + prevs + froms + time_indicators
453  for idx, word in enumerate(words):
454  if word == "":
455  continue
456  wordPrevPrev = words[idx - 2] if idx > 1 else ""
457  wordPrev = words[idx - 1] if idx > 0 else ""
458  wordNext = words[idx + 1] if idx + 1 < len(words) else ""
459  wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
460  wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
461 
462  start = idx
463  used = 0
464  # save timequalifier for later
465  if word in timeQualifiersList:
466  timeQualifier = word
467 
468  # parse today, tomorrow, yesterday
469  elif word == "hoy" and not fromFlag:
470  dayOffset = 0
471  used += 1
472  elif word == u"mañana" and not fromFlag:
473  dayOffset = 1
474  used += 1
475  elif word == "ayer" and not fromFlag:
476  dayOffset -= 1
477  used += 1
478  # "before yesterday" and "before before yesterday"
479  elif (word == "anteayer" or
480  (word == "ante" and wordNext == "ayer")) and not fromFlag:
481  dayOffset -= 2
482  used += 1
483  if wordNext == "ayer":
484  used += 1
485  elif word == "ante" and wordNext == "ante" and wordNextNext == \
486  "ayer" and not fromFlag:
487  dayOffset -= 3
488  used += 3
489  elif word == "ante anteayer" and not fromFlag:
490  dayOffset -= 3
491  used += 1
492  # day after tomorrow
493  elif word == "pasado" and wordNext == u"mañana" and not fromFlag:
494  dayOffset += 2
495  used = 2
496  # day before yesterday
497  elif word == "ante" and wordNext == "ayer" and not fromFlag:
498  dayOffset -= 2
499  used = 2
500  # parse 5 days, 10 weeks, last week, next week, week after
501  elif word == u"día":
502  if wordNext == "pasado" or wordNext == "ante":
503  used += 1
504  if wordPrev and wordPrev[0].isdigit():
505  dayOffset += int(wordPrev)
506  start -= 1
507  used += 1
508  elif (wordPrev and wordPrev[0].isdigit() and
509  wordNext not in months and
510  wordNext not in monthsShort):
511  dayOffset += int(wordPrev)
512  start -= 1
513  used += 2
514  elif wordNext and wordNext[0].isdigit() and wordNextNext not in \
515  months and wordNextNext not in monthsShort:
516  dayOffset += int(wordNext)
517  start -= 1
518  used += 2
519 
520  elif word == "semana" and not fromFlag:
521  if wordPrev[0].isdigit():
522  dayOffset += int(wordPrev) * 7
523  start -= 1
524  used = 2
525  for w in nexts:
526  if wordPrev == w:
527  dayOffset = 7
528  start -= 1
529  used = 2
530  for w in lasts:
531  if wordPrev == w:
532  dayOffset = -7
533  start -= 1
534  used = 2
535  for w in suffix_nexts:
536  if wordNext == w:
537  dayOffset = 7
538  start -= 1
539  used = 2
540  for w in suffix_lasts:
541  if wordNext == w:
542  dayOffset = -7
543  start -= 1
544  used = 2
545  # parse 10 months, next month, last month
546  elif word == "mes" and not fromFlag:
547  if wordPrev[0].isdigit():
548  monthOffset = int(wordPrev)
549  start -= 1
550  used = 2
551  for w in nexts:
552  if wordPrev == w:
553  monthOffset = 7
554  start -= 1
555  used = 2
556  for w in lasts:
557  if wordPrev == w:
558  monthOffset = -7
559  start -= 1
560  used = 2
561  for w in suffix_nexts:
562  if wordNext == w:
563  monthOffset = 7
564  start -= 1
565  used = 2
566  for w in suffix_lasts:
567  if wordNext == w:
568  monthOffset = -7
569  start -= 1
570  used = 2
571  # parse 5 years, next year, last year
572  elif word == u"año" and not fromFlag:
573  if wordPrev[0].isdigit():
574  yearOffset = int(wordPrev)
575  start -= 1
576  used = 2
577  for w in nexts:
578  if wordPrev == w:
579  yearOffset = 7
580  start -= 1
581  used = 2
582  for w in lasts:
583  if wordPrev == w:
584  yearOffset = -7
585  start -= 1
586  used = 2
587  for w in suffix_nexts:
588  if wordNext == w:
589  yearOffset = 7
590  start -= 1
591  used = 2
592  for w in suffix_lasts:
593  if wordNext == w:
594  yearOffset = -7
595  start -= 1
596  used = 2
597  # parse Monday, Tuesday, etc., and next Monday,
598  # last Tuesday, etc.
599  elif word in days and not fromFlag:
600  d = days.index(word)
601  dayOffset = (d + 1) - int(today)
602  used = 1
603  if dayOffset < 0:
604  dayOffset += 7
605  if wordPrev == "siguiente":
606  dayOffset += 7
607  used += 1
608  start -= 1
609  elif wordPrev == "pasado":
610  dayOffset -= 7
611  used += 1
612  start -= 1
613  if wordNext == "siguiente":
614  # dayOffset += 7
615  used += 1
616  elif wordNext == "pasado":
617  # dayOffset -= 7
618  used += 1
619  # parse 15 of July, June 20th, Feb 18, 19 of February
620  elif word in months or word in monthsShort:
621  try:
622  m = months.index(word)
623  except ValueError:
624  m = monthsShort.index(word)
625  used += 1
626  datestr = months[m]
627  if wordPrev and wordPrev[0].isdigit():
628  # 13 mayo
629  datestr += " " + wordPrev
630  start -= 1
631  used += 1
632  if wordNext and wordNext[0].isdigit():
633  datestr += " " + wordNext
634  used += 1
635  hasYear = True
636  else:
637  hasYear = False
638 
639  elif wordNext and wordNext[0].isdigit():
640  # mayo 13
641  datestr += " " + wordNext
642  used += 1
643  if wordNextNext and wordNextNext[0].isdigit():
644  datestr += " " + wordNextNext
645  used += 1
646  hasYear = True
647  else:
648  hasYear = False
649 
650  elif wordPrevPrev and wordPrevPrev[0].isdigit():
651  # 13 dia mayo
652  datestr += " " + wordPrevPrev
653 
654  start -= 2
655  used += 2
656  if wordNext and word[0].isdigit():
657  datestr += " " + wordNext
658  used += 1
659  hasYear = True
660  else:
661  hasYear = False
662 
663  elif wordNextNext and wordNextNext[0].isdigit():
664  # mayo dia 13
665  datestr += " " + wordNextNext
666  used += 2
667  if wordNextNextNext and wordNextNextNext[0].isdigit():
668  datestr += " " + wordNextNextNext
669  used += 1
670  hasYear = True
671  else:
672  hasYear = False
673 
674  if datestr in months:
675  datestr = ""
676 
677  # parse 5 days from tomorrow, 10 weeks from next thursday,
678  # 2 months from July
679  validFollowups = days + months + monthsShort
680  validFollowups.append("hoy")
681  validFollowups.append(u"mañana")
682  validFollowups.append("ayer")
683  validFollowups.append("anteayer")
684  validFollowups.append("ahora")
685  validFollowups.append("ya")
686  validFollowups.append("ante")
687 
688  # TODO debug word "depois" that one is failing for some reason
689  if word in froms and wordNext in validFollowups:
690 
691  if not (wordNext == u"mañana" and wordNext == "ayer") and not (
692  word == "pasado" or word == "antes"):
693  used = 2
694  fromFlag = True
695  if wordNext == u"mañana" and word != "pasado":
696  dayOffset += 1
697  elif wordNext == "ayer":
698  dayOffset -= 1
699  elif wordNext == "anteayer":
700  dayOffset -= 2
701  elif wordNext == "ante" and wordNextNext == "ayer":
702  dayOffset -= 2
703  elif (wordNext == "ante" and wordNext == "ante" and
704  wordNextNextNext == "ayer"):
705  dayOffset -= 3
706  elif wordNext in days:
707  d = days.index(wordNext)
708  tmpOffset = (d + 1) - int(today)
709  used = 2
710  # if wordNextNext == "feira":
711  # used += 1
712  if tmpOffset < 0:
713  tmpOffset += 7
714  if wordNextNext:
715  if wordNextNext in nxts:
716  tmpOffset += 7
717  used += 1
718  elif wordNextNext in prevs:
719  tmpOffset -= 7
720  used += 1
721  dayOffset += tmpOffset
722  elif wordNextNext and wordNextNext in days:
723  d = days.index(wordNextNext)
724  tmpOffset = (d + 1) - int(today)
725  used = 3
726  if wordNextNextNext:
727  if wordNextNextNext in nxts:
728  tmpOffset += 7
729  used += 1
730  elif wordNextNextNext in prevs:
731  tmpOffset -= 7
732  used += 1
733  dayOffset += tmpOffset
734  # if wordNextNextNext == "feira":
735  # used += 1
736  if wordNext in months:
737  used -= 1
738  if used > 0:
739 
740  if start - 1 > 0 and words[start - 1] in lists:
741  start -= 1
742  used += 1
743 
744  for i in range(0, used):
745  words[i + start] = ""
746 
747  if start - 1 >= 0 and words[start - 1] in lists:
748  words[start - 1] = ""
749  found = True
750  daySpecified = True
751 
752  # parse time
753  hrOffset = 0
754  minOffset = 0
755  secOffset = 0
756  hrAbs = None
757  minAbs = None
758 
759  for idx, word in enumerate(words):
760  if word == "":
761  continue
762 
763  wordPrevPrev = words[idx - 2] if idx > 1 else ""
764  wordPrev = words[idx - 1] if idx > 0 else ""
765  wordNext = words[idx + 1] if idx + 1 < len(words) else ""
766  wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
767  wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
768  # parse noon, midnight, morning, afternoon, evening
769  used = 0
770  if word == "medio" and wordNext == u"día":
771  hrAbs = 12
772  used += 2
773  elif word == "media" and wordNext == "noche":
774  hrAbs = 0
775  used += 2
776  elif word == u"mañana":
777  if not hrAbs:
778  hrAbs = 8
779  used += 1
780  elif word == "tarde":
781  if not hrAbs:
782  hrAbs = 15
783  used += 1
784  elif word == "media" and wordNext == "tarde":
785  if not hrAbs:
786  hrAbs = 17
787  used += 2
788  elif word == "tarde" and wordNext == "noche":
789  if not hrAbs:
790  hrAbs = 20
791  used += 2
792  elif word == "media" and wordNext == u"mañana":
793  if not hrAbs:
794  hrAbs = 10
795  used += 2
796  # elif word == "fim" and wordNext == "tarde":
797  # if not hrAbs:
798  # hrAbs = 19
799  # used += 2
800  # elif word == "fim" and wordNext == "manha":
801  # if not hrAbs:
802  # hrAbs = 11
803  # used += 2
804  elif word == "madrugada":
805  if not hrAbs:
806  hrAbs = 1
807  used += 2
808  elif word == "noche":
809  if not hrAbs:
810  hrAbs = 21
811  used += 1
812  # parse half an hour, quarter hour
813  elif word == "hora" and \
814  (wordPrev in time_indicators or wordPrevPrev in
815  time_indicators):
816  if wordPrev == "media":
817  minOffset = 30
818  elif wordPrev == "cuarto":
819  minOffset = 15
820  elif wordPrevPrev == "cuarto":
821  minOffset = 15
822  if idx > 2 and words[idx - 3] in time_indicators:
823  words[idx - 3] = ""
824  words[idx - 2] = ""
825  else:
826  hrOffset = 1
827  if wordPrevPrev in time_indicators:
828  words[idx - 2] = ""
829  words[idx - 1] = ""
830  used += 1
831  hrAbs = -1
832  minAbs = -1
833  # parse 5:00 am, 12:00 p.m., etc
834  elif word[0].isdigit():
835  isTime = True
836  strHH = ""
837  strMM = ""
838  remainder = ""
839  if ':' in word:
840  # parse colons
841  # "3:00 in the morning"
842  stage = 0
843  length = len(word)
844  for i in range(length):
845  if stage == 0:
846  if word[i].isdigit():
847  strHH += word[i]
848  elif word[i] == ":":
849  stage = 1
850  else:
851  stage = 2
852  i -= 1
853  elif stage == 1:
854  if word[i].isdigit():
855  strMM += word[i]
856  else:
857  stage = 2
858  i -= 1
859  elif stage == 2:
860  remainder = word[i:].replace(".", "")
861  break
862  if remainder == "":
863  nextWord = wordNext.replace(".", "")
864  if nextWord == "am" or nextWord == "pm":
865  remainder = nextWord
866  used += 1
867  elif wordNext == u"mañana" or wordNext == "madrugada":
868  remainder = "am"
869  used += 1
870  elif wordNext == "tarde":
871  remainder = "pm"
872  used += 1
873  elif wordNext == "noche":
874  if 0 < int(word[0]) < 6:
875  remainder = "am"
876  else:
877  remainder = "pm"
878  used += 1
879  elif wordNext in thises and wordNextNext == u"mañana":
880  remainder = "am"
881  used = 2
882  elif wordNext in thises and wordNextNext == "tarde":
883  remainder = "pm"
884  used = 2
885  elif wordNext in thises and wordNextNext == "noche":
886  remainder = "pm"
887  used = 2
888  else:
889  if timeQualifier != "":
890  if strHH <= 12 and \
891  (timeQualifier == u"mañana" or
892  timeQualifier == "tarde"):
893  strHH += 12
894 
895  else:
896  # try to parse # s without colons
897  # 5 hours, 10 minutes etc.
898  length = len(word)
899  strNum = ""
900  remainder = ""
901  for i in range(length):
902  if word[i].isdigit():
903  strNum += word[i]
904  else:
905  remainder += word[i]
906 
907  if remainder == "":
908  remainder = wordNext.replace(".", "").lstrip().rstrip()
909 
910  if (
911  remainder == "pm" or
912  wordNext == "pm" or
913  remainder == "p.m." or
914  wordNext == "p.m."):
915  strHH = strNum
916  remainder = "pm"
917  used = 1
918  elif (
919  remainder == "am" or
920  wordNext == "am" or
921  remainder == "a.m." or
922  wordNext == "a.m."):
923  strHH = strNum
924  remainder = "am"
925  used = 1
926  else:
927  if (wordNext == "pm" or
928  wordNext == "p.m." or
929  wordNext == "tarde"):
930  strHH = strNum
931  remainder = "pm"
932  used = 1
933  elif (wordNext == "am" or
934  wordNext == "a.m." or
935  wordNext == u"mañana"):
936  strHH = strNum
937  remainder = "am"
938  used = 1
939  elif (int(word) > 100 and
940  (
941  # wordPrev == "o" or
942  # wordPrev == "oh" or
943  wordPrev == "cero"
944  )):
945  # 0800 hours (pronounced oh-eight-hundred)
946  strHH = int(word) / 100
947  strMM = int(word) - strHH * 100
948  if wordNext == "hora":
949  used += 1
950  elif (
951  wordNext == "hora" and
952  word[0] != '0' and
953  (
954  int(word) < 100 and
955  int(word) > 2400
956  )):
957  # ignores military time
958  # "in 3 hours"
959  hrOffset = int(word)
960  used = 2
961  isTime = False
962  hrAbs = -1
963  minAbs = -1
964 
965  elif wordNext == "minuto":
966  # "in 10 minutes"
967  minOffset = int(word)
968  used = 2
969  isTime = False
970  hrAbs = -1
971  minAbs = -1
972  elif wordNext == "segundo":
973  # in 5 seconds
974  secOffset = int(word)
975  used = 2
976  isTime = False
977  hrAbs = -1
978  minAbs = -1
979  elif int(word) > 100:
980  strHH = int(word) / 100
981  strMM = int(word) - strHH * 100
982  if wordNext == "hora":
983  used += 1
984 
985  elif wordNext == "" or (
986  wordNext == "en" and wordNextNext == "punto"):
987  strHH = word
988  strMM = 00
989  if wordNext == "en" and wordNextNext == "punto":
990  used += 2
991  if wordNextNextNext == "tarde":
992  remainder = "pm"
993  used += 1
994  elif wordNextNextNext == u"mañana":
995  remainder = "am"
996  used += 1
997  elif wordNextNextNext == "noche":
998  if 0 > strHH > 6:
999  remainder = "am"
1000  else:
1001  remainder = "pm"
1002  used += 1
1003 
1004  elif wordNext[0].isdigit():
1005  strHH = word
1006  strMM = wordNext
1007  used += 1
1008  if wordNextNext == "hora":
1009  used += 1
1010  else:
1011  isTime = False
1012 
1013  strHH = int(strHH) if strHH else 0
1014  strMM = int(strMM) if strMM else 0
1015  strHH = strHH + 12 if (remainder == "pm" and
1016  0 < strHH < 12) else strHH
1017  strHH = strHH - 12 if (remainder == "am" and
1018  0 < strHH >= 12) else strHH
1019  if strHH > 24 or strMM > 59:
1020  isTime = False
1021  used = 0
1022  if isTime:
1023  hrAbs = strHH * 1
1024  minAbs = strMM * 1
1025  used += 1
1026 
1027  if used > 0:
1028  # removed parsed words from the sentence
1029  for i in range(used):
1030  words[idx + i] = ""
1031 
1032  if wordPrev == "en" or wordPrev == "punto":
1033  words[words.index(wordPrev)] = ""
1034 
1035  if idx > 0 and wordPrev in time_indicators:
1036  words[idx - 1] = ""
1037  if idx > 1 and wordPrevPrev in time_indicators:
1038  words[idx - 2] = ""
1039 
1040  idx += used - 1
1041  found = True
1042 
1043  # check that we found a date
1044  if not date_found:
1045  return None
1046 
1047  if dayOffset is False:
1048  dayOffset = 0
1049 
1050  # perform date manipulation
1051 
1052  extractedDate = dateNow
1053  extractedDate = extractedDate.replace(microsecond=0,
1054  second=0,
1055  minute=0,
1056  hour=0)
1057  if datestr != "":
1058  en_months = ['january', 'february', 'march', 'april', 'may', 'june',
1059  'july', 'august', 'september', 'october', 'november',
1060  'december']
1061  en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july',
1062  'aug',
1063  'sept', 'oct', 'nov', 'dec']
1064  for idx, en_month in enumerate(en_months):
1065  datestr = datestr.replace(months[idx], en_month)
1066  for idx, en_month in enumerate(en_monthsShort):
1067  datestr = datestr.replace(monthsShort[idx], en_month)
1068 
1069  temp = datetime.strptime(datestr, "%B %d")
1070  if not hasYear:
1071  temp = temp.replace(year=extractedDate.year)
1072  if extractedDate < temp:
1073  extractedDate = extractedDate.replace(year=int(currentYear),
1074  month=int(
1075  temp.strftime(
1076  "%m")),
1077  day=int(temp.strftime(
1078  "%d")))
1079  else:
1080  extractedDate = extractedDate.replace(
1081  year=int(currentYear) + 1,
1082  month=int(temp.strftime("%m")),
1083  day=int(temp.strftime("%d")))
1084  else:
1085  extractedDate = extractedDate.replace(
1086  year=int(temp.strftime("%Y")),
1087  month=int(temp.strftime("%m")),
1088  day=int(temp.strftime("%d")))
1089 
1090  if yearOffset != 0:
1091  extractedDate = extractedDate + relativedelta(years=yearOffset)
1092  if monthOffset != 0:
1093  extractedDate = extractedDate + relativedelta(months=monthOffset)
1094  if dayOffset != 0:
1095  extractedDate = extractedDate + relativedelta(days=dayOffset)
1096 
1097  if hrAbs is None and minAbs is None and default_time:
1098  hrAbs = default_time.hour
1099  minAbs = default_time.minute
1100 
1101  if hrAbs != -1 and minAbs != -1:
1102  extractedDate = extractedDate + relativedelta(hours=hrAbs or 0,
1103  minutes=minAbs or 0)
1104  if (hrAbs or minAbs) and datestr == "":
1105  if not daySpecified and dateNow > extractedDate:
1106  extractedDate = extractedDate + relativedelta(days=1)
1107  if hrOffset != 0:
1108  extractedDate = extractedDate + relativedelta(hours=hrOffset)
1109  if minOffset != 0:
1110  extractedDate = extractedDate + relativedelta(minutes=minOffset)
1111  if secOffset != 0:
1112  extractedDate = extractedDate + relativedelta(seconds=secOffset)
1113 
1114  resultStr = " ".join(words)
1115  resultStr = ' '.join(resultStr.split())
1116  # resultStr = pt_pruning(resultStr)
1117  return [extractedDate, resultStr]
1118 
1119 
1120 def get_gender_es(word, raw_string=""):
1121  # Next rules are imprecise and incompleted, but is a good starting point.
1122  # For more detailed explanation, see
1123  # http://www.wikilengua.org/index.php/Género_gramatical
1124  word = word.rstrip("s")
1125  gender = False
1126  words = raw_string.split(" ")
1127  for idx, w in enumerate(words):
1128  if w == word and idx != 0:
1129  previous = words[idx - 1]
1130  gender = get_gender_es(previous)
1131  break
1132  if not gender:
1133  if word[-1] == "a":
1134  gender = "f"
1135  if word[-1] == "o" or word[-1] == "e":
1136  gender = "m"
1137  return gender
def es_number_parse(words, i)
Definition: parse_es.py:260
def isFractional_es(input_str)
Definition: parse_es.py:96
def extract_datetime_es(input_str, currentDate=None, default_time=None)
Definition: parse_es.py:363
def look_for_fractions(split_list)
Definition: parse_common.py:36
def get_gender_es(word, raw_string="")
Definition: parse_es.py:1120
def normalize_es(text, remove_articles)
Definition: parse_es.py:336


mycroft_ros
Author(s):
autogenerated on Mon Apr 26 2021 02:35:40