parse_pt.py
Go to the documentation of this file.
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright 2017 Mycroft AI Inc.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #
17 """
18  Parse functions for Portuguese (PT-PT)
19 
20  TODO: numbers greater than 999999
21  TODO: date time pt
22 """
23 
24 from datetime import datetime
25 from dateutil.relativedelta import relativedelta
26 from mycroft.util.lang.parse_common import is_numeric, look_for_fractions
27 from mycroft.util.lang.common_data_pt import _FRACTION_STRING_PT, \
28  _PT_ARTICLES, _PT_NUMBERS
29 
30 
31 def isFractional_pt(input_str):
32  """
33  This function takes the given text and checks if it is a fraction.
34 
35  Args:
36  text (str): the string to check if fractional
37  Returns:
38  (bool) or (float): False if not a fraction, otherwise the fraction
39 
40  """
41  if input_str.endswith('s', -1):
42  input_str = input_str[:len(input_str) - 1] # e.g. "fifths"
43 
44  aFrac = ["meio", u"terço", "quarto", "quinto", "sexto",
45  "setimo", "oitavo", "nono", u"décimo"]
46 
47  if input_str.lower() in aFrac:
48  return 1.0 / (aFrac.index(input_str) + 2)
49  if input_str == u"vigésimo":
50  return 1.0 / 20
51  if input_str == u"trigésimo":
52  return 1.0 / 30
53  if input_str == u"centésimo":
54  return 1.0 / 100
55  if input_str == u"milésimo":
56  return 1.0 / 1000
57  if (input_str == u"sétimo" or input_str == "septimo" or
58  input_str == u"séptimo"):
59  return 1.0 / 7
60 
61  return False
62 
63 
64 def extractnumber_pt(text):
65  """
66  This function prepares the given text for parsing by making
67  numbers consistent, getting rid of contractions, etc.
68  Args:
69  text (str): the string to normalize
70  Returns:
71  (int) or (float): The value of extracted number
72 
73  """
74  aWords = text.split()
75  count = 0
76  result = None
77  while count < len(aWords):
78  val = 0
79  word = aWords[count]
80  next_next_word = None
81  if count + 1 < len(aWords):
82  next_word = aWords[count + 1]
83  if count + 2 < len(aWords):
84  next_next_word = aWords[count + 2]
85  else:
86  next_word = None
87 
88  # is current word a number?
89  if word in _PT_NUMBERS:
90  val = _PT_NUMBERS[word]
91  elif word.isdigit(): # doesn't work with decimals
92  val = int(word)
93  elif is_numeric(word):
94  val = float(word)
95  elif isFractional_pt(word):
96  if not result:
97  result = 1
98  result = result * isFractional_pt(word)
99  count += 1
100  continue
101 
102  if not val:
103  # look for fractions like "2/3"
104  aPieces = word.split('/')
105  # if (len(aPieces) == 2 and is_numeric(aPieces[0])
106  # and is_numeric(aPieces[1])):
107  if look_for_fractions(aPieces):
108  val = float(aPieces[0]) / float(aPieces[1])
109 
110  if val:
111  if result is None:
112  result = 0
113  # handle fractions
114  if next_word != "avos":
115  result += val
116  else:
117  result = float(result) / float(val)
118 
119  if next_word is None:
120  break
121 
122  # number word and fraction
123  ands = ["e"]
124  if next_word in ands:
125  zeros = 0
126  if result is None:
127  count += 1
128  continue
129  newWords = aWords[count + 2:]
130  newText = ""
131  for word in newWords:
132  newText += word + " "
133 
134  afterAndVal = extractnumber_pt(newText[:-1])
135  if afterAndVal:
136  if result < afterAndVal or result < 20:
137  while afterAndVal > 1:
138  afterAndVal = afterAndVal / 10.0
139  for word in newWords:
140  if word == "zero" or word == "0":
141  zeros += 1
142  else:
143  break
144  for _ in range(0, zeros):
145  afterAndVal = afterAndVal / 10.0
146  result += afterAndVal
147  break
148  elif next_next_word is not None:
149  if next_next_word in ands:
150  newWords = aWords[count + 3:]
151  newText = ""
152  for word in newWords:
153  newText += word + " "
154  afterAndVal = extractnumber_pt(newText[:-1])
155  if afterAndVal:
156  if result is None:
157  result = 0
158  result += afterAndVal
159  break
160 
161  decimals = ["ponto", "virgula", "vírgula", ".", ","]
162  if next_word in decimals:
163  zeros = 0
164  newWords = aWords[count + 2:]
165  newText = ""
166  for word in newWords:
167  newText += word + " "
168  for word in newWords:
169  if word == "zero" or word == "0":
170  zeros += 1
171  else:
172  break
173  afterDotVal = str(extractnumber_pt(newText[:-1]))
174  afterDotVal = zeros * "0" + afterDotVal
175  result = float(str(result) + "." + afterDotVal)
176  break
177  count += 1
178 
179  if result is None:
180  return False
181 
182  # Return the $str with the number related words removed
183  # (now empty strings, so strlen == 0)
184  # aWords = [word for word in aWords if len(word) > 0]
185  # text = ' '.join(aWords)
186  if "." in str(result):
187  integer, dec = str(result).split(".")
188  # cast float to int
189  if dec == "0":
190  result = int(integer)
191 
192  return result
193 
194 
195 def pt_number_parse(words, i):
196  def pt_cte(i, s):
197  if i < len(words) and s == words[i]:
198  return s, i + 1
199  return None
200 
201  def pt_number_word(i, mi, ma):
202  if i < len(words):
203  v = _PT_NUMBERS.get(words[i])
204  if v and v >= mi and v <= ma:
205  return v, i + 1
206  return None
207 
208  def pt_number_1_99(i):
209  r1 = pt_number_word(i, 1, 29)
210  if r1:
211  return r1
212 
213  r1 = pt_number_word(i, 30, 90)
214  if r1:
215  v1, i1 = r1
216  r2 = pt_cte(i1, "e")
217  if r2:
218  i2 = r2[1]
219  r3 = pt_number_word(i2, 1, 9)
220  if r3:
221  v3, i3 = r3
222  return v1 + v3, i3
223  return r1
224  return None
225 
226  def pt_number_1_999(i):
227  # [2-9]cientos [1-99]?
228  r1 = pt_number_word(i, 100, 900)
229  if r1:
230  v1, i1 = r1
231  r2 = pt_number_1_99(i1)
232  if r2:
233  v2, i2 = r2
234  return v1 + v2, i2
235  else:
236  return r1
237 
238  # [1-99]
239  r1 = pt_number_1_99(i)
240  if r1:
241  return r1
242 
243  return None
244 
245  def pt_number(i):
246  # check for cero
247  r1 = pt_number_word(i, 0, 0)
248  if r1:
249  return r1
250 
251  # check for [1-999] (mil [0-999])?
252  r1 = pt_number_1_999(i)
253  if r1:
254  v1, i1 = r1
255  r2 = pt_cte(i1, "mil")
256  if r2:
257  i2 = r2[1]
258  r3 = pt_number_1_999(i2)
259  if r3:
260  v3, i3 = r3
261  return v1 * 1000 + v3, i3
262  else:
263  return v1 * 1000, i2
264  else:
265  return r1
266  return None
267 
268  return pt_number(i)
269 
270 
271 def normalize_pt(text, remove_articles):
272  """ PT string normalization """
273 
274  words = text.split() # this also removed extra spaces
275  normalized = ""
276  # Contractions are not common in PT
277 
278  # Convert numbers into digits, e.g. "dois" -> "2"
279  normalized = ""
280  i = 0
281  while i < len(words):
282  word = words[i]
283  # remove articles
284  if remove_articles and word in _PT_ARTICLES:
285  i += 1
286  continue
287 
288  # Convert numbers into digits
289  r = pt_number_parse(words, i)
290  if r:
291  v, i = r
292  normalized += " " + str(v)
293  continue
294 
295  # NOTE temporary , handle some numbers above >999
296  if word in _PT_NUMBERS:
297  word = str(_PT_NUMBERS[word])
298  # end temporary
299 
300  normalized += " " + word
301  i += 1
302  # some articles in pt-pt can not be removed, but many words can
303  # this is experimental and some meaning may be lost
304  # maybe agressive should default to False
305  # only usage will tell, as a native speaker this seems reasonable
306  return pt_pruning(normalized[1:], agressive=remove_articles)
307 
308 
309 def extract_datetime_pt(input_str, currentDate, default_time):
310  def clean_string(s):
311  # cleans the input string of unneeded punctuation and capitalization
312  # among other things
313  symbols = [".", ",", ";", "?", "!", u"º", u"ª"]
314  noise_words = ["o", "os", "a", "as", "do", "da", "dos", "das", "de",
315  "ao", "aos"]
316 
317  for word in symbols:
318  s = s.replace(word, "")
319  for word in noise_words:
320  s = s.replace(" " + word + " ", " ")
321  s = s.lower().replace(
322  u"á",
323  "a").replace(
324  u"ç",
325  "c").replace(
326  u"à",
327  "a").replace(
328  u"ã",
329  "a").replace(
330  u"é",
331  "e").replace(
332  u"è",
333  "e").replace(
334  u"ê",
335  "e").replace(
336  u"ó",
337  "o").replace(
338  u"ò",
339  "o").replace(
340  "-",
341  " ").replace(
342  "_",
343  "")
344  # handle synonims and equivalents, "tomorrow early = tomorrow morning
345  synonims = {"manha": ["manhazinha", "cedo", "cedinho"],
346  "tarde": ["tardinha", "tarde"],
347  "noite": ["noitinha", "anoitecer"],
348  "todos": ["ao", "aos"],
349  "em": ["do", "da", "dos", "das", "de"]}
350  for syn in synonims:
351  for word in synonims[syn]:
352  s = s.replace(" " + word + " ", " " + syn + " ")
353  # relevant plurals, cant just extract all s in pt
354  wordlist = ["manhas", "noites", "tardes", "dias", "semanas", "anos",
355  "minutos", "segundos", "nas", "nos", "proximas",
356  "seguintes", "horas"]
357  for _, word in enumerate(wordlist):
358  s = s.replace(word, word.rstrip('s'))
359  s = s.replace("meses", "mes").replace("anteriores", "anterior")
360  return s
361 
362  def date_found():
363  return found or \
364  (
365  datestr != "" or timeStr != "" or
366  yearOffset != 0 or monthOffset != 0 or
367  dayOffset is True or hrOffset != 0 or
368  hrAbs or minOffset != 0 or
369  minAbs or secOffset != 0
370  )
371 
372  if input_str == "" or not currentDate:
373  return None
374 
375  found = False
376  daySpecified = False
377  dayOffset = False
378  monthOffset = 0
379  yearOffset = 0
380  dateNow = currentDate
381  today = dateNow.strftime("%w")
382  currentYear = dateNow.strftime("%Y")
383  fromFlag = False
384  datestr = ""
385  hasYear = False
386  timeQualifier = ""
387 
388  words = clean_string(input_str).split(" ")
389  timeQualifiersList = ['manha', 'tarde', 'noite']
390  time_indicators = ["em", "as", "nas", "pelas", "volta", "depois", "estas",
391  "no", "dia", "hora"]
392  days = ['segunda', 'terca', 'quarta',
393  'quinta', 'sexta', 'sabado', 'domingo']
394  months = ['janeiro', 'febreiro', 'marco', 'abril', 'maio', 'junho',
395  'julho', 'agosto', 'setembro', 'outubro', 'novembro',
396  'dezembro']
397  monthsShort = ['jan', 'feb', 'mar', 'abr', 'mai', 'jun', 'jul', 'ag',
398  'set', 'out', 'nov', 'dec']
399  nexts = ["proximo", "proxima"]
400  suffix_nexts = ["seguinte", "subsequente", "seguir"]
401  lasts = ["ultimo", "ultima"]
402  suffix_lasts = ["passada", "passado", "anterior", "antes"]
403  nxts = ["depois", "seguir", "seguida", "seguinte", "proxima", "proximo"]
404  prevs = ["antes", "ante", "previa", "previamente", "anterior"]
405  froms = ["partir", "em", "para", "na", "no", "daqui", "seguir",
406  "depois", "por", "proxima", "proximo", "da", "do", "de"]
407  thises = ["este", "esta", "deste", "desta", "neste", "nesta", "nesse",
408  "nessa"]
409  froms += thises
410  lists = nxts + prevs + froms + time_indicators
411  for idx, word in enumerate(words):
412  if word == "":
413  continue
414  wordPrevPrev = words[idx - 2] if idx > 1 else ""
415  wordPrev = words[idx - 1] if idx > 0 else ""
416  wordNext = words[idx + 1] if idx + 1 < len(words) else ""
417  wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
418  wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
419 
420  start = idx
421  used = 0
422  # save timequalifier for later
423  if word in timeQualifiersList:
424  timeQualifier = word
425 
426  # parse today, tomorrow, yesterday
427  elif word == "hoje" and not fromFlag:
428  dayOffset = 0
429  used += 1
430  elif word == "amanha" and not fromFlag:
431  dayOffset = 1
432  used += 1
433  elif word == "ontem" and not fromFlag:
434  dayOffset -= 1
435  used += 1
436  # "before yesterday" and "before before yesterday"
437  elif (word == "anteontem" or
438  (word == "ante" and wordNext == "ontem")) and not fromFlag:
439  dayOffset -= 2
440  used += 1
441  if wordNext == "ontem":
442  used += 1
443  elif word == "ante" and wordNext == "ante" and wordNextNext == \
444  "ontem" and not fromFlag:
445  dayOffset -= 3
446  used += 3
447  elif word == "anteanteontem" and not fromFlag:
448  dayOffset -= 3
449  used += 1
450  # day after tomorrow
451  elif word == "depois" and wordNext == "amanha" and not fromFlag:
452  dayOffset += 2
453  used = 2
454  # day before yesterday
455  elif word == "antes" and wordNext == "ontem" and not fromFlag:
456  dayOffset -= 2
457  used = 2
458  # parse 5 days, 10 weeks, last week, next week, week after
459  elif word == "dia":
460  if wordNext == "depois" or wordNext == "antes":
461  used += 1
462  if wordPrev and wordPrev[0].isdigit():
463  dayOffset += int(wordPrev)
464  start -= 1
465  used += 1
466  elif (wordPrev and wordPrev[0].isdigit() and
467  wordNext not in months and
468  wordNext not in monthsShort):
469  dayOffset += int(wordPrev)
470  start -= 1
471  used += 2
472  elif wordNext and wordNext[0].isdigit() and wordNextNext not in \
473  months and wordNextNext not in monthsShort:
474  dayOffset += int(wordNext)
475  start -= 1
476  used += 2
477 
478  elif word == "semana" and not fromFlag:
479  if wordPrev[0].isdigit():
480  dayOffset += int(wordPrev) * 7
481  start -= 1
482  used = 2
483  for w in nexts:
484  if wordPrev == w:
485  dayOffset = 7
486  start -= 1
487  used = 2
488  for w in lasts:
489  if wordPrev == w:
490  dayOffset = -7
491  start -= 1
492  used = 2
493  for w in suffix_nexts:
494  if wordNext == w:
495  dayOffset = 7
496  start -= 1
497  used = 2
498  for w in suffix_lasts:
499  if wordNext == w:
500  dayOffset = -7
501  start -= 1
502  used = 2
503  # parse 10 months, next month, last month
504  elif word == "mes" and not fromFlag:
505  if wordPrev[0].isdigit():
506  monthOffset = int(wordPrev)
507  start -= 1
508  used = 2
509  for w in nexts:
510  if wordPrev == w:
511  monthOffset = 7
512  start -= 1
513  used = 2
514  for w in lasts:
515  if wordPrev == w:
516  monthOffset = -7
517  start -= 1
518  used = 2
519  for w in suffix_nexts:
520  if wordNext == w:
521  monthOffset = 7
522  start -= 1
523  used = 2
524  for w in suffix_lasts:
525  if wordNext == w:
526  monthOffset = -7
527  start -= 1
528  used = 2
529  # parse 5 years, next year, last year
530  elif word == "ano" and not fromFlag:
531  if wordPrev[0].isdigit():
532  yearOffset = int(wordPrev)
533  start -= 1
534  used = 2
535  for w in nexts:
536  if wordPrev == w:
537  yearOffset = 7
538  start -= 1
539  used = 2
540  for w in lasts:
541  if wordPrev == w:
542  yearOffset = -7
543  start -= 1
544  used = 2
545  for w in suffix_nexts:
546  if wordNext == w:
547  yearOffset = 7
548  start -= 1
549  used = 2
550  for w in suffix_lasts:
551  if wordNext == w:
552  yearOffset = -7
553  start -= 1
554  used = 2
555  # parse Monday, Tuesday, etc., and next Monday,
556  # last Tuesday, etc.
557  elif word in days and not fromFlag:
558 
559  d = days.index(word)
560  dayOffset = (d + 1) - int(today)
561  used = 1
562  if dayOffset < 0:
563  dayOffset += 7
564  for w in nexts:
565  if wordPrev == w:
566  dayOffset += 7
567  used += 1
568  start -= 1
569  for w in lasts:
570  if wordPrev == w:
571  dayOffset -= 7
572  used += 1
573  start -= 1
574  for w in suffix_nexts:
575  if wordNext == w:
576  dayOffset += 7
577  used += 1
578  start -= 1
579  for w in suffix_lasts:
580  if wordNext == w:
581  dayOffset -= 7
582  used += 1
583  start -= 1
584  if wordNext == "feira":
585  used += 1
586  # parse 15 of July, June 20th, Feb 18, 19 of February
587  elif word in months or word in monthsShort:
588  try:
589  m = months.index(word)
590  except ValueError:
591  m = monthsShort.index(word)
592  used += 1
593  datestr = months[m]
594  if wordPrev and wordPrev[0].isdigit():
595  # 13 maio
596  datestr += " " + wordPrev
597  start -= 1
598  used += 1
599  if wordNext and wordNext[0].isdigit():
600  datestr += " " + wordNext
601  used += 1
602  hasYear = True
603  else:
604  hasYear = False
605 
606  elif wordNext and wordNext[0].isdigit():
607  # maio 13
608  datestr += " " + wordNext
609  used += 1
610  if wordNextNext and wordNextNext[0].isdigit():
611  datestr += " " + wordNextNext
612  used += 1
613  hasYear = True
614  else:
615  hasYear = False
616 
617  elif wordPrevPrev and wordPrevPrev[0].isdigit():
618  # 13 dia maio
619  datestr += " " + wordPrevPrev
620 
621  start -= 2
622  used += 2
623  if wordNext and word[0].isdigit():
624  datestr += " " + wordNext
625  used += 1
626  hasYear = True
627  else:
628  hasYear = False
629 
630  elif wordNextNext and wordNextNext[0].isdigit():
631  # maio dia 13
632  datestr += " " + wordNextNext
633  used += 2
634  if wordNextNextNext and wordNextNextNext[0].isdigit():
635  datestr += " " + wordNextNextNext
636  used += 1
637  hasYear = True
638  else:
639  hasYear = False
640 
641  if datestr in months:
642  datestr = ""
643 
644  # parse 5 days from tomorrow, 10 weeks from next thursday,
645  # 2 months from July
646  validFollowups = days + months + monthsShort
647  validFollowups.append("hoje")
648  validFollowups.append("amanha")
649  validFollowups.append("ontem")
650  validFollowups.append("anteontem")
651  validFollowups.append("agora")
652  validFollowups.append("ja")
653  validFollowups.append("ante")
654 
655  # TODO debug word "depois" that one is failing for some reason
656  if word in froms and wordNext in validFollowups:
657 
658  if not (wordNext == "amanha" and wordNext == "ontem") and not (
659  word == "depois" or word == "antes" or word == "em"):
660  used = 2
661  fromFlag = True
662  if wordNext == "amanha" and word != "depois":
663  dayOffset += 1
664  elif wordNext == "ontem":
665  dayOffset -= 1
666  elif wordNext == "anteontem":
667  dayOffset -= 2
668  elif wordNext == "ante" and wordNextNext == "ontem":
669  dayOffset -= 2
670  elif (wordNext == "ante" and wordNext == "ante" and
671  wordNextNextNext == "ontem"):
672  dayOffset -= 3
673  elif wordNext in days:
674  d = days.index(wordNext)
675  tmpOffset = (d + 1) - int(today)
676  used = 2
677  if wordNextNext == "feira":
678  used += 1
679  if tmpOffset < 0:
680  tmpOffset += 7
681  if wordNextNext:
682  if wordNextNext in nxts:
683  tmpOffset += 7
684  used += 1
685  elif wordNextNext in prevs:
686  tmpOffset -= 7
687  used += 1
688  dayOffset += tmpOffset
689  elif wordNextNext and wordNextNext in days:
690  d = days.index(wordNextNext)
691  tmpOffset = (d + 1) - int(today)
692  used = 3
693  if wordNextNextNext:
694  if wordNextNextNext in nxts:
695  tmpOffset += 7
696  used += 1
697  elif wordNextNextNext in prevs:
698  tmpOffset -= 7
699  used += 1
700  dayOffset += tmpOffset
701  if wordNextNextNext == "feira":
702  used += 1
703  if wordNext in months:
704  used -= 1
705  if used > 0:
706 
707  if start - 1 > 0 and words[start - 1] in lists:
708  start -= 1
709  used += 1
710 
711  for i in range(0, used):
712  words[i + start] = ""
713 
714  if start - 1 >= 0 and words[start - 1] in lists:
715  words[start - 1] = ""
716  found = True
717  daySpecified = True
718 
719  # parse time
720  timeStr = ""
721  hrOffset = 0
722  minOffset = 0
723  secOffset = 0
724  hrAbs = None
725  minAbs = None
726  military = False
727 
728  for idx, word in enumerate(words):
729  if word == "":
730  continue
731 
732  wordPrevPrev = words[idx - 2] if idx > 1 else ""
733  wordPrev = words[idx - 1] if idx > 0 else ""
734  wordNext = words[idx + 1] if idx + 1 < len(words) else ""
735  wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
736  wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
737  # parse noon, midnight, morning, afternoon, evening
738  used = 0
739  if word == "meio" and wordNext == "dia":
740  hrAbs = 12
741  used += 2
742  elif word == "meia" and wordNext == "noite":
743  hrAbs = 0
744  used += 2
745  elif word == "manha":
746  if not hrAbs:
747  hrAbs = 8
748  used += 1
749  elif word == "tarde":
750  if not hrAbs:
751  hrAbs = 15
752  used += 1
753  elif word == "meio" and wordNext == "tarde":
754  if not hrAbs:
755  hrAbs = 17
756  used += 2
757  elif word == "meio" and wordNext == "manha":
758  if not hrAbs:
759  hrAbs = 10
760  used += 2
761  elif word == "fim" and wordNext == "tarde":
762  if not hrAbs:
763  hrAbs = 19
764  used += 2
765  elif word == "fim" and wordNext == "manha":
766  if not hrAbs:
767  hrAbs = 11
768  used += 2
769  elif word == "tantas" and wordNext == "manha":
770  if not hrAbs:
771  hrAbs = 4
772  used += 2
773  elif word == "noite":
774  if not hrAbs:
775  hrAbs = 22
776  used += 1
777  # parse half an hour, quarter hour
778  elif word == "hora" and \
779  (wordPrev in time_indicators or wordPrevPrev in
780  time_indicators):
781  if wordPrev == "meia":
782  minOffset = 30
783  elif wordPrev == "quarto":
784  minOffset = 15
785  elif wordPrevPrev == "quarto":
786  minOffset = 15
787  if idx > 2 and words[idx - 3] in time_indicators:
788  words[idx - 3] = ""
789  words[idx - 2] = ""
790  else:
791  hrOffset = 1
792  if wordPrevPrev in time_indicators:
793  words[idx - 2] = ""
794  words[idx - 1] = ""
795  used += 1
796  hrAbs = -1
797  minAbs = -1
798  # parse 5:00 am, 12:00 p.m., etc
799  elif word[0].isdigit():
800  isTime = True
801  strHH = ""
802  strMM = ""
803  remainder = ""
804  if ':' in word:
805  # parse colons
806  # "3:00 in the morning"
807  stage = 0
808  length = len(word)
809  for i in range(length):
810  if stage == 0:
811  if word[i].isdigit():
812  strHH += word[i]
813  elif word[i] == ":":
814  stage = 1
815  else:
816  stage = 2
817  i -= 1
818  elif stage == 1:
819  if word[i].isdigit():
820  strMM += word[i]
821  else:
822  stage = 2
823  i -= 1
824  elif stage == 2:
825  remainder = word[i:].replace(".", "")
826  break
827  if remainder == "":
828  nextWord = wordNext.replace(".", "")
829  if nextWord == "am" or nextWord == "pm":
830  remainder = nextWord
831  used += 1
832  elif wordNext == "manha":
833  remainder = "am"
834  used += 1
835  elif wordNext == "tarde":
836  remainder = "pm"
837  used += 1
838  elif wordNext == "noite":
839  if 0 < int(word[0]) < 6:
840  remainder = "am"
841  else:
842  remainder = "pm"
843  used += 1
844  elif wordNext in thises and wordNextNext == "manha":
845  remainder = "am"
846  used = 2
847  elif wordNext in thises and wordNextNext == "tarde":
848  remainder = "pm"
849  used = 2
850  elif wordNext in thises and wordNextNext == "noite":
851  remainder = "pm"
852  used = 2
853  else:
854  if timeQualifier != "":
855  military = True
856  if strHH <= 12 and \
857  (timeQualifier == "manha" or
858  timeQualifier == "tarde"):
859  strHH += 12
860 
861  else:
862  # try to parse # s without colons
863  # 5 hours, 10 minutes etc.
864  length = len(word)
865  strNum = ""
866  remainder = ""
867  for i in range(length):
868  if word[i].isdigit():
869  strNum += word[i]
870  else:
871  remainder += word[i]
872 
873  if remainder == "":
874  remainder = wordNext.replace(".", "").lstrip().rstrip()
875 
876  if (
877  remainder == "pm" or
878  wordNext == "pm" or
879  remainder == "p.m." or
880  wordNext == "p.m."):
881  strHH = strNum
882  remainder = "pm"
883  used = 1
884  elif (
885  remainder == "am" or
886  wordNext == "am" or
887  remainder == "a.m." or
888  wordNext == "a.m."):
889  strHH = strNum
890  remainder = "am"
891  used = 1
892  else:
893  if (wordNext == "pm" or
894  wordNext == "p.m." or
895  wordNext == "tarde"):
896  strHH = strNum
897  remainder = "pm"
898  used = 1
899  elif (wordNext == "am" or
900  wordNext == "a.m." or
901  wordNext == "manha"):
902  strHH = strNum
903  remainder = "am"
904  used = 1
905  elif (int(word) > 100 and
906  (
907  wordPrev == "o" or
908  wordPrev == "oh" or
909  wordPrev == "zero"
910  )):
911  # 0800 hours (pronounced oh-eight-hundred)
912  strHH = int(word) / 100
913  strMM = int(word) - strHH * 100
914  military = True
915  if wordNext == "hora":
916  used += 1
917  elif (
918  wordNext == "hora" and
919  word[0] != '0' and
920  (
921  int(word) < 100 and
922  int(word) > 2400
923  )):
924  # ignores military time
925  # "in 3 hours"
926  hrOffset = int(word)
927  used = 2
928  isTime = False
929  hrAbs = -1
930  minAbs = -1
931 
932  elif wordNext == "minuto":
933  # "in 10 minutes"
934  minOffset = int(word)
935  used = 2
936  isTime = False
937  hrAbs = -1
938  minAbs = -1
939  elif wordNext == "segundo":
940  # in 5 seconds
941  secOffset = int(word)
942  used = 2
943  isTime = False
944  hrAbs = -1
945  minAbs = -1
946  elif int(word) > 100:
947  strHH = int(word) / 100
948  strMM = int(word) - strHH * 100
949  military = True
950  if wordNext == "hora":
951  used += 1
952 
953  elif wordNext == "" or (
954  wordNext == "em" and wordNextNext == "ponto"):
955  strHH = word
956  strMM = 00
957  if wordNext == "em" and wordNextNext == "ponto":
958  used += 2
959  if wordNextNextNext == "tarde":
960  remainder = "pm"
961  used += 1
962  elif wordNextNextNext == "manha":
963  remainder = "am"
964  used += 1
965  elif wordNextNextNext == "noite":
966  if 0 > int(strHH) > 6:
967  remainder = "am"
968  else:
969  remainder = "pm"
970  used += 1
971 
972  elif wordNext[0].isdigit():
973  strHH = word
974  strMM = wordNext
975  military = True
976  used += 1
977  if wordNextNext == "hora":
978  used += 1
979  else:
980  isTime = False
981 
982  strHH = int(strHH) if strHH else 0
983  strMM = int(strMM) if strMM else 0
984  strHH = strHH + 12 if (remainder == "pm" and
985  0 < strHH < 12) else strHH
986  strHH = strHH - 12 if (remainder == "am" and
987  0 < strHH >= 12) else strHH
988  if strHH > 24 or strMM > 59:
989  isTime = False
990  used = 0
991  if isTime:
992  hrAbs = strHH * 1
993  minAbs = strMM * 1
994  used += 1
995 
996  if used > 0:
997  # removed parsed words from the sentence
998  for i in range(used):
999  words[idx + i] = ""
1000 
1001  if wordPrev == "em" or wordPrev == "ponto":
1002  words[words.index(wordPrev)] = ""
1003 
1004  if idx > 0 and wordPrev in time_indicators:
1005  words[idx - 1] = ""
1006  if idx > 1 and wordPrevPrev in time_indicators:
1007  words[idx - 2] = ""
1008 
1009  idx += used - 1
1010  found = True
1011 
1012  # check that we found a date
1013  if not date_found:
1014  return None
1015 
1016  if dayOffset is False:
1017  dayOffset = 0
1018 
1019  # perform date manipulation
1020 
1021  extractedDate = dateNow
1022  extractedDate = extractedDate.replace(microsecond=0,
1023  second=0,
1024  minute=0,
1025  hour=0)
1026  if datestr != "":
1027  en_months = ['january', 'february', 'march', 'april', 'may', 'june',
1028  'july', 'august', 'september', 'october', 'november',
1029  'december']
1030  en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july',
1031  'aug',
1032  'sept', 'oct', 'nov', 'dec']
1033  for idx, en_month in enumerate(en_months):
1034  datestr = datestr.replace(months[idx], en_month)
1035  for idx, en_month in enumerate(en_monthsShort):
1036  datestr = datestr.replace(monthsShort[idx], en_month)
1037 
1038  temp = datetime.strptime(datestr, "%B %d")
1039  if not hasYear:
1040  temp = temp.replace(year=extractedDate.year)
1041  if extractedDate < temp:
1042  extractedDate = extractedDate.replace(year=int(currentYear),
1043  month=int(
1044  temp.strftime(
1045  "%m")),
1046  day=int(temp.strftime(
1047  "%d")))
1048  else:
1049  extractedDate = extractedDate.replace(
1050  year=int(currentYear) + 1,
1051  month=int(temp.strftime("%m")),
1052  day=int(temp.strftime("%d")))
1053  else:
1054  extractedDate = extractedDate.replace(
1055  year=int(temp.strftime("%Y")),
1056  month=int(temp.strftime("%m")),
1057  day=int(temp.strftime("%d")))
1058 
1059  if timeStr != "":
1060  temp = datetime(timeStr)
1061  extractedDate = extractedDate.replace(hour=temp.strftime("%H"),
1062  minute=temp.strftime("%M"),
1063  second=temp.strftime("%S"))
1064 
1065  if yearOffset != 0:
1066  extractedDate = extractedDate + relativedelta(years=yearOffset)
1067  if monthOffset != 0:
1068  extractedDate = extractedDate + relativedelta(months=monthOffset)
1069  if dayOffset != 0:
1070  extractedDate = extractedDate + relativedelta(days=dayOffset)
1071  if (hrAbs or 0) != -1 and (minAbs or 0) != -1:
1072  if hrAbs is None and minAbs is None and default_time:
1073  hrAbs = default_time.hour
1074  minAbs = default_time.minute
1075  extractedDate = extractedDate + relativedelta(hours=hrAbs or 0,
1076  minutes=minAbs or 0)
1077  if (hrAbs or minAbs) and datestr == "":
1078  if not daySpecified and dateNow > extractedDate:
1079  extractedDate = extractedDate + relativedelta(days=1)
1080  if hrOffset != 0:
1081  extractedDate = extractedDate + relativedelta(hours=hrOffset)
1082  if minOffset != 0:
1083  extractedDate = extractedDate + relativedelta(minutes=minOffset)
1084  if secOffset != 0:
1085  extractedDate = extractedDate + relativedelta(seconds=secOffset)
1086 
1087  resultStr = " ".join(words)
1088  resultStr = ' '.join(resultStr.split())
1089  resultStr = pt_pruning(resultStr)
1090  return [extractedDate, resultStr]
1091 
1092 
1093 def pt_pruning(text, symbols=True, accents=True, agressive=True):
1094  # agressive pt word pruning
1095  words = ["a", "o", "os", "as", "de", "dos", "das",
1096  "lhe", "lhes", "me", "e", "no", "nas", "na", "nos", "em", "para",
1097  "este",
1098  "esta", "deste", "desta", "neste", "nesta", "nesse",
1099  "nessa", "foi", "que"]
1100  if symbols:
1101  symbols = [".", ",", ";", ":", "!", "?", u"�", u"�"]
1102  for symbol in symbols:
1103  text = text.replace(symbol, "")
1104  text = text.replace("-", " ").replace("_", " ")
1105  if accents:
1106  accents = {"a": [u"á", u"à", u"ã", u"â"],
1107  "e": [u"ê", u"è", u"é"],
1108  "i": [u"í", u"ì"],
1109  "o": [u"ò", u"ó"],
1110  "u": [u"ú", u"ù"],
1111  "c": [u"ç"]}
1112  for char in accents:
1113  for acc in accents[char]:
1114  text = text.replace(acc, char)
1115  if agressive:
1116  text_words = text.split(" ")
1117  for idx, word in enumerate(text_words):
1118  if word in words:
1119  text_words[idx] = ""
1120  text = " ".join(text_words)
1121  text = ' '.join(text.split())
1122  return text
1123 
1124 
1125 def get_gender_pt(word, raw_string=""):
1126  word = word.rstrip("s")
1127  gender = None
1128  words = raw_string.split(" ")
1129  for idx, w in enumerate(words):
1130  if w == word and idx != 0:
1131  previous = words[idx - 1]
1132  gender = get_gender_pt(previous)
1133  break
1134  if not gender:
1135  if word[-1] == "a":
1136  gender = "f"
1137  if word[-1] == "o" or word[-1] == "e":
1138  gender = "m"
1139  return gender
def pt_number_parse(words, i)
Definition: parse_pt.py:195
def isFractional_pt(input_str)
Definition: parse_pt.py:31
def get_gender_pt(word, raw_string="")
Definition: parse_pt.py:1125
def extract_datetime_pt(input_str, currentDate, default_time)
Definition: parse_pt.py:309
def look_for_fractions(split_list)
Definition: parse_common.py:36
def extractnumber_pt(text)
Definition: parse_pt.py:64
def pt_pruning(text, symbols=True, accents=True, agressive=True)
Definition: parse_pt.py:1093
def normalize_pt(text, remove_articles)
Definition: parse_pt.py:271


mycroft_ros
Author(s):
autogenerated on Mon Apr 26 2021 02:35:40