parse_fr.py
Go to the documentation of this file.
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright 2017 Mycroft AI Inc.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #
17 """ Parse functions for french (fr)
18 
19  Todo:
20  * extractnumber_fr: ordinal numbers ("cinquième")
21  * extractnumber_fr: numbers greater than 999 999 ("cinq millions")
22  * extract_datetime_fr: "quatrième lundi de janvier"
23  * get_gender_fr
24 """
25 
26 from datetime import datetime
27 from dateutil.relativedelta import relativedelta
28 from mycroft.util.lang.parse_common import is_numeric, look_for_fractions, \
29  extract_numbers_generic
30 from mycroft.util.lang.format_fr import pronounce_number_fr
31 
32 # Undefined articles ["un", "une"] cannot be supressed,
33 # in French, "un cheval" means "a horse" or "one horse".
34 articles_fr = ["le", "la", "du", "de", "les", "des"]
35 
36 numbers_fr = {
37  "zéro": 0,
38  "un": 1,
39  "une": 1,
40  "deux": 2,
41  "trois": 3,
42  "quatre": 4,
43  "cinq": 5,
44  "six": 6,
45  "sept": 7,
46  "huit": 8,
47  "neuf": 9,
48  "dix": 10,
49  "onze": 11,
50  "douze": 12,
51  "treize": 13,
52  "quatorze": 14,
53  "quinze": 15,
54  "seize": 16,
55  "vingt": 20,
56  "trente": 30,
57  "quarante": 40,
58  "cinquante": 50,
59  "soixante": 60,
60  "soixante-dix": 70,
61  "septante": 70,
62  "quatre-vingt": 80,
63  "quatre-vingts": 80,
64  "octante": 80,
65  "huitante": 80,
66  "quatre-vingt-dix": 90,
67  "nonante": 90,
68  "cent": 100,
69  "cents": 100,
70  "mille": 1000,
71  "mil": 1000,
72  "millier": 1000,
73  "milliers": 1000,
74  "million": 1000000,
75  "millions": 1000000,
76  "milliard": 1000000000,
77  "milliards": 1000000000}
78 
79 ordinals_fr = ("er", "re", "ère", "nd", "nde" "ième", "ème", "e")
80 
81 
82 def number_parse_fr(words, i):
83  """ Parses a list of words to find a number
84  Takes in a list of words (strings without whitespace) and
85  extracts a number that starts at the given index.
86  Args:
87  words (array): the list to extract a number from
88  i (int): the index in words where to look for the number
89  Returns:
90  tuple with number, index of next word after the number.
91 
92  Returns None if no number was found.
93  """
94 
95  def cte_fr(i, s):
96  # Check if string s is equal to words[i].
97  # If it is return tuple with s, index of next word.
98  # If it is not return None.
99  if i < len(words) and s == words[i]:
100  return s, i + 1
101  return None
102 
103  def number_word_fr(i, mi, ma):
104  # Check if words[i] is a number in numbers_fr between mi and ma.
105  # If it is return tuple with number, index of next word.
106  # If it is not return None.
107  if i < len(words):
108  val = numbers_fr.get(words[i])
109  # Numbers [1-16,20,30,40,50,60,70,80,90,100,1000]
110  if val is not None:
111  if val >= mi and val <= ma:
112  return val, i + 1
113  else:
114  return None
115  # The number may be hyphenated (numbers [17-999])
116  splitWord = words[i].split('-')
117  if len(splitWord) > 1:
118  val1 = numbers_fr.get(splitWord[0])
119  if val1:
120  i1 = 0
121  val2 = 0
122  val3 = 0
123  if val1 < 10 and splitWord[1] == "cents":
124  val1 = val1 * 100
125  i1 = 2
126 
127  # For [81-99], e.g. "quatre-vingt-deux"
128  if len(splitWord) > i1 and splitWord[0] == "quatre" and \
129  splitWord[1] == "vingt":
130  val1 = 80
131  i1 += 2
132 
133  # We still found a number
134  if i1 == 0:
135  i1 = 1
136 
137  if len(splitWord) > i1:
138  # For [21,31,41,51,61,71]
139  if len(splitWord) > i1 + 1 and splitWord[i1] == "et":
140  val2 = numbers_fr.get(splitWord[i1 + 1])
141  if val2 is not None:
142  i1 += 2
143  # For [77-79],[97-99] e.g. "soixante-dix-sept"
144  elif splitWord[i1] == "dix" and \
145  len(splitWord) > i1 + 1:
146  val2 = numbers_fr.get(splitWord[i1 + 1])
147  if val2 is not None:
148  val2 += 10
149  i1 += 2
150  else:
151  val2 = numbers_fr.get(splitWord[i1])
152  if val2 is not None:
153  i1 += 1
154  if len(splitWord) > i1:
155  val3 = numbers_fr.get(splitWord[i1])
156  if val3 is not None:
157  i1 += 1
158 
159  if val2:
160  if val3:
161  val = val1 + val2 + val3
162  else:
163  val = val1 + val2
164  else:
165  return None
166  if i1 == len(splitWord) and val and ma >= val >= mi:
167  return val, i + 1
168 
169  return None
170 
171  def number_1_99_fr(i):
172  # Check if words[i] is a number between 1 and 99.
173  # If it is return tuple with number, index of next word.
174  # If it is not return None.
175 
176  # Is it a number between 1 and 16?
177  result1 = number_word_fr(i, 1, 16)
178  if result1:
179  return result1
180 
181  # Is it a number between 10 and 99?
182  result1 = number_word_fr(i, 10, 99)
183  if result1:
184  val1, i1 = result1
185  result2 = cte_fr(i1, "et")
186  # If the number is not hyphenated [21,31,41,51,61,71]
187  if result2:
188  i2 = result2[1]
189  result3 = number_word_fr(i2, 1, 11)
190  if result3:
191  val3, i3 = result3
192  return val1 + val3, i3
193  return result1
194 
195  # It is not a number
196  return None
197 
198  def number_1_999_fr(i):
199  # Check if words[i] is a number between 1 and 999.
200  # If it is return tuple with number, index of next word.
201  # If it is not return None.
202 
203  # Is it 100 ?
204  result = number_word_fr(i, 100, 100)
205 
206  # Is it [200,300,400,500,600,700,800,900]?
207  if not result:
208  resultH1 = number_word_fr(i, 2, 9)
209  if resultH1:
210  valH1, iH1 = resultH1
211  resultH2 = number_word_fr(iH1, 100, 100)
212  if resultH2:
213  iH2 = resultH2[1]
214  result = valH1 * 100, iH2
215 
216  if result:
217  val1, i1 = result
218  result2 = number_1_99_fr(i1)
219  if result2:
220  val2, i2 = result2
221  return val1 + val2, i2
222  else:
223  return result
224 
225  # Is it hyphenated? [101-999]
226  result = number_word_fr(i, 101, 999)
227  if result:
228  return result
229 
230  # [1-99]
231  result = number_1_99_fr(i)
232  if result:
233  return result
234 
235  return None
236 
237  def number_1_999999_fr(i):
238  """ Find a number in a list of words
239  Checks if words[i] is a number between 1 and 999,999.
240 
241  Args:
242  i (int): the index in words where to look for the number
243  Returns:
244  tuple with number, index of next word after the number.
245 
246  Returns None if no number was found.
247  """
248 
249  # check for zero
250  result1 = number_word_fr(i, 0, 0)
251  if result1:
252  return result1
253 
254  # check for [1-999]
255  result1 = number_1_999_fr(i)
256  if result1:
257  val1, i1 = result1
258  else:
259  val1 = 1
260  i1 = i
261  # check for 1000
262  result2 = number_word_fr(i1, 1000, 1000)
263  if result2:
264  # it's [1000-999000]
265  i2 = result2[1]
266  # check again for [1-999]
267  result3 = number_1_999_fr(i2)
268  if result3:
269  val3, i3 = result3
270  return val1 * 1000 + val3, i3
271  else:
272  return val1 * 1000, i2
273  elif result1:
274  return result1
275  return None
276 
277  return number_1_999999_fr(i)
278 
279 
280 def getOrdinal_fr(word):
281  """ Get the ordinal number
282  Takes in a word (string without whitespace) and
283  extracts the ordinal number.
284  Args:
285  word (string): the word to extract the number from
286  Returns:
287  number (int)
288 
289  Returns None if no ordinal number was found.
290  """
291  if word:
292  for ordinal in ordinals_fr:
293  if word[0].isdigit() and ordinal in word:
294  result = word.replace(ordinal, "")
295  if result.isdigit():
296  return int(result)
297 
298  return None
299 
300 
301 def number_ordinal_fr(words, i):
302  """ Find an ordinal number in a list of words
303  Takes in a list of words (strings without whitespace) and
304  extracts an ordinal number that starts at the given index.
305  Args:
306  words (array): the list to extract a number from
307  i (int): the index in words where to look for the ordinal number
308  Returns:
309  tuple with ordinal number (str),
310  index of next word after the number (int).
311 
312  Returns None if no ordinal number was found.
313  """
314  val1 = None
315  strOrd = ""
316  # it's already a digit, normalize to "1er" or "5e"
317  val1 = getOrdinal_fr(words[i])
318  if val1 is not None:
319  if val1 == 1:
320  strOrd = "1er"
321  else:
322  strOrd = str(val1) + "e"
323  return strOrd, i + 1
324 
325  # if it's a big number the beginning should be detected as a number
326  result = number_parse_fr(words, i)
327  if result:
328  val1, i = result
329  else:
330  val1 = 0
331 
332  if i < len(words):
333  word = words[i]
334  if word in ["premier", "première"]:
335  strOrd = "1er"
336  elif word == "second":
337  strOrd = "2e"
338  elif word.endswith("ième"):
339  val2 = None
340  word = word[:-4]
341  # centième
342  if word == "cent":
343  if val1:
344  strOrd = str(val1 * 100) + "e"
345  else:
346  strOrd = "100e"
347  # millième
348  elif word == "mill":
349  if val1:
350  strOrd = str(val1 * 1000) + "e"
351  else:
352  strOrd = "1000e"
353  else:
354  # "cinquième", "trente-cinquième"
355  if word.endswith("cinqu"):
356  word = word[:-1]
357  # "neuvième", "dix-neuvième"
358  elif word.endswith("neuv"):
359  word = word[:-1] + "f"
360  result = number_parse_fr([word], 0)
361  if not result:
362  # "trentième", "douzième"
363  word = word + "e"
364  result = number_parse_fr([word], 0)
365  if result:
366  val2, i = result
367  if val2 is not None:
368  strOrd = str(val1 + val2) + "e"
369  if strOrd:
370  return strOrd, i + 1
371 
372  return None
373 
374 
376  """Takes in a string and extracts a number.
377  Args:
378  text (str): the string to extract a number from
379  Returns:
380  (str): The number extracted or the original text.
381  """
382  # normalize text, keep articles for ordinals versus fractionals
383  text = normalize_fr(text, False)
384  # split words by whitespace
385  aWords = text.split()
386  count = 0
387  result = None
388  add = False
389  while count < len(aWords):
390  val = None
391  word = aWords[count]
392  wordNext = ""
393  wordPrev = ""
394  if count < (len(aWords) - 1):
395  wordNext = aWords[count + 1]
396  if count > 0:
397  wordPrev = aWords[count - 1]
398 
399  if word in articles_fr:
400  count += 1
401  continue
402  if word in ["et", "plus", "+"]:
403  count += 1
404  add = True
405  continue
406 
407  # is current word a numeric number?
408  if word.isdigit():
409  val = int(word)
410  count += 1
411  elif is_numeric(word):
412  val = float(word)
413  count += 1
414  elif wordPrev in articles_fr and getOrdinal_fr(word):
415  val = getOrdinal_fr(word)
416  count += 1
417  # is current word the denominator of a fraction?
418  elif isFractional_fr(word):
419  val = isFractional_fr(word)
420  count += 1
421 
422  # is current word the numerator of a fraction?
423  if val and wordNext:
424  valNext = isFractional_fr(wordNext)
425  if valNext:
426  val = float(val) * valNext
427  count += 1
428 
429  if not val:
430  count += 1
431  # is current word a numeric fraction like "2/3"?
432  aPieces = word.split('/')
433  # if (len(aPieces) == 2 and is_numeric(aPieces[0])
434  # and is_numeric(aPieces[1])):
435  if look_for_fractions(aPieces):
436  val = float(aPieces[0]) / float(aPieces[1])
437 
438  # is current word followed by a decimal value?
439  if wordNext == "virgule":
440  zeros = 0
441  newWords = aWords[count + 1:]
442  # count the number of zeros after the decimal sign
443  for word in newWords:
444  if word == "zéro" or word == "0":
445  zeros += 1
446  else:
447  break
448  afterDotVal = None
449  # extract the number after the zeros
450  if newWords[zeros].isdigit():
451  afterDotVal = newWords[zeros]
452  countDot = count + zeros + 2
453  # if a number was extracted (since comma is also a
454  # punctuation sign)
455  if afterDotVal:
456  count = countDot
457  if not val:
458  val = 0
459  # add the zeros
460  afterDotString = zeros * "0" + afterDotVal
461  val = float(str(val) + "." + afterDotString)
462  if val:
463  if add:
464  result += val
465  add = False
466  else:
467  result = val
468 
469  # if result == False:
470  if not result:
471  return normalize_fr(text, True)
472 
473  return result
474 
475 
476 def extract_datetime_fr(string, currentDate, default_time):
477  def clean_string(s):
478  """
479  cleans the input string of unneeded punctuation and capitalization
480  among other things.
481  """
482  s = normalize_fr(s, True)
483  wordList = s.split()
484  for idx, word in enumerate(wordList):
485  # remove comma and dot if it's not a number
486  if word[-1] in [",", "."]:
487  word = word[:-1]
488  wordList[idx] = word
489 
490  return wordList
491 
492  def date_found():
493  return found or \
494  (
495  datestr != "" or
496  yearOffset != 0 or monthOffset != 0 or dayOffset or
497  (isTime and (hrAbs or minAbs)) or
498  hrOffset != 0 or minOffset != 0 or secOffset != 0
499  )
500 
501  if string == "" or not currentDate:
502  return None
503 
504  found = False
505  daySpecified = False
506  dayOffset = False
507  monthOffset = 0
508  yearOffset = 0
509  dateNow = currentDate
510  today = dateNow.strftime("%w")
511  currentYear = dateNow.strftime("%Y")
512  fromFlag = False
513  datestr = ""
514  hasYear = False
515  timeQualifier = ""
516 
517  timeQualifiersList = ["matin", "après-midi", "soir", "nuit"]
518  words_in = ["dans", "après"]
519  markers = ["à", "dès", "autour", "vers", "environs", "ce",
520  "cette"] + words_in
521  days = ["lundi", "mardi", "mercredi",
522  "jeudi", "vendredi", "samedi", "dimanche"]
523  months = ["janvier", "février", "mars", "avril", "mai", "juin",
524  "juillet", "août", "septembre", "octobre", "novembre",
525  "décembre"]
526  monthsShort = ["jan", "fév", "mar", "avr", "mai", "juin", "juil", "aoû",
527  "sept", "oct", "nov", "déc"]
528  # needed for format functions
529  months_en = ['january', 'february', 'march', 'april', 'may', 'june',
530  'july', 'august', 'september', 'october', 'november',
531  'december']
532 
533  words = clean_string(string)
534 
535  for idx, word in enumerate(words):
536  if word == "":
537  continue
538  wordPrevPrevPrev = words[idx - 3] if idx > 2 else ""
539  wordPrevPrev = words[idx - 2] if idx > 1 else ""
540  wordPrev = words[idx - 1] if idx > 0 else ""
541  wordNext = words[idx + 1] if idx + 1 < len(words) else ""
542  wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
543 
544  start = idx
545  used = 0
546  # save timequalifier for later
547  if word in timeQualifiersList:
548  timeQualifier = word
549  used = 1
550  if wordPrev in ["ce", "cet", "cette"]:
551  used = 2
552  start -= 1
553  # parse aujourd'hui, demain, après-demain
554  elif word == "aujourd'hui" and not fromFlag:
555  dayOffset = 0
556  used += 1
557  elif word == "demain" and not fromFlag:
558  dayOffset = 1
559  used += 1
560  elif word == "après-demain" and not fromFlag:
561  dayOffset = 2
562  used += 1
563  # parse 5 jours, 10 semaines, semaine dernière, semaine prochaine
564  elif word in ["jour", "jours"]:
565  if wordPrev.isdigit():
566  dayOffset += int(wordPrev)
567  start -= 1
568  used = 2
569  # "3e jour"
570  elif getOrdinal_fr(wordPrev) is not None:
571  dayOffset += getOrdinal_fr(wordPrev) - 1
572  start -= 1
573  used = 2
574  elif word in ["semaine", "semaines"] and not fromFlag:
575  if wordPrev[0].isdigit():
576  dayOffset += int(wordPrev) * 7
577  start -= 1
578  used = 2
579  elif wordNext in ["prochaine", "suivante"]:
580  dayOffset = 7
581  used = 2
582  elif wordNext in ["dernière", "précédente"]:
583  dayOffset = -7
584  used = 2
585  # parse 10 mois, mois prochain, mois dernier
586  elif word == "mois" and not fromFlag:
587  if wordPrev[0].isdigit():
588  monthOffset = int(wordPrev)
589  start -= 1
590  used = 2
591  elif wordNext in ["prochain", "suivant"]:
592  monthOffset = 1
593  used = 2
594  elif wordNext in ["dernier", "précédent"]:
595  monthOffset = -1
596  used = 2
597  # parse 5 ans, an prochain, année dernière
598  elif word in ["an", "ans", "année", "années"] and not fromFlag:
599  if wordPrev[0].isdigit():
600  yearOffset = int(wordPrev)
601  start -= 1
602  used = 2
603  elif wordNext in ["prochain", "prochaine", "suivant", "suivante"]:
604  yearOffset = 1
605  used = 2
606  elif wordNext in ["dernier", "dernière", "précédent",
607  "précédente"]:
608  yearOffset = -1
609  used = 2
610  # parse lundi, mardi etc., and lundi prochain, mardi dernier, etc.
611  elif word in days and not fromFlag:
612  d = days.index(word)
613  dayOffset = (d + 1) - int(today)
614  used = 1
615  if dayOffset < 0:
616  dayOffset += 7
617  if wordNext in ["prochain", "suivant"]:
618  dayOffset += 7
619  used += 1
620  elif wordNext in ["dernier", "précédent"]:
621  dayOffset -= 7
622  used += 1
623  # parse 15 juillet, 15 juil
624  elif word in months or word in monthsShort and not fromFlag:
625  try:
626  m = months.index(word)
627  except ValueError:
628  m = monthsShort.index(word)
629  used += 1
630  datestr = months_en[m]
631  if wordPrev and (wordPrev[0].isdigit()):
632  datestr += " " + wordPrev
633  start -= 1
634  used += 1
635  else:
636  datestr += " 1"
637  if wordNext and wordNext[0].isdigit():
638  datestr += " " + wordNext
639  used += 1
640  hasYear = True
641  else:
642  hasYear = False
643  # parse 5 jours après demain, 10 semaines après jeudi prochain,
644  # 2 mois après juillet
645  validFollowups = days + months + monthsShort
646  validFollowups.append("aujourd'hui")
647  validFollowups.append("demain")
648  validFollowups.append("prochain")
649  validFollowups.append("prochaine")
650  validFollowups.append("suivant")
651  validFollowups.append("suivante")
652  validFollowups.append("dernier")
653  validFollowups.append("dernière")
654  validFollowups.append("précédent")
655  validFollowups.append("précédente")
656  validFollowups.append("maintenant")
657  if word in ["après", "depuis"] and wordNext in validFollowups:
658  used = 2
659  fromFlag = True
660  if wordNext == "demain":
661  dayOffset += 1
662  elif wordNext in days:
663  d = days.index(wordNext)
664  tmpOffset = (d + 1) - int(today)
665  used = 2
666  if wordNextNext == "prochain":
667  tmpOffset += 7
668  used += 1
669  elif wordNextNext == "dernier":
670  tmpOffset -= 7
671  used += 1
672  elif tmpOffset < 0:
673  tmpOffset += 7
674  dayOffset += tmpOffset
675  if used > 0:
676  if start - 1 > 0 and words[start - 1] in ["ce", "cette"]:
677  start -= 1
678  used += 1
679 
680  for i in range(0, used):
681  words[i + start] = ""
682 
683  if start - 1 >= 0 and words[start - 1] in markers:
684  words[start - 1] = ""
685  found = True
686  daySpecified = True
687 
688  # parse time
689  hrOffset = 0
690  minOffset = 0
691  secOffset = 0
692  hrAbs = None
693  minAbs = None
694  ampm = ""
695  isTime = False
696 
697  for idx, word in enumerate(words):
698  if word == "":
699  continue
700 
701  wordPrevPrev = words[idx - 2] if idx > 1 else ""
702  wordPrev = words[idx - 1] if idx > 0 else ""
703  wordNext = words[idx + 1] if idx + 1 < len(words) else ""
704  wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
705  used = 0
706  start = idx
707 
708  # parse midi et quart, minuit et demi, midi 10, minuit moins 20
709  if word in ["midi", "minuit"]:
710  isTime = True
711  if word == "midi":
712  hrAbs = 12
713  used += 1
714  elif word == "minuit":
715  hrAbs = 0
716  used += 1
717  if wordNext.isdigit():
718  minAbs = int(wordNext)
719  used += 1
720  elif wordNext == "et":
721  if wordNextNext == "quart":
722  minAbs = 15
723  used += 2
724  elif wordNextNext == "demi":
725  minAbs = 30
726  used += 2
727  elif wordNext == "moins":
728  if wordNextNext.isdigit():
729  minAbs = 60 - int(wordNextNext)
730  if not hrAbs:
731  hrAbs = 23
732  else:
733  hrAbs -= 1
734  used += 2
735  if wordNextNext == "quart":
736  minAbs = 45
737  if not hrAbs:
738  hrAbs = 23
739  else:
740  hrAbs -= 1
741  used += 2
742  # parse une demi-heure, un quart d'heure
743  elif word == "demi-heure" or word == "heure" and \
744  (wordPrevPrev in markers or wordPrevPrevPrev in markers):
745  used = 1
746  isTime = True
747  if word == "demi-heure":
748  minOffset = 30
749  elif wordPrev == "quart":
750  minOffset = 15
751  used += 1
752  start -= 1
753  elif wordPrev == "quarts" and wordPrevPrev.isdigit():
754  minOffset = int(wordPrevPrev) * 15
755  used += 1
756  start -= 1
757  if wordPrev.isdigit() or wordPrevPrev.isdigit():
758  start -= 1
759  used += 1
760  # parse 5:00 du matin, 12:00, etc
761  elif word[0].isdigit() and getOrdinal_fr(word) is None:
762  isTime = True
763  if ":" in word or "h" in word or "min" in word:
764  # parse hours on short format
765  # "3:00 du matin", "4h14", "3h15min"
766  strHH = ""
767  strMM = ""
768  stage = 0
769  length = len(word)
770  for i in range(length):
771  if stage == 0:
772  if word[i].isdigit():
773  strHH += word[i]
774  used = 1
775  elif word[i] in [":", "h", "m"]:
776  stage = 1
777  else:
778  stage = 2
779  i -= 1
780  elif stage == 1:
781  if word[i].isdigit():
782  strMM += word[i]
783  used = 1
784  else:
785  stage = 2
786  if word[i:i + 3] == "min":
787  i += 1
788  elif stage == 2:
789  break
790  if wordPrev in words_in:
791  hrOffset = int(strHH) if strHH else 0
792  minOffset = int(strMM) if strMM else 0
793  else:
794  hrAbs = int(strHH) if strHH else 0
795  minAbs = int(strMM) if strMM else 0
796  else:
797  # try to parse time without colons
798  # 5 hours, 10 minutes etc.
799  length = len(word)
800  ampm = ""
801  if (
802  word.isdigit() and
803  wordNext in ["heures", "heure"] and word != "0" and
804  (
805  int(word) < 100 or
806  int(word) > 2400
807  )):
808  # "dans 3 heures", "à 3 heures"
809  if wordPrev in words_in:
810  hrOffset = int(word)
811  else:
812  hrAbs = int(word)
813  used = 2
814  idxHr = idx + 2
815  # "dans 1 heure 40", "à 1 heure 40"
816  if idxHr < len(words):
817  # "3 heures 45"
818  if words[idxHr].isdigit():
819  if wordPrev in words_in:
820  minOffset = int(words[idxHr])
821  else:
822  minAbs = int(words[idxHr])
823  used += 1
824  idxHr += 1
825  # "3 heures et quart", "4 heures et demi"
826  elif words[idxHr] == "et" and idxHr + 1 < len(words):
827  if words[idxHr + 1] == "quart":
828  if wordPrev in words_in:
829  minOffset = 15
830  else:
831  minAbs = 15
832  used += 2
833  idxHr += 2
834  elif words[idxHr + 1] == "demi":
835  if wordPrev in words_in:
836  minOffset = 30
837  else:
838  minAbs = 30
839  used += 2
840  idxHr += 2
841  # "5 heures moins 20", "6 heures moins le quart"
842  elif words[idxHr] == "moins" and \
843  idxHr + 1 < len(words):
844  if words[idxHr + 1].isdigit():
845  if wordPrev in words_in:
846  hrOffset -= 1
847  minOffset = 60 - int(words[idxHr + 1])
848  else:
849  hrAbs = hrAbs - 1
850  minAbs = 60 - int(words[idxHr + 1])
851  used += 2
852  idxHr += 2
853  elif words[idxHr + 1] == "quart":
854  if wordPrev in words_in:
855  hrOffset -= 1
856  minOffset = 45
857  else:
858  hrAbs = hrAbs - 1
859  minAbs = 45
860  used += 2
861  idxHr += 2
862  # remove word minutes if present
863  if idxHr < len(words) and \
864  words[idxHr] in ["minutes", "minute"]:
865  used += 1
866  idxHr += 1
867  elif wordNext == "minutes":
868  # "dans 10 minutes"
869  if wordPrev in words_in:
870  minOffset = int(word)
871  else:
872  minAbs = int(word)
873  used = 2
874  elif wordNext == "secondes":
875  # "dans 5 secondes"
876  secOffset = int(word)
877  used = 2
878  elif int(word) > 100:
879  # format militaire
880  hrAbs = int(word) / 100
881  minAbs = int(word) - hrAbs * 100
882  used = 1
883  if wordNext == "heures":
884  used += 1
885 
886  # handle am/pm
887  if timeQualifier:
888  if timeQualifier == "matin":
889  ampm = "am"
890  elif timeQualifier == "après-midi":
891  ampm = "pm"
892  elif timeQualifier == "soir":
893  ampm = "pm"
894  elif timeQualifier == "nuit":
895  if (hrAbs or 0) > 8:
896  ampm = "pm"
897  else:
898  ampm = "am"
899  hrAbs = ((hrAbs or 0) + 12 if ampm == "pm" and (hrAbs or 0) < 12
900  else hrAbs)
901  hrAbs = ((hrAbs or 0) - 12 if ampm == "am" and (hrAbs or 0) >= 12
902  else hrAbs)
903  if (hrAbs or 0) > 24 or ((minAbs or 0) > 59):
904  isTime = False
905  used = 0
906  elif wordPrev in words_in:
907  isTime = False
908  else:
909  isTime = True
910 
911  elif not hrAbs and timeQualifier:
912  if timeQualifier == "matin":
913  hrAbs = 8
914  elif timeQualifier == "après-midi":
915  hrAbs = 15
916  elif timeQualifier == "soir":
917  hrAbs = 19
918  elif timeQualifier == "nuit":
919  hrAbs = 2
920  isTime = True
921 
922  if used > 0:
923  # removed parsed words from the sentence
924  for i in range(0, used):
925  words[i + start] = ""
926 
927  if start - 1 >= 0 and words[start - 1] in markers:
928  words[start - 1] = ""
929 
930  idx += used - 1
931  found = True
932 
933  # check that we found a date
934  if not date_found():
935  return None
936 
937  if dayOffset is False:
938  dayOffset = 0
939 
940  # perform date manipulation
941  extractedDate = dateNow
942  extractedDate = extractedDate.replace(microsecond=0,
943  second=0,
944  minute=0,
945  hour=0)
946  if datestr != "":
947  if not hasYear:
948  temp = datetime.strptime(datestr, "%B %d")
949  temp = temp.replace(year=extractedDate.year)
950  if extractedDate < temp:
951  extractedDate = extractedDate.replace(year=int(currentYear),
952  month=int(
953  temp.strftime(
954  "%m")),
955  day=int(temp.strftime(
956  "%d")))
957  else:
958  extractedDate = extractedDate.replace(
959  year=int(currentYear) + 1,
960  month=int(temp.strftime("%m")),
961  day=int(temp.strftime("%d")))
962  else:
963  temp = datetime.strptime(datestr, "%B %d %Y")
964  extractedDate = extractedDate.replace(
965  year=int(temp.strftime("%Y")),
966  month=int(temp.strftime("%m")),
967  day=int(temp.strftime("%d")))
968 
969  if yearOffset != 0:
970  extractedDate = extractedDate + relativedelta(years=yearOffset)
971  if monthOffset != 0:
972  extractedDate = extractedDate + relativedelta(months=monthOffset)
973  if dayOffset != 0:
974  extractedDate = extractedDate + relativedelta(days=dayOffset)
975 
976  if hrAbs is None and minAbs is None and default_time:
977  hrAbs = default_time.hour
978  minAbs = default_time.minute
979  if hrAbs != -1 and minAbs != -1:
980  extractedDate = extractedDate + relativedelta(hours=hrAbs or 0,
981  minutes=minAbs or 0)
982  if (hrAbs or minAbs) and datestr == "":
983  if not daySpecified and dateNow > extractedDate:
984  extractedDate = extractedDate + relativedelta(days=1)
985  if hrOffset != 0:
986  extractedDate = extractedDate + relativedelta(hours=hrOffset)
987  if minOffset != 0:
988  extractedDate = extractedDate + relativedelta(minutes=minOffset)
989  if secOffset != 0:
990  extractedDate = extractedDate + relativedelta(seconds=secOffset)
991  for idx, word in enumerate(words):
992  if words[idx] == "et" and words[idx - 1] == "" and \
993  words[idx + 1] == "":
994  words[idx] = ""
995 
996  resultStr = " ".join(words)
997  resultStr = ' '.join(resultStr.split())
998  return [extractedDate, resultStr]
999 
1000 
1001 def isFractional_fr(input_str):
1002  """
1003  This function takes the given text and checks if it is a fraction.
1004  Args:
1005  input_str (str): the string to check if fractional
1006  Returns:
1007  (bool) or (float): False if not a fraction, otherwise the fraction
1008  """
1009  input_str = input_str.lower()
1010 
1011  if input_str != "tiers" and input_str.endswith('s', -1):
1012  input_str = input_str[:len(input_str) - 1] # e.g. "quarts"
1013 
1014  aFrac = ["entier", "demi", "tiers", "quart", "cinquième", "sixième",
1015  "septième", "huitième", "neuvième", "dixième", "onzième",
1016  "douzième", "treizième", "quatorzième", "quinzième", "seizième",
1017  "dix-septième", "dix-huitième", "dix-neuvième", "vingtième"]
1018 
1019  if input_str in aFrac:
1020  return 1.0 / (aFrac.index(input_str) + 1)
1021  if getOrdinal_fr(input_str):
1022  return 1.0 / getOrdinal_fr(input_str)
1023  if input_str == "trentième":
1024  return 1.0 / 30
1025  if input_str == "centième":
1026  return 1.0 / 100
1027  if input_str == "millième":
1028  return 1.0 / 1000
1029 
1030  return False
1031 
1032 
1033 def normalize_fr(text, remove_articles):
1034  """ French string normalization """
1035  text = text.lower()
1036  words = text.split() # this also removed extra spaces
1037  normalized = ""
1038  i = 0
1039  while i < len(words):
1040  # remove articles
1041  if remove_articles and words[i] in articles_fr:
1042  i += 1
1043  continue
1044  if remove_articles and words[i][:2] in ["l'", "d'"]:
1045  words[i] = words[i][2:]
1046  # remove useless punctuation signs
1047  if words[i] in ["?", "!", ";", "…"]:
1048  i += 1
1049  continue
1050  # Normalize ordinal numbers
1051  if i > 0 and words[i - 1] in articles_fr:
1052  result = number_ordinal_fr(words, i)
1053  if result is not None:
1054  val, i = result
1055  normalized += " " + str(val)
1056  continue
1057  # Convert numbers into digits
1058  result = number_parse_fr(words, i)
1059  if result is not None:
1060  val, i = result
1061  normalized += " " + str(val)
1062  continue
1063 
1064  normalized += " " + words[i]
1065  i += 1
1066 
1067  return normalized[1:] # strip the initial space
1068 
1069 
1070 def extract_numbers_fr(text, short_scale=True, ordinals=False):
1071  """
1072  Takes in a string and extracts a list of numbers.
1073 
1074  Args:
1075  text (str): the string to extract a number from
1076  short_scale (bool): Use "short scale" or "long scale" for large
1077  numbers -- over a million. The default is short scale, which
1078  is now common in most English speaking countries.
1079  See https://en.wikipedia.org/wiki/Names_of_large_numbers
1080  ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
1081  Returns:
1082  list: list of extracted numbers as floats
1083  """
1084  return extract_numbers_generic(text, pronounce_number_fr, extractnumber_fr,
1085  short_scale=short_scale, ordinals=ordinals)
def extract_datetime_fr(string, currentDate, default_time)
Definition: parse_fr.py:476
def number_parse_fr(words, i)
Definition: parse_fr.py:82
def number_ordinal_fr(words, i)
Definition: parse_fr.py:301
def extract_numbers_generic(text, pronounce_handler, extract_handler, short_scale=True, ordinals=False)
Definition: parse_common.py:55
def look_for_fractions(split_list)
Definition: parse_common.py:36
def isFractional_fr(input_str)
Definition: parse_fr.py:1001
def normalize_fr(text, remove_articles)
Definition: parse_fr.py:1033
def extract_numbers_fr(text, short_scale=True, ordinals=False)
Definition: parse_fr.py:1070


mycroft_ros
Author(s):
autogenerated on Mon Apr 26 2021 02:35:40