parse_it.py
Go to the documentation of this file.
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright 2017 Mycroft AI Inc.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #
17 """
18  Parse functions for Italian (IT-IT)
19 
20 """
21 
22 import collections
23 from datetime import datetime
24 from dateutil.relativedelta import relativedelta
25 from mycroft.util.lang.parse_common import is_numeric, look_for_fractions, \
26  extract_numbers_generic
27 from mycroft.util.lang.format_it import LONG_SCALE_IT, SHORT_SCALE_IT, \
28  pronounce_number_it
29 
30 SHORT_ORDINAL_STRING_IT = {
31  1: 'primo',
32  2: 'secondo',
33  3: 'terzo',
34  4: 'quarto',
35  5: 'quinto',
36  6: 'sesto',
37  7: 'settimo',
38  8: 'ottavo',
39  9: 'nono',
40  10: 'decimo',
41  11: 'undicesimo',
42  12: 'dodicesimo',
43  13: 'tredicesimo',
44  14: 'quattordicesimo',
45  15: 'quindicesimo',
46  16: 'sedicesimo',
47  17: 'diciassettesimo',
48  18: 'diciottesimo',
49  19: 'diciannovesimo',
50  20: 'ventesimo',
51  30: 'trentesimo',
52  40: 'quarantesimo',
53  50: 'cinquantesimo',
54  60: 'sessantesimo',
55  70: 'settantesimo',
56  80: 'ottantesimo',
57  90: 'novantesimo',
58  1e2: 'centesimo',
59  1e3: 'millesimo',
60  1e6: 'milionesimo',
61  1e9: 'miliardesimo',
62  1e12: 'trilionesimo',
63  1e15: 'quadrilionesimo',
64  1e18: 'quintilionesim',
65  1e21: 'sestilionesimo',
66  1e24: 'settilionesimo',
67  1e27: 'ottilionesimo',
68  1e30: 'nonilionesimo',
69  1e33: 'decilionesimo'
70  # TODO > 1e-33
71 }
72 
73 # per i > 10e12 modificata solo la desinenza: da sistemare a fine debug
74 LONG_ORDINAL_STRING_IT = {
75  1: 'primo',
76  2: 'secondo',
77  3: 'terzo',
78  4: 'quarto',
79  5: 'quinto',
80  6: 'sesto',
81  7: 'settimo',
82  8: 'ottavo',
83  9: 'nono',
84  10: 'decimo',
85  11: 'undicesimo',
86  12: 'dodicesimo',
87  13: 'tredicesimo',
88  14: 'quattordicesimo',
89  15: 'quindicesimo',
90  16: 'sedicesimo',
91  17: 'diciassettesimo',
92  18: 'diciottesimo',
93  19: 'diciannovesimo',
94  20: 'ventesimo',
95  30: 'trentesimo',
96  40: 'quarantesimo',
97  50: 'cinquantesimo',
98  60: 'sessantesimo',
99  70: 'settantesimo',
100  80: 'ottantesimo',
101  90: 'novantesimo',
102  1e2: 'centesimo',
103  1e3: 'millesimo',
104  1e6: 'milionesimo',
105  1e12: 'bilionesimo',
106  1e18: 'trilionesimo',
107  1e24: 'quadrilionesimo',
108  1e30: 'quintilionesimo',
109  1e36: 'sestilionesimo',
110  1e42: 'settilionesimo',
111  1e48: 'ottilionesimo',
112  1e54: 'nonilionesimo',
113  1e60: 'decilionesimo'
114  # TODO > 1e60
115 }
116 
117 # Undefined articles ['un', 'una', 'un\''] can not be supressed,
118 # in Italian, 'un cavallo' means 'a horse' or 'one horse'.
119 ARTICLES_IT = ['il', 'lo', 'la', 'i', 'gli', 'le']
120 
121 STRING_NUM_ITA = {
122  'zero': 0,
123  'un': 1,
124  'uno': 1,
125  'una': 1,
126  'un\'': 1,
127  'due': 2,
128  'tre': 3,
129  'quattro': 4,
130  'cinque': 5,
131  'sei': 6,
132  'sette': 7,
133  'otto': 8,
134  'nove': 9,
135  'dieci': 10,
136  'undici': 11,
137  'dodici': 12,
138  'tredici': 13,
139  'quattordici': 14,
140  'quindici': 15,
141  'sedici': 16,
142  'diciassette': 17,
143  'diciotto': 18,
144  'diciannove': 19,
145  'venti': 20,
146  'vent': 20,
147  'trenta': 30,
148  'trent': 30,
149  'quaranta': 40,
150  'quarant': 40,
151  'cinquanta': 50,
152  'cinquant': 50,
153  'sessanta': 60,
154  'sessant': 60,
155  'settanta': 70,
156  'settant': 70,
157  'ottanta': 80,
158  'ottant': 80,
159  'novanta': 90,
160  'novant': 90,
161  'cento': 100,
162  'duecento': 200,
163  'trecento': 300,
164  'quattrocento': 400,
165  'cinquecento': 500,
166  'seicento': 600,
167  'settecento': 700,
168  'ottocento': 800,
169  'novecento': 900,
170  'mille': 1000,
171  'mila': 1000,
172  'centomila': 100000,
173  'milione': 1000000,
174  'miliardo': 1000000000,
175  'primo': 1,
176  'secondo': 2,
177  'mezzo': 0.5,
178  'mezza': 0.5,
179  'paio': 2,
180  'decina': 10,
181  'decine': 10,
182  'dozzina': 12,
183  'dozzine': 12,
184  'centinaio': 100,
185  'centinaia': 100,
186  'migliaio': 1000,
187  'migliaia': 1000
188 }
189 
190 
191 def isFractional_it(input_str, short_scale=False):
192  """
193  This function takes the given text and checks if it is a fraction.
194  Updated to italian from en version 18.8.9
195 
196  Args:
197  input_str (str): the string to check if fractional
198  short_scale (bool): use short scale if True, long scale if False
199  Returns:
200  (bool) or (float): False if not a fraction, otherwise the fraction
201 
202  """
203  input_str = input_str.lower()
204  if input_str.endswith('i', -1) and len(input_str) > 2:
205  input_str = input_str[:-1] + "o" # normalizza plurali
206 
207  fracts_it = {"intero": 1, "mezza": 2, "mezzo": 2}
208 
209  if short_scale:
210  for num in SHORT_ORDINAL_STRING_IT:
211  if num > 2:
212  fracts_it[SHORT_ORDINAL_STRING_IT[num]] = num
213  else:
214  for num in LONG_ORDINAL_STRING_IT:
215  if num > 2:
216  fracts_it[LONG_ORDINAL_STRING_IT[num]] = num
217 
218  if input_str in fracts_it:
219  return 1.0 / fracts_it[input_str]
220  return False
221 
222 
224  """
225  This function converts a long textual number like
226  milleventisette -> 1027 diecimila -> 10041 in
227  integer value, covers from 0 to 999999999999999
228  for now limited to 999_e21 but ready for 999_e63
229  example:
230  milleventisette -> 1027
231  diecimilaquarantuno-> 10041
232  centottomiladuecentotredici -> 108213
233  Args:
234  word (str): the word to convert in number
235  Returns:
236  (bool) or (int): The extracted number or False if no number
237  was found
238  """
239 
240  units = {'zero': 0, 'uno': 1, 'due': 2, 'tre': 3, 'quattro': 4,
241  'cinque': 5, 'sei': 6, 'sette': 7, 'otto': 8, 'nove': 9}
242 
243  tens = {'dieci': 10, 'venti': 20, 'trenta': 30, 'quaranta': 40,
244  'cinquanta': 50, 'sessanta': 60, 'settanta': 70, 'ottanta': 80,
245  'novanta': 90}
246 
247  tens_short = {'vent': 20, 'trent': 30, 'quarant': 40, 'cinquant': 50,
248  'sessant': 60, 'settant': 70, 'ottant': 80, 'novant': 90}
249 
250  nums_long = {'undici': 11, 'dodici': 12, 'tredici': 13, 'quattordici': 14,
251  'quindici': 15, 'sedici': 16, 'diciassette': 17,
252  'diciotto': 18, 'diciannove': 19}
253 
254  multipli_it = collections.OrderedDict([
255  # (1e63, 'deciliardi'),
256  # (1e60, 'decilioni'),
257  # (1e57, 'noviliardi'),
258  # (1e54, 'novilioni'),
259  # (1e51, 'ottiliardi'),
260  # (1e48, 'ottilioni'),
261  # (1e45, 'settiliardi'),
262  # (1e42, 'settilioni'),
263  # (1e39, 'sestiliardi'),
264  # (1e36, 'sestilioni'),
265  # (1e33, 'quintiliardi'),
266  # (1e30, 'quintilioni'),
267  # (1e27, 'quadriliardi'),
268  # (1e24, 'quadrilioni'), # yotta
269  (1e21, 'triliardi'), # zetta
270  (1e18, 'trilioni'), # exa
271  (1e15, 'biliardi'), # peta
272  (1e12, 'bilioni'), # tera
273  (1e9, 'miliardi'), # giga
274  (1e6, 'milioni') # mega
275  ])
276 
277  multiplier = {}
278  un_multiplier = {}
279 
280  for num in multipli_it:
281  if num > 1000 and num <= 1e21:
282  # plurali
283  multiplier[multipli_it[num]] = int(num)
284  # singolari - modificare per eccezioni *liardo
285  if multipli_it[num][-5:-1] == 'iard':
286  un_multiplier['un' + multipli_it[num][:-1] + 'o'] = int(num)
287  else:
288  un_multiplier['un' + multipli_it[num][:-1] + 'e'] = int(num)
289 
290  value = False
291 
292  # normalizza ordinali singoli o plurali -esimo -esimi
293  if word[-5:-1] == 'esim':
294  base = word[:-5]
295  normalize_ita3 = {'tre': '', 'ttr': 'o', 'sei': '', 'ott': 'o'}
296  normalize_ita2 = {'un': 'o', 'du': 'e', 'qu': 'e', 'tt': 'e',
297  'ov': 'e'}
298 
299  if base[-3:] in normalize_ita3:
300  base += normalize_ita3[base[-3:]]
301  elif base[-2:] in normalize_ita2:
302  base += normalize_ita2[base[-2:]]
303 
304  word = base
305 
306  for item in un_multiplier:
307  components = word.split(item, 1)
308  if len(components) == 2:
309  if not components[0]: # inizia con un1^x
310  if not components[1]: # unmilione
311  word = str(int(un_multiplier[item]))
312  else: # unmilione + x
313  word = str(int(un_multiplier[item]) +
314  extractnumber_long_it(components[1]))
315 
316  for item in multiplier:
317  components = word.split(item, 1)
318  if len(components) == 2:
319  if not components[0]: # inizia con un1^x
320  word = str(int(multiplier[item]) +
321  extractnumber_long_it(components[1]))
322  else:
323  if not components[1]:
324  word = str(extractnumber_long_it(components[0])) + '*' \
325  + str(int(multiplier[item]))
326  else:
327  word = str(extractnumber_long_it(components[0])) + '*' \
328  + str(int(multiplier[item])) + '+' \
329  + str(extractnumber_long_it(components[1]))
330 
331  for item in tens:
332  word = word.replace(item, '+' + str(tens[item]))
333 
334  for item in tens_short:
335  word = word.replace(item, '+' + str(tens_short[item]))
336 
337  for item in nums_long:
338  word = word.replace(item, '+' + str(nums_long[item]))
339 
340  word = word.replace('cento', '+1xx')
341  word = word.replace('cent', '+1xx')
342  word = word.replace('mille', '+1000') # unmilionemille
343  word = word.replace('mila', '*1000') # unmilioneduemila
344 
345  for item in units:
346  word = word.replace(item, '+' + str(units[item]))
347 
348  # normalizzo i cento
349  occorrenze = word.count('+1xx')
350  for _ in range(0, occorrenze):
351  components = word.rsplit('+1xx', 1)
352  if len(components[0]) > 1 and components[0].endswith('0'):
353  word = components[0] + '+100' + components[1]
354  else:
355  word = components[0] + '*100' + components[1]
356 
357  components = word.rsplit('*1000', 1)
358  if len(components) == 2:
359  if components[0].startswith('*'): # centomila
360  components[0] = components[0][1:]
361  word = str(extractnumber_long_it(components[0])) + \
362  '*1000' + str(components[1])
363 
364  # gestione eccezioni
365  if word.startswith('*') or word.startswith('+'):
366  word = word[1:]
367 
368  addends = word.split('+')
369  for c, _ in enumerate(addends):
370  if '*' in addends[c]:
371  factors = addends[c].split('*')
372  result = int(factors[0]) * int(factors[1])
373  if len(factors) == 3:
374  result *= int(factors[2])
375  addends[c] = str(result)
376 
377  # check if all token are numbers
378  if all([s.isdecimal() for s in addends]):
379  value = sum([int(s) for s in addends])
380  else:
381  value = False
382  return value
383 
384 
385 def extractnumber_it(text, short_scale=False, ordinals=False):
386  """
387  This function extracts a number from a text string,
388  handles pronunciations in long scale and short scale
389 
390  https://en.wikipedia.org/wiki/Names_of_large_numbers
391 
392  Args:
393  text (str): the string to normalize
394  short_scale (bool): use short scale if True, long scale if False
395  ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
396  Returns:
397  (int) or (float) or False: The extracted number or False if no number
398  was found
399 
400  """
401 
402  string_num_ordinal_it = {}
403  # first, second...
404  if ordinals:
405  if short_scale:
406  for num in SHORT_ORDINAL_STRING_IT:
407  num_string = SHORT_ORDINAL_STRING_IT[num]
408  string_num_ordinal_it[num_string] = num
409  STRING_NUM_ITA[num_string] = num
410  else:
411  for num in LONG_ORDINAL_STRING_IT:
412  num_string = LONG_ORDINAL_STRING_IT[num]
413  string_num_ordinal_it[num_string] = num
414  STRING_NUM_ITA[num_string] = num
415 
416  # negate next number (-2 = 0 - 2)
417  negatives = ['meno'] # 'negativo' non ĆØ usuale in italiano
418 
419  # multiply the previous number (one hundred = 1 * 100)
420  multiplies = ['decina', 'decine', 'dozzina', 'dozzine',
421  'centinaia', 'centinaio', 'migliaia', 'migliaio', 'mila']
422 
423  # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 )
424  fraction_marker = [' e ']
425 
426  # decimal marker ( 1 point 5 = 1 + 0.5)
427  decimal_marker = [' punto ', ' virgola ']
428 
429  if short_scale:
430  for num in SHORT_SCALE_IT:
431  num_string = SHORT_SCALE_IT[num]
432  STRING_NUM_ITA[num_string] = num
433  multiplies.append(num_string)
434  else:
435  for num in LONG_SCALE_IT:
436  num_string = LONG_SCALE_IT[num]
437  STRING_NUM_ITA[num_string] = num
438  multiplies.append(num_string)
439 
440  # 2 e 3/4 ed altri casi
441  for separator in fraction_marker:
442  components = text.split(separator)
443  zeros = 0
444 
445  if len(components) == 2:
446  # count zeros in fraction part
447  sub_components = components[1].split(' ')
448  for element in sub_components:
449  if element == 'zero' or element == '0':
450  zeros += 1
451  else:
452  break
453  # ensure first is not a fraction and second is a fraction
454  num1 = extractnumber_it(components[0])
455  num2 = extractnumber_it(components[1])
456  if num1 is not None and num2 is not None \
457  and num1 >= 1 and 0 < num2 < 1:
458  return num1 + num2
459  # sette e quaranta sette e zero zero due
460  elif num1 is not None and num2 is not None \
461  and num1 >= 1 and num2 > 1:
462  return num1 + num2 / pow(10, len(str(num2)) + zeros)
463 
464  # 2 punto 5
465  for separator in decimal_marker:
466  zeros = 0
467  # count zeros in fraction part
468  components = text.split(separator)
469 
470  if len(components) == 2:
471  sub_components = components[1].split(' ')
472  for element in sub_components:
473  if element == 'zero' or element == '0':
474  zeros += 1
475  else:
476  break
477 
478  number = int(extractnumber_it(components[0]))
479  decimal = int(extractnumber_it(components[1]))
480  if number is not None and decimal is not None:
481  if '.' not in str(decimal):
482  return number + decimal / pow(10,
483  len(str(decimal)) + zeros)
484 
485  all_words = text.split()
486  val = False
487  prev_val = None
488  to_sum = []
489  for idx, word in enumerate(all_words):
490 
491  if not word:
492  continue
493  prev_word = all_words[idx - 1] if idx > 0 else ''
494  next_word = all_words[idx + 1] if idx + 1 < len(all_words) else ''
495 
496  # is this word already a number ?
497  if is_numeric(word):
498  val = float(word)
499 
500  # is this word the name of a number ?
501  if word in STRING_NUM_ITA:
502  val = STRING_NUM_ITA[word]
503 
504  # tre quarti un quarto trenta secondi
505  if isFractional_it(word) and prev_val:
506  if word[:-1] == 'second' and not ordinals:
507  val = prev_val * 2
508  else:
509  val = prev_val
510 
511  # is the prev word a number and should we multiply it?
512  # twenty hundred, six hundred
513  if word in multiplies:
514  if not prev_val:
515  prev_val = 1
516  val = prev_val * val
517 
518  # is this a spoken fraction?
519  # mezza tazza
520  if val is False:
521  val = isFractional_it(word, short_scale=short_scale)
522 
523  # 2 quinti
524  if not ordinals:
525  next_value = isFractional_it(next_word, short_scale=short_scale)
526  if next_value:
527  if not val:
528  val = 1
529  val = val * next_value
530 
531  # is this a negative number?
532  if val and prev_word and prev_word in negatives:
533  val = 0 - val
534 
535  if not val:
536  val = extractnumber_long_it(word)
537 
538  # let's make sure it isn't a fraction
539  if not val:
540  # look for fractions like '2/3'
541  all_pieces = word.split('/')
542  if look_for_fractions(all_pieces):
543  val = float(all_pieces[0]) / float(all_pieces[1])
544  else:
545  prev_val = val
546  # handle long numbers
547  # six hundred sixty six
548  # two million five hundred thousand
549  if word in multiplies and next_word not in multiplies:
550  to_sum.append(val)
551  val = 0
552  prev_val = 0
553  elif extractnumber_long_it(word) > 100 and \
554  extractnumber_long_it(next_word) and \
555  next_word not in multiplies:
556  to_sum.append(val)
557  val = 0
558  prev_val = 0
559 
560  if val is not None:
561  for addend in to_sum:
562  val = val + addend
563  return val
564 
565 
566 def normalize_it(text, remove_articles):
567  """ IT string normalization """
568  # replace ambiguous words
569  text = text.replace('un paio', 'due')
570 
571  words = text.split() # this also removed extra spaces
572  # Contractions are not common in IT
573  # Convert numbers into digits, e.g. 'quarantadue' -> '42'
574  normalized = ''
575  i = 0
576 
577  while i < len(words):
578  word = words[i]
579  # remove articles
580  # Italian requires the article to define the grammatical gender
581  if remove_articles and word in ARTICLES_IT:
582  i += 1
583  continue
584 
585  if word in STRING_NUM_ITA:
586  word = str(STRING_NUM_ITA[word])
587 
588  val = int(extractnumber_it(word)) # era extractnumber_long_it
589 
590  if val:
591  word = str(val)
592 
593  normalized += ' ' + word
594  i += 1
595  # indefinite articles in it-it can not be removed
596 
597  return normalized[1:]
598 
599 
600 def extract_datetime_it(string, dateNow, default_time):
601  def clean_string(s):
602  """
603  cleans the input string of unneeded punctuation and capitalization
604  among other things.
605  Normalize italian plurals
606  """
607  symbols = ['.', ',', ';', '?', '!', 'Āŗ', 'ĀŖ', 'Ā°', 'l\'']
608 
609  for word in symbols:
610  s = s.replace(word, '')
611 
612  s = s.lower().replace('Ć”', 'a').replace('Ć ', 'a').replace('ĆØ', "e'")\
613  .replace('Ć©', "e'").replace('Ƭ', 'i').replace('Ć¹', 'u')\
614  .replace('Ć²', 'o').replace('-', ' ').replace('_', '')
615 
616  # normalizza plurali per semplificare analisi
617  s = s.replace('secondi', 'secondo').replace('minuti', 'minuto')\
618  .replace('ore', 'ora').replace('giorni', 'giorno')\
619  .replace('settimane', 'settimana').replace('mesi', 'mese')\
620  .replace('anni', 'anno').replace('mattino', 'mattina')\
621  .replace('prossima', 'prossimo').replace('questa', 'questo')\
622  .replace('quarti', 'quarto').replace('in punto', 'in_punto')\
623  .replace('decennio', 'decenni').replace('secoli', 'secolo')\
624  .replace('millennio', 'millenni').replace(' un ', ' uno ')\
625  .replace('scorsa', 'scorso').replace('passata', 'passato')\
626  .replace('uno paio', 'due')
627 
628  noise_words = ['dello', 'la', 'del', 'al', 'il', 'di', 'tra', 'lo',
629  'le', 'alle', 'alla', 'dai', 'delle', 'della',
630  'a', 'e\'', 'era', 'questa', 'questo', 'e', 'nel',
631  'nello', 'dallo', ' ']
632 
633  word_list = s.split()
634  word_list = [x for x in word_list if x not in noise_words]
635  # normalizza alcuni formati orari
636  for idx in range(0, len(word_list) - 1):
637  if word_list[idx][0].isdigit() and word_list[idx+1][0].isdigit():
638  num0 = int(word_list[idx])
639  num1 = int(word_list[idx+1])
640  if 0 <= num0 <= 23 and 10 <= num1 <= 59:
641  word_list[idx] = str(num0) + ':' + str(num1)
642  word_list[idx+1] = ''
643 
644  word_list = [x for x in word_list if x]
645 
646  return word_list
647 
648  def date_found():
649  return found or \
650  (datestr != '' or time_str != '' or year_offset != 0 or
651  month_offset != 0 or day_offset is True or hr_offset != 0 or
652  hr_abs or min_offset != 0 or min_abs or sec_offset != 0)
653 
654  if string == '' or not dateNow:
655  return None
656 
657  found = False
658  day_specified = False
659  day_offset = False
660  month_offset = 0
661  year_offset = 0
662  today = dateNow.strftime('%w')
663  current_year = dateNow.strftime('%Y')
664  from_flag = False
665  datestr = ''
666  has_year = False
667  time_qualifier = ''
668  time_qualifiers_am = ['mattina', 'stamani', 'stamane']
669  time_qualifiers_pm = ['pomeriggio', 'sera', 'stasera', 'stanotte']
670  time_qualifiers_list = set(time_qualifiers_am + time_qualifiers_pm)
671  markers = ['alle', 'in', 'questo', 'per', 'di', 'tra', 'fra', 'entro']
672  days = ['lunedi', 'martedi', 'mercoledi',
673  'giovedi', 'venerdi', 'sabato', 'domenica']
674  months = ['gennaio', 'febbraio', 'marzo', 'aprile', 'maggio', 'giugno',
675  'luglio', 'agosto', 'settembre', 'ottobre', 'novembre',
676  'dicembre']
677  months_short = ['gen', 'feb', 'mar', 'apr', 'mag', 'giu', 'lug', 'ago',
678  'set', 'ott', 'nov', 'dic']
679  year_multiples = ['decenni', 'secolo', 'millenni'] # decennio <- decenni
680  time_multiples = ['ora', 'minuto', 'secondo']
681  day_multiples = ['settimana', 'mese', 'anno']
682  noise_words_2 = ['tra', 'di', 'per', 'fra', 'un ', 'uno', 'lo', 'del',
683  'l', 'in_punto', ' ', 'nella', 'dell']
684 
685  words = clean_string(string)
686 
687  for idx, word in enumerate(words):
688  if word == '':
689  continue
690  word_prev_prev = words[idx - 2] if idx > 1 else ''
691  word_prev = words[idx - 1] if idx > 0 else ''
692  word_next = words[idx + 1] if idx + 1 < len(words) else ''
693  word_next_next = words[idx + 2] if idx + 2 < len(words) else ''
694  start = idx
695  used = 0
696  # save timequalifier for later
697  if word == 'adesso' and not datestr:
698  # word == 'ora' va in conflitto con 'tra un ora'
699  words = [x for x in words if x != 'adesso']
700  words = [x for x in words if x]
701  result_str = ' '.join(words)
702  extracted_date = dateNow.replace(microsecond=0)
703  return [extracted_date, result_str]
704 
705  # un paio di o tra tre settimane --> secoli
706  elif extractnumber_it(word) and (word_next in year_multiples or
707  word_next in day_multiples):
708  multiplier = int(extractnumber_it(word))
709  used += 2
710  if word_next == 'decenni':
711  year_offset = multiplier * 10
712  elif word_next == 'secolo':
713  year_offset = multiplier * 100
714  elif word_next == 'millenni':
715  year_offset = multiplier * 1000
716  elif word_next == 'anno':
717  year_offset = multiplier
718  elif word_next == 'mese':
719  month_offset = multiplier
720  elif word_next == 'settimana':
721  day_offset = multiplier * 7
722  elif word in time_qualifiers_list:
723  time_qualifier = word
724  # parse today, tomorrow, day after tomorrow
725  elif word == 'oggi' and not from_flag:
726  day_offset = 0
727  used += 1
728  elif word == 'domani' and not from_flag:
729  day_offset = 1
730  used += 1
731  elif word == 'ieri' and not from_flag:
732  day_offset -= 1
733  used += 1
734  elif word == 'dopodomani' and not from_flag: # after tomorrow
735  day_offset += 2
736  used += 1
737  elif word == 'dopo' and word_next == 'domani' and not from_flag:
738  day_offset += 1
739  used += 2
740  elif word == 'giorno':
741  if word_prev[0].isdigit():
742  day_offset += int(word_prev)
743  start -= 1
744  used = 2
745  if word_next == 'dopo' and word_next_next == 'domani':
746  day_offset += 1
747  used += 2
748  elif word == 'settimana' and not from_flag:
749  if word_prev == 'prossimo':
750  day_offset = 7
751  start -= 1
752  used = 2
753  elif word_prev == 'passato' or word_prev == 'scorso':
754  day_offset = -7
755  start -= 1
756  used = 2
757  elif word_next == 'prossimo':
758  day_offset = 7
759  used += 2
760  elif word_next == 'passato' or word_next == 'scorso':
761  day_offset = -7
762  used += 2
763  # parse next month, last month
764  elif word == 'mese' and not from_flag:
765  if word_prev == 'prossimo':
766  month_offset = 1
767  start -= 1
768  used = 2
769  elif word_prev == 'passato' or word_prev == 'scorso':
770  month_offset = -1
771  start -= 1
772  used = 2
773  elif word_next == 'prossimo':
774  month_offset = 1
775  used += 2
776  elif word_next == 'passato' or word_next == 'scorso':
777  month_offset = -1
778  used += 2
779  # parse next year, last year
780  elif word == 'anno' and not from_flag:
781  if word_prev == 'prossimo': # prossimo anno
782  year_offset = 1
783  start -= 1
784  used = 2
785  elif word_next == 'prossimo': # anno prossimo
786  year_offset = 1
787  used = 2
788  elif word_prev == 'passato' or word_prev == 'scorso':
789  year_offset = -1
790  start -= 1
791  used = 2
792  elif word_next == 'passato' or word_next == 'scorso':
793  year_offset = -1
794  used = 2
795  elif word == 'decenni' and not from_flag:
796  if word_prev == 'prossimo': # prossimo mese
797  year_offset = 10
798  start -= 1
799  used = 2
800  elif word_next == 'prossimo': # mese prossimo
801  year_offset = 10
802  used = 2
803  elif word_prev == 'passato' or word_prev == 'scorso':
804  year_offset = -10
805  start -= 1
806  used = 2
807  elif word_next == 'passato' or word_next == 'scorso':
808  year_offset = -10
809  used = 2
810  # parse Monday, Tuesday, etc., and next Monday,
811  # last Tuesday, etc.
812  elif word in days and not from_flag:
813  ddd = days.index(word)
814  day_offset = (ddd + 1) - int(today)
815  used = 1
816  if day_offset < 0:
817  day_offset += 7
818  if word_prev == 'prossimo':
819  day_offset += 7
820  start -= 1
821  used += 1
822  elif word_prev == 'passato' or word_prev == 'scorso':
823  day_offset -= 7
824  start -= 1
825  used += 1
826  if word_next == 'prossimo':
827  day_offset += 7
828  used += 1
829  elif word_next == 'passato' or word_next == 'scorso':
830  day_offset -= 7
831  used += 1
832  # parse 15 of July, June 20th, Feb 18, 19 of February
833  elif word in months or word in months_short and not from_flag:
834  try:
835  mmm = months.index(word)
836  except ValueError:
837  mmm = months_short.index(word)
838  used += 1
839  datestr = months[mmm]
840  if word_prev and extractnumber_it(word_prev):
841  datestr += ' ' + str(int(extractnumber_it(word_prev)))
842  start -= 1
843  used += 1
844  if word_next and extractnumber_it(word_next):
845  datestr += ' ' + str(int(extractnumber_it(word_next)))
846  used += 1
847  has_year = True
848  else:
849  has_year = False
850  elif word_next and word_next[0].isdigit():
851  datestr += ' ' + word_next
852  used += 1
853  if word_next_next and word_next_next[0].isdigit():
854  datestr += ' ' + word_next_next
855  used += 1
856  has_year = True
857  else:
858  has_year = False
859  # parse 5 days from tomorrow, 10 weeks from next thursday,
860  # 2 months from July
861  validFollowups = days + months + months_short
862  validFollowups.append('oggi')
863  validFollowups.append('domani')
864  validFollowups.append('prossimo')
865  validFollowups.append('passato')
866  validFollowups.append('adesso')
867 
868  if (word == 'da' or word == 'dopo') and word_next in validFollowups:
869  used = 0
870  from_flag = True
871  if word_next == 'domani':
872  day_offset += 1
873  used += 2
874  elif word_next == 'oggi' or word_next == 'adesso':
875  used += 2
876  elif word_next in days:
877  ddd = days.index(word_next)
878  tmp_offset = (ddd + 1) - int(today)
879  used += 2
880  if tmp_offset < 0:
881  tmp_offset += 7
882  if word_next_next == 'prossimo':
883  tmp_offset += 7
884  used += 1
885  elif word_next_next == 'passato' or word_next_next == 'scorso':
886  tmp_offset = (ddd + 1) - int(today)
887  used += 1
888  day_offset += tmp_offset
889  elif word_next_next and word_next_next in days:
890  ddd = days.index(word_next_next)
891  tmp_offset = (ddd + 1) - int(today)
892  if word_next == 'prossimo':
893  tmp_offset += 7
894  # elif word_next == 'passato' or word_next == 'scorso':
895  # tmp_offset -= 7
896  day_offset += tmp_offset
897  used += 3
898 
899  if used > 0:
900  if start - 1 > 0 and words[start - 1] == 'questo':
901  start -= 1
902  used += 1
903 
904  for i in range(0, used):
905  words[i + start] = ''
906 
907  if start - 1 >= 0 and words[start - 1] in markers:
908  words[start - 1] = ''
909  found = True
910  day_specified = True
911 
912  # parse time
913  time_str = ''
914  hr_offset = 0
915  min_offset = 0
916  sec_offset = 0
917  hr_abs = None
918  min_abs = None
919  military = False
920 
921  for idx, word in enumerate(words):
922  if word == '':
923  continue
924  word_prev_prev = words[idx - 2] if idx > 1 else ''
925  word_prev = words[idx - 1] if idx > 0 else ''
926  word_next = words[idx + 1] if idx + 1 < len(words) else ''
927  word_next_next = words[idx + 2] if idx + 2 < len(words) else ''
928  # parse noon, midnight, morning, afternoon, evening
929  used = 0
930  if word == 'mezzogiorno':
931  hr_abs = 12
932  used += 1
933  elif word == 'mezzanotte':
934  hr_abs = 24
935  used += 1
936  if word == 'mezzo' and word_next == 'giorno':
937  hr_abs = 12
938  used += 2
939  elif word == 'mezza' and word_next == 'notte':
940  hr_abs = 24
941  used += 2
942  elif word == 'mattina':
943  if not hr_abs:
944  hr_abs = 8
945  used += 1
946  if word_next and word_next[0].isdigit(): # mattina alle 5
947  hr_abs = int(word_next)
948  used += 1
949  elif word == 'pomeriggio':
950  if not hr_abs:
951  hr_abs = 15
952  used += 1
953  if word_next and word_next[0].isdigit(): # pomeriggio alle 5
954  hr_abs = int(word_next)
955  used += 1
956  if (hr_abs or 0) < 12:
957  hr_abs = (hr_abs or 0) + 12
958  elif word == 'sera':
959  if not hr_abs:
960  hr_abs = 19
961  used += 1
962  if word_next and word_next[0].isdigit() \
963  and ':' not in word_next:
964  hr_abs = int(word_next)
965  used += 1
966  if (hr_abs or 0) < 12:
967  hr_abs = (hr_abs or 0) + 12
968  # da verificare piĆ¹ a fondo
969  elif word == 'presto':
970  hr_abs -= 1
971  used += 1
972  elif word == 'tardi':
973  hr_abs += 1
974  used += 1
975  # un paio di minuti tra cinque minuti tra 5 ore
976  elif extractnumber_it(word) and (word_next in time_multiples):
977  d_time = int(extractnumber_it(word))
978  used += 2
979  if word_next == 'ora':
980  hr_offset = d_time
981  isTime = False
982  hr_abs = -1
983  min_abs = -1
984  elif word_next == 'minuto':
985  min_offset = d_time
986  isTime = False
987  hr_abs = -1
988  min_abs = -1
989  elif word_next == 'secondo':
990  sec_offset = d_time
991  isTime = False
992  hr_abs = -1
993  min_abs = -1
994  elif word == 'mezzora':
995  min_offset = 30
996  used = 1
997  isTime = False
998  hr_abs = -1
999  min_abs = -1
1000  # if word_prev == 'uno' or word_prev == 'una':
1001  # start -= 1
1002  # used += 1
1003  elif extractnumber_it(word) and word_next and \
1004  word_next == 'quarto' and word_next_next == 'ora':
1005  if int(extractnumber_it(word)) == 1 \
1006  or int(extractnumber_it(word)) == 3:
1007  min_offset = 15 * int(extractnumber_it(word))
1008  else: # elimina eventuali errori
1009  min_offset = 15
1010  used = 3
1011  start -= 1
1012  isTime = False
1013  hr_abs = -1
1014  min_abs = -1
1015  elif word[0].isdigit():
1016  isTime = True
1017  str_hh = ''
1018  str_mm = ''
1019  remainder = ''
1020  if ':' in word:
1021  # parse colons
1022  # '3:00 in the morning'
1023  components = word.split(':')
1024  if len(components) == 2:
1025  num0 = int(extractnumber_it(components[0]))
1026  num1 = int(extractnumber_it(components[1]))
1027  if num0 is not False and num1 is not False \
1028  and 0 <= num0 <= 23 and 0 <= num1 <= 59:
1029  str_hh = str(num0)
1030  str_mm = str(num1)
1031  elif 0 < int(extractnumber_it(word)) < 24 \
1032  and word_next != 'quarto':
1033  str_hh = str(int(word))
1034  str_mm = '00'
1035  elif 100 <= int(word) <= 2400:
1036  str_hh = int(word) / 100
1037  str_mm = int(word) - str_hh * 100
1038  military = True
1039  isTime = False
1040  if extractnumber_it(word) and word_next \
1041  and word_next == 'quarto' and word_next_next != 'ora':
1042  if int(extractnumber_it(word)) == 1 \
1043  or int(extractnumber_it(word)) == 3:
1044  str_mm = str(15 * int(extractnumber_it(word)))
1045  else: # elimina eventuali errori
1046  str_mm = '0'
1047  str_hh = str(hr_abs)
1048  used = 2
1049  words[idx + 1] = ''
1050  isTime = False
1051  if extractnumber_it(word) and word_next \
1052  and word_next == 'in_punto':
1053  str_hh = str(int(extractnumber_it(word)))
1054  used = 2
1055  if word_next == 'pm':
1056  remainder = 'pm'
1057  hr_abs = int(str_hh)
1058  min_abs = int(str_mm)
1059  if hr_abs <= 12:
1060  hr_abs = hr_abs + 12
1061  used = 2
1062  elif word_next == 'am':
1063  remainder = 'am'
1064  hr_abs = int(str_hh)
1065  min_abs = int(str_mm)
1066  used = 2
1067  elif word_next == 'mattina':
1068  # ' 11 del mattina'
1069  hh = int(str_hh)
1070  mm = int(str_mm)
1071  used = 2
1072  remainder = 'am'
1073  isTime = False
1074  hr_abs = hh
1075  min_abs = mm
1076  elif word_next == 'pomeriggio':
1077  # ' 2 del pomeriggio'
1078  hh = int(str_hh)
1079  mm = int(str_mm)
1080  if hh < 12:
1081  hh += 12
1082  used = 2
1083  remainder = 'pm'
1084  isTime = False
1085  hr_abs = hh
1086  min_abs = mm
1087  elif word_next == 'sera':
1088  # 'alle 8 di sera'
1089  hh = int(str_hh)
1090  mm = int(str_mm)
1091  if hh < 12:
1092  hh += 12
1093  used = 2
1094  remainder = 'pm'
1095  isTime = False
1096  hr_abs = hh
1097  min_abs = mm
1098  elif word_next == 'notte':
1099  hh = int(str_hh)
1100  mm = int(str_mm)
1101  if hh > 5:
1102  remainder = 'pm'
1103  else:
1104  remainder = 'am'
1105  used = 2
1106  isTime = False
1107  hr_abs = hh
1108  min_abs = mm
1109  # parse half an hour : undici e mezza
1110  elif word_next and word_next == 'mezza':
1111  hr_abs = int(str_hh)
1112  min_abs = 30
1113  used = 2
1114  isTime = False
1115  elif word_next and word_next == 'in_punto':
1116  hr_abs = int(str_hh)
1117  min_abs = 0
1118  str_mm = '0'
1119  used = 2
1120  isTime = False
1121  else:
1122  # 17:30
1123  remainder = ''
1124  hr_abs = int(str_hh)
1125  min_abs = int(str_mm)
1126  used = 1
1127  isTime = False
1128  if word_prev == 'ora':
1129  words[idx - 1] = ''
1130 
1131  if time_qualifier != '':
1132  # military = True
1133  if str_hh and int(str_hh) <= 12 and \
1134  (time_qualifier in time_qualifiers_pm):
1135  str_hh = str(int(str_hh) + 12)
1136  else:
1137  isTime = False
1138 
1139  str_hh = int(str_hh) if str_hh else 0
1140  str_mm = int(str_mm) if str_mm else 0
1141 
1142  str_hh = str_hh + 12 if remainder == 'pm' \
1143  and str_hh < 12 else str_hh
1144  str_hh = str_hh - 12 if remainder == 'am' \
1145  and str_hh >= 12 else str_hh
1146 
1147  if (not military and
1148  remainder not in ['am', 'pm'] and
1149  ((not day_specified) or day_offset < 1)):
1150  # ambiguous time, detect whether they mean this evening or
1151  # the next morning based on whether it has already passed
1152  hr_abs = str_hh
1153  if dateNow.hour < str_hh:
1154  pass # No modification needed
1155  elif dateNow.hour < str_hh + 12:
1156  str_hh += 12
1157  hr_abs = str_hh
1158  else:
1159  # has passed, assume the next morning
1160  day_offset += 1
1161 
1162  if time_qualifier in time_qualifiers_pm and str_hh < 12:
1163  str_hh += 12
1164 
1165  if str_hh > 24 or str_mm > 59:
1166  isTime = False
1167  used = 0
1168  if isTime:
1169  hr_abs = str_hh * 1
1170  min_abs = str_mm * 1
1171  used += 1
1172 
1173  if (hr_abs or 0) <= 12 and (time_qualifier == 'sera' or
1174  time_qualifier == 'pomeriggio'):
1175  hr_abs = (hr_abs or 0) + 12
1176 
1177  if used > 0:
1178  # removed parsed words from the sentence
1179  for i in range(used):
1180  words[idx + i] = ''
1181 
1182  if word_prev == 'o' or word_prev == 'oh':
1183  words[words.index(word_prev)] = ''
1184 
1185  if idx > 0 and word_prev in markers:
1186  words[idx - 1] = ''
1187  if idx > 1 and word_prev_prev in markers:
1188  words[idx - 2] = ''
1189 
1190  idx += used - 1
1191  found = True
1192 
1193  # check that we found a date
1194  if not date_found:
1195  return None
1196 
1197  if day_offset is False:
1198  day_offset = 0
1199 
1200  # perform date manipulation
1201 
1202  extracted_date = dateNow.replace(microsecond=0)
1203 
1204  if datestr != '':
1205  en_months = ['january', 'february', 'march', 'april', 'may', 'june',
1206  'july', 'august', 'september', 'october', 'november',
1207  'december']
1208  en_months_short = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july',
1209  'aug', 'sept', 'oct', 'nov', 'dec']
1210 
1211  for idx, en_month in enumerate(en_months):
1212  datestr = datestr.replace(months[idx], en_month)
1213 
1214  for idx, en_month in enumerate(en_months_short):
1215  datestr = datestr.replace(months_short[idx], en_month)
1216 
1217  try:
1218  temp = datetime.strptime(datestr, '%B %d')
1219  except ValueError:
1220  # Try again, allowing the year
1221  temp = datetime.strptime(datestr, '%B %d %Y')
1222  extracted_date = extracted_date.replace(hour=0, minute=0, second=0)
1223  if not has_year:
1224  temp = temp.replace(year=extracted_date.year,
1225  tzinfo=extracted_date.tzinfo)
1226  if extracted_date < temp:
1227  extracted_date = extracted_date.replace(
1228  year=int(current_year),
1229  month=int(temp.strftime('%m')),
1230  day=int(temp.strftime('%d')),
1231  tzinfo=extracted_date.tzinfo)
1232  else:
1233  extracted_date = extracted_date.replace(
1234  year=int(current_year) + 1,
1235  month=int(temp.strftime('%m')),
1236  day=int(temp.strftime('%d')),
1237  tzinfo=extracted_date.tzinfo)
1238  else:
1239  extracted_date = extracted_date.replace(
1240  year=int(temp.strftime('%Y')),
1241  month=int(temp.strftime('%m')),
1242  day=int(temp.strftime('%d')),
1243  tzinfo=extracted_date.tzinfo)
1244  else:
1245  # ignore the current HH:MM:SS if relative using days or greater
1246  if hr_offset == 0 and min_offset == 0 and sec_offset == 0:
1247  extracted_date = extracted_date.replace(hour=0, minute=0, second=0)
1248 
1249  if year_offset != 0:
1250  extracted_date = extracted_date + relativedelta(years=year_offset)
1251  if month_offset != 0:
1252  extracted_date = extracted_date + relativedelta(months=month_offset)
1253  if day_offset != 0:
1254  extracted_date = extracted_date + relativedelta(days=day_offset)
1255  if hr_abs != -1 and min_abs != -1:
1256  # If no time was supplied in the string set the time to default
1257  # time if it's available
1258  if hr_abs is None and min_abs is None and default_time is not None:
1259  hr_abs, min_abs = default_time.hour, default_time.minute
1260  else:
1261  hr_abs = hr_abs or 0
1262  min_abs = min_abs or 0
1263 
1264  extracted_date = extracted_date + relativedelta(hours=hr_abs,
1265  minutes=min_abs)
1266  if (hr_abs != 0 or min_abs != 0) and datestr == '':
1267  if not day_specified and dateNow > extracted_date:
1268  extracted_date = extracted_date + relativedelta(days=1)
1269  if hr_offset != 0:
1270  extracted_date = extracted_date + relativedelta(hours=hr_offset)
1271  if min_offset != 0:
1272  extracted_date = extracted_date + relativedelta(minutes=min_offset)
1273  if sec_offset != 0:
1274  extracted_date = extracted_date + relativedelta(seconds=sec_offset)
1275 
1276  words = [x for x in words if x not in noise_words_2]
1277  words = [x for x in words if x]
1278  result_str = ' '.join(words)
1279 
1280  return [extracted_date, result_str]
1281 
1282 
1283 def get_gender_it(word, raw_string=""):
1284  """
1285  In Italian to define the grammatical gender of a word is necessary
1286  analyze the article that precedes the word and not only the last
1287  letter of the word.
1288 
1289  TODO: check if useful
1290  """
1291 
1292  gender = None
1293  words = raw_string.split(' ')
1294  for idx, w in enumerate(words):
1295  if w == word and idx != 0:
1296  previous = words[idx - 1]
1297  gender = get_gender_it(previous)
1298  break
1299 
1300  if not gender:
1301  if word[-1] == 'a' or word[-1] == 'e':
1302  gender = 'f'
1303  if word[-1] == 'o' or word[-1] == 'n' \
1304  or word[-1] == 'l' or word[-1] == 'i':
1305  gender = 'm'
1306 
1307  return gender
1308 
1309 
1310 def extract_numbers_it(text, short_scale=False, ordinals=False):
1311  """
1312  Takes in a string and extracts a list of numbers.
1313 
1314  Args:
1315  text (str): the string to extract a number from
1316  short_scale (bool): Use "short scale" or "long scale" for large
1317  numbers -- over a million. The default is short scale, which
1318  is now common in most English speaking countries.
1319  See https://en.wikipedia.org/wiki/Names_of_large_numbers
1320  ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
1321  Returns:
1322  list: list of extracted numbers as floats
1323  """
1324  return extract_numbers_generic(text, pronounce_number_it, extractnumber_it,
1325  short_scale=short_scale, ordinals=ordinals)
def extractnumber_it(text, short_scale=False, ordinals=False)
Definition: parse_it.py:385
def normalize_it(text, remove_articles)
Definition: parse_it.py:566
def extract_datetime_it(string, dateNow, default_time)
Definition: parse_it.py:600
def isFractional_it(input_str, short_scale=False)
Definition: parse_it.py:191
def extractnumber_long_it(word)
Definition: parse_it.py:223
def extract_numbers_generic(text, pronounce_handler, extract_handler, short_scale=True, ordinals=False)
Definition: parse_common.py:55
def extract_numbers_it(text, short_scale=False, ordinals=False)
Definition: parse_it.py:1310
def look_for_fractions(split_list)
Definition: parse_common.py:36
def get_gender_it(word, raw_string="")
Definition: parse_it.py:1283


mycroft_ros
Author(s):
autogenerated on Mon Apr 26 2021 02:35:40