mycroft_ros: parse_it.py Source File

Go to the documentation of this file.
 # -*- coding: utf-8 -*-
 #
 # Copyright 2017 Mycroft AI Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 """
     Parse functions for Italian (IT-IT)
 
 """
 
 import collections
 from datetime import datetime
 from dateutil.relativedelta import relativedelta
 from mycroft.util.lang.parse_common import is_numeric, look_for_fractions, \
     extract_numbers_generic
 from mycroft.util.lang.format_it import LONG_SCALE_IT, SHORT_SCALE_IT, \
     pronounce_number_it
 
 SHORT_ORDINAL_STRING_IT = {
     1: 'primo',
     2: 'secondo',
     3: 'terzo',
     4: 'quarto',
     5: 'quinto',
     6: 'sesto',
     7: 'settimo',
     8: 'ottavo',
     9: 'nono',
     10: 'decimo',
     11: 'undicesimo',
     12: 'dodicesimo',
     13: 'tredicesimo',
     14: 'quattordicesimo',
     15: 'quindicesimo',
     16: 'sedicesimo',
     17: 'diciassettesimo',
     18: 'diciottesimo',
     19: 'diciannovesimo',
     20: 'ventesimo',
     30: 'trentesimo',
     40: 'quarantesimo',
     50: 'cinquantesimo',
     60: 'sessantesimo',
     70: 'settantesimo',
     80: 'ottantesimo',
     90: 'novantesimo',
     1e2: 'centesimo',
     1e3: 'millesimo',
     1e6: 'milionesimo',
     1e9: 'miliardesimo',
     1e12: 'trilionesimo',
     1e15: 'quadrilionesimo',
     1e18: 'quintilionesim',
     1e21: 'sestilionesimo',
     1e24: 'settilionesimo',
     1e27: 'ottilionesimo',
     1e30: 'nonilionesimo',
     1e33: 'decilionesimo'
     # TODO > 1e-33
 }
 
 #  per i > 10e12 modificata solo la desinenza: da sistemare a fine debug
 LONG_ORDINAL_STRING_IT = {
     1: 'primo',
     2: 'secondo',
     3: 'terzo',
     4: 'quarto',
     5: 'quinto',
     6: 'sesto',
     7: 'settimo',
     8: 'ottavo',
     9: 'nono',
     10: 'decimo',
     11: 'undicesimo',
     12: 'dodicesimo',
     13: 'tredicesimo',
     14: 'quattordicesimo',
     15: 'quindicesimo',
     16: 'sedicesimo',
     17: 'diciassettesimo',
     18: 'diciottesimo',
     19: 'diciannovesimo',
     20: 'ventesimo',
     30: 'trentesimo',
     40: 'quarantesimo',
     50: 'cinquantesimo',
     60: 'sessantesimo',
     70: 'settantesimo',
     80: 'ottantesimo',
     90: 'novantesimo',
     1e2: 'centesimo',
     1e3: 'millesimo',
     1e6: 'milionesimo',
     1e12: 'bilionesimo',
     1e18: 'trilionesimo',
     1e24: 'quadrilionesimo',
     1e30: 'quintilionesimo',
     1e36: 'sestilionesimo',
     1e42: 'settilionesimo',
     1e48: 'ottilionesimo',
     1e54: 'nonilionesimo',
     1e60: 'decilionesimo'
     # TODO > 1e60
 }
 
 # Undefined articles ['un', 'una', 'un\''] can not be supressed,
 # in Italian, 'un cavallo' means 'a horse' or 'one horse'.
 ARTICLES_IT = ['il', 'lo', 'la', 'i', 'gli', 'le']
 
 STRING_NUM_ITA = {
     'zero': 0,
     'un': 1,
     'uno': 1,
     'una': 1,
     'un\'': 1,
     'due': 2,
     'tre': 3,
     'quattro': 4,
     'cinque': 5,
     'sei': 6,
     'sette': 7,
     'otto': 8,
     'nove': 9,
     'dieci': 10,
     'undici': 11,
     'dodici': 12,
     'tredici': 13,
     'quattordici': 14,
     'quindici': 15,
     'sedici': 16,
     'diciassette': 17,
     'diciotto': 18,
     'diciannove': 19,
     'venti': 20,
     'vent': 20,
     'trenta': 30,
     'trent': 30,
     'quaranta': 40,
     'quarant': 40,
     'cinquanta': 50,
     'cinquant': 50,
     'sessanta': 60,
     'sessant': 60,
     'settanta': 70,
     'settant': 70,
     'ottanta': 80,
     'ottant': 80,
     'novanta': 90,
     'novant': 90,
     'cento': 100,
     'duecento': 200,
     'trecento': 300,
     'quattrocento': 400,
     'cinquecento': 500,
     'seicento': 600,
     'settecento': 700,
     'ottocento': 800,
     'novecento': 900,
     'mille': 1000,
     'mila': 1000,
     'centomila': 100000,
     'milione': 1000000,
     'miliardo': 1000000000,
     'primo': 1,
     'secondo': 2,
     'mezzo': 0.5,
     'mezza': 0.5,
     'paio': 2,
     'decina': 10,
     'decine': 10,
     'dozzina': 12,
     'dozzine': 12,
     'centinaio': 100,
     'centinaia': 100,
     'migliaio': 1000,
     'migliaia': 1000
 }
 
 
 def isFractional_it(input_str, short_scale=False):
     """
     This function takes the given text and checks if it is a fraction.
     Updated to italian from en version 18.8.9
 
     Args:
         input_str (str): the string to check if fractional
         short_scale (bool): use short scale if True, long scale if False
     Returns:
         (bool) or (float): False if not a fraction, otherwise the fraction
 
     """
     input_str = input_str.lower()
     if input_str.endswith('i', -1) and len(input_str) > 2:
         input_str = input_str[:-1] + "o"  # normalizza plurali
 
     fracts_it = {"intero": 1, "mezza": 2, "mezzo": 2}
 
     if short_scale:
         for num in SHORT_ORDINAL_STRING_IT:
             if num > 2:
                 fracts_it[SHORT_ORDINAL_STRING_IT[num]] = num
     else:
         for num in LONG_ORDINAL_STRING_IT:
             if num > 2:
                 fracts_it[LONG_ORDINAL_STRING_IT[num]] = num
 
     if input_str in fracts_it:
         return 1.0 / fracts_it[input_str]
     return False
 
 
 def extractnumber_long_it(word):
     """
      This function converts a long textual number like
      milleventisette -> 1027 diecimila -> 10041 in
      integer value, covers from  0 to 999999999999999
      for now limited to 999_e21 but ready for 999_e63
      example:
         milleventisette -> 1027
         diecimilaquarantuno-> 10041
         centottomiladuecentotredici -> 108213
     Args:
          word (str): the word to convert in number
     Returns:
          (bool) or (int): The extracted number or False if no number
                                    was found
     """
 
     units = {'zero': 0, 'uno': 1, 'due': 2, 'tre': 3, 'quattro': 4,
              'cinque': 5, 'sei': 6, 'sette': 7, 'otto': 8, 'nove': 9}
 
     tens = {'dieci': 10, 'venti': 20, 'trenta': 30, 'quaranta': 40,
             'cinquanta': 50, 'sessanta': 60, 'settanta': 70, 'ottanta': 80,
             'novanta': 90}
 
     tens_short = {'vent': 20, 'trent': 30, 'quarant': 40, 'cinquant': 50,
                   'sessant': 60, 'settant': 70, 'ottant': 80, 'novant': 90}
 
     nums_long = {'undici': 11, 'dodici': 12, 'tredici': 13, 'quattordici': 14,
                  'quindici': 15, 'sedici': 16, 'diciassette': 17,
                  'diciotto': 18, 'diciannove': 19}
 
     multipli_it = collections.OrderedDict([
         # (1e63, 'deciliardi'),
         # (1e60, 'decilioni'),
         # (1e57, 'noviliardi'),
         # (1e54, 'novilioni'),
         # (1e51, 'ottiliardi'),
         # (1e48, 'ottilioni'),
         # (1e45, 'settiliardi'),
         # (1e42, 'settilioni'),
         # (1e39, 'sestiliardi'),
         # (1e36, 'sestilioni'),
         # (1e33, 'quintiliardi'),
         # (1e30, 'quintilioni'),
         # (1e27, 'quadriliardi'),
         # (1e24, 'quadrilioni'),    # yotta
         (1e21, 'triliardi'),      # zetta
         (1e18, 'trilioni'),       # exa
         (1e15, 'biliardi'),       # peta
         (1e12, 'bilioni'),        # tera
         (1e9, 'miliardi'),        # giga
         (1e6, 'milioni')          # mega
     ])
 
     multiplier = {}
     un_multiplier = {}
 
     for num in multipli_it:
         if num > 1000 and num <= 1e21:
             # plurali
             multiplier[multipli_it[num]] = int(num)
             # singolari - modificare per eccezioni *liardo
             if multipli_it[num][-5:-1] == 'iard':
                 un_multiplier['un' + multipli_it[num][:-1] + 'o'] = int(num)
             else:
                 un_multiplier['un' + multipli_it[num][:-1] + 'e'] = int(num)
 
     value = False
 
     # normalizza ordinali singoli o plurali -esimo -esimi
     if word[-5:-1] == 'esim':
         base = word[:-5]
         normalize_ita3 = {'tre': '', 'ttr': 'o', 'sei': '', 'ott': 'o'}
         normalize_ita2 = {'un': 'o', 'du': 'e', 'qu': 'e', 'tt': 'e',
                           'ov': 'e'}
 
         if base[-3:] in normalize_ita3:
             base += normalize_ita3[base[-3:]]
         elif base[-2:] in normalize_ita2:
             base += normalize_ita2[base[-2:]]
 
         word = base
 
     for item in un_multiplier:
         components = word.split(item, 1)
         if len(components) == 2:
             if not components[0]:  # inizia con un1^x
                 if not components[1]:  # unmilione
                     word = str(int(un_multiplier[item]))
                 else:                  # unmilione + x
                     word = str(int(un_multiplier[item]) +
                                extractnumber_long_it(components[1]))
 
     for item in multiplier:
         components = word.split(item, 1)
         if len(components) == 2:
             if not components[0]:  # inizia con un1^x
                 word = str(int(multiplier[item]) +
                            extractnumber_long_it(components[1]))
             else:
                 if not components[1]:
                     word = str(extractnumber_long_it(components[0])) + '*' \
                         + str(int(multiplier[item]))
                 else:
                     word = str(extractnumber_long_it(components[0])) + '*' \
                         + str(int(multiplier[item])) + '+' \
                         + str(extractnumber_long_it(components[1]))
 
     for item in tens:
         word = word.replace(item, '+' + str(tens[item]))
 
     for item in tens_short:
         word = word.replace(item, '+' + str(tens_short[item]))
 
     for item in nums_long:
         word = word.replace(item, '+' + str(nums_long[item]))
 
     word = word.replace('cento', '+1xx')
     word = word.replace('cent', '+1xx')
     word = word.replace('mille', '+1000')   # unmilionemille
     word = word.replace('mila', '*1000')   # unmilioneduemila
 
     for item in units:
         word = word.replace(item, '+' + str(units[item]))
 
     # normalizzo i cento
     occorrenze = word.count('+1xx')
     for _ in range(0, occorrenze):
         components = word.rsplit('+1xx', 1)
         if len(components[0]) > 1 and components[0].endswith('0'):
             word = components[0] + '+100' + components[1]
         else:
             word = components[0] + '*100' + components[1]
 
     components = word.rsplit('*1000', 1)
     if len(components) == 2:
         if components[0].startswith('*'):  # centomila
             components[0] = components[0][1:]
         word = str(extractnumber_long_it(components[0])) + \
             '*1000' + str(components[1])
 
     # gestione eccezioni
     if word.startswith('*') or word.startswith('+'):
         word = word[1:]
 
     addends = word.split('+')
     for c, _ in enumerate(addends):
         if '*' in addends[c]:
             factors = addends[c].split('*')
             result = int(factors[0]) * int(factors[1])
             if len(factors) == 3:
                 result *= int(factors[2])
             addends[c] = str(result)
 
     # check if all token are numbers
     if all([s.isdecimal() for s in addends]):
         value = sum([int(s) for s in addends])
     else:
         value = False
     return value
 
 
 def extractnumber_it(text, short_scale=False, ordinals=False):
     """
     This function extracts a number from a text string,
     handles pronunciations in long scale and short scale
 
     https://en.wikipedia.org/wiki/Names_of_large_numbers
 
     Args:
         text (str): the string to normalize
         short_scale (bool): use short scale if True, long scale if False
         ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
     Returns:
         (int) or (float) or False: The extracted number or False if no number
                                    was found
 
     """
 
     string_num_ordinal_it = {}
     # first, second...
     if ordinals:
         if short_scale:
             for num in SHORT_ORDINAL_STRING_IT:
                 num_string = SHORT_ORDINAL_STRING_IT[num]
                 string_num_ordinal_it[num_string] = num
                 STRING_NUM_ITA[num_string] = num
         else:
             for num in LONG_ORDINAL_STRING_IT:
                 num_string = LONG_ORDINAL_STRING_IT[num]
                 string_num_ordinal_it[num_string] = num
                 STRING_NUM_ITA[num_string] = num
 
     # negate next number (-2 = 0 - 2)
     negatives = ['meno']  # 'negativo' non è usuale in italiano
 
     # multiply the previous number (one hundred = 1 * 100)
     multiplies = ['decina', 'decine', 'dozzina', 'dozzine',
                   'centinaia', 'centinaio', 'migliaia', 'migliaio', 'mila']
 
     # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 )
     fraction_marker = [' e ']
 
     # decimal marker ( 1 point 5 = 1 + 0.5)
     decimal_marker = [' punto ', ' virgola ']
 
     if short_scale:
         for num in SHORT_SCALE_IT:
             num_string = SHORT_SCALE_IT[num]
             STRING_NUM_ITA[num_string] = num
             multiplies.append(num_string)
     else:
         for num in LONG_SCALE_IT:
             num_string = LONG_SCALE_IT[num]
             STRING_NUM_ITA[num_string] = num
             multiplies.append(num_string)
 
     # 2 e 3/4 ed altri casi
     for separator in fraction_marker:
         components = text.split(separator)
         zeros = 0
 
         if len(components) == 2:
             # count zeros in fraction part
             sub_components = components[1].split(' ')
             for element in sub_components:
                 if element == 'zero' or element == '0':
                     zeros += 1
                 else:
                     break
             # ensure first is not a fraction and second is a fraction
             num1 = extractnumber_it(components[0])
             num2 = extractnumber_it(components[1])
             if num1 is not None and num2 is not None \
                     and num1 >= 1 and 0 < num2 < 1:
                 return num1 + num2
             # sette e quaranta  sette e zero zero due
             elif num1 is not None and num2 is not None \
                     and num1 >= 1 and num2 > 1:
                 return num1 + num2 / pow(10, len(str(num2)) + zeros)
 
     # 2 punto 5
     for separator in decimal_marker:
         zeros = 0
         # count zeros in fraction part
         components = text.split(separator)
 
         if len(components) == 2:
             sub_components = components[1].split(' ')
             for element in sub_components:
                 if element == 'zero' or element == '0':
                     zeros += 1
                 else:
                     break
 
             number = int(extractnumber_it(components[0]))
             decimal = int(extractnumber_it(components[1]))
             if number is not None and decimal is not None:
                 if '.' not in str(decimal):
                     return number + decimal / pow(10,
                                                   len(str(decimal)) + zeros)
 
     all_words = text.split()
     val = False
     prev_val = None
     to_sum = []
     for idx, word in enumerate(all_words):
 
         if not word:
             continue
         prev_word = all_words[idx - 1] if idx > 0 else ''
         next_word = all_words[idx + 1] if idx + 1 < len(all_words) else ''
 
         # is this word already a number ?
         if is_numeric(word):
             val = float(word)
 
         # is this word the name of a number ?
         if word in STRING_NUM_ITA:
             val = STRING_NUM_ITA[word]
 
         #  tre quarti  un quarto  trenta secondi
         if isFractional_it(word) and prev_val:
             if word[:-1] == 'second' and not ordinals:
                 val = prev_val * 2
             else:
                 val = prev_val
 
         # is the prev word a number and should we multiply it?
         # twenty hundred, six hundred
         if word in multiplies:
             if not prev_val:
                 prev_val = 1
             val = prev_val * val
 
         # is this a spoken fraction?
         # mezza tazza
         if val is False:
             val = isFractional_it(word, short_scale=short_scale)
 
         # 2 quinti
         if not ordinals:
             next_value = isFractional_it(next_word, short_scale=short_scale)
             if next_value:
                 if not val:
                     val = 1
                 val = val * next_value
 
         # is this a negative number?
         if val and prev_word and prev_word in negatives:
             val = 0 - val
 
         if not val:
             val = extractnumber_long_it(word)
 
         # let's make sure it isn't a fraction
         if not val:
             # look for fractions like '2/3'
             all_pieces = word.split('/')
             if look_for_fractions(all_pieces):
                 val = float(all_pieces[0]) / float(all_pieces[1])
         else:
             prev_val = val
             # handle long numbers
             # six hundred sixty six
             # two million five hundred thousand
             if word in multiplies and next_word not in multiplies:
                 to_sum.append(val)
                 val = 0
                 prev_val = 0
             elif extractnumber_long_it(word) > 100 and \
                 extractnumber_long_it(next_word) and \
                     next_word not in multiplies:
                 to_sum.append(val)
                 val = 0
                 prev_val = 0
 
     if val is not None:
         for addend in to_sum:
             val = val + addend
     return val
 
 
 def normalize_it(text, remove_articles):
     """ IT string normalization """
     # replace ambiguous words
     text = text.replace('un paio', 'due')
 
     words = text.split()  # this also removed extra spaces
     # Contractions are not common in IT
     # Convert numbers into digits, e.g. 'quarantadue' -> '42'
     normalized = ''
     i = 0
 
     while i < len(words):
         word = words[i]
         # remove articles
         # Italian requires the article to define the grammatical gender
         if remove_articles and word in ARTICLES_IT:
             i += 1
             continue
 
         if word in STRING_NUM_ITA:
             word = str(STRING_NUM_ITA[word])
 
         val = int(extractnumber_it(word))    # era extractnumber_long_it
 
         if val:
             word = str(val)
 
         normalized += ' ' + word
         i += 1
     # indefinite articles in it-it can not be removed
 
     return normalized[1:]
 
 
 def extract_datetime_it(string, dateNow, default_time):
     def clean_string(s):
         """
             cleans the input string of unneeded punctuation and capitalization
             among other things.
             Normalize italian plurals
         """
         symbols = ['.', ',', ';', '?', '!', 'º', 'ª', '°', 'l\'']
 
         for word in symbols:
             s = s.replace(word, '')
 
         s = s.lower().replace('á', 'a').replace('à', 'a').replace('è', "e'")\
             .replace('é', "e'").replace('ì', 'i').replace('ù', 'u')\
             .replace('ò', 'o').replace('-', ' ').replace('_', '')
 
         # normalizza plurali per semplificare analisi
         s = s.replace('secondi', 'secondo').replace('minuti', 'minuto')\
             .replace('ore', 'ora').replace('giorni', 'giorno')\
             .replace('settimane', 'settimana').replace('mesi', 'mese')\
             .replace('anni', 'anno').replace('mattino', 'mattina')\
             .replace('prossima', 'prossimo').replace('questa', 'questo')\
             .replace('quarti', 'quarto').replace('in punto', 'in_punto')\
             .replace('decennio', 'decenni').replace('secoli', 'secolo')\
             .replace('millennio', 'millenni').replace(' un ', ' uno ')\
             .replace('scorsa', 'scorso').replace('passata', 'passato')\
             .replace('uno paio', 'due')
 
         noise_words = ['dello', 'la', 'del', 'al', 'il', 'di', 'tra', 'lo',
                        'le', 'alle', 'alla', 'dai', 'delle', 'della',
                        'a', 'e\'', 'era', 'questa', 'questo', 'e', 'nel',
                        'nello', 'dallo', '  ']
 
         word_list = s.split()
         word_list = [x for x in word_list if x not in noise_words]
         # normalizza alcuni formati orari
         for idx in range(0, len(word_list) - 1):
             if word_list[idx][0].isdigit() and word_list[idx+1][0].isdigit():
                 num0 = int(word_list[idx])
                 num1 = int(word_list[idx+1])
                 if 0 <= num0 <= 23 and 10 <= num1 <= 59:
                     word_list[idx] = str(num0) + ':' + str(num1)
                     word_list[idx+1] = ''
 
         word_list = [x for x in word_list if x]
 
         return word_list
 
     def date_found():
         return found or \
             (datestr != '' or time_str != '' or year_offset != 0 or
              month_offset != 0 or day_offset is True or hr_offset != 0 or
              hr_abs or min_offset != 0 or min_abs or sec_offset != 0)
 
     if string == '' or not dateNow:
         return None
 
     found = False
     day_specified = False
     day_offset = False
     month_offset = 0
     year_offset = 0
     today = dateNow.strftime('%w')
     current_year = dateNow.strftime('%Y')
     from_flag = False
     datestr = ''
     has_year = False
     time_qualifier = ''
     time_qualifiers_am = ['mattina', 'stamani', 'stamane']
     time_qualifiers_pm = ['pomeriggio', 'sera', 'stasera', 'stanotte']
     time_qualifiers_list = set(time_qualifiers_am + time_qualifiers_pm)
     markers = ['alle', 'in', 'questo', 'per', 'di', 'tra', 'fra', 'entro']
     days = ['lunedi', 'martedi', 'mercoledi',
             'giovedi', 'venerdi', 'sabato', 'domenica']
     months = ['gennaio', 'febbraio', 'marzo', 'aprile', 'maggio', 'giugno',
               'luglio', 'agosto', 'settembre', 'ottobre', 'novembre',
               'dicembre']
     months_short = ['gen', 'feb', 'mar', 'apr', 'mag', 'giu', 'lug', 'ago',
                     'set', 'ott', 'nov', 'dic']
     year_multiples = ['decenni', 'secolo', 'millenni']  # decennio <- decenni
     time_multiples = ['ora', 'minuto', 'secondo']
     day_multiples = ['settimana', 'mese', 'anno']
     noise_words_2 = ['tra', 'di', 'per', 'fra', 'un ', 'uno', 'lo', 'del',
                      'l', 'in_punto', ' ', 'nella', 'dell']
 
     words = clean_string(string)
 
     for idx, word in enumerate(words):
         if word == '':
             continue
         word_prev_prev = words[idx - 2] if idx > 1 else ''
         word_prev = words[idx - 1] if idx > 0 else ''
         word_next = words[idx + 1] if idx + 1 < len(words) else ''
         word_next_next = words[idx + 2] if idx + 2 < len(words) else ''
         start = idx
         used = 0
         # save timequalifier for later
         if word == 'adesso' and not datestr:
             # word == 'ora' va in conflitto con 'tra un ora'
             words = [x for x in words if x != 'adesso']
             words = [x for x in words if x]
             result_str = ' '.join(words)
             extracted_date = dateNow.replace(microsecond=0)
             return [extracted_date, result_str]
 
         # un paio di  o  tra tre settimane --> secoli
         elif extractnumber_it(word) and (word_next in year_multiples or
                                          word_next in day_multiples):
             multiplier = int(extractnumber_it(word))
             used += 2
             if word_next == 'decenni':
                 year_offset = multiplier * 10
             elif word_next == 'secolo':
                 year_offset = multiplier * 100
             elif word_next == 'millenni':
                 year_offset = multiplier * 1000
             elif word_next == 'anno':
                 year_offset = multiplier
             elif word_next == 'mese':
                 month_offset = multiplier
             elif word_next == 'settimana':
                 day_offset = multiplier * 7
         elif word in time_qualifiers_list:
             time_qualifier = word
         # parse today, tomorrow, day after tomorrow
         elif word == 'oggi' and not from_flag:
             day_offset = 0
             used += 1
         elif word == 'domani' and not from_flag:
             day_offset = 1
             used += 1
         elif word == 'ieri' and not from_flag:
             day_offset -= 1
             used += 1
         elif word == 'dopodomani' and not from_flag:  # after tomorrow
             day_offset += 2
             used += 1
         elif word == 'dopo' and word_next == 'domani' and not from_flag:
             day_offset += 1
             used += 2
         elif word == 'giorno':
             if word_prev[0].isdigit():
                 day_offset += int(word_prev)
                 start -= 1
                 used = 2
                 if word_next == 'dopo' and word_next_next == 'domani':
                     day_offset += 1
                     used += 2
         elif word == 'settimana' and not from_flag:
             if word_prev == 'prossimo':
                 day_offset = 7
                 start -= 1
                 used = 2
             elif word_prev == 'passato' or word_prev == 'scorso':
                 day_offset = -7
                 start -= 1
                 used = 2
             elif word_next == 'prossimo':
                 day_offset = 7
                 used += 2
             elif word_next == 'passato' or word_next == 'scorso':
                 day_offset = -7
                 used += 2
         # parse next month, last month
         elif word == 'mese' and not from_flag:
             if word_prev == 'prossimo':
                 month_offset = 1
                 start -= 1
                 used = 2
             elif word_prev == 'passato' or word_prev == 'scorso':
                 month_offset = -1
                 start -= 1
                 used = 2
             elif word_next == 'prossimo':
                 month_offset = 1
                 used += 2
             elif word_next == 'passato' or word_next == 'scorso':
                 month_offset = -1
                 used += 2
         # parse next year, last year
         elif word == 'anno' and not from_flag:
             if word_prev == 'prossimo':  # prossimo anno
                 year_offset = 1
                 start -= 1
                 used = 2
             elif word_next == 'prossimo':  # anno prossimo
                 year_offset = 1
                 used = 2
             elif word_prev == 'passato' or word_prev == 'scorso':
                 year_offset = -1
                 start -= 1
                 used = 2
             elif word_next == 'passato' or word_next == 'scorso':
                 year_offset = -1
                 used = 2
         elif word == 'decenni' and not from_flag:
             if word_prev == 'prossimo':  # prossimo mese
                 year_offset = 10
                 start -= 1
                 used = 2
             elif word_next == 'prossimo':  # mese prossimo
                 year_offset = 10
                 used = 2
             elif word_prev == 'passato' or word_prev == 'scorso':
                 year_offset = -10
                 start -= 1
                 used = 2
             elif word_next == 'passato' or word_next == 'scorso':
                 year_offset = -10
                 used = 2
         # parse Monday, Tuesday, etc., and next Monday,
         # last Tuesday, etc.
         elif word in days and not from_flag:
             ddd = days.index(word)
             day_offset = (ddd + 1) - int(today)
             used = 1
             if day_offset < 0:
                 day_offset += 7
             if word_prev == 'prossimo':
                 day_offset += 7
                 start -= 1
                 used += 1
             elif word_prev == 'passato' or word_prev == 'scorso':
                 day_offset -= 7
                 start -= 1
                 used += 1
             if word_next == 'prossimo':
                 day_offset += 7
                 used += 1
             elif word_next == 'passato' or word_next == 'scorso':
                 day_offset -= 7
                 used += 1
         # parse 15 of July, June 20th, Feb 18, 19 of February
         elif word in months or word in months_short and not from_flag:
             try:
                 mmm = months.index(word)
             except ValueError:
                 mmm = months_short.index(word)
             used += 1
             datestr = months[mmm]
             if word_prev and extractnumber_it(word_prev):
                 datestr += ' ' + str(int(extractnumber_it(word_prev)))
                 start -= 1
                 used += 1
                 if word_next and extractnumber_it(word_next):
                     datestr += ' ' + str(int(extractnumber_it(word_next)))
                     used += 1
                     has_year = True
                 else:
                     has_year = False
             elif word_next and word_next[0].isdigit():
                 datestr += ' ' + word_next
                 used += 1
                 if word_next_next and word_next_next[0].isdigit():
                     datestr += ' ' + word_next_next
                     used += 1
                     has_year = True
                 else:
                     has_year = False
         # parse 5 days from tomorrow, 10 weeks from next thursday,
         # 2 months from July
         validFollowups = days + months + months_short
         validFollowups.append('oggi')
         validFollowups.append('domani')
         validFollowups.append('prossimo')
         validFollowups.append('passato')
         validFollowups.append('adesso')
 
         if (word == 'da' or word == 'dopo') and word_next in validFollowups:
             used = 0
             from_flag = True
             if word_next == 'domani':
                 day_offset += 1
                 used += 2
             elif word_next == 'oggi' or word_next == 'adesso':
                 used += 2
             elif word_next in days:
                 ddd = days.index(word_next)
                 tmp_offset = (ddd + 1) - int(today)
                 used += 2
                 if tmp_offset < 0:
                     tmp_offset += 7
                 if word_next_next == 'prossimo':
                     tmp_offset += 7
                     used += 1
                 elif word_next_next == 'passato' or word_next_next == 'scorso':
                     tmp_offset = (ddd + 1) - int(today)
                     used += 1
                 day_offset += tmp_offset
             elif word_next_next and word_next_next in days:
                 ddd = days.index(word_next_next)
                 tmp_offset = (ddd + 1) - int(today)
                 if word_next == 'prossimo':
                     tmp_offset += 7
                 # elif word_next == 'passato' or word_next == 'scorso':
                 #    tmp_offset -= 7
                 day_offset += tmp_offset
                 used += 3
 
         if used > 0:
             if start - 1 > 0 and words[start - 1] == 'questo':
                 start -= 1
                 used += 1
 
             for i in range(0, used):
                 words[i + start] = ''
 
             if start - 1 >= 0 and words[start - 1] in markers:
                 words[start - 1] = ''
             found = True
             day_specified = True
 
     # parse time
     time_str = ''
     hr_offset = 0
     min_offset = 0
     sec_offset = 0
     hr_abs = None
     min_abs = None
     military = False
 
     for idx, word in enumerate(words):
         if word == '':
             continue
         word_prev_prev = words[idx - 2] if idx > 1 else ''
         word_prev = words[idx - 1] if idx > 0 else ''
         word_next = words[idx + 1] if idx + 1 < len(words) else ''
         word_next_next = words[idx + 2] if idx + 2 < len(words) else ''
         # parse noon, midnight, morning, afternoon, evening
         used = 0
         if word == 'mezzogiorno':
             hr_abs = 12
             used += 1
         elif word == 'mezzanotte':
             hr_abs = 24
             used += 1
         if word == 'mezzo' and word_next == 'giorno':
             hr_abs = 12
             used += 2
         elif word == 'mezza' and word_next == 'notte':
             hr_abs = 24
             used += 2
         elif word == 'mattina':
             if not hr_abs:
                 hr_abs = 8
             used += 1
             if word_next and word_next[0].isdigit():  # mattina alle 5
                 hr_abs = int(word_next)
                 used += 1
         elif word == 'pomeriggio':
             if not hr_abs:
                 hr_abs = 15
             used += 1
             if word_next and word_next[0].isdigit():  # pomeriggio alle 5
                 hr_abs = int(word_next)
                 used += 1
                 if (hr_abs or 0) < 12:
                     hr_abs = (hr_abs or 0) + 12
         elif word == 'sera':
             if not hr_abs:
                 hr_abs = 19
             used += 1
             if word_next and word_next[0].isdigit() \
                and ':' not in word_next:
                 hr_abs = int(word_next)
                 used += 1
                 if (hr_abs or 0) < 12:
                     hr_abs = (hr_abs or 0) + 12
         # da verificare più a fondo
         elif word == 'presto':
             hr_abs -= 1
             used += 1
         elif word == 'tardi':
             hr_abs += 1
             used += 1
         # un paio di minuti  tra cinque minuti tra 5 ore
         elif extractnumber_it(word) and (word_next in time_multiples):
             d_time = int(extractnumber_it(word))
             used += 2
             if word_next == 'ora':
                 hr_offset = d_time
                 isTime = False
                 hr_abs = -1
                 min_abs = -1
             elif word_next == 'minuto':
                 min_offset = d_time
                 isTime = False
                 hr_abs = -1
                 min_abs = -1
             elif word_next == 'secondo':
                 sec_offset = d_time
                 isTime = False
                 hr_abs = -1
                 min_abs = -1
         elif word == 'mezzora':
             min_offset = 30
             used = 1
             isTime = False
             hr_abs = -1
             min_abs = -1
             # if word_prev == 'uno' or word_prev == 'una':
             #    start -= 1
             #    used += 1
         elif extractnumber_it(word) and word_next and \
                 word_next == 'quarto' and word_next_next == 'ora':
             if int(extractnumber_it(word)) == 1 \
                or int(extractnumber_it(word)) == 3:
                 min_offset = 15 * int(extractnumber_it(word))
             else:  # elimina eventuali errori
                 min_offset = 15
             used = 3
             start -= 1
             isTime = False
             hr_abs = -1
             min_abs = -1
         elif word[0].isdigit():
             isTime = True
             str_hh = ''
             str_mm = ''
             remainder = ''
             if ':' in word:
                 # parse colons
                 # '3:00 in the morning'
                 components = word.split(':')
                 if len(components) == 2:
                     num0 = int(extractnumber_it(components[0]))
                     num1 = int(extractnumber_it(components[1]))
                     if num0 is not False and num1 is not False \
                             and 0 <= num0 <= 23 and 0 <= num1 <= 59:
                         str_hh = str(num0)
                         str_mm = str(num1)
             elif 0 < int(extractnumber_it(word)) < 24 \
                     and word_next != 'quarto':
                 str_hh = str(int(word))
                 str_mm = '00'
             elif 100 <= int(word) <= 2400:
                 str_hh = int(word) / 100
                 str_mm = int(word) - str_hh * 100
                 military = True
                 isTime = False
             if extractnumber_it(word) and word_next \
                and word_next == 'quarto' and word_next_next != 'ora':
                 if int(extractnumber_it(word)) == 1 \
                    or int(extractnumber_it(word)) == 3:
                     str_mm = str(15 * int(extractnumber_it(word)))
                 else:  # elimina eventuali errori
                     str_mm = '0'
                 str_hh = str(hr_abs)
                 used = 2
                 words[idx + 1] = ''
                 isTime = False
             if extractnumber_it(word) and word_next \
                and word_next == 'in_punto':
                 str_hh = str(int(extractnumber_it(word)))
                 used = 2
             if word_next == 'pm':
                 remainder = 'pm'
                 hr_abs = int(str_hh)
                 min_abs = int(str_mm)
                 if hr_abs <= 12:
                     hr_abs = hr_abs + 12
                 used = 2
             elif word_next == 'am':
                 remainder = 'am'
                 hr_abs = int(str_hh)
                 min_abs = int(str_mm)
                 used = 2
             elif word_next == 'mattina':
                 # ' 11 del mattina'
                 hh = int(str_hh)
                 mm = int(str_mm)
                 used = 2
                 remainder = 'am'
                 isTime = False
                 hr_abs = hh
                 min_abs = mm
             elif word_next == 'pomeriggio':
                 # ' 2 del pomeriggio'
                 hh = int(str_hh)
                 mm = int(str_mm)
                 if hh < 12:
                     hh += 12
                 used = 2
                 remainder = 'pm'
                 isTime = False
                 hr_abs = hh
                 min_abs = mm
             elif word_next == 'sera':
                 # 'alle 8 di sera'
                 hh = int(str_hh)
                 mm = int(str_mm)
                 if hh < 12:
                     hh += 12
                 used = 2
                 remainder = 'pm'
                 isTime = False
                 hr_abs = hh
                 min_abs = mm
             elif word_next == 'notte':
                 hh = int(str_hh)
                 mm = int(str_mm)
                 if hh > 5:
                     remainder = 'pm'
                 else:
                     remainder = 'am'
                 used = 2
                 isTime = False
                 hr_abs = hh
                 min_abs = mm
             # parse half an hour : undici e mezza
             elif word_next and word_next == 'mezza':
                 hr_abs = int(str_hh)
                 min_abs = 30
                 used = 2
                 isTime = False
             elif word_next and word_next == 'in_punto':
                 hr_abs = int(str_hh)
                 min_abs = 0
                 str_mm = '0'
                 used = 2
                 isTime = False
             else:
                 # 17:30
                 remainder = ''
                 hr_abs = int(str_hh)
                 min_abs = int(str_mm)
                 used = 1
                 isTime = False
                 if word_prev == 'ora':
                     words[idx - 1] = ''
 
             if time_qualifier != '':
                 # military = True
                 if str_hh and int(str_hh) <= 12 and \
                    (time_qualifier in time_qualifiers_pm):
                     str_hh = str(int(str_hh) + 12)
             else:
                 isTime = False
 
             str_hh = int(str_hh) if str_hh else 0
             str_mm = int(str_mm) if str_mm else 0
 
             str_hh = str_hh + 12 if remainder == 'pm' \
                 and str_hh < 12 else str_hh
             str_hh = str_hh - 12 if remainder == 'am' \
                 and str_hh >= 12 else str_hh
 
             if (not military and
                     remainder not in ['am', 'pm'] and
                     ((not day_specified) or day_offset < 1)):
                 # ambiguous time, detect whether they mean this evening or
                 # the next morning based on whether it has already passed
                 hr_abs = str_hh
                 if dateNow.hour < str_hh:
                     pass  # No modification needed
                 elif dateNow.hour < str_hh + 12:
                     str_hh += 12
                     hr_abs = str_hh
                 else:
                     # has passed, assume the next morning
                     day_offset += 1
 
             if time_qualifier in time_qualifiers_pm and str_hh < 12:
                 str_hh += 12
 
             if str_hh > 24 or str_mm > 59:
                 isTime = False
                 used = 0
             if isTime:
                 hr_abs = str_hh * 1
                 min_abs = str_mm * 1
                 used += 1
 
             if (hr_abs or 0) <= 12 and (time_qualifier == 'sera' or
                                         time_qualifier == 'pomeriggio'):
                 hr_abs = (hr_abs or 0) + 12
 
         if used > 0:
             # removed parsed words from the sentence
             for i in range(used):
                 words[idx + i] = ''
 
             if word_prev == 'o' or word_prev == 'oh':
                 words[words.index(word_prev)] = ''
 
             if idx > 0 and word_prev in markers:
                 words[idx - 1] = ''
             if idx > 1 and word_prev_prev in markers:
                 words[idx - 2] = ''
 
             idx += used - 1
             found = True
 
     # check that we found a date
     if not date_found:
         return None
 
     if day_offset is False:
         day_offset = 0
 
     # perform date manipulation
 
     extracted_date = dateNow.replace(microsecond=0)
 
     if datestr != '':
         en_months = ['january', 'february', 'march', 'april', 'may', 'june',
                      'july', 'august', 'september', 'october', 'november',
                      'december']
         en_months_short = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july',
                            'aug', 'sept', 'oct', 'nov', 'dec']
 
         for idx, en_month in enumerate(en_months):
             datestr = datestr.replace(months[idx], en_month)
 
         for idx, en_month in enumerate(en_months_short):
             datestr = datestr.replace(months_short[idx], en_month)
 
         try:
             temp = datetime.strptime(datestr, '%B %d')
         except ValueError:
             # Try again, allowing the year
             temp = datetime.strptime(datestr, '%B %d %Y')
         extracted_date = extracted_date.replace(hour=0, minute=0, second=0)
         if not has_year:
             temp = temp.replace(year=extracted_date.year,
                                 tzinfo=extracted_date.tzinfo)
             if extracted_date < temp:
                 extracted_date = extracted_date.replace(
                     year=int(current_year),
                     month=int(temp.strftime('%m')),
                     day=int(temp.strftime('%d')),
                     tzinfo=extracted_date.tzinfo)
             else:
                 extracted_date = extracted_date.replace(
                     year=int(current_year) + 1,
                     month=int(temp.strftime('%m')),
                     day=int(temp.strftime('%d')),
                     tzinfo=extracted_date.tzinfo)
         else:
             extracted_date = extracted_date.replace(
                 year=int(temp.strftime('%Y')),
                 month=int(temp.strftime('%m')),
                 day=int(temp.strftime('%d')),
                 tzinfo=extracted_date.tzinfo)
     else:
         # ignore the current HH:MM:SS if relative using days or greater
         if hr_offset == 0 and min_offset == 0 and sec_offset == 0:
             extracted_date = extracted_date.replace(hour=0, minute=0, second=0)
 
     if year_offset != 0:
         extracted_date = extracted_date + relativedelta(years=year_offset)
     if month_offset != 0:
         extracted_date = extracted_date + relativedelta(months=month_offset)
     if day_offset != 0:
         extracted_date = extracted_date + relativedelta(days=day_offset)
     if hr_abs != -1 and min_abs != -1:
         # If no time was supplied in the string set the time to default
         # time if it's available
         if hr_abs is None and min_abs is None and default_time is not None:
             hr_abs, min_abs = default_time.hour, default_time.minute
         else:
             hr_abs = hr_abs or 0
             min_abs = min_abs or 0
 
         extracted_date = extracted_date + relativedelta(hours=hr_abs,
                                                         minutes=min_abs)
         if (hr_abs != 0 or min_abs != 0) and datestr == '':
             if not day_specified and dateNow > extracted_date:
                 extracted_date = extracted_date + relativedelta(days=1)
     if hr_offset != 0:
         extracted_date = extracted_date + relativedelta(hours=hr_offset)
     if min_offset != 0:
         extracted_date = extracted_date + relativedelta(minutes=min_offset)
     if sec_offset != 0:
         extracted_date = extracted_date + relativedelta(seconds=sec_offset)
 
     words = [x for x in words if x not in noise_words_2]
     words = [x for x in words if x]
     result_str = ' '.join(words)
 
     return [extracted_date, result_str]
 
 
 def get_gender_it(word, raw_string=""):
     """
     In Italian to define the grammatical gender of a word is necessary
     analyze the article that precedes the word and not only the last
     letter of the word.
 
     TODO: check if useful
     """
 
     gender = None
     words = raw_string.split(' ')
     for idx, w in enumerate(words):
         if w == word and idx != 0:
             previous = words[idx - 1]
             gender = get_gender_it(previous)
             break
 
     if not gender:
         if word[-1] == 'a' or word[-1] == 'e':
             gender = 'f'
         if word[-1] == 'o' or word[-1] == 'n' \
                 or word[-1] == 'l' or word[-1] == 'i':
             gender = 'm'
 
     return gender
 
 
 def extract_numbers_it(text, short_scale=False, ordinals=False):
     """
         Takes in a string and extracts a list of numbers.
 
     Args:
         text (str): the string to extract a number from
         short_scale (bool): Use "short scale" or "long scale" for large
             numbers -- over a million.  The default is short scale, which
             is now common in most English speaking countries.
             See https://en.wikipedia.org/wiki/Names_of_large_numbers
         ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
     Returns:
         list: list of extracted numbers as floats
     """
     return extract_numbers_generic(text, pronounce_number_it, extractnumber_it,
                                    short_scale=short_scale, ordinals=ordinals)