18 Parse functions for Italian (IT-IT) 23 from datetime
import datetime
24 from dateutil.relativedelta
import relativedelta
26 extract_numbers_generic
30 SHORT_ORDINAL_STRING_IT = {
44 14:
'quattordicesimo',
47 17:
'diciassettesimo',
63 1e15:
'quadrilionesimo',
64 1e18:
'quintilionesim',
65 1e21:
'sestilionesimo',
66 1e24:
'settilionesimo',
67 1e27:
'ottilionesimo',
68 1e30:
'nonilionesimo',
74 LONG_ORDINAL_STRING_IT = {
88 14:
'quattordicesimo',
91 17:
'diciassettesimo',
106 1e18:
'trilionesimo',
107 1e24:
'quadrilionesimo',
108 1e30:
'quintilionesimo',
109 1e36:
'sestilionesimo',
110 1e42:
'settilionesimo',
111 1e48:
'ottilionesimo',
112 1e54:
'nonilionesimo',
113 1e60:
'decilionesimo' 119 ARTICLES_IT = [
'il',
'lo',
'la',
'i',
'gli',
'le']
174 'miliardo': 1000000000,
193 This function takes the given text and checks if it is a fraction. 194 Updated to italian from en version 18.8.9 197 input_str (str): the string to check if fractional 198 short_scale (bool): use short scale if True, long scale if False 200 (bool) or (float): False if not a fraction, otherwise the fraction 203 input_str = input_str.lower()
204 if input_str.endswith(
'i', -1)
and len(input_str) > 2:
205 input_str = input_str[:-1] +
"o" 207 fracts_it = {
"intero": 1,
"mezza": 2,
"mezzo": 2}
210 for num
in SHORT_ORDINAL_STRING_IT:
212 fracts_it[SHORT_ORDINAL_STRING_IT[num]] = num
214 for num
in LONG_ORDINAL_STRING_IT:
216 fracts_it[LONG_ORDINAL_STRING_IT[num]] = num
218 if input_str
in fracts_it:
219 return 1.0 / fracts_it[input_str]
225 This function converts a long textual number like 226 milleventisette -> 1027 diecimila -> 10041 in 227 integer value, covers from 0 to 999999999999999 228 for now limited to 999_e21 but ready for 999_e63 230 milleventisette -> 1027 231 diecimilaquarantuno-> 10041 232 centottomiladuecentotredici -> 108213 234 word (str): the word to convert in number 236 (bool) or (int): The extracted number or False if no number 240 units = {
'zero': 0,
'uno': 1,
'due': 2,
'tre': 3,
'quattro': 4,
241 'cinque': 5,
'sei': 6,
'sette': 7,
'otto': 8,
'nove': 9}
243 tens = {
'dieci': 10,
'venti': 20,
'trenta': 30,
'quaranta': 40,
244 'cinquanta': 50,
'sessanta': 60,
'settanta': 70,
'ottanta': 80,
247 tens_short = {
'vent': 20,
'trent': 30,
'quarant': 40,
'cinquant': 50,
248 'sessant': 60,
'settant': 70,
'ottant': 80,
'novant': 90}
250 nums_long = {
'undici': 11,
'dodici': 12,
'tredici': 13,
'quattordici': 14,
251 'quindici': 15,
'sedici': 16,
'diciassette': 17,
252 'diciotto': 18,
'diciannove': 19}
254 multipli_it = collections.OrderedDict([
280 for num
in multipli_it:
281 if num > 1000
and num <= 1e21:
283 multiplier[multipli_it[num]] = int(num)
285 if multipli_it[num][-5:-1] ==
'iard':
286 un_multiplier[
'un' + multipli_it[num][:-1] +
'o'] = int(num)
288 un_multiplier[
'un' + multipli_it[num][:-1] +
'e'] = int(num)
293 if word[-5:-1] ==
'esim':
295 normalize_ita3 = {
'tre':
'',
'ttr':
'o',
'sei':
'',
'ott':
'o'}
296 normalize_ita2 = {
'un':
'o',
'du':
'e',
'qu':
'e',
'tt':
'e',
299 if base[-3:]
in normalize_ita3:
300 base += normalize_ita3[base[-3:]]
301 elif base[-2:]
in normalize_ita2:
302 base += normalize_ita2[base[-2:]]
306 for item
in un_multiplier:
307 components = word.split(item, 1)
308 if len(components) == 2:
309 if not components[0]:
310 if not components[1]:
311 word = str(int(un_multiplier[item]))
313 word = str(int(un_multiplier[item]) +
316 for item
in multiplier:
317 components = word.split(item, 1)
318 if len(components) == 2:
319 if not components[0]:
320 word = str(int(multiplier[item]) +
323 if not components[1]:
325 + str(int(multiplier[item]))
328 + str(int(multiplier[item])) +
'+' \
332 word = word.replace(item,
'+' + str(tens[item]))
334 for item
in tens_short:
335 word = word.replace(item,
'+' + str(tens_short[item]))
337 for item
in nums_long:
338 word = word.replace(item,
'+' + str(nums_long[item]))
340 word = word.replace(
'cento',
'+1xx')
341 word = word.replace(
'cent',
'+1xx')
342 word = word.replace(
'mille',
'+1000')
343 word = word.replace(
'mila',
'*1000')
346 word = word.replace(item,
'+' + str(units[item]))
349 occorrenze = word.count(
'+1xx')
350 for _
in range(0, occorrenze):
351 components = word.rsplit(
'+1xx', 1)
352 if len(components[0]) > 1
and components[0].endswith(
'0'):
353 word = components[0] +
'+100' + components[1]
355 word = components[0] +
'*100' + components[1]
357 components = word.rsplit(
'*1000', 1)
358 if len(components) == 2:
359 if components[0].startswith(
'*'):
360 components[0] = components[0][1:]
362 '*1000' + str(components[1])
365 if word.startswith(
'*')
or word.startswith(
'+'):
368 addends = word.split(
'+')
369 for c, _
in enumerate(addends):
370 if '*' in addends[c]:
371 factors = addends[c].split(
'*')
372 result = int(factors[0]) * int(factors[1])
373 if len(factors) == 3:
374 result *= int(factors[2])
375 addends[c] = str(result)
378 if all([s.isdecimal()
for s
in addends]):
379 value = sum([int(s)
for s
in addends])
387 This function extracts a number from a text string, 388 handles pronunciations in long scale and short scale 390 https://en.wikipedia.org/wiki/Names_of_large_numbers 393 text (str): the string to normalize 394 short_scale (bool): use short scale if True, long scale if False 395 ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 397 (int) or (float) or False: The extracted number or False if no number 402 string_num_ordinal_it = {}
406 for num
in SHORT_ORDINAL_STRING_IT:
407 num_string = SHORT_ORDINAL_STRING_IT[num]
408 string_num_ordinal_it[num_string] = num
409 STRING_NUM_ITA[num_string] = num
411 for num
in LONG_ORDINAL_STRING_IT:
412 num_string = LONG_ORDINAL_STRING_IT[num]
413 string_num_ordinal_it[num_string] = num
414 STRING_NUM_ITA[num_string] = num
420 multiplies = [
'decina',
'decine',
'dozzina',
'dozzine',
421 'centinaia',
'centinaio',
'migliaia',
'migliaio',
'mila']
424 fraction_marker = [
' e ']
427 decimal_marker = [
' punto ',
' virgola ']
430 for num
in SHORT_SCALE_IT:
431 num_string = SHORT_SCALE_IT[num]
432 STRING_NUM_ITA[num_string] = num
433 multiplies.append(num_string)
435 for num
in LONG_SCALE_IT:
436 num_string = LONG_SCALE_IT[num]
437 STRING_NUM_ITA[num_string] = num
438 multiplies.append(num_string)
441 for separator
in fraction_marker:
442 components = text.split(separator)
445 if len(components) == 2:
447 sub_components = components[1].split(
' ')
448 for element
in sub_components:
449 if element ==
'zero' or element ==
'0':
456 if num1
is not None and num2
is not None \
457 and num1 >= 1
and 0 < num2 < 1:
460 elif num1
is not None and num2
is not None \
461 and num1 >= 1
and num2 > 1:
462 return num1 + num2 / pow(10, len(str(num2)) + zeros)
465 for separator
in decimal_marker:
468 components = text.split(separator)
470 if len(components) == 2:
471 sub_components = components[1].split(
' ')
472 for element
in sub_components:
473 if element ==
'zero' or element ==
'0':
480 if number
is not None and decimal
is not None:
481 if '.' not in str(decimal):
482 return number + decimal / pow(10,
483 len(str(decimal)) + zeros)
485 all_words = text.split()
489 for idx, word
in enumerate(all_words):
493 prev_word = all_words[idx - 1]
if idx > 0
else '' 494 next_word = all_words[idx + 1]
if idx + 1 < len(all_words)
else '' 501 if word
in STRING_NUM_ITA:
502 val = STRING_NUM_ITA[word]
506 if word[:-1] ==
'second' and not ordinals:
513 if word
in multiplies:
529 val = val * next_value
532 if val
and prev_word
and prev_word
in negatives:
541 all_pieces = word.split(
'/')
543 val = float(all_pieces[0]) / float(all_pieces[1])
549 if word
in multiplies
and next_word
not in multiplies:
555 next_word
not in multiplies:
561 for addend
in to_sum:
567 """ IT string normalization """ 569 text = text.replace(
'un paio',
'due')
577 while i < len(words):
581 if remove_articles
and word
in ARTICLES_IT:
585 if word
in STRING_NUM_ITA:
586 word = str(STRING_NUM_ITA[word])
593 normalized +=
' ' + word
597 return normalized[1:]
603 cleans the input string of unneeded punctuation and capitalization 605 Normalize italian plurals 607 symbols = [
'.',
',',
';',
'?',
'!',
'Āŗ',
'ĀŖ',
'Ā°',
'l\'']
610 s = s.replace(word,
'')
612 s = s.lower().replace(
'Ć”',
'a').replace(
'Ć ',
'a').replace(
'ĆØ',
"e'")\
613 .replace(
'Ć©',
"e'").replace(
'Ƭ',
'i').replace(
'Ć¹',
'u')\ 614 .replace('Ć²', 'o').replace('-', ' ').replace('_', '')
617 s = s.replace(
'secondi',
'secondo').replace(
'minuti',
'minuto')\
618 .replace(
'ore',
'ora').replace(
'giorni',
'giorno')\
619 .replace(
'settimane',
'settimana').replace(
'mesi',
'mese')\
620 .replace(
'anni',
'anno').replace(
'mattino',
'mattina')\
621 .replace(
'prossima',
'prossimo').replace(
'questa',
'questo')\
622 .replace(
'quarti',
'quarto').replace(
'in punto',
'in_punto')\
623 .replace(
'decennio',
'decenni').replace(
'secoli',
'secolo')\
624 .replace(
'millennio',
'millenni').replace(
' un ',
' uno ')\
625 .replace(
'scorsa',
'scorso').replace(
'passata',
'passato')\
626 .replace(
'uno paio',
'due')
628 noise_words = [
'dello',
'la',
'del',
'al',
'il',
'di',
'tra',
'lo',
629 'le',
'alle',
'alla',
'dai',
'delle',
'della',
630 'a',
'e\'',
'era',
'questa',
'questo',
'e',
'nel',
631 'nello',
'dallo',
' ']
633 word_list = s.split()
634 word_list = [x
for x
in word_list
if x
not in noise_words]
636 for idx
in range(0, len(word_list) - 1):
637 if word_list[idx][0].isdigit()
and word_list[idx+1][0].isdigit():
638 num0 = int(word_list[idx])
639 num1 = int(word_list[idx+1])
640 if 0 <= num0 <= 23
and 10 <= num1 <= 59:
641 word_list[idx] = str(num0) +
':' + str(num1)
642 word_list[idx+1] =
'' 644 word_list = [x
for x
in word_list
if x]
650 (datestr !=
'' or time_str !=
'' or year_offset != 0
or 651 month_offset != 0
or day_offset
is True or hr_offset != 0
or 652 hr_abs
or min_offset != 0
or min_abs
or sec_offset != 0)
654 if string ==
'' or not dateNow:
658 day_specified =
False 662 today = dateNow.strftime(
'%w')
663 current_year = dateNow.strftime(
'%Y')
668 time_qualifiers_am = [
'mattina',
'stamani',
'stamane']
669 time_qualifiers_pm = [
'pomeriggio',
'sera',
'stasera',
'stanotte']
670 time_qualifiers_list = set(time_qualifiers_am + time_qualifiers_pm)
671 markers = [
'alle',
'in',
'questo',
'per',
'di',
'tra',
'fra',
'entro']
672 days = [
'lunedi',
'martedi',
'mercoledi',
673 'giovedi',
'venerdi',
'sabato',
'domenica']
674 months = [
'gennaio',
'febbraio',
'marzo',
'aprile',
'maggio',
'giugno',
675 'luglio',
'agosto',
'settembre',
'ottobre',
'novembre',
677 months_short = [
'gen',
'feb',
'mar',
'apr',
'mag',
'giu',
'lug',
'ago',
678 'set',
'ott',
'nov',
'dic']
679 year_multiples = [
'decenni',
'secolo',
'millenni']
680 time_multiples = [
'ora',
'minuto',
'secondo']
681 day_multiples = [
'settimana',
'mese',
'anno']
682 noise_words_2 = [
'tra',
'di',
'per',
'fra',
'un ',
'uno',
'lo',
'del',
683 'l',
'in_punto',
' ',
'nella',
'dell']
685 words = clean_string(string)
687 for idx, word
in enumerate(words):
690 word_prev_prev = words[idx - 2]
if idx > 1
else '' 691 word_prev = words[idx - 1]
if idx > 0
else '' 692 word_next = words[idx + 1]
if idx + 1 < len(words)
else '' 693 word_next_next = words[idx + 2]
if idx + 2 < len(words)
else '' 697 if word ==
'adesso' and not datestr:
699 words = [x
for x
in words
if x !=
'adesso']
700 words = [x
for x
in words
if x]
701 result_str =
' '.join(words)
702 extracted_date = dateNow.replace(microsecond=0)
703 return [extracted_date, result_str]
707 word_next
in day_multiples):
710 if word_next ==
'decenni':
711 year_offset = multiplier * 10
712 elif word_next ==
'secolo':
713 year_offset = multiplier * 100
714 elif word_next ==
'millenni':
715 year_offset = multiplier * 1000
716 elif word_next ==
'anno':
717 year_offset = multiplier
718 elif word_next ==
'mese':
719 month_offset = multiplier
720 elif word_next ==
'settimana':
721 day_offset = multiplier * 7
722 elif word
in time_qualifiers_list:
723 time_qualifier = word
725 elif word ==
'oggi' and not from_flag:
728 elif word ==
'domani' and not from_flag:
731 elif word ==
'ieri' and not from_flag:
734 elif word ==
'dopodomani' and not from_flag:
737 elif word ==
'dopo' and word_next ==
'domani' and not from_flag:
740 elif word ==
'giorno':
741 if word_prev[0].isdigit():
742 day_offset += int(word_prev)
745 if word_next ==
'dopo' and word_next_next ==
'domani':
748 elif word ==
'settimana' and not from_flag:
749 if word_prev ==
'prossimo':
753 elif word_prev ==
'passato' or word_prev ==
'scorso':
757 elif word_next ==
'prossimo':
760 elif word_next ==
'passato' or word_next ==
'scorso':
764 elif word ==
'mese' and not from_flag:
765 if word_prev ==
'prossimo':
769 elif word_prev ==
'passato' or word_prev ==
'scorso':
773 elif word_next ==
'prossimo':
776 elif word_next ==
'passato' or word_next ==
'scorso':
780 elif word ==
'anno' and not from_flag:
781 if word_prev ==
'prossimo':
785 elif word_next ==
'prossimo':
788 elif word_prev ==
'passato' or word_prev ==
'scorso':
792 elif word_next ==
'passato' or word_next ==
'scorso':
795 elif word ==
'decenni' and not from_flag:
796 if word_prev ==
'prossimo':
800 elif word_next ==
'prossimo':
803 elif word_prev ==
'passato' or word_prev ==
'scorso':
807 elif word_next ==
'passato' or word_next ==
'scorso':
812 elif word
in days
and not from_flag:
813 ddd = days.index(word)
814 day_offset = (ddd + 1) - int(today)
818 if word_prev ==
'prossimo':
822 elif word_prev ==
'passato' or word_prev ==
'scorso':
826 if word_next ==
'prossimo':
829 elif word_next ==
'passato' or word_next ==
'scorso':
833 elif word
in months
or word
in months_short
and not from_flag:
835 mmm = months.index(word)
837 mmm = months_short.index(word)
839 datestr = months[mmm]
850 elif word_next
and word_next[0].isdigit():
851 datestr +=
' ' + word_next
853 if word_next_next
and word_next_next[0].isdigit():
854 datestr +=
' ' + word_next_next
861 validFollowups = days + months + months_short
862 validFollowups.append(
'oggi')
863 validFollowups.append(
'domani')
864 validFollowups.append(
'prossimo')
865 validFollowups.append(
'passato')
866 validFollowups.append(
'adesso')
868 if (word ==
'da' or word ==
'dopo')
and word_next
in validFollowups:
871 if word_next ==
'domani':
874 elif word_next ==
'oggi' or word_next ==
'adesso':
876 elif word_next
in days:
877 ddd = days.index(word_next)
878 tmp_offset = (ddd + 1) - int(today)
882 if word_next_next ==
'prossimo':
885 elif word_next_next ==
'passato' or word_next_next ==
'scorso':
886 tmp_offset = (ddd + 1) - int(today)
888 day_offset += tmp_offset
889 elif word_next_next
and word_next_next
in days:
890 ddd = days.index(word_next_next)
891 tmp_offset = (ddd + 1) - int(today)
892 if word_next ==
'prossimo':
896 day_offset += tmp_offset
900 if start - 1 > 0
and words[start - 1] ==
'questo':
904 for i
in range(0, used):
905 words[i + start] =
'' 907 if start - 1 >= 0
and words[start - 1]
in markers:
908 words[start - 1] =
'' 921 for idx, word
in enumerate(words):
924 word_prev_prev = words[idx - 2]
if idx > 1
else '' 925 word_prev = words[idx - 1]
if idx > 0
else '' 926 word_next = words[idx + 1]
if idx + 1 < len(words)
else '' 927 word_next_next = words[idx + 2]
if idx + 2 < len(words)
else '' 930 if word ==
'mezzogiorno':
933 elif word ==
'mezzanotte':
936 if word ==
'mezzo' and word_next ==
'giorno':
939 elif word ==
'mezza' and word_next ==
'notte':
942 elif word ==
'mattina':
946 if word_next
and word_next[0].isdigit():
947 hr_abs = int(word_next)
949 elif word ==
'pomeriggio':
953 if word_next
and word_next[0].isdigit():
954 hr_abs = int(word_next)
956 if (hr_abs
or 0) < 12:
957 hr_abs = (hr_abs
or 0) + 12
962 if word_next
and word_next[0].isdigit() \
963 and ':' not in word_next:
964 hr_abs = int(word_next)
966 if (hr_abs
or 0) < 12:
967 hr_abs = (hr_abs
or 0) + 12
969 elif word ==
'presto':
972 elif word ==
'tardi':
979 if word_next ==
'ora':
984 elif word_next ==
'minuto':
989 elif word_next ==
'secondo':
994 elif word ==
'mezzora':
1004 word_next ==
'quarto' and word_next_next ==
'ora':
1015 elif word[0].isdigit():
1023 components = word.split(
':')
1024 if len(components) == 2:
1027 if num0
is not False and num1
is not False \
1028 and 0 <= num0 <= 23
and 0 <= num1 <= 59:
1032 and word_next !=
'quarto':
1033 str_hh = str(int(word))
1035 elif 100 <= int(word) <= 2400:
1036 str_hh = int(word) / 100
1037 str_mm = int(word) - str_hh * 100
1041 and word_next ==
'quarto' and word_next_next !=
'ora':
1047 str_hh = str(hr_abs)
1052 and word_next ==
'in_punto':
1055 if word_next ==
'pm':
1057 hr_abs = int(str_hh)
1058 min_abs = int(str_mm)
1060 hr_abs = hr_abs + 12
1062 elif word_next ==
'am':
1064 hr_abs = int(str_hh)
1065 min_abs = int(str_mm)
1067 elif word_next ==
'mattina':
1076 elif word_next ==
'pomeriggio':
1087 elif word_next ==
'sera':
1098 elif word_next ==
'notte':
1110 elif word_next
and word_next ==
'mezza':
1111 hr_abs = int(str_hh)
1115 elif word_next
and word_next ==
'in_punto':
1116 hr_abs = int(str_hh)
1124 hr_abs = int(str_hh)
1125 min_abs = int(str_mm)
1128 if word_prev ==
'ora':
1131 if time_qualifier !=
'':
1133 if str_hh
and int(str_hh) <= 12
and \
1134 (time_qualifier
in time_qualifiers_pm):
1135 str_hh = str(int(str_hh) + 12)
1139 str_hh = int(str_hh)
if str_hh
else 0
1140 str_mm = int(str_mm)
if str_mm
else 0
1142 str_hh = str_hh + 12
if remainder ==
'pm' \
1143 and str_hh < 12
else str_hh
1144 str_hh = str_hh - 12
if remainder ==
'am' \
1145 and str_hh >= 12
else str_hh
1147 if (
not military
and 1148 remainder
not in [
'am',
'pm']
and 1149 ((
not day_specified)
or day_offset < 1)):
1153 if dateNow.hour < str_hh:
1155 elif dateNow.hour < str_hh + 12:
1162 if time_qualifier
in time_qualifiers_pm
and str_hh < 12:
1165 if str_hh > 24
or str_mm > 59:
1170 min_abs = str_mm * 1
1173 if (hr_abs
or 0) <= 12
and (time_qualifier ==
'sera' or 1174 time_qualifier ==
'pomeriggio'):
1175 hr_abs = (hr_abs
or 0) + 12
1179 for i
in range(used):
1182 if word_prev ==
'o' or word_prev ==
'oh':
1183 words[words.index(word_prev)] =
'' 1185 if idx > 0
and word_prev
in markers:
1187 if idx > 1
and word_prev_prev
in markers:
1197 if day_offset
is False:
1202 extracted_date = dateNow.replace(microsecond=0)
1205 en_months = [
'january',
'february',
'march',
'april',
'may',
'june',
1206 'july',
'august',
'september',
'october',
'november',
1208 en_months_short = [
'jan',
'feb',
'mar',
'apr',
'may',
'june',
'july',
1209 'aug',
'sept',
'oct',
'nov',
'dec']
1211 for idx, en_month
in enumerate(en_months):
1212 datestr = datestr.replace(months[idx], en_month)
1214 for idx, en_month
in enumerate(en_months_short):
1215 datestr = datestr.replace(months_short[idx], en_month)
1218 temp = datetime.strptime(datestr,
'%B %d')
1221 temp = datetime.strptime(datestr,
'%B %d %Y')
1222 extracted_date = extracted_date.replace(hour=0, minute=0, second=0)
1224 temp = temp.replace(year=extracted_date.year,
1225 tzinfo=extracted_date.tzinfo)
1226 if extracted_date < temp:
1227 extracted_date = extracted_date.replace(
1228 year=int(current_year),
1229 month=int(temp.strftime(
'%m')),
1230 day=int(temp.strftime(
'%d')),
1231 tzinfo=extracted_date.tzinfo)
1233 extracted_date = extracted_date.replace(
1234 year=int(current_year) + 1,
1235 month=int(temp.strftime(
'%m')),
1236 day=int(temp.strftime(
'%d')),
1237 tzinfo=extracted_date.tzinfo)
1239 extracted_date = extracted_date.replace(
1240 year=int(temp.strftime(
'%Y')),
1241 month=int(temp.strftime(
'%m')),
1242 day=int(temp.strftime(
'%d')),
1243 tzinfo=extracted_date.tzinfo)
1246 if hr_offset == 0
and min_offset == 0
and sec_offset == 0:
1247 extracted_date = extracted_date.replace(hour=0, minute=0, second=0)
1249 if year_offset != 0:
1250 extracted_date = extracted_date + relativedelta(years=year_offset)
1251 if month_offset != 0:
1252 extracted_date = extracted_date + relativedelta(months=month_offset)
1254 extracted_date = extracted_date + relativedelta(days=day_offset)
1255 if hr_abs != -1
and min_abs != -1:
1258 if hr_abs
is None and min_abs
is None and default_time
is not None:
1259 hr_abs, min_abs = default_time.hour, default_time.minute
1261 hr_abs = hr_abs
or 0
1262 min_abs = min_abs
or 0
1264 extracted_date = extracted_date + relativedelta(hours=hr_abs,
1266 if (hr_abs != 0
or min_abs != 0)
and datestr ==
'':
1267 if not day_specified
and dateNow > extracted_date:
1268 extracted_date = extracted_date + relativedelta(days=1)
1270 extracted_date = extracted_date + relativedelta(hours=hr_offset)
1272 extracted_date = extracted_date + relativedelta(minutes=min_offset)
1274 extracted_date = extracted_date + relativedelta(seconds=sec_offset)
1276 words = [x
for x
in words
if x
not in noise_words_2]
1277 words = [x
for x
in words
if x]
1278 result_str =
' '.join(words)
1280 return [extracted_date, result_str]
1285 In Italian to define the grammatical gender of a word is necessary 1286 analyze the article that precedes the word and not only the last 1289 TODO: check if useful 1293 words = raw_string.split(
' ')
1294 for idx, w
in enumerate(words):
1295 if w == word
and idx != 0:
1296 previous = words[idx - 1]
1301 if word[-1] ==
'a' or word[-1] ==
'e':
1303 if word[-1] ==
'o' or word[-1] ==
'n' \
1304 or word[-1] ==
'l' or word[-1] ==
'i':
1312 Takes in a string and extracts a list of numbers. 1315 text (str): the string to extract a number from 1316 short_scale (bool): Use "short scale" or "long scale" for large 1317 numbers -- over a million. The default is short scale, which 1318 is now common in most English speaking countries. 1319 See https://en.wikipedia.org/wiki/Names_of_large_numbers 1320 ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 1322 list: list of extracted numbers as floats 1325 short_scale=short_scale, ordinals=ordinals)
def extractnumber_it(text, short_scale=False, ordinals=False)
def normalize_it(text, remove_articles)
def extract_datetime_it(string, dateNow, default_time)
def isFractional_it(input_str, short_scale=False)
def extractnumber_long_it(word)
def extract_numbers_generic(text, pronounce_handler, extract_handler, short_scale=True, ordinals=False)
def is_numeric(input_str)
def extract_numbers_it(text, short_scale=False, ordinals=False)
def look_for_fractions(split_list)
def get_gender_it(word, raw_string="")