18 Parse functions for Portuguese (PT-PT) 20 TODO: numbers greater than 999999 24 from datetime
import datetime
25 from dateutil.relativedelta
import relativedelta
28 _PT_ARTICLES, _PT_NUMBERS
33 This function takes the given text and checks if it is a fraction. 36 text (str): the string to check if fractional 38 (bool) or (float): False if not a fraction, otherwise the fraction 41 if input_str.endswith(
's', -1):
42 input_str = input_str[:len(input_str) - 1]
44 aFrac = [
"meio",
u"terço",
"quarto",
"quinto",
"sexto",
45 "setimo",
"oitavo",
"nono",
u"décimo"]
47 if input_str.lower()
in aFrac:
48 return 1.0 / (aFrac.index(input_str) + 2)
49 if input_str ==
u"vigésimo":
51 if input_str ==
u"trigésimo":
53 if input_str ==
u"centésimo":
55 if input_str ==
u"milésimo":
57 if (input_str ==
u"sétimo" or input_str ==
"septimo" or 58 input_str ==
u"séptimo"):
66 This function prepares the given text for parsing by making 67 numbers consistent, getting rid of contractions, etc. 69 text (str): the string to normalize 71 (int) or (float): The value of extracted number 77 while count < len(aWords):
81 if count + 1 < len(aWords):
82 next_word = aWords[count + 1]
83 if count + 2 < len(aWords):
84 next_next_word = aWords[count + 2]
89 if word
in _PT_NUMBERS:
90 val = _PT_NUMBERS[word]
104 aPieces = word.split(
'/')
108 val = float(aPieces[0]) / float(aPieces[1])
114 if next_word !=
"avos":
117 result = float(result) / float(val)
119 if next_word
is None:
124 if next_word
in ands:
129 newWords = aWords[count + 2:]
131 for word
in newWords:
132 newText += word +
" " 136 if result < afterAndVal
or result < 20:
137 while afterAndVal > 1:
138 afterAndVal = afterAndVal / 10.0
139 for word
in newWords:
140 if word ==
"zero" or word ==
"0":
144 for _
in range(0, zeros):
145 afterAndVal = afterAndVal / 10.0
146 result += afterAndVal
148 elif next_next_word
is not None:
149 if next_next_word
in ands:
150 newWords = aWords[count + 3:]
152 for word
in newWords:
153 newText += word +
" " 158 result += afterAndVal
161 decimals = [
"ponto",
"virgula",
"vírgula",
".",
","]
162 if next_word
in decimals:
164 newWords = aWords[count + 2:]
166 for word
in newWords:
167 newText += word +
" " 168 for word
in newWords:
169 if word ==
"zero" or word ==
"0":
174 afterDotVal = zeros *
"0" + afterDotVal
175 result = float(str(result) +
"." + afterDotVal)
186 if "." in str(result):
187 integer, dec = str(result).split(
".")
190 result = int(integer)
197 if i < len(words)
and s == words[i]:
201 def pt_number_word(i, mi, ma):
203 v = _PT_NUMBERS.get(words[i])
204 if v
and v >= mi
and v <= ma:
208 def pt_number_1_99(i):
209 r1 = pt_number_word(i, 1, 29)
213 r1 = pt_number_word(i, 30, 90)
219 r3 = pt_number_word(i2, 1, 9)
226 def pt_number_1_999(i):
228 r1 = pt_number_word(i, 100, 900)
231 r2 = pt_number_1_99(i1)
239 r1 = pt_number_1_99(i)
247 r1 = pt_number_word(i, 0, 0)
252 r1 = pt_number_1_999(i)
255 r2 = pt_cte(i1,
"mil")
258 r3 = pt_number_1_999(i2)
261 return v1 * 1000 + v3, i3
272 """ PT string normalization """ 281 while i < len(words):
284 if remove_articles
and word
in _PT_ARTICLES:
292 normalized +=
" " + str(v)
296 if word
in _PT_NUMBERS:
297 word = str(_PT_NUMBERS[word])
300 normalized +=
" " + word
306 return pt_pruning(normalized[1:], agressive=remove_articles)
313 symbols = [
".",
",",
";",
"?",
"!",
u"º",
u"ª"]
314 noise_words = [
"o",
"os",
"a",
"as",
"do",
"da",
"dos",
"das",
"de",
318 s = s.replace(word,
"")
319 for word
in noise_words:
320 s = s.replace(
" " + word +
" ",
" ")
321 s = s.lower().replace(
345 synonims = {
"manha": [
"manhazinha",
"cedo",
"cedinho"],
346 "tarde": [
"tardinha",
"tarde"],
347 "noite": [
"noitinha",
"anoitecer"],
348 "todos": [
"ao",
"aos"],
349 "em": [
"do",
"da",
"dos",
"das",
"de"]}
351 for word
in synonims[syn]:
352 s = s.replace(
" " + word +
" ",
" " + syn +
" ")
354 wordlist = [
"manhas",
"noites",
"tardes",
"dias",
"semanas",
"anos",
355 "minutos",
"segundos",
"nas",
"nos",
"proximas",
356 "seguintes",
"horas"]
357 for _, word
in enumerate(wordlist):
358 s = s.replace(word, word.rstrip(
's'))
359 s = s.replace(
"meses",
"mes").replace(
"anteriores",
"anterior")
365 datestr !=
"" or timeStr !=
"" or 366 yearOffset != 0
or monthOffset != 0
or 367 dayOffset
is True or hrOffset != 0
or 368 hrAbs
or minOffset != 0
or 369 minAbs
or secOffset != 0
372 if input_str ==
"" or not currentDate:
380 dateNow = currentDate
381 today = dateNow.strftime(
"%w")
382 currentYear = dateNow.strftime(
"%Y")
388 words = clean_string(input_str).split(
" ")
389 timeQualifiersList = [
'manha',
'tarde',
'noite']
390 time_indicators = [
"em",
"as",
"nas",
"pelas",
"volta",
"depois",
"estas",
392 days = [
'segunda',
'terca',
'quarta',
393 'quinta',
'sexta',
'sabado',
'domingo']
394 months = [
'janeiro',
'febreiro',
'marco',
'abril',
'maio',
'junho',
395 'julho',
'agosto',
'setembro',
'outubro',
'novembro',
397 monthsShort = [
'jan',
'feb',
'mar',
'abr',
'mai',
'jun',
'jul',
'ag',
398 'set',
'out',
'nov',
'dec']
399 nexts = [
"proximo",
"proxima"]
400 suffix_nexts = [
"seguinte",
"subsequente",
"seguir"]
401 lasts = [
"ultimo",
"ultima"]
402 suffix_lasts = [
"passada",
"passado",
"anterior",
"antes"]
403 nxts = [
"depois",
"seguir",
"seguida",
"seguinte",
"proxima",
"proximo"]
404 prevs = [
"antes",
"ante",
"previa",
"previamente",
"anterior"]
405 froms = [
"partir",
"em",
"para",
"na",
"no",
"daqui",
"seguir",
406 "depois",
"por",
"proxima",
"proximo",
"da",
"do",
"de"]
407 thises = [
"este",
"esta",
"deste",
"desta",
"neste",
"nesta",
"nesse",
410 lists = nxts + prevs + froms + time_indicators
411 for idx, word
in enumerate(words):
414 wordPrevPrev = words[idx - 2]
if idx > 1
else "" 415 wordPrev = words[idx - 1]
if idx > 0
else "" 416 wordNext = words[idx + 1]
if idx + 1 < len(words)
else "" 417 wordNextNext = words[idx + 2]
if idx + 2 < len(words)
else "" 418 wordNextNextNext = words[idx + 3]
if idx + 3 < len(words)
else "" 423 if word
in timeQualifiersList:
427 elif word ==
"hoje" and not fromFlag:
430 elif word ==
"amanha" and not fromFlag:
433 elif word ==
"ontem" and not fromFlag:
437 elif (word ==
"anteontem" or 438 (word ==
"ante" and wordNext ==
"ontem"))
and not fromFlag:
441 if wordNext ==
"ontem":
443 elif word ==
"ante" and wordNext ==
"ante" and wordNextNext == \
444 "ontem" and not fromFlag:
447 elif word ==
"anteanteontem" and not fromFlag:
451 elif word ==
"depois" and wordNext ==
"amanha" and not fromFlag:
455 elif word ==
"antes" and wordNext ==
"ontem" and not fromFlag:
460 if wordNext ==
"depois" or wordNext ==
"antes":
462 if wordPrev
and wordPrev[0].isdigit():
463 dayOffset += int(wordPrev)
466 elif (wordPrev
and wordPrev[0].isdigit()
and 467 wordNext
not in months
and 468 wordNext
not in monthsShort):
469 dayOffset += int(wordPrev)
472 elif wordNext
and wordNext[0].isdigit()
and wordNextNext
not in \
473 months
and wordNextNext
not in monthsShort:
474 dayOffset += int(wordNext)
478 elif word ==
"semana" and not fromFlag:
479 if wordPrev[0].isdigit():
480 dayOffset += int(wordPrev) * 7
493 for w
in suffix_nexts:
498 for w
in suffix_lasts:
504 elif word ==
"mes" and not fromFlag:
505 if wordPrev[0].isdigit():
506 monthOffset = int(wordPrev)
519 for w
in suffix_nexts:
524 for w
in suffix_lasts:
530 elif word ==
"ano" and not fromFlag:
531 if wordPrev[0].isdigit():
532 yearOffset = int(wordPrev)
545 for w
in suffix_nexts:
550 for w
in suffix_lasts:
557 elif word
in days
and not fromFlag:
560 dayOffset = (d + 1) - int(today)
574 for w
in suffix_nexts:
579 for w
in suffix_lasts:
584 if wordNext ==
"feira":
587 elif word
in months
or word
in monthsShort:
589 m = months.index(word)
591 m = monthsShort.index(word)
594 if wordPrev
and wordPrev[0].isdigit():
596 datestr +=
" " + wordPrev
599 if wordNext
and wordNext[0].isdigit():
600 datestr +=
" " + wordNext
606 elif wordNext
and wordNext[0].isdigit():
608 datestr +=
" " + wordNext
610 if wordNextNext
and wordNextNext[0].isdigit():
611 datestr +=
" " + wordNextNext
617 elif wordPrevPrev
and wordPrevPrev[0].isdigit():
619 datestr +=
" " + wordPrevPrev
623 if wordNext
and word[0].isdigit():
624 datestr +=
" " + wordNext
630 elif wordNextNext
and wordNextNext[0].isdigit():
632 datestr +=
" " + wordNextNext
634 if wordNextNextNext
and wordNextNextNext[0].isdigit():
635 datestr +=
" " + wordNextNextNext
641 if datestr
in months:
646 validFollowups = days + months + monthsShort
647 validFollowups.append(
"hoje")
648 validFollowups.append(
"amanha")
649 validFollowups.append(
"ontem")
650 validFollowups.append(
"anteontem")
651 validFollowups.append(
"agora")
652 validFollowups.append(
"ja")
653 validFollowups.append(
"ante")
656 if word
in froms
and wordNext
in validFollowups:
658 if not (wordNext ==
"amanha" and wordNext ==
"ontem")
and not (
659 word ==
"depois" or word ==
"antes" or word ==
"em"):
662 if wordNext ==
"amanha" and word !=
"depois":
664 elif wordNext ==
"ontem":
666 elif wordNext ==
"anteontem":
668 elif wordNext ==
"ante" and wordNextNext ==
"ontem":
670 elif (wordNext ==
"ante" and wordNext ==
"ante" and 671 wordNextNextNext ==
"ontem"):
673 elif wordNext
in days:
674 d = days.index(wordNext)
675 tmpOffset = (d + 1) - int(today)
677 if wordNextNext ==
"feira":
682 if wordNextNext
in nxts:
685 elif wordNextNext
in prevs:
688 dayOffset += tmpOffset
689 elif wordNextNext
and wordNextNext
in days:
690 d = days.index(wordNextNext)
691 tmpOffset = (d + 1) - int(today)
694 if wordNextNextNext
in nxts:
697 elif wordNextNextNext
in prevs:
700 dayOffset += tmpOffset
701 if wordNextNextNext ==
"feira":
703 if wordNext
in months:
707 if start - 1 > 0
and words[start - 1]
in lists:
711 for i
in range(0, used):
712 words[i + start] =
"" 714 if start - 1 >= 0
and words[start - 1]
in lists:
715 words[start - 1] =
"" 728 for idx, word
in enumerate(words):
732 wordPrevPrev = words[idx - 2]
if idx > 1
else "" 733 wordPrev = words[idx - 1]
if idx > 0
else "" 734 wordNext = words[idx + 1]
if idx + 1 < len(words)
else "" 735 wordNextNext = words[idx + 2]
if idx + 2 < len(words)
else "" 736 wordNextNextNext = words[idx + 3]
if idx + 3 < len(words)
else "" 739 if word ==
"meio" and wordNext ==
"dia":
742 elif word ==
"meia" and wordNext ==
"noite":
745 elif word ==
"manha":
749 elif word ==
"tarde":
753 elif word ==
"meio" and wordNext ==
"tarde":
757 elif word ==
"meio" and wordNext ==
"manha":
761 elif word ==
"fim" and wordNext ==
"tarde":
765 elif word ==
"fim" and wordNext ==
"manha":
769 elif word ==
"tantas" and wordNext ==
"manha":
773 elif word ==
"noite":
778 elif word ==
"hora" and \
779 (wordPrev
in time_indicators
or wordPrevPrev
in 781 if wordPrev ==
"meia":
783 elif wordPrev ==
"quarto":
785 elif wordPrevPrev ==
"quarto":
787 if idx > 2
and words[idx - 3]
in time_indicators:
792 if wordPrevPrev
in time_indicators:
799 elif word[0].isdigit():
809 for i
in range(length):
811 if word[i].isdigit():
819 if word[i].isdigit():
825 remainder = word[i:].replace(
".",
"")
828 nextWord = wordNext.replace(
".",
"")
829 if nextWord ==
"am" or nextWord ==
"pm":
832 elif wordNext ==
"manha":
835 elif wordNext ==
"tarde":
838 elif wordNext ==
"noite":
839 if 0 < int(word[0]) < 6:
844 elif wordNext
in thises
and wordNextNext ==
"manha":
847 elif wordNext
in thises
and wordNextNext ==
"tarde":
850 elif wordNext
in thises
and wordNextNext ==
"noite":
854 if timeQualifier !=
"":
857 (timeQualifier ==
"manha" or 858 timeQualifier ==
"tarde"):
867 for i
in range(length):
868 if word[i].isdigit():
874 remainder = wordNext.replace(
".",
"").lstrip().rstrip()
879 remainder ==
"p.m." or 887 remainder ==
"a.m." or 893 if (wordNext ==
"pm" or 894 wordNext ==
"p.m." or 895 wordNext ==
"tarde"):
899 elif (wordNext ==
"am" or 900 wordNext ==
"a.m." or 901 wordNext ==
"manha"):
905 elif (int(word) > 100
and 912 strHH = int(word) / 100
913 strMM = int(word) - strHH * 100
915 if wordNext ==
"hora":
918 wordNext ==
"hora" and 932 elif wordNext ==
"minuto":
934 minOffset = int(word)
939 elif wordNext ==
"segundo":
941 secOffset = int(word)
946 elif int(word) > 100:
947 strHH = int(word) / 100
948 strMM = int(word) - strHH * 100
950 if wordNext ==
"hora":
953 elif wordNext ==
"" or (
954 wordNext ==
"em" and wordNextNext ==
"ponto"):
957 if wordNext ==
"em" and wordNextNext ==
"ponto":
959 if wordNextNextNext ==
"tarde":
962 elif wordNextNextNext ==
"manha":
965 elif wordNextNextNext ==
"noite":
966 if 0 > int(strHH) > 6:
972 elif wordNext[0].isdigit():
977 if wordNextNext ==
"hora":
982 strHH = int(strHH)
if strHH
else 0
983 strMM = int(strMM)
if strMM
else 0
984 strHH = strHH + 12
if (remainder ==
"pm" and 985 0 < strHH < 12)
else strHH
986 strHH = strHH - 12
if (remainder ==
"am" and 987 0 < strHH >= 12)
else strHH
988 if strHH > 24
or strMM > 59:
998 for i
in range(used):
1001 if wordPrev ==
"em" or wordPrev ==
"ponto":
1002 words[words.index(wordPrev)] =
"" 1004 if idx > 0
and wordPrev
in time_indicators:
1006 if idx > 1
and wordPrevPrev
in time_indicators:
1016 if dayOffset
is False:
1021 extractedDate = dateNow
1022 extractedDate = extractedDate.replace(microsecond=0,
1027 en_months = [
'january',
'february',
'march',
'april',
'may',
'june',
1028 'july',
'august',
'september',
'october',
'november',
1030 en_monthsShort = [
'jan',
'feb',
'mar',
'apr',
'may',
'june',
'july',
1032 'sept',
'oct',
'nov',
'dec']
1033 for idx, en_month
in enumerate(en_months):
1034 datestr = datestr.replace(months[idx], en_month)
1035 for idx, en_month
in enumerate(en_monthsShort):
1036 datestr = datestr.replace(monthsShort[idx], en_month)
1038 temp = datetime.strptime(datestr,
"%B %d")
1040 temp = temp.replace(year=extractedDate.year)
1041 if extractedDate < temp:
1042 extractedDate = extractedDate.replace(year=int(currentYear),
1046 day=int(temp.strftime(
1049 extractedDate = extractedDate.replace(
1050 year=int(currentYear) + 1,
1051 month=int(temp.strftime(
"%m")),
1052 day=int(temp.strftime(
"%d")))
1054 extractedDate = extractedDate.replace(
1055 year=int(temp.strftime(
"%Y")),
1056 month=int(temp.strftime(
"%m")),
1057 day=int(temp.strftime(
"%d")))
1060 temp = datetime(timeStr)
1061 extractedDate = extractedDate.replace(hour=temp.strftime(
"%H"),
1062 minute=temp.strftime(
"%M"),
1063 second=temp.strftime(
"%S"))
1066 extractedDate = extractedDate + relativedelta(years=yearOffset)
1067 if monthOffset != 0:
1068 extractedDate = extractedDate + relativedelta(months=monthOffset)
1070 extractedDate = extractedDate + relativedelta(days=dayOffset)
1071 if (hrAbs
or 0) != -1
and (minAbs
or 0) != -1:
1072 if hrAbs
is None and minAbs
is None and default_time:
1073 hrAbs = default_time.hour
1074 minAbs = default_time.minute
1075 extractedDate = extractedDate + relativedelta(hours=hrAbs
or 0,
1076 minutes=minAbs
or 0)
1077 if (hrAbs
or minAbs)
and datestr ==
"":
1078 if not daySpecified
and dateNow > extractedDate:
1079 extractedDate = extractedDate + relativedelta(days=1)
1081 extractedDate = extractedDate + relativedelta(hours=hrOffset)
1083 extractedDate = extractedDate + relativedelta(minutes=minOffset)
1085 extractedDate = extractedDate + relativedelta(seconds=secOffset)
1087 resultStr =
" ".join(words)
1088 resultStr =
' '.join(resultStr.split())
1090 return [extractedDate, resultStr]
1093 def pt_pruning(text, symbols=True, accents=True, agressive=True):
1095 words = [
"a",
"o",
"os",
"as",
"de",
"dos",
"das",
1096 "lhe",
"lhes",
"me",
"e",
"no",
"nas",
"na",
"nos",
"em",
"para",
1098 "esta",
"deste",
"desta",
"neste",
"nesta",
"nesse",
1099 "nessa",
"foi",
"que"]
1101 symbols = [
".",
",",
";",
":",
"!",
"?",
u"�",
u"�"]
1102 for symbol
in symbols:
1103 text = text.replace(symbol,
"")
1104 text = text.replace(
"-",
" ").replace(
"_",
" ")
1106 accents = {
"a": [
u"á",
u"à",
u"ã",
u"â"],
1107 "e": [
u"ê",
u"è",
u"é"],
1112 for char
in accents:
1113 for acc
in accents[char]:
1114 text = text.replace(acc, char)
1116 text_words = text.split(
" ")
1117 for idx, word
in enumerate(text_words):
1119 text_words[idx] =
"" 1120 text =
" ".join(text_words)
1121 text =
' '.join(text.split())
1126 word = word.rstrip(
"s")
1128 words = raw_string.split(
" ")
1129 for idx, w
in enumerate(words):
1130 if w == word
and idx != 0:
1131 previous = words[idx - 1]
1137 if word[-1] ==
"o" or word[-1] ==
"e":
def pt_number_parse(words, i)
def isFractional_pt(input_str)
def get_gender_pt(word, raw_string="")
def extract_datetime_pt(input_str, currentDate, default_time)
def is_numeric(input_str)
def look_for_fractions(split_list)
def extractnumber_pt(text)
def pt_pruning(text, symbols=True, accents=True, agressive=True)
def normalize_pt(text, remove_articles)