18 Parse functions for spanish (es) 19 TODO: numbers greater than 999999 21 from datetime
import datetime
22 from dateutil.relativedelta
import relativedelta
27 es_articles = [
"el",
"la",
"los",
"las"]
98 This function takes the given text and checks if it is a fraction. 101 text (str): the string to check if fractional 103 (bool) or (float): False if not a fraction, otherwise the fraction 106 if input_str.endswith(
's', -1):
107 input_str = input_str[:len(input_str) - 1]
109 aFrac = [
"medio",
"media",
"tercio",
"cuarto",
"cuarta",
"quinto",
110 "quinta",
"sexto",
"sexta",
u"séptimo",
u"séptima",
"octavo",
111 "octava",
"noveno",
"novena",
u"décimo",
u"décima",
u"onceavo",
112 u"onceava",
u"doceavo",
u"doceava"]
114 if input_str.lower()
in aFrac:
115 return 1.0 / (aFrac.index(input_str) + 2)
116 if (input_str ==
"cuarto" or input_str ==
"cuarta"):
118 if (input_str ==
u"vigésimo" or input_str ==
u"vigésima"):
120 if (input_str ==
u"trigésimo" or input_str ==
u"trigésima"):
122 if (input_str ==
u"centésimo" or input_str ==
u"centésima"):
124 if (input_str ==
u"milésimo" or input_str ==
u"milésima"):
131 This function prepares the given text for parsing by making 132 numbers consistent, getting rid of contractions, etc. 134 text (str): the string to normalize 136 (int) or (float): The value of extracted number 139 aWords = text.split()
142 while count < len(aWords):
145 next_next_word =
None 146 if count + 1 < len(aWords):
147 next_word = aWords[count + 1]
148 if count + 2 < len(aWords):
149 next_next_word = aWords[count + 2]
154 if word
in es_numbers:
155 val = es_numbers[word]
169 aPieces = word.split(
'/')
173 val = float(aPieces[0]) / float(aPieces[1])
179 if next_word !=
"avos":
182 result = float(result) / float(val)
184 if next_word
is None:
189 if next_word
in ands:
194 newWords = aWords[count + 2:]
196 for word
in newWords:
197 newText += word +
" " 201 if result < afterAndVal
or result < 20:
202 while afterAndVal > 1:
203 afterAndVal = afterAndVal / 10.0
204 for word
in newWords:
205 if word ==
"cero" or word ==
"0":
209 for _
in range(0, zeros):
210 afterAndVal = afterAndVal / 10.0
211 result += afterAndVal
213 elif next_next_word
is not None:
214 if next_next_word
in ands:
215 newWords = aWords[count + 3:]
217 for word
in newWords:
218 newText += word +
" " 223 result += afterAndVal
226 decimals = [
"punto",
"coma",
".",
","]
227 if next_word
in decimals:
229 newWords = aWords[count + 2:]
231 for word
in newWords:
232 newText += word +
" " 233 for word
in newWords:
234 if word ==
"cero" or word ==
"0":
239 afterDotVal = zeros *
"0" + afterDotVal
240 result = float(str(result) +
"." + afterDotVal)
251 if "." in str(result):
252 integer, dec = str(result).split(
".")
255 result = int(integer)
262 if i < len(words)
and s == words[i]:
266 def es_number_word(i, mi, ma):
268 v = es_numbers.get(words[i])
269 if v
and v >= mi
and v <= ma:
273 def es_number_1_99(i):
274 r1 = es_number_word(i, 1, 29)
278 r1 = es_number_word(i, 30, 90)
284 r3 = es_number_word(i2, 1, 9)
291 def es_number_1_999(i):
293 r1 = es_number_word(i, 100, 900)
296 r2 = es_number_1_99(i1)
304 r1 = es_number_1_99(i)
312 r1 = es_number_word(i, 0, 0)
317 r1 = es_number_1_999(i)
320 r2 = es_cte(i1,
"mil")
323 r3 = es_number_1_999(i2)
326 return v1 * 1000 + v3, i3
337 """ Spanish string normalization """ 343 while i < len(words):
346 if remove_articles
and word
in es_articles:
354 normalized +=
" " + str(v)
357 normalized +=
" " + word
360 return normalized[1:]
367 symbols = [
".",
",",
";",
"?",
"!",
u"º",
u"ª"]
368 noise_words = [
"entre",
"la",
"del",
"al",
"el",
"de",
369 "por",
"para",
"una",
"cualquier",
"a",
370 "e'",
"esta",
"este"]
373 s = s.replace(word,
"")
374 for word
in noise_words:
375 s = s.replace(
" " + word +
" ",
" ")
376 s = s.lower().replace(
388 synonims = {
u"mañana": [
"amanecer",
"temprano",
"muy temprano"],
389 "tarde": [
"media tarde",
"atardecer"],
390 "noche": [
"anochecer",
"tarde"]}
392 for word
in synonims[syn]:
393 s = s.replace(
" " + word +
" ",
" " + syn +
" ")
395 wordlist = [
u"mañanas",
"tardes",
"noches",
u"días",
"semanas",
396 u"años",
"minutos",
"segundos",
"las",
"los",
"siguientes",
397 u"próximas",
u"próximos",
"horas"]
398 for _, word
in enumerate(wordlist):
399 s = s.replace(word, word.rstrip(
's'))
400 s = s.replace(
"meses",
"mes").replace(
"anteriores",
"anterior")
407 yearOffset != 0
or monthOffset != 0
or 408 dayOffset
is True or hrOffset != 0
or 409 hrAbs
or minOffset != 0
or 410 minAbs
or secOffset != 0
415 if currentDate
is None:
416 currentDate = datetime.now()
423 dateNow = currentDate
424 today = dateNow.strftime(
"%w")
425 currentYear = dateNow.strftime(
"%Y")
431 words = clean_string(input_str).split(
" ")
432 timeQualifiersList = [
u'mañana',
'tarde',
'noche']
433 time_indicators = [
"en",
"la",
"al",
"por",
"pasados",
434 "pasadas",
u"día",
"hora"]
435 days = [
'lunes',
'martes',
u'miércoles',
436 'jueves',
'viernes',
u'sábado',
'domingo']
437 months = [
'enero',
'febrero',
'marzo',
'abril',
'mayo',
'junio',
438 'julio',
'agosto',
'septiembre',
'octubre',
'noviembre',
440 monthsShort = [
'ene',
'feb',
'mar',
'abr',
'may',
'jun',
'jul',
'ago',
441 'sep',
'oct',
'nov',
'dic']
442 nexts = [
"siguiente",
u"próximo",
u"próxima"]
443 suffix_nexts = [
"siguientes",
"subsecuentes"]
444 lasts = [
u"último",
u"última"]
445 suffix_lasts = [
"pasada",
"pasado",
"anterior",
"antes"]
446 nxts = [
u"después",
"siguiente",
u"próximo",
u"próxima"]
447 prevs = [
"antes",
"previa",
"previo",
"anterior"]
448 froms = [
"desde",
"en",
"para",
u"después de",
"por",
u"próximo",
450 thises = [
"este",
"esta"]
452 lists = nxts + prevs + froms + time_indicators
453 for idx, word
in enumerate(words):
456 wordPrevPrev = words[idx - 2]
if idx > 1
else "" 457 wordPrev = words[idx - 1]
if idx > 0
else "" 458 wordNext = words[idx + 1]
if idx + 1 < len(words)
else "" 459 wordNextNext = words[idx + 2]
if idx + 2 < len(words)
else "" 460 wordNextNextNext = words[idx + 3]
if idx + 3 < len(words)
else "" 465 if word
in timeQualifiersList:
469 elif word ==
"hoy" and not fromFlag:
472 elif word ==
u"mañana" and not fromFlag:
475 elif word ==
"ayer" and not fromFlag:
479 elif (word ==
"anteayer" or 480 (word ==
"ante" and wordNext ==
"ayer"))
and not fromFlag:
483 if wordNext ==
"ayer":
485 elif word ==
"ante" and wordNext ==
"ante" and wordNextNext == \
486 "ayer" and not fromFlag:
489 elif word ==
"ante anteayer" and not fromFlag:
493 elif word ==
"pasado" and wordNext ==
u"mañana" and not fromFlag:
497 elif word ==
"ante" and wordNext ==
"ayer" and not fromFlag:
502 if wordNext ==
"pasado" or wordNext ==
"ante":
504 if wordPrev
and wordPrev[0].isdigit():
505 dayOffset += int(wordPrev)
508 elif (wordPrev
and wordPrev[0].isdigit()
and 509 wordNext
not in months
and 510 wordNext
not in monthsShort):
511 dayOffset += int(wordPrev)
514 elif wordNext
and wordNext[0].isdigit()
and wordNextNext
not in \
515 months
and wordNextNext
not in monthsShort:
516 dayOffset += int(wordNext)
520 elif word ==
"semana" and not fromFlag:
521 if wordPrev[0].isdigit():
522 dayOffset += int(wordPrev) * 7
535 for w
in suffix_nexts:
540 for w
in suffix_lasts:
546 elif word ==
"mes" and not fromFlag:
547 if wordPrev[0].isdigit():
548 monthOffset = int(wordPrev)
561 for w
in suffix_nexts:
566 for w
in suffix_lasts:
572 elif word ==
u"año" and not fromFlag:
573 if wordPrev[0].isdigit():
574 yearOffset = int(wordPrev)
587 for w
in suffix_nexts:
592 for w
in suffix_lasts:
599 elif word
in days
and not fromFlag:
601 dayOffset = (d + 1) - int(today)
605 if wordPrev ==
"siguiente":
609 elif wordPrev ==
"pasado":
613 if wordNext ==
"siguiente":
616 elif wordNext ==
"pasado":
620 elif word
in months
or word
in monthsShort:
622 m = months.index(word)
624 m = monthsShort.index(word)
627 if wordPrev
and wordPrev[0].isdigit():
629 datestr +=
" " + wordPrev
632 if wordNext
and wordNext[0].isdigit():
633 datestr +=
" " + wordNext
639 elif wordNext
and wordNext[0].isdigit():
641 datestr +=
" " + wordNext
643 if wordNextNext
and wordNextNext[0].isdigit():
644 datestr +=
" " + wordNextNext
650 elif wordPrevPrev
and wordPrevPrev[0].isdigit():
652 datestr +=
" " + wordPrevPrev
656 if wordNext
and word[0].isdigit():
657 datestr +=
" " + wordNext
663 elif wordNextNext
and wordNextNext[0].isdigit():
665 datestr +=
" " + wordNextNext
667 if wordNextNextNext
and wordNextNextNext[0].isdigit():
668 datestr +=
" " + wordNextNextNext
674 if datestr
in months:
679 validFollowups = days + months + monthsShort
680 validFollowups.append(
"hoy")
681 validFollowups.append(
u"mañana")
682 validFollowups.append(
"ayer")
683 validFollowups.append(
"anteayer")
684 validFollowups.append(
"ahora")
685 validFollowups.append(
"ya")
686 validFollowups.append(
"ante")
689 if word
in froms
and wordNext
in validFollowups:
691 if not (wordNext ==
u"mañana" and wordNext ==
"ayer")
and not (
692 word ==
"pasado" or word ==
"antes"):
695 if wordNext ==
u"mañana" and word !=
"pasado":
697 elif wordNext ==
"ayer":
699 elif wordNext ==
"anteayer":
701 elif wordNext ==
"ante" and wordNextNext ==
"ayer":
703 elif (wordNext ==
"ante" and wordNext ==
"ante" and 704 wordNextNextNext ==
"ayer"):
706 elif wordNext
in days:
707 d = days.index(wordNext)
708 tmpOffset = (d + 1) - int(today)
715 if wordNextNext
in nxts:
718 elif wordNextNext
in prevs:
721 dayOffset += tmpOffset
722 elif wordNextNext
and wordNextNext
in days:
723 d = days.index(wordNextNext)
724 tmpOffset = (d + 1) - int(today)
727 if wordNextNextNext
in nxts:
730 elif wordNextNextNext
in prevs:
733 dayOffset += tmpOffset
736 if wordNext
in months:
740 if start - 1 > 0
and words[start - 1]
in lists:
744 for i
in range(0, used):
745 words[i + start] =
"" 747 if start - 1 >= 0
and words[start - 1]
in lists:
748 words[start - 1] =
"" 759 for idx, word
in enumerate(words):
763 wordPrevPrev = words[idx - 2]
if idx > 1
else "" 764 wordPrev = words[idx - 1]
if idx > 0
else "" 765 wordNext = words[idx + 1]
if idx + 1 < len(words)
else "" 766 wordNextNext = words[idx + 2]
if idx + 2 < len(words)
else "" 767 wordNextNextNext = words[idx + 3]
if idx + 3 < len(words)
else "" 770 if word ==
"medio" and wordNext ==
u"día":
773 elif word ==
"media" and wordNext ==
"noche":
776 elif word ==
u"mañana":
780 elif word ==
"tarde":
784 elif word ==
"media" and wordNext ==
"tarde":
788 elif word ==
"tarde" and wordNext ==
"noche":
792 elif word ==
"media" and wordNext ==
u"mañana":
804 elif word ==
"madrugada":
808 elif word ==
"noche":
813 elif word ==
"hora" and \
814 (wordPrev
in time_indicators
or wordPrevPrev
in 816 if wordPrev ==
"media":
818 elif wordPrev ==
"cuarto":
820 elif wordPrevPrev ==
"cuarto":
822 if idx > 2
and words[idx - 3]
in time_indicators:
827 if wordPrevPrev
in time_indicators:
834 elif word[0].isdigit():
844 for i
in range(length):
846 if word[i].isdigit():
854 if word[i].isdigit():
860 remainder = word[i:].replace(
".",
"")
863 nextWord = wordNext.replace(
".",
"")
864 if nextWord ==
"am" or nextWord ==
"pm":
867 elif wordNext ==
u"mañana" or wordNext ==
"madrugada":
870 elif wordNext ==
"tarde":
873 elif wordNext ==
"noche":
874 if 0 < int(word[0]) < 6:
879 elif wordNext
in thises
and wordNextNext ==
u"mañana":
882 elif wordNext
in thises
and wordNextNext ==
"tarde":
885 elif wordNext
in thises
and wordNextNext ==
"noche":
889 if timeQualifier !=
"":
891 (timeQualifier ==
u"mañana" or 892 timeQualifier ==
"tarde"):
901 for i
in range(length):
902 if word[i].isdigit():
908 remainder = wordNext.replace(
".",
"").lstrip().rstrip()
913 remainder ==
"p.m." or 921 remainder ==
"a.m." or 927 if (wordNext ==
"pm" or 928 wordNext ==
"p.m." or 929 wordNext ==
"tarde"):
933 elif (wordNext ==
"am" or 934 wordNext ==
"a.m." or 935 wordNext ==
u"mañana"):
939 elif (int(word) > 100
and 946 strHH = int(word) / 100
947 strMM = int(word) - strHH * 100
948 if wordNext ==
"hora":
951 wordNext ==
"hora" and 965 elif wordNext ==
"minuto":
967 minOffset = int(word)
972 elif wordNext ==
"segundo":
974 secOffset = int(word)
979 elif int(word) > 100:
980 strHH = int(word) / 100
981 strMM = int(word) - strHH * 100
982 if wordNext ==
"hora":
985 elif wordNext ==
"" or (
986 wordNext ==
"en" and wordNextNext ==
"punto"):
989 if wordNext ==
"en" and wordNextNext ==
"punto":
991 if wordNextNextNext ==
"tarde":
994 elif wordNextNextNext ==
u"mañana":
997 elif wordNextNextNext ==
"noche":
1004 elif wordNext[0].isdigit():
1008 if wordNextNext ==
"hora":
1013 strHH = int(strHH)
if strHH
else 0
1014 strMM = int(strMM)
if strMM
else 0
1015 strHH = strHH + 12
if (remainder ==
"pm" and 1016 0 < strHH < 12)
else strHH
1017 strHH = strHH - 12
if (remainder ==
"am" and 1018 0 < strHH >= 12)
else strHH
1019 if strHH > 24
or strMM > 59:
1029 for i
in range(used):
1032 if wordPrev ==
"en" or wordPrev ==
"punto":
1033 words[words.index(wordPrev)] =
"" 1035 if idx > 0
and wordPrev
in time_indicators:
1037 if idx > 1
and wordPrevPrev
in time_indicators:
1047 if dayOffset
is False:
1052 extractedDate = dateNow
1053 extractedDate = extractedDate.replace(microsecond=0,
1058 en_months = [
'january',
'february',
'march',
'april',
'may',
'june',
1059 'july',
'august',
'september',
'october',
'november',
1061 en_monthsShort = [
'jan',
'feb',
'mar',
'apr',
'may',
'june',
'july',
1063 'sept',
'oct',
'nov',
'dec']
1064 for idx, en_month
in enumerate(en_months):
1065 datestr = datestr.replace(months[idx], en_month)
1066 for idx, en_month
in enumerate(en_monthsShort):
1067 datestr = datestr.replace(monthsShort[idx], en_month)
1069 temp = datetime.strptime(datestr,
"%B %d")
1071 temp = temp.replace(year=extractedDate.year)
1072 if extractedDate < temp:
1073 extractedDate = extractedDate.replace(year=int(currentYear),
1077 day=int(temp.strftime(
1080 extractedDate = extractedDate.replace(
1081 year=int(currentYear) + 1,
1082 month=int(temp.strftime(
"%m")),
1083 day=int(temp.strftime(
"%d")))
1085 extractedDate = extractedDate.replace(
1086 year=int(temp.strftime(
"%Y")),
1087 month=int(temp.strftime(
"%m")),
1088 day=int(temp.strftime(
"%d")))
1091 extractedDate = extractedDate + relativedelta(years=yearOffset)
1092 if monthOffset != 0:
1093 extractedDate = extractedDate + relativedelta(months=monthOffset)
1095 extractedDate = extractedDate + relativedelta(days=dayOffset)
1097 if hrAbs
is None and minAbs
is None and default_time:
1098 hrAbs = default_time.hour
1099 minAbs = default_time.minute
1101 if hrAbs != -1
and minAbs != -1:
1102 extractedDate = extractedDate + relativedelta(hours=hrAbs
or 0,
1103 minutes=minAbs
or 0)
1104 if (hrAbs
or minAbs)
and datestr ==
"":
1105 if not daySpecified
and dateNow > extractedDate:
1106 extractedDate = extractedDate + relativedelta(days=1)
1108 extractedDate = extractedDate + relativedelta(hours=hrOffset)
1110 extractedDate = extractedDate + relativedelta(minutes=minOffset)
1112 extractedDate = extractedDate + relativedelta(seconds=secOffset)
1114 resultStr =
" ".join(words)
1115 resultStr =
' '.join(resultStr.split())
1117 return [extractedDate, resultStr]
1124 word = word.rstrip(
"s")
1126 words = raw_string.split(
" ")
1127 for idx, w
in enumerate(words):
1128 if w == word
and idx != 0:
1129 previous = words[idx - 1]
1135 if word[-1] ==
"o" or word[-1] ==
"e":
def es_number_parse(words, i)
def extractnumber_es(text)
def isFractional_es(input_str)
def extract_datetime_es(input_str, currentDate=None, default_time=None)
def is_numeric(input_str)
def look_for_fractions(split_list)
def get_gender_es(word, raw_string="")
def normalize_es(text, remove_articles)