17 """ Parse functions for french (fr) 20 * extractnumber_fr: ordinal numbers ("cinquième") 21 * extractnumber_fr: numbers greater than 999 999 ("cinq millions") 22 * extract_datetime_fr: "quatrième lundi de janvier" 26 from datetime
import datetime
27 from dateutil.relativedelta
import relativedelta
29 extract_numbers_generic
34 articles_fr = [
"le",
"la",
"du",
"de",
"les",
"des"]
66 "quatre-vingt-dix": 90,
76 "milliard": 1000000000,
77 "milliards": 1000000000}
79 ordinals_fr = (
"er",
"re",
"ère",
"nd",
"nde" "ième",
"ème",
"e")
83 """ Parses a list of words to find a number 84 Takes in a list of words (strings without whitespace) and 85 extracts a number that starts at the given index. 87 words (array): the list to extract a number from 88 i (int): the index in words where to look for the number 90 tuple with number, index of next word after the number. 92 Returns None if no number was found. 99 if i < len(words)
and s == words[i]:
103 def number_word_fr(i, mi, ma):
108 val = numbers_fr.get(words[i])
111 if val >= mi
and val <= ma:
116 splitWord = words[i].split(
'-')
117 if len(splitWord) > 1:
118 val1 = numbers_fr.get(splitWord[0])
123 if val1 < 10
and splitWord[1] ==
"cents":
128 if len(splitWord) > i1
and splitWord[0] ==
"quatre" and \
129 splitWord[1] ==
"vingt":
137 if len(splitWord) > i1:
139 if len(splitWord) > i1 + 1
and splitWord[i1] ==
"et":
140 val2 = numbers_fr.get(splitWord[i1 + 1])
144 elif splitWord[i1] ==
"dix" and \
145 len(splitWord) > i1 + 1:
146 val2 = numbers_fr.get(splitWord[i1 + 1])
151 val2 = numbers_fr.get(splitWord[i1])
154 if len(splitWord) > i1:
155 val3 = numbers_fr.get(splitWord[i1])
161 val = val1 + val2 + val3
166 if i1 == len(splitWord)
and val
and ma >= val >= mi:
171 def number_1_99_fr(i):
177 result1 = number_word_fr(i, 1, 16)
182 result1 = number_word_fr(i, 10, 99)
185 result2 = cte_fr(i1,
"et")
189 result3 = number_word_fr(i2, 1, 11)
192 return val1 + val3, i3
198 def number_1_999_fr(i):
204 result = number_word_fr(i, 100, 100)
208 resultH1 = number_word_fr(i, 2, 9)
210 valH1, iH1 = resultH1
211 resultH2 = number_word_fr(iH1, 100, 100)
214 result = valH1 * 100, iH2
218 result2 = number_1_99_fr(i1)
221 return val1 + val2, i2
226 result = number_word_fr(i, 101, 999)
231 result = number_1_99_fr(i)
237 def number_1_999999_fr(i):
238 """ Find a number in a list of words 239 Checks if words[i] is a number between 1 and 999,999. 242 i (int): the index in words where to look for the number 244 tuple with number, index of next word after the number. 246 Returns None if no number was found. 250 result1 = number_word_fr(i, 0, 0)
255 result1 = number_1_999_fr(i)
262 result2 = number_word_fr(i1, 1000, 1000)
267 result3 = number_1_999_fr(i2)
270 return val1 * 1000 + val3, i3
272 return val1 * 1000, i2
277 return number_1_999999_fr(i)
281 """ Get the ordinal number 282 Takes in a word (string without whitespace) and 283 extracts the ordinal number. 285 word (string): the word to extract the number from 289 Returns None if no ordinal number was found. 292 for ordinal
in ordinals_fr:
293 if word[0].isdigit()
and ordinal
in word:
294 result = word.replace(ordinal,
"")
302 """ Find an ordinal number in a list of words 303 Takes in a list of words (strings without whitespace) and 304 extracts an ordinal number that starts at the given index. 306 words (array): the list to extract a number from 307 i (int): the index in words where to look for the ordinal number 309 tuple with ordinal number (str), 310 index of next word after the number (int). 312 Returns None if no ordinal number was found. 322 strOrd = str(val1) +
"e" 334 if word
in [
"premier",
"première"]:
336 elif word ==
"second":
338 elif word.endswith(
"ième"):
344 strOrd = str(val1 * 100) +
"e" 350 strOrd = str(val1 * 1000) +
"e" 355 if word.endswith(
"cinqu"):
358 elif word.endswith(
"neuv"):
359 word = word[:-1] +
"f" 368 strOrd = str(val1 + val2) +
"e" 376 """Takes in a string and extracts a number. 378 text (str): the string to extract a number from 380 (str): The number extracted or the original text. 385 aWords = text.split()
389 while count < len(aWords):
394 if count < (len(aWords) - 1):
395 wordNext = aWords[count + 1]
397 wordPrev = aWords[count - 1]
399 if word
in articles_fr:
402 if word
in [
"et",
"plus",
"+"]:
426 val = float(val) * valNext
432 aPieces = word.split(
'/')
436 val = float(aPieces[0]) / float(aPieces[1])
439 if wordNext ==
"virgule":
441 newWords = aWords[count + 1:]
443 for word
in newWords:
444 if word ==
"zéro" or word ==
"0":
450 if newWords[zeros].isdigit():
451 afterDotVal = newWords[zeros]
452 countDot = count + zeros + 2
460 afterDotString = zeros *
"0" + afterDotVal
461 val = float(str(val) +
"." + afterDotString)
479 cleans the input string of unneeded punctuation and capitalization 484 for idx, word
in enumerate(wordList):
486 if word[-1]
in [
",",
"."]:
496 yearOffset != 0
or monthOffset != 0
or dayOffset
or 497 (isTime
and (hrAbs
or minAbs))
or 498 hrOffset != 0
or minOffset != 0
or secOffset != 0
501 if string ==
"" or not currentDate:
509 dateNow = currentDate
510 today = dateNow.strftime(
"%w")
511 currentYear = dateNow.strftime(
"%Y")
517 timeQualifiersList = [
"matin",
"après-midi",
"soir",
"nuit"]
518 words_in = [
"dans",
"après"]
519 markers = [
"à",
"dès",
"autour",
"vers",
"environs",
"ce",
521 days = [
"lundi",
"mardi",
"mercredi",
522 "jeudi",
"vendredi",
"samedi",
"dimanche"]
523 months = [
"janvier",
"février",
"mars",
"avril",
"mai",
"juin",
524 "juillet",
"août",
"septembre",
"octobre",
"novembre",
526 monthsShort = [
"jan",
"fév",
"mar",
"avr",
"mai",
"juin",
"juil",
"aoû",
527 "sept",
"oct",
"nov",
"déc"]
529 months_en = [
'january',
'february',
'march',
'april',
'may',
'june',
530 'july',
'august',
'september',
'october',
'november',
533 words = clean_string(string)
535 for idx, word
in enumerate(words):
538 wordPrevPrevPrev = words[idx - 3]
if idx > 2
else "" 539 wordPrevPrev = words[idx - 2]
if idx > 1
else "" 540 wordPrev = words[idx - 1]
if idx > 0
else "" 541 wordNext = words[idx + 1]
if idx + 1 < len(words)
else "" 542 wordNextNext = words[idx + 2]
if idx + 2 < len(words)
else "" 547 if word
in timeQualifiersList:
550 if wordPrev
in [
"ce",
"cet",
"cette"]:
554 elif word ==
"aujourd'hui" and not fromFlag:
557 elif word ==
"demain" and not fromFlag:
560 elif word ==
"après-demain" and not fromFlag:
564 elif word
in [
"jour",
"jours"]:
565 if wordPrev.isdigit():
566 dayOffset += int(wordPrev)
574 elif word
in [
"semaine",
"semaines"]
and not fromFlag:
575 if wordPrev[0].isdigit():
576 dayOffset += int(wordPrev) * 7
579 elif wordNext
in [
"prochaine",
"suivante"]:
582 elif wordNext
in [
"dernière",
"précédente"]:
586 elif word ==
"mois" and not fromFlag:
587 if wordPrev[0].isdigit():
588 monthOffset = int(wordPrev)
591 elif wordNext
in [
"prochain",
"suivant"]:
594 elif wordNext
in [
"dernier",
"précédent"]:
598 elif word
in [
"an",
"ans",
"année",
"années"]
and not fromFlag:
599 if wordPrev[0].isdigit():
600 yearOffset = int(wordPrev)
603 elif wordNext
in [
"prochain",
"prochaine",
"suivant",
"suivante"]:
606 elif wordNext
in [
"dernier",
"dernière",
"précédent",
611 elif word
in days
and not fromFlag:
613 dayOffset = (d + 1) - int(today)
617 if wordNext
in [
"prochain",
"suivant"]:
620 elif wordNext
in [
"dernier",
"précédent"]:
624 elif word
in months
or word
in monthsShort
and not fromFlag:
626 m = months.index(word)
628 m = monthsShort.index(word)
630 datestr = months_en[m]
631 if wordPrev
and (wordPrev[0].isdigit()):
632 datestr +=
" " + wordPrev
637 if wordNext
and wordNext[0].isdigit():
638 datestr +=
" " + wordNext
645 validFollowups = days + months + monthsShort
646 validFollowups.append(
"aujourd'hui")
647 validFollowups.append(
"demain")
648 validFollowups.append(
"prochain")
649 validFollowups.append(
"prochaine")
650 validFollowups.append(
"suivant")
651 validFollowups.append(
"suivante")
652 validFollowups.append(
"dernier")
653 validFollowups.append(
"dernière")
654 validFollowups.append(
"précédent")
655 validFollowups.append(
"précédente")
656 validFollowups.append(
"maintenant")
657 if word
in [
"après",
"depuis"]
and wordNext
in validFollowups:
660 if wordNext ==
"demain":
662 elif wordNext
in days:
663 d = days.index(wordNext)
664 tmpOffset = (d + 1) - int(today)
666 if wordNextNext ==
"prochain":
669 elif wordNextNext ==
"dernier":
674 dayOffset += tmpOffset
676 if start - 1 > 0
and words[start - 1]
in [
"ce",
"cette"]:
680 for i
in range(0, used):
681 words[i + start] =
"" 683 if start - 1 >= 0
and words[start - 1]
in markers:
684 words[start - 1] =
"" 697 for idx, word
in enumerate(words):
701 wordPrevPrev = words[idx - 2]
if idx > 1
else "" 702 wordPrev = words[idx - 1]
if idx > 0
else "" 703 wordNext = words[idx + 1]
if idx + 1 < len(words)
else "" 704 wordNextNext = words[idx + 2]
if idx + 2 < len(words)
else "" 709 if word
in [
"midi",
"minuit"]:
714 elif word ==
"minuit":
717 if wordNext.isdigit():
718 minAbs = int(wordNext)
720 elif wordNext ==
"et":
721 if wordNextNext ==
"quart":
724 elif wordNextNext ==
"demi":
727 elif wordNext ==
"moins":
728 if wordNextNext.isdigit():
729 minAbs = 60 - int(wordNextNext)
735 if wordNextNext ==
"quart":
743 elif word ==
"demi-heure" or word ==
"heure" and \
744 (wordPrevPrev
in markers
or wordPrevPrevPrev
in markers):
747 if word ==
"demi-heure":
749 elif wordPrev ==
"quart":
753 elif wordPrev ==
"quarts" and wordPrevPrev.isdigit():
754 minOffset = int(wordPrevPrev) * 15
757 if wordPrev.isdigit()
or wordPrevPrev.isdigit():
763 if ":" in word
or "h" in word
or "min" in word:
770 for i
in range(length):
772 if word[i].isdigit():
775 elif word[i]
in [
":",
"h",
"m"]:
781 if word[i].isdigit():
786 if word[i:i + 3] ==
"min":
790 if wordPrev
in words_in:
791 hrOffset = int(strHH)
if strHH
else 0
792 minOffset = int(strMM)
if strMM
else 0
794 hrAbs = int(strHH)
if strHH
else 0
795 minAbs = int(strMM)
if strMM
else 0
803 wordNext
in [
"heures",
"heure"]
and word !=
"0" and 809 if wordPrev
in words_in:
816 if idxHr < len(words):
818 if words[idxHr].isdigit():
819 if wordPrev
in words_in:
820 minOffset = int(words[idxHr])
822 minAbs = int(words[idxHr])
826 elif words[idxHr] ==
"et" and idxHr + 1 < len(words):
827 if words[idxHr + 1] ==
"quart":
828 if wordPrev
in words_in:
834 elif words[idxHr + 1] ==
"demi":
835 if wordPrev
in words_in:
842 elif words[idxHr] ==
"moins" and \
843 idxHr + 1 < len(words):
844 if words[idxHr + 1].isdigit():
845 if wordPrev
in words_in:
847 minOffset = 60 - int(words[idxHr + 1])
850 minAbs = 60 - int(words[idxHr + 1])
853 elif words[idxHr + 1] ==
"quart":
854 if wordPrev
in words_in:
863 if idxHr < len(words)
and \
864 words[idxHr]
in [
"minutes",
"minute"]:
867 elif wordNext ==
"minutes":
869 if wordPrev
in words_in:
870 minOffset = int(word)
874 elif wordNext ==
"secondes":
876 secOffset = int(word)
878 elif int(word) > 100:
880 hrAbs = int(word) / 100
881 minAbs = int(word) - hrAbs * 100
883 if wordNext ==
"heures":
888 if timeQualifier ==
"matin":
890 elif timeQualifier ==
"après-midi":
892 elif timeQualifier ==
"soir":
894 elif timeQualifier ==
"nuit":
899 hrAbs = ((hrAbs
or 0) + 12
if ampm ==
"pm" and (hrAbs
or 0) < 12
901 hrAbs = ((hrAbs
or 0) - 12
if ampm ==
"am" and (hrAbs
or 0) >= 12
903 if (hrAbs
or 0) > 24
or ((minAbs
or 0) > 59):
906 elif wordPrev
in words_in:
911 elif not hrAbs
and timeQualifier:
912 if timeQualifier ==
"matin":
914 elif timeQualifier ==
"après-midi":
916 elif timeQualifier ==
"soir":
918 elif timeQualifier ==
"nuit":
924 for i
in range(0, used):
925 words[i + start] =
"" 927 if start - 1 >= 0
and words[start - 1]
in markers:
928 words[start - 1] =
"" 937 if dayOffset
is False:
941 extractedDate = dateNow
942 extractedDate = extractedDate.replace(microsecond=0,
948 temp = datetime.strptime(datestr,
"%B %d")
949 temp = temp.replace(year=extractedDate.year)
950 if extractedDate < temp:
951 extractedDate = extractedDate.replace(year=int(currentYear),
955 day=int(temp.strftime(
958 extractedDate = extractedDate.replace(
959 year=int(currentYear) + 1,
960 month=int(temp.strftime(
"%m")),
961 day=int(temp.strftime(
"%d")))
963 temp = datetime.strptime(datestr,
"%B %d %Y")
964 extractedDate = extractedDate.replace(
965 year=int(temp.strftime(
"%Y")),
966 month=int(temp.strftime(
"%m")),
967 day=int(temp.strftime(
"%d")))
970 extractedDate = extractedDate + relativedelta(years=yearOffset)
972 extractedDate = extractedDate + relativedelta(months=monthOffset)
974 extractedDate = extractedDate + relativedelta(days=dayOffset)
976 if hrAbs
is None and minAbs
is None and default_time:
977 hrAbs = default_time.hour
978 minAbs = default_time.minute
979 if hrAbs != -1
and minAbs != -1:
980 extractedDate = extractedDate + relativedelta(hours=hrAbs
or 0,
982 if (hrAbs
or minAbs)
and datestr ==
"":
983 if not daySpecified
and dateNow > extractedDate:
984 extractedDate = extractedDate + relativedelta(days=1)
986 extractedDate = extractedDate + relativedelta(hours=hrOffset)
988 extractedDate = extractedDate + relativedelta(minutes=minOffset)
990 extractedDate = extractedDate + relativedelta(seconds=secOffset)
991 for idx, word
in enumerate(words):
992 if words[idx] ==
"et" and words[idx - 1] ==
"" and \
993 words[idx + 1] ==
"":
996 resultStr =
" ".join(words)
997 resultStr =
' '.join(resultStr.split())
998 return [extractedDate, resultStr]
1003 This function takes the given text and checks if it is a fraction. 1005 input_str (str): the string to check if fractional 1007 (bool) or (float): False if not a fraction, otherwise the fraction 1009 input_str = input_str.lower()
1011 if input_str !=
"tiers" and input_str.endswith(
's', -1):
1012 input_str = input_str[:len(input_str) - 1]
1014 aFrac = [
"entier",
"demi",
"tiers",
"quart",
"cinquième",
"sixième",
1015 "septième",
"huitième",
"neuvième",
"dixième",
"onzième",
1016 "douzième",
"treizième",
"quatorzième",
"quinzième",
"seizième",
1017 "dix-septième",
"dix-huitième",
"dix-neuvième",
"vingtième"]
1019 if input_str
in aFrac:
1020 return 1.0 / (aFrac.index(input_str) + 1)
1023 if input_str ==
"trentième":
1025 if input_str ==
"centième":
1027 if input_str ==
"millième":
1034 """ French string normalization """ 1036 words = text.split()
1039 while i < len(words):
1041 if remove_articles
and words[i]
in articles_fr:
1044 if remove_articles
and words[i][:2]
in [
"l'",
"d'"]:
1045 words[i] = words[i][2:]
1047 if words[i]
in [
"?",
"!",
";",
"…"]:
1051 if i > 0
and words[i - 1]
in articles_fr:
1053 if result
is not None:
1055 normalized +=
" " + str(val)
1059 if result
is not None:
1061 normalized +=
" " + str(val)
1064 normalized +=
" " + words[i]
1067 return normalized[1:]
1072 Takes in a string and extracts a list of numbers. 1075 text (str): the string to extract a number from 1076 short_scale (bool): Use "short scale" or "long scale" for large 1077 numbers -- over a million. The default is short scale, which 1078 is now common in most English speaking countries. 1079 See https://en.wikipedia.org/wiki/Names_of_large_numbers 1080 ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 1082 list: list of extracted numbers as floats 1085 short_scale=short_scale, ordinals=ordinals)
def extract_datetime_fr(string, currentDate, default_time)
def number_parse_fr(words, i)
def number_ordinal_fr(words, i)
def extract_numbers_generic(text, pronounce_handler, extract_handler, short_scale=True, ordinals=False)
def is_numeric(input_str)
def look_for_fractions(split_list)
def isFractional_fr(input_str)
def normalize_fr(text, remove_articles)
def extract_numbers_fr(text, short_scale=True, ordinals=False)
def extractnumber_fr(text)