17 from datetime
import datetime
18 from dateutil.relativedelta
import relativedelta
20 extract_numbers_generic
55 u'fünfundzwanzig': 25,
56 'sechsundzwanzig': 26,
57 'siebenundzwanzig': 27,
84 This function prepares the given text for parsing by making 85 numbers consistent, getting rid of contractions, etc. 87 text (str): the string to normalize 89 (int) or (float): The value of extracted number 92 undefined articles cannot be suppressed in German: 93 'ein Pferd' means 'one horse' and 'a horse' 97 aWords = [word
for word
in aWords
if 98 word
not in [
"der",
"die",
"das",
"des",
"den",
"dem"]]
103 while count < len(aWords):
113 if word
in de_numbers:
114 val = de_numbers[word]
115 if count < (len(aWords) - 1):
116 wordNext = aWords[count + 1]
123 aWords[count + 1] =
"" 127 aPieces = word.split(
'/')
131 val = float(aPieces[0]) / float(aPieces[1])
143 aWords[count - 1] =
'' 145 elif count + 1 < len(aWords)
and aWords[count + 1] ==
'und':
151 elif count + 2 < len(aWords)
and aWords[count + 2] ==
'und':
169 cleans the input string of unneeded punctuation 170 and capitalization among other things. 172 'am' is a preposition, so cannot currently be used 173 for 12 hour date format 176 s = s.lower().replace(
'?',
'').replace(
'.',
'').replace(
',',
'') \
177 .replace(
' der ',
' ').replace(
' den ',
' ').replace(
' an ',
180 .replace(
' auf ',
' ').replace(
' um ',
' ')
183 for idx, word
in enumerate(wordList):
193 datestr !=
"" or timeStr !=
"" or 194 yearOffset != 0
or monthOffset != 0
or 195 dayOffset
is True or hrOffset != 0
or 196 hrAbs
or minOffset != 0
or 197 minAbs
or secOffset != 0
200 if string ==
"" or not currentDate:
208 dateNow = currentDate
209 today = dateNow.strftime(
"%w")
210 currentYear = dateNow.strftime(
"%Y")
216 timeQualifiersList = [
u'früh',
'morgens',
'vormittag',
'vormittags',
217 'nachmittag',
'nachmittags',
'abend',
'abends',
219 markers = [
'in',
'am',
'gegen',
'bis',
u'für']
220 days = [
'montag',
'dienstag',
'mittwoch',
221 'donnerstag',
'freitag',
'samstag',
'sonntag']
222 months = [
'januar',
'februar',
u'märz',
'april',
'mai',
'juni',
223 'juli',
'august',
'september',
'october',
'november',
225 monthsShort = [
'jan',
'feb',
u'mär',
'apr',
'mai',
'juni',
'juli',
'aug',
226 'sept',
'oct',
'nov',
'dez']
228 validFollowups = days + months + monthsShort
229 validFollowups.append(
"heute")
230 validFollowups.append(
"morgen")
231 validFollowups.append(
u"nächste")
232 validFollowups.append(
u"nächster")
233 validFollowups.append(
u"nächstes")
234 validFollowups.append(
u"nächsten")
235 validFollowups.append(
u"nächstem")
236 validFollowups.append(
"letzte")
237 validFollowups.append(
"letzter")
238 validFollowups.append(
"letztes")
239 validFollowups.append(
"letzten")
240 validFollowups.append(
"letztem")
241 validFollowups.append(
"jetzt")
243 words = clean_string(string)
245 for idx, word
in enumerate(words):
248 wordPrevPrev = words[idx - 2]
if idx > 1
else "" 249 wordPrev = words[idx - 1]
if idx > 0
else "" 250 wordNext = words[idx + 1]
if idx + 1 < len(words)
else "" 251 wordNextNext = words[idx + 2]
if idx + 2 < len(words)
else "" 255 if word !=
'morgen' and word !=
u'übermorgen':
256 if word[-2:] ==
"en":
265 if word
in timeQualifiersList:
268 elif word ==
"heute" and not fromFlag:
271 elif word ==
"morgen" and not fromFlag
and wordPrev !=
"am" and \
272 wordPrev
not in days:
276 elif word ==
u"übermorgen" and not fromFlag:
280 elif word ==
"tag" or word ==
"tage":
281 if wordPrev[0].isdigit():
282 dayOffset += int(wordPrev)
285 elif word ==
"woch" and not fromFlag:
286 if wordPrev[0].isdigit():
287 dayOffset += int(wordPrev) * 7
290 elif wordPrev[:6] ==
u"nächst":
294 elif wordPrev[:5] ==
"letzt":
299 elif word ==
"monat" and not fromFlag:
300 if wordPrev[0].isdigit():
301 monthOffset = int(wordPrev)
304 elif wordPrev[:6] ==
u"nächst":
308 elif wordPrev[:5] ==
"letzt":
313 elif word ==
"jahr" and not fromFlag:
314 if wordPrev[0].isdigit():
315 yearOffset = int(wordPrev)
318 elif wordPrev[:6] ==
u"nächst":
322 elif wordPrev[:6] ==
u"nächst":
328 elif word
in days
and not fromFlag:
330 dayOffset = (d + 1) - int(today)
334 if wordNext ==
"morgen":
336 words[idx + 1] =
u"früh" 337 if wordPrev[:6] ==
u"nächst":
341 elif wordPrev[:5] ==
"letzt":
346 elif word
in months
or word
in monthsShort
and not fromFlag:
348 m = months.index(word)
350 m = monthsShort.index(word)
353 if wordPrev
and (wordPrev[0].isdigit()
or 354 (wordPrev ==
"of" and wordPrevPrev[0].isdigit())):
355 if wordPrev ==
"of" and wordPrevPrev[0].isdigit():
356 datestr +=
" " + words[idx - 2]
360 datestr +=
" " + wordPrev
363 if wordNext
and wordNext[0].isdigit():
364 datestr +=
" " + wordNext
370 elif wordNext
and wordNext[0].isdigit():
371 datestr +=
" " + wordNext
373 if wordNextNext
and wordNextNext[0].isdigit():
374 datestr +=
" " + wordNextNext
383 word ==
"von" or word ==
"nach" or word ==
"ab")
and wordNext \
387 if wordNext ==
"morgen" and wordPrev !=
"am" and \
388 wordPrev
not in days:
391 elif wordNext
in days:
392 d = days.index(wordNext)
393 tmpOffset = (d + 1) - int(today)
397 dayOffset += tmpOffset
398 elif wordNextNext
and wordNextNext
in days:
399 d = days.index(wordNextNext)
400 tmpOffset = (d + 1) - int(today)
402 if wordNext[:6] ==
u"nächst":
406 elif wordNext[:5] ==
"letzt":
410 dayOffset += tmpOffset
412 if start - 1 > 0
and words[start - 1].startswith(
"diese"):
416 for i
in range(0, used):
417 words[i + start] =
"" 419 if start - 1 >= 0
and words[start - 1]
in markers:
420 words[start - 1] =
"" 432 for idx, word
in enumerate(words):
436 wordPrevPrev = words[idx - 2]
if idx > 1
else "" 437 wordPrev = words[idx - 1]
if idx > 0
else "" 438 wordNext = words[idx + 1]
if idx + 1 < len(words)
else "" 439 wordNextNext = words[idx + 2]
if idx + 2 < len(words)
else "" 440 wordNextNextNext = words[idx + 3]
if idx + 3 < len(words)
else "" 441 wordNextNextNextNext = words[idx + 4]
if idx + 4 < len(words)
else "" 445 if word[:6] ==
"mittag":
448 elif word[:11] ==
"mitternacht":
451 elif word ==
"morgens" or (
452 wordPrev ==
"am" and word ==
"morgen")
or word ==
u"früh":
456 elif word[:10] ==
"nachmittag":
460 elif word[:5] ==
"abend":
465 elif word ==
"stunde" and \
466 (wordPrev
in markers
or wordPrevPrev
in markers):
467 if wordPrev[:4] ==
"halb":
469 elif wordPrev ==
"viertel":
471 elif wordPrev ==
"dreiviertel":
475 if wordPrevPrev
in markers:
482 elif word[0].isdigit():
492 for i
in range(length):
494 if word[i].isdigit():
502 if word[i].isdigit():
508 remainder = word[i:].replace(
".",
"")
511 nextWord = wordNext.replace(
".",
"")
512 if nextWord ==
"am" or nextWord ==
"pm":
515 elif nextWord ==
"abends":
518 elif wordNext ==
"am" and wordNextNext ==
"morgen":
521 elif wordNext ==
"am" and wordNextNext ==
"nachmittag":
524 elif wordNext ==
"am" and wordNextNext ==
"abend":
527 elif wordNext ==
"morgens":
530 elif wordNext ==
"nachmittags":
533 elif wordNext ==
"abends":
536 elif wordNext ==
"heute" and wordNextNext ==
"morgen":
539 elif wordNext ==
"heute" and wordNextNext ==
"nachmittag":
542 elif wordNext ==
"heute" and wordNextNext ==
"abend":
545 elif wordNext ==
"nachts":
552 if timeQualifier !=
"":
554 (timeQualifier ==
"abends" or 555 timeQualifier ==
"nachmittags"):
563 for i
in range(length):
564 if word[i].isdigit():
570 remainder = wordNext.replace(
".",
"").lstrip().rstrip()
575 remainder ==
"p.m." or 583 remainder ==
"a.m." or 589 if wordNext ==
"stund" and int(word) < 100:
596 elif wordNext ==
"minut":
598 minOffset = int(word)
603 elif wordNext ==
"sekund":
605 secOffset = int(word)
611 elif wordNext ==
"uhr":
615 if wordNextNext == timeQualifier:
617 if wordNextNext[:10] ==
"nachmittag":
620 elif wordNextNext ==
"am" and wordNextNextNext == \
624 elif wordNextNext[:5] ==
"abend":
627 elif wordNextNext ==
"am" and wordNextNextNext == \
631 elif wordNextNext[:7] ==
"morgens":
634 elif wordNextNext ==
"am" and wordNextNextNext == \
638 elif wordNextNext ==
"nachts":
640 if 8 <= int(word) <= 12:
648 if wordNextNextNext == timeQualifier:
649 if wordNextNextNext[:10] ==
"nachmittag":
652 elif wordNextNextNext ==
"am" and \
653 wordNextNextNextNext ==
"nachmittag":
656 elif wordNextNextNext[:5] ==
"abend":
659 elif wordNextNextNext ==
"am" and \
660 wordNextNextNextNext ==
"abend":
663 elif wordNextNextNext[:7] ==
"morgens":
666 elif wordNextNextNext ==
"am" and \
667 wordNextNextNextNext ==
"morgen":
670 elif wordNextNextNext ==
"nachts":
672 if 8 <= int(word) <= 12:
677 elif wordNext == timeQualifier:
681 if wordNext[:10] ==
"nachmittag":
684 elif wordNext ==
"am" and wordNextNext ==
"nachmittag":
687 elif wordNext[:5] ==
"abend":
690 elif wordNext ==
"am" and wordNextNext ==
"abend":
693 elif wordNext[:7] ==
"morgens":
696 elif wordNext ==
"am" and wordNextNext ==
"morgen":
699 elif wordNext ==
"nachts":
701 if 8 <= int(word) <= 12:
711 strHH = int(strHH)
if strHH
else 0
712 strMM = int(strMM)
if strMM
else 0
713 strHH = strHH + 12
if remainder ==
"pm" and strHH < 12
else strHH
714 strHH = strHH - 12
if remainder ==
"am" and strHH >= 12
else strHH
715 if strHH > 24
or strMM > 59:
724 for i
in range(used):
727 if wordPrev ==
"Uhr":
728 words[words.index(wordPrev)] =
"" 730 if wordPrev ==
u"früh":
734 elif wordPrev ==
u"spät":
738 if idx > 0
and wordPrev
in markers:
740 if idx > 1
and wordPrevPrev
in markers:
750 if dayOffset
is False:
755 extractedDate = dateNow
756 extractedDate = extractedDate.replace(microsecond=0,
761 en_months = [
'january',
'february',
'march',
'april',
'may',
'june',
762 'july',
'august',
'september',
'october',
'november',
764 en_monthsShort = [
'jan',
'feb',
'mar',
'apr',
'may',
'june',
'july',
766 'sept',
'oct',
'nov',
'dec']
767 for idx, en_month
in enumerate(en_months):
768 datestr = datestr.replace(months[idx], en_month)
769 for idx, en_month
in enumerate(en_monthsShort):
770 datestr = datestr.replace(monthsShort[idx], en_month)
772 temp = datetime.strptime(datestr,
"%B %d")
774 temp = temp.replace(year=extractedDate.year)
775 if extractedDate < temp:
776 extractedDate = extractedDate.replace(year=int(currentYear),
780 day=int(temp.strftime(
783 extractedDate = extractedDate.replace(
784 year=int(currentYear) + 1,
785 month=int(temp.strftime(
"%m")),
786 day=int(temp.strftime(
"%d")))
788 extractedDate = extractedDate.replace(
789 year=int(temp.strftime(
"%Y")),
790 month=int(temp.strftime(
"%m")),
791 day=int(temp.strftime(
"%d")))
794 temp = datetime(timeStr)
795 extractedDate = extractedDate.replace(hour=temp.strftime(
"%H"),
796 minute=temp.strftime(
"%M"),
797 second=temp.strftime(
"%S"))
800 extractedDate = extractedDate + relativedelta(years=yearOffset)
802 extractedDate = extractedDate + relativedelta(months=monthOffset)
804 extractedDate = extractedDate + relativedelta(days=dayOffset)
806 if hrAbs
is None and minAbs
is None and default_time:
807 hrAbs = default_time.hour
808 minAbs = default_time.minute
810 if hrAbs != -1
and minAbs != -1:
812 extractedDate = extractedDate + relativedelta(hours=hrAbs
or 0,
814 if (hrAbs
or minAbs)
and datestr ==
"":
815 if not daySpecified
and dateNow > extractedDate:
816 extractedDate = extractedDate + relativedelta(days=1)
818 extractedDate = extractedDate + relativedelta(hours=hrOffset)
820 extractedDate = extractedDate + relativedelta(minutes=minOffset)
822 extractedDate = extractedDate + relativedelta(seconds=secOffset)
823 for idx, word
in enumerate(words):
824 if words[idx] ==
"und" and words[idx - 1] ==
"" \
825 and words[idx + 1] ==
"":
828 resultStr =
" ".join(words)
829 resultStr =
' '.join(resultStr.split())
831 return [extractedDate, resultStr]
836 This function takes the given text and checks if it is a fraction. 839 input_str (str): the string to check if fractional 841 (bool) or (float): False if not a fraction, otherwise the fraction 844 if input_str.lower().startswith(
"halb"):
847 if input_str.lower() ==
"drittel":
849 elif input_str.endswith(
'tel'):
850 if input_str.endswith(
'stel'):
851 input_str = input_str[:len(input_str) - 4]
853 input_str = input_str[:len(input_str) - 3]
854 if input_str.lower()
in de_numbers:
855 return 1.0 / (de_numbers[input_str.lower()])
862 This function takes the given text and checks if it is an ordinal number. 865 input_str (str): the string to check if ordinal 867 (bool) or (float): False if not an ordinal, otherwise the number 868 corresponding to the ordinal 870 ordinals for 1, 3, 7 and 8 are irregular 872 only works for ordinals corresponding to the numbers in de_numbers 876 lowerstr = input_str.lower()
878 if lowerstr.startswith(
"erste"):
880 if lowerstr.startswith(
"dritte"):
882 if lowerstr.startswith(
"siebte"):
884 if lowerstr.startswith(
"achte"):
887 if lowerstr[-3:] ==
"ste":
888 lowerstr = lowerstr[:-3]
889 if lowerstr
in de_numbers:
890 return de_numbers[lowerstr]
892 if lowerstr[-4:]
in [
"ster",
"stes",
"sten",
"stem"]:
893 lowerstr = lowerstr[:-4]
894 if lowerstr
in de_numbers:
895 return de_numbers[lowerstr]
897 if lowerstr[-2:] ==
"te":
898 lowerstr = lowerstr[:-2]
899 if lowerstr
in de_numbers:
900 return de_numbers[lowerstr]
902 if lowerstr[-3:]
in [
"ter",
"tes",
"ten",
"tem"]:
903 lowerstr = lowerstr[:-3]
904 if lowerstr
in de_numbers:
905 return de_numbers[lowerstr]
911 """ German string normalization """ 916 if remove_articles
and word
in [
"der",
"die",
"das",
"des",
"den",
921 contraction = [
"net",
"nett"]
922 if word
in contraction:
923 expansion = [
"nicht",
"nicht"]
924 word = expansion[contraction.index(word)]
928 if word
in de_numbers:
929 word = str(de_numbers[word])
931 normalized +=
" " + word
933 return normalized[1:]
938 Takes in a string and extracts a list of numbers. 941 text (str): the string to extract a number from 942 short_scale (bool): Use "short scale" or "long scale" for large 943 numbers -- over a million. The default is short scale, which 944 is now common in most English speaking countries. 945 See https://en.wikipedia.org/wiki/Names_of_large_numbers 946 ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 948 list: list of extracted numbers as floats 951 short_scale=short_scale, ordinals=ordinals)
def extract_numbers_de(text, short_scale=True, ordinals=False)
def isFractional_de(input_str)
def extract_numbers_generic(text, pronounce_handler, extract_handler, short_scale=True, ordinals=False)
def normalize_de(text, remove_articles)
def is_numeric(input_str)
def look_for_fractions(split_list)
def extract_datetime_de(string, currentDate, default_time)
def extractnumber_de(text)
def isOrdinal_de(input_str)