17 from datetime
import datetime
18 from dateutil.relativedelta
import relativedelta
20 extract_numbers_generic
79 This function prepares the given text for parsing by making 80 numbers consistent, getting rid of contractions, etc. 82 text (str): the string to normalize 84 (int) or (float): The value of extracted number 87 undefined articles cannot be suppressed in German: 88 'ein Pferd' means 'one horse' and 'a horse' 92 aWords = [word
for word
in aWords
if 93 word
not in [
"den",
"det"]]
98 while count < len(aWords):
108 if word
in da_numbers:
109 val = da_numbers[word]
110 if count < (len(aWords) - 1):
111 wordNext = aWords[count + 1]
118 aWords[count + 1] =
"" 122 aPieces = word.split(
'/')
126 val = float(aPieces[0]) / float(aPieces[1])
138 aWords[count - 1] =
'' 140 elif count + 1 < len(aWords)
and aWords[count + 1] ==
'og':
146 elif count + 2 < len(aWords)
and aWords[count + 2] ==
'og':
164 cleans the input string of unneeded punctuation 165 and capitalization among other things. 167 'am' is a preposition, so cannot currently be used 168 for 12 hour date format 171 s = s.lower().replace(
'?',
'').replace(
'.',
'').replace(
',',
'') \
172 .replace(
' den ',
' ').replace(
' det ',
' ').replace(
' om ',
175 .replace(
' på ',
' ').replace(
' om ',
' ')
178 for idx, word
in enumerate(wordList):
188 datestr !=
"" or timeStr !=
"" or 189 yearOffset != 0
or monthOffset != 0
or 190 dayOffset
is True or hrOffset != 0
or 191 hrAbs
or minOffset != 0
or 192 minAbs
or secOffset != 0
195 if string ==
"" or not currentDate:
203 dateNow = currentDate
204 today = dateNow.strftime(
"%w")
205 currentYear = dateNow.strftime(
"%Y")
211 timeQualifiersList = [
'tidlig',
222 markers = [
'i',
'om',
'på',
'klokken',
'ved']
223 days = [
'mandag',
'tirsdag',
'onsdag',
224 'torsdag',
'fredag',
'lørdag',
'søndag']
225 months = [
'januar',
'februar',
'marts',
'april',
'maj',
'juni',
226 'juli',
'august',
'september',
'oktober',
'november',
228 monthsShort = [
'jan',
'feb',
'mar',
'apr',
'maj',
'juni',
'juli',
'aug',
229 'sep',
'okt',
'nov',
'des']
231 validFollowups = days + months + monthsShort
232 validFollowups.append(
"i dag")
233 validFollowups.append(
"morgen")
234 validFollowups.append(
"næste")
235 validFollowups.append(
"forige")
236 validFollowups.append(
"nu")
238 words = clean_string(string)
240 for idx, word
in enumerate(words):
243 wordPrevPrev = words[idx - 2]
if idx > 1
else "" 244 wordPrev = words[idx - 1]
if idx > 0
else "" 245 wordNext = words[idx + 1]
if idx + 1 < len(words)
else "" 246 wordNextNext = words[idx + 2]
if idx + 2 < len(words)
else "" 251 if word
in timeQualifiersList:
254 elif word ==
"dag" and not fromFlag:
257 elif word ==
"morgen" and not fromFlag
and wordPrev !=
"om" and \
258 wordPrev
not in days:
262 elif word ==
"overmorgen" and not fromFlag:
266 elif word ==
"dag" or word ==
"dage":
267 if wordPrev[0].isdigit():
268 dayOffset += int(wordPrev)
271 elif word ==
"uge" or word ==
"uger" and not fromFlag:
272 if wordPrev[0].isdigit():
273 dayOffset += int(wordPrev) * 7
276 elif wordPrev[:6] ==
"næste":
280 elif wordPrev[:5] ==
"forige":
285 elif word ==
"måned" and not fromFlag:
286 if wordPrev[0].isdigit():
287 monthOffset = int(wordPrev)
290 elif wordPrev[:6] ==
"næste":
294 elif wordPrev[:5] ==
"forige":
299 elif word ==
"år" and not fromFlag:
300 if wordPrev[0].isdigit():
301 yearOffset = int(wordPrev)
304 elif wordPrev[:6] ==
" næste":
308 elif wordPrev[:6] ==
"næste":
314 elif word
in days
and not fromFlag:
316 dayOffset = (d + 1) - int(today)
320 if wordNext ==
"morgen":
323 words[idx + 1] =
"tidlig" 324 if wordPrev[:6] ==
"næste":
328 elif wordPrev[:5] ==
"forige":
333 elif word
in months
or word
in monthsShort
and not fromFlag:
335 m = months.index(word)
337 m = monthsShort.index(word)
340 if wordPrev
and (wordPrev[0].isdigit()
or 341 (wordPrev ==
"of" and wordPrevPrev[0].isdigit())):
342 if wordPrev ==
"of" and wordPrevPrev[0].isdigit():
343 datestr +=
" " + words[idx - 2]
347 datestr +=
" " + wordPrev
350 if wordNext
and wordNext[0].isdigit():
351 datestr +=
" " + wordNext
357 elif wordNext
and wordNext[0].isdigit():
358 datestr +=
" " + wordNext
360 if wordNextNext
and wordNextNext[0].isdigit():
361 datestr +=
" " + wordNextNext
370 word ==
"fra" or word ==
"til" or word ==
"om")
and wordNext \
374 if wordNext ==
"morgenen" and \
375 wordPrev !=
"om" and \
376 wordPrev
not in days:
380 elif wordNext
in days:
381 d = days.index(wordNext)
382 tmpOffset = (d + 1) - int(today)
386 dayOffset += tmpOffset
387 elif wordNextNext
and wordNextNext
in days:
388 d = days.index(wordNextNext)
389 tmpOffset = (d + 1) - int(today)
391 if wordNext[:6] ==
"næste":
395 elif wordNext[:5] ==
"forige":
399 dayOffset += tmpOffset
401 if start - 1 > 0
and words[start - 1].startswith(
"denne"):
405 for i
in range(0, used):
406 words[i + start] =
"" 408 if start - 1 >= 0
and words[start - 1]
in markers:
409 words[start - 1] =
"" 421 for idx, word
in enumerate(words):
425 wordPrevPrev = words[idx - 2]
if idx > 1
else "" 426 wordPrev = words[idx - 1]
if idx > 0
else "" 427 wordNext = words[idx + 1]
if idx + 1 < len(words)
else "" 428 wordNextNext = words[idx + 2]
if idx + 2 < len(words)
else "" 429 wordNextNextNext = words[idx + 3]
if idx + 3 < len(words)
else "" 430 wordNextNextNextNext = words[idx + 4]
if idx + 4 < len(words)
else "" 434 if word[:6] ==
"middag":
437 elif word[:11] ==
"midnat":
440 elif word ==
"morgenen" or (
441 wordPrev ==
"om" and word ==
"morgenen")
or word ==
"tidlig":
445 elif word[:11] ==
"eftermiddag":
449 elif word[:5] ==
"aften":
454 elif word ==
"time" and \
455 (wordPrev
in markers
or wordPrevPrev
in markers):
456 if wordPrev[:4] ==
"halv":
458 elif wordPrev ==
"kvarter":
460 elif wordPrev ==
"trekvarter":
464 if wordPrevPrev
in markers:
471 elif word[0].isdigit():
481 for i
in range(length):
483 if word[i].isdigit():
491 if word[i].isdigit():
497 remainder = word[i:].replace(
".",
"")
500 nextWord = wordNext.replace(
".",
"")
501 if nextWord ==
"am" or nextWord ==
"pm":
504 elif nextWord ==
"aften":
507 elif wordNext ==
"om" and wordNextNext ==
"morgenen":
510 elif wordNext ==
"om" and wordNextNext ==
"eftermiddagen":
513 elif wordNext ==
"om" and wordNextNext ==
"aftenen":
516 elif wordNext ==
"morgen":
519 elif wordNext ==
"eftermiddag":
522 elif wordNext ==
"aften":
525 elif wordNext ==
"i" and wordNextNext ==
"morgen":
528 elif wordNext ==
"i" and wordNextNext ==
"eftermiddag":
531 elif wordNext ==
"i" and wordNextNext ==
"aften":
534 elif wordNext ==
"natten":
541 if timeQualifier !=
"":
543 (timeQualifier ==
"aftenen" or 544 timeQualifier ==
"eftermiddagen"):
552 for i
in range(length):
553 if word[i].isdigit():
559 remainder = wordNext.replace(
".",
"").lstrip().rstrip()
564 remainder ==
"p.m." or 572 remainder ==
"a.m." or 578 if wordNext ==
"time" and int(word) < 100:
585 elif wordNext ==
"minut":
587 minOffset = int(word)
592 elif wordNext ==
"sekund":
594 secOffset = int(word)
600 elif wordNext ==
"time":
604 if wordNextNext == timeQualifier:
606 if wordNextNext[:11] ==
"eftermiddag":
609 elif wordNextNext ==
"om" and wordNextNextNext == \
613 elif wordNextNext[:5] ==
"aften":
616 elif wordNextNext ==
"om" and wordNextNextNext == \
620 elif wordNextNext[:6] ==
"morgen":
623 elif wordNextNext ==
"om" and wordNextNextNext == \
627 elif wordNextNext ==
"natten":
629 if 8 <= int(word) <= 12:
637 if wordNextNextNext == timeQualifier:
638 if wordNextNextNext[:11] ==
"eftermiddag":
641 elif wordNextNextNext ==
"om" and \
642 wordNextNextNextNext == \
646 elif wordNextNextNext[:6] ==
"natten":
649 elif wordNextNextNext ==
"am" and \
650 wordNextNextNextNext ==
"natten":
653 elif wordNextNextNext[:7] ==
"morgenen":
656 elif wordNextNextNext ==
"om" and \
657 wordNextNextNextNext ==
"morgenen":
660 elif wordNextNextNext ==
"natten":
662 if 8 <= int(word) <= 12:
667 elif wordNext == timeQualifier:
671 if wordNext[:10] ==
"eftermidag":
674 elif wordNext ==
"om" and \
675 wordNextNext ==
"eftermiddanen":
678 elif wordNext[:7] ==
"aftenen":
681 elif wordNext ==
"om" and wordNextNext ==
"aftenen":
684 elif wordNext[:7] ==
"morgenen":
687 elif wordNext ==
"ao" and wordNextNext ==
"morgenen":
690 elif wordNext ==
"natten":
692 if 8 <= int(word) <= 12:
702 strHH = int(strHH)
if strHH
else 0
703 strMM = int(strMM)
if strMM
else 0
704 strHH = strHH + 12
if remainder ==
"pm" and strHH < 12
else strHH
705 strHH = strHH - 12
if remainder ==
"am" and strHH >= 12
else strHH
706 if strHH > 24
or strMM > 59:
715 for i
in range(used):
718 if wordPrev ==
"tidlig":
722 elif wordPrev ==
"sen":
726 if idx > 0
and wordPrev
in markers:
728 if idx > 1
and wordPrevPrev
in markers:
738 if dayOffset
is False:
743 extractedDate = dateNow
744 extractedDate = extractedDate.replace(microsecond=0,
749 en_months = [
'january',
'february',
'march',
'april',
'may',
'june',
750 'july',
'august',
'september',
'october',
'november',
752 en_monthsShort = [
'jan',
'feb',
'mar',
'apr',
'may',
'june',
'july',
754 'sept',
'oct',
'nov',
'dec']
755 for idx, en_month
in enumerate(en_months):
756 datestr = datestr.replace(months[idx], en_month)
757 for idx, en_month
in enumerate(en_monthsShort):
758 datestr = datestr.replace(monthsShort[idx], en_month)
760 temp = datetime.strptime(datestr,
"%B %d")
762 temp = temp.replace(year=extractedDate.year)
763 if extractedDate < temp:
764 extractedDate = extractedDate.replace(year=int(currentYear),
768 day=int(temp.strftime(
771 extractedDate = extractedDate.replace(
772 year=int(currentYear) + 1,
773 month=int(temp.strftime(
"%m")),
774 day=int(temp.strftime(
"%d")))
776 extractedDate = extractedDate.replace(
777 year=int(temp.strftime(
"%Y")),
778 month=int(temp.strftime(
"%m")),
779 day=int(temp.strftime(
"%d")))
782 temp = datetime(timeStr)
783 extractedDate = extractedDate.replace(hour=temp.strftime(
"%H"),
784 minute=temp.strftime(
"%M"),
785 second=temp.strftime(
"%S"))
788 extractedDate = extractedDate + relativedelta(years=yearOffset)
790 extractedDate = extractedDate + relativedelta(months=monthOffset)
792 extractedDate = extractedDate + relativedelta(days=dayOffset)
794 if hrAbs
is None and minAbs
is None and default_time:
795 hrAbs = default_time.hour
796 minAbs = default_time.minute
798 if hrAbs != -1
and minAbs != -1:
800 extractedDate = extractedDate + relativedelta(hours=hrAbs
or 0,
802 if (hrAbs
or minAbs)
and datestr ==
"":
803 if not daySpecified
and dateNow > extractedDate:
804 extractedDate = extractedDate + relativedelta(days=1)
806 extractedDate = extractedDate + relativedelta(hours=hrOffset)
808 extractedDate = extractedDate + relativedelta(minutes=minOffset)
810 extractedDate = extractedDate + relativedelta(seconds=secOffset)
811 for idx, word
in enumerate(words):
812 if words[idx] ==
"og" and words[idx - 1] ==
"" \
813 and words[idx + 1] ==
"":
816 resultStr =
" ".join(words)
817 resultStr =
' '.join(resultStr.split())
819 return [extractedDate, resultStr]
824 This function takes the given text and checks if it is a fraction. 827 input_str (str): the string to check if fractional 829 (bool) or (float): False if not a fraction, otherwise the fraction 832 if input_str.lower().startswith(
"halv"):
835 if input_str.lower() ==
"trediedel":
837 elif input_str.endswith(
'del'):
838 input_str = input_str[:len(input_str) - 3]
839 if input_str.lower()
in da_numbers:
840 return 1.0 / (da_numbers[input_str.lower()])
847 This function takes the given text and checks if it is an ordinal number. 850 input_str (str): the string to check if ordinal 852 (bool) or (float): False if not an ordinal, otherwise the number 853 corresponding to the ordinal 855 ordinals for 1, 3, 7 and 8 are irregular 857 only works for ordinals corresponding to the numbers in da_numbers 861 lowerstr = input_str.lower()
863 if lowerstr.startswith(
"første"):
865 if lowerstr.startswith(
"anden"):
867 if lowerstr.startswith(
"tredie"):
869 if lowerstr.startswith(
"fjerde"):
871 if lowerstr.startswith(
"femte"):
873 if lowerstr.startswith(
"sjette"):
875 if lowerstr.startswith(
"elfte"):
877 if lowerstr.startswith(
"tolvfte"):
880 if lowerstr[-3:] ==
"nde":
882 lowerstr = lowerstr[:-3]
883 if lowerstr
in da_numbers:
884 return da_numbers[lowerstr]
886 if lowerstr[-4:]
in [
"ende"]:
887 lowerstr = lowerstr[:-4]
888 if lowerstr
in da_numbers:
889 return da_numbers[lowerstr]
891 if lowerstr[-2:] ==
"te":
892 lowerstr = lowerstr[:-2]
893 if lowerstr
in da_numbers:
894 return da_numbers[lowerstr]
900 """ German string normalization """ 905 if remove_articles
and word
in [
"den",
"det"]:
910 if word
in da_numbers:
911 word = str(da_numbers[word])
913 normalized +=
" " + word
915 return normalized[1:]
920 Takes in a string and extracts a list of numbers. 923 text (str): the string to extract a number from 924 short_scale (bool): Use "short scale" or "long scale" for large 925 numbers -- over a million. The default is short scale, which 926 is now common in most English speaking countries. 927 See https://en.wikipedia.org/wiki/Names_of_large_numbers 928 ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 930 list: list of extracted numbers as floats 933 short_scale=short_scale, ordinals=ordinals)
def extractnumber_da(text)
def isOrdinal_da(input_str)
def extract_datetime_da(string, currentDate, default_time)
def extract_numbers_generic(text, pronounce_handler, extract_handler, short_scale=True, ordinals=False)
def is_numeric(input_str)
def look_for_fractions(split_list)
def isFractional_da(input_str)
def normalize_da(text, remove_articles)
def extract_numbers_da(text, short_scale=True, ordinals=False)