17 from datetime
import datetime
18 from dateutil.relativedelta
import relativedelta
24 This function prepares the given text for parsing by making 25 numbers consistent, getting rid of contractions, etc. 27 text (str): the string to normalize 29 (int) or (float): The value of extracted number 36 while count < len(aWords):
40 elif word ==
"första":
44 elif word ==
"tredje":
46 elif word ==
"fjärde":
50 elif word ==
"sjätte":
78 if count < (len(aWords) - 1):
79 wordNext = aWords[count + 1]
86 aWords[count + 1] =
"" 90 aPieces = word.split(
'/')
92 val = float(aPieces[0]) / float(aPieces[1])
104 aWords[count - 1] =
'' 106 elif count + 1 < len(aWords)
and aWords[count + 1] ==
'och':
112 elif count + 2 < len(aWords)
and aWords[count + 2] ==
'och':
130 cleans the input string of unneeded punctuation and capitalization 133 s = s.lower().replace(
'?',
'').replace(
'.',
'').replace(
',',
'') \
134 .replace(
' den ',
' ').replace(
' en ',
' ')
136 for idx, word
in enumerate(wordList):
137 word = word.replace(
"'s",
"")
139 ordinals = [
"rd",
"st",
"nd",
"th"]
140 if word[0].isdigit():
141 for ordinal
in ordinals:
143 word = word.replace(ordinal,
"")
151 datestr !=
"" or timeStr !=
"" or 152 yearOffset != 0
or monthOffset != 0
or 153 dayOffset
is True or hrOffset != 0
or 154 hrAbs
or minOffset != 0
or 155 minAbs
or secOffset != 0
158 if string ==
"" or not currentDate:
166 dateNow = currentDate
167 today = dateNow.strftime(
"%w")
168 currentYear = dateNow.strftime(
"%Y")
174 timeQualifiersList = [
'morgon',
'förmiddag',
'eftermiddag',
'kväll']
175 markers = [
'på',
'i',
'den här',
'kring',
'efter']
176 days = [
'måndag',
'tisdag',
'onsdag',
'torsdag',
177 'fredag',
'lördag',
'söndag']
178 months = [
'januari',
'februari',
'mars',
'april',
'maj',
'juni',
179 'juli',
'augusti',
'september',
'oktober',
'november',
181 monthsShort = [
'jan',
'feb',
'mar',
'apr',
'may',
'june',
'july',
'aug',
182 'sept',
'oct',
'nov',
'dec']
184 words = clean_string(string)
186 for idx, word
in enumerate(words):
189 wordPrevPrev = words[idx - 2]
if idx > 1
else "" 190 wordPrev = words[idx - 1]
if idx > 0
else "" 191 wordNext = words[idx + 1]
if idx + 1 < len(words)
else "" 192 wordNextNext = words[idx + 2]
if idx + 2 < len(words)
else "" 195 word = word.rstrip(
's')
199 if word
in timeQualifiersList:
202 elif word ==
"idag" and not fromFlag:
205 elif word ==
"imorgon" and not fromFlag:
208 elif word ==
"morgondagen" or word ==
"morgondagens" and not fromFlag:
211 elif word ==
"övermorgon" and not fromFlag:
215 elif word ==
"dag" or word ==
"dagar":
216 if wordPrev[0].isdigit():
217 dayOffset += int(wordPrev)
220 elif word ==
"vecka" or word ==
"veckor" and not fromFlag:
221 if wordPrev[0].isdigit():
222 dayOffset += int(wordPrev) * 7
225 elif wordPrev ==
"nästa":
229 elif wordPrev ==
"förra":
234 elif word ==
"månad" and not fromFlag:
235 if wordPrev[0].isdigit():
236 monthOffset = int(wordPrev)
239 elif wordPrev ==
"nästa":
243 elif wordPrev ==
"förra":
248 elif word ==
"år" and not fromFlag:
249 if wordPrev[0].isdigit():
250 yearOffset = int(wordPrev)
253 elif wordPrev ==
"nästa":
257 elif wordPrev ==
"förra":
263 elif word
in days
and not fromFlag:
265 dayOffset = (d + 1) - int(today)
269 if wordPrev ==
"nästa":
273 elif wordPrev ==
"förra":
278 elif word
in months
or word
in monthsShort
and not fromFlag:
280 m = months.index(word)
282 m = monthsShort.index(word)
285 if wordPrev
and (wordPrev[0].isdigit()
or 286 (wordPrev ==
"of" and wordPrevPrev[0].isdigit())):
287 if wordPrev ==
"of" and wordPrevPrev[0].isdigit():
288 datestr +=
" " + words[idx - 2]
292 datestr +=
" " + wordPrev
295 if wordNext
and wordNext[0].isdigit():
296 datestr +=
" " + wordNext
302 elif wordNext
and wordNext[0].isdigit():
303 datestr +=
" " + wordNext
305 if wordNextNext
and wordNextNext[0].isdigit():
306 datestr +=
" " + wordNextNext
313 validFollowups = days + months + monthsShort
314 validFollowups.append(
"idag")
315 validFollowups.append(
"imorgon")
316 validFollowups.append(
"nästa")
317 validFollowups.append(
"förra")
318 validFollowups.append(
"nu")
319 if (word ==
"från" or word ==
"efter")
and wordNext
in validFollowups:
322 if wordNext ==
"imorgon":
324 elif wordNext
in days:
325 d = days.index(wordNext)
326 tmpOffset = (d + 1) - int(today)
330 dayOffset += tmpOffset
331 elif wordNextNext
and wordNextNext
in days:
332 d = days.index(wordNextNext)
333 tmpOffset = (d + 1) - int(today)
335 if wordNext ==
"nästa":
339 elif wordNext ==
"förra":
343 dayOffset += tmpOffset
345 if start - 1 > 0
and words[start - 1] ==
"denna":
349 for i
in range(0, used):
350 words[i + start] =
"" 352 if start - 1 >= 0
and words[start - 1]
in markers:
353 words[start - 1] =
"" 365 for idx, word
in enumerate(words):
369 wordPrevPrev = words[idx - 2]
if idx > 1
else "" 370 wordPrev = words[idx - 1]
if idx > 0
else "" 371 wordNext = words[idx + 1]
if idx + 1 < len(words)
else "" 372 wordNextNext = words[idx + 2]
if idx + 2 < len(words)
else "" 378 elif word ==
"midnatt":
381 elif word ==
"morgon":
385 elif word ==
"förmiddag":
389 elif word ==
"eftermiddag":
393 elif word ==
"kväll":
398 elif wordPrev
in markers
or wordPrevPrev
in markers:
399 if word ==
"halvtimme" or word ==
"halvtimma":
401 elif word ==
"kvart":
403 elif word ==
"timme" or word ==
"timma":
410 elif word[0].isdigit():
420 for i
in range(length):
422 if word[i].isdigit():
430 if word[i].isdigit():
436 remainder = word[i:].replace(
".",
"")
439 nextWord = wordNext.replace(
".",
"")
440 if nextWord ==
"am" or nextWord ==
"pm":
443 elif nextWord ==
"tonight":
446 elif wordNext ==
"in" and wordNextNext ==
"the" and \
447 words[idx + 3] ==
"morning":
450 elif wordNext ==
"in" and wordNextNext ==
"the" and \
451 words[idx + 3] ==
"afternoon":
454 elif wordNext ==
"in" and wordNextNext ==
"the" and \
455 words[idx + 3] ==
"evening":
458 elif wordNext ==
"in" and wordNextNext ==
"morning":
461 elif wordNext ==
"in" and wordNextNext ==
"afternoon":
464 elif wordNext ==
"in" and wordNextNext ==
"evening":
467 elif wordNext ==
"this" and wordNextNext ==
"morning":
470 elif wordNext ==
"this" and wordNextNext ==
"afternoon":
473 elif wordNext ==
"this" and wordNextNext ==
"evening":
476 elif wordNext ==
"at" and wordNextNext ==
"night":
483 if timeQualifier !=
"":
485 (timeQualifier ==
"evening" or 486 timeQualifier ==
"afternoon"):
494 for i
in range(length):
495 if word[i].isdigit():
501 remainder = wordNext.replace(
".",
"").lstrip().rstrip()
506 remainder ==
"p.m." or 514 remainder ==
"a.m." or 520 if wordNext ==
"pm" or wordNext ==
"p.m.":
524 elif wordNext ==
"am" or wordNext ==
"a.m.":
535 strHH = int(word) / 100
536 strMM = int(word) - strHH * 100
537 if wordNext ==
"hours":
540 wordNext ==
"hours" and 553 elif wordNext ==
"minutes":
555 minOffset = int(word)
560 elif wordNext ==
"seconds":
562 secOffset = int(word)
567 elif int(word) > 100:
568 strHH = int(word) / 100
569 strMM = int(word) - strHH * 100
570 if wordNext ==
"hours":
572 elif wordNext[0].isdigit():
576 if wordNextNext ==
"hours":
579 wordNext ==
"" or wordNext ==
"o'clock" or 583 wordNextNext ==
"the" or 584 wordNextNext == timeQualifier
589 if wordNext ==
"o'clock":
591 if wordNext ==
"in" or wordNextNext ==
"in":
592 used += (1
if wordNext ==
"in" else 2)
594 wordNextNext
in timeQualifier
or 595 (words[words.index(wordNextNext) + 1]
and 596 words[words.index(wordNextNext) + 1]
in 598 if (wordNextNext ==
"afternoon" or 600 words.index(wordNextNext) + 1
and 602 wordNextNext) + 1] ==
"afternoon")):
604 if (wordNextNext ==
"evening" or 606 (words.index(wordNextNext) + 1)
and 608 wordNextNext) + 1] ==
"evening")):
610 if (wordNextNext ==
"morning" or 612 words.index(wordNextNext) + 1
and 614 wordNextNext) + 1] ==
"morning")):
619 strHH = int(strHH)
if strHH
else 0
620 strMM = int(strMM)
if strMM
else 0
621 strHH = strHH + 12
if remainder ==
"pm" and strHH < 12
else strHH
622 strHH = strHH - 12
if remainder ==
"am" and strHH >= 12
else strHH
623 if strHH > 24
or strMM > 59:
632 for i
in range(used):
635 if wordPrev ==
"o" or wordPrev ==
"oh":
636 words[words.index(wordPrev)] =
"" 638 if wordPrev ==
"early":
642 elif wordPrev ==
"late":
646 if idx > 0
and wordPrev
in markers:
648 if idx > 1
and wordPrevPrev
in markers:
658 if dayOffset
is False:
663 extractedDate = dateNow
664 extractedDate = extractedDate.replace(microsecond=0,
669 temp = datetime.strptime(datestr,
"%B %d")
671 temp = temp.replace(year=extractedDate.year)
672 if extractedDate < temp:
673 extractedDate = extractedDate.replace(year=int(currentYear),
677 day=int(temp.strftime(
680 extractedDate = extractedDate.replace(
681 year=int(currentYear) + 1,
682 month=int(temp.strftime(
"%m")),
683 day=int(temp.strftime(
"%d")))
685 extractedDate = extractedDate.replace(
686 year=int(temp.strftime(
"%Y")),
687 month=int(temp.strftime(
"%m")),
688 day=int(temp.strftime(
"%d")))
691 temp = datetime(timeStr)
692 extractedDate = extractedDate.replace(hour=temp.strftime(
"%H"),
693 minute=temp.strftime(
"%M"),
694 second=temp.strftime(
"%S"))
697 extractedDate = extractedDate + relativedelta(years=yearOffset)
699 extractedDate = extractedDate + relativedelta(months=monthOffset)
701 extractedDate = extractedDate + relativedelta(days=dayOffset)
703 if hrAbs
is None and minAbs
is None and default_time:
704 hrAbs = default_time.hour
705 minAbs = default_time.minute
706 if hrAbs != -1
and minAbs != -1:
707 extractedDate = extractedDate + relativedelta(hours=hrAbs
or 0,
709 if (hrAbs
or minAbs)
and datestr ==
"":
710 if not daySpecified
and dateNow > extractedDate:
711 extractedDate = extractedDate + relativedelta(days=1)
713 extractedDate = extractedDate + relativedelta(hours=hrOffset)
715 extractedDate = extractedDate + relativedelta(minutes=minOffset)
717 extractedDate = extractedDate + relativedelta(seconds=secOffset)
718 for idx, word
in enumerate(words):
719 if words[idx] ==
"and" and words[idx - 1] ==
"" and words[
723 resultStr =
" ".join(words)
724 resultStr =
' '.join(resultStr.split())
725 return [extractedDate, resultStr]
730 This function takes the given text and checks if it is a fraction. 733 input_str (str): the string to check if fractional 735 (bool) or (float): False if not a fraction, otherwise the fraction 738 if input_str.endswith(
'ars', -3):
739 input_str = input_str[:len(input_str) - 3]
740 if input_str.endswith(
'ar', -2):
741 input_str = input_str[:len(input_str) - 2]
742 if input_str.endswith(
'a', -1):
743 input_str = input_str[:len(input_str) - 1]
744 if input_str.endswith(
's', -1):
745 input_str = input_str[:len(input_str) - 1]
747 aFrac = [
"hel",
"halv",
"tredjedel",
"fjärdedel",
"femtedel",
"sjättedel",
748 "sjundedel",
"åttondel",
"niondel",
"tiondel",
"elftedel",
750 if input_str.lower()
in aFrac:
751 return 1.0 / (aFrac.index(input_str) + 1)
752 if input_str ==
"kvart":
754 if input_str ==
"trekvart":
761 """ English string normalization """ 769 textNumbers = [
"noll",
"ett",
"två",
"tre",
"fyra",
"fem",
"sex",
770 "sju",
"åtta",
"nio",
"tio",
"elva",
"tolv",
771 "tretton",
"fjorton",
"femton",
"sexton",
772 "sjutton",
"arton",
"nitton",
"tjugo"]
773 if word
in textNumbers:
774 word = str(textNumbers.index(word))
776 normalized +=
" " + word
778 return normalized[1:]
def extractnumber_sv(text)
def normalize_sv(text, remove_articles)
def is_numeric(input_str)
def extract_datetime_sv(string, currentDate, default_time)
def look_for_fractions(split_list)
def is_fractional_sv(input_str)