parse_da.py
Go to the documentation of this file.
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright 2017 Mycroft AI Inc.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #
17 from datetime import datetime
18 from dateutil.relativedelta import relativedelta
19 from mycroft.util.lang.parse_common import is_numeric, look_for_fractions, \
20  extract_numbers_generic
21 from mycroft.util.lang.format_da import pronounce_number_da
22 
23 da_numbers = {
24  'nul': 0,
25  'en': 1,
26  'et': 1,
27  'to': 2,
28  'tre': 3,
29  'fire': 4,
30  'fem': 5,
31  'seks': 6,
32  'syv': 7,
33  'otte': 8,
34  'ni': 9,
35  'ti': 10,
36  'elve': 11,
37  'tolv': 12,
38  'tretten': 13,
39  'fjorten': 14,
40  'femten': 15,
41  'seksten': 16,
42  'sytten': 17,
43  'atten': 18,
44  'nitten': 19,
45  'tyve': 20,
46  'enogtyve': 21,
47  'toogtyve': 22,
48  'treogtyve': 23,
49  'fireogtyve': 24,
50  'femogtyve': 25,
51  'seksogtyve': 26,
52  'syvogtyve': 27,
53  'otteogtyve': 28,
54  'niogtyve': 29,
55  'tredive': 30,
56  'enogtredive': 31,
57  'fyrrre': 40,
58  'halvtres': 50,
59  'tres': 60,
60  'halvfjers': 70,
61  'firs': 80,
62  'halvfems': 90,
63  'hunderede': 100,
64  'tohundrede': 200,
65  'trehundrede': 300,
66  'firehundrede': 400,
67  'femhundrede': 500,
68  'sekshundrede': 600,
69  'syvhundrede': 700,
70  'ottehundrede': 800,
71  'nihundrede': 900,
72  'tusinde': 1000,
73  'million': 1000000
74 }
75 
76 
77 def extractnumber_da(text):
78  """
79  This function prepares the given text for parsing by making
80  numbers consistent, getting rid of contractions, etc.
81  Args:
82  text (str): the string to normalize
83  Returns:
84  (int) or (float): The value of extracted number
85 
86 
87  undefined articles cannot be suppressed in German:
88  'ein Pferd' means 'one horse' and 'a horse'
89 
90  """
91  aWords = text.split()
92  aWords = [word for word in aWords if
93  word not in ["den", "det"]]
94  and_pass = False
95  valPreAnd = False
96  val = False
97  count = 0
98  while count < len(aWords):
99  word = aWords[count]
100  if is_numeric(word):
101  if word.isdigit(): # doesn't work with decimals
102  val = float(word)
103  elif isFractional_da(word):
104  val = isFractional_da(word)
105  elif isOrdinal_da(word):
106  val = isOrdinal_da(word)
107  else:
108  if word in da_numbers:
109  val = da_numbers[word]
110  if count < (len(aWords) - 1):
111  wordNext = aWords[count + 1]
112  else:
113  wordNext = ""
114  valNext = isFractional_da(wordNext)
115 
116  if valNext:
117  val = val * valNext
118  aWords[count + 1] = ""
119 
120  if not val:
121  # look for fractions like "2/3"
122  aPieces = word.split('/')
123  # if (len(aPieces) == 2 and is_numeric(aPieces[0])
124  # and is_numeric(aPieces[1])):
125  if look_for_fractions(aPieces):
126  val = float(aPieces[0]) / float(aPieces[1])
127  elif and_pass:
128  # added to value, quit here
129  val = valPreAnd
130  break
131  else:
132  count += 1
133  continue
134 
135  aWords[count] = ""
136 
137  if and_pass:
138  aWords[count - 1] = '' # remove "og"
139  val += valPreAnd
140  elif count + 1 < len(aWords) and aWords[count + 1] == 'og':
141  and_pass = True
142  valPreAnd = val
143  val = False
144  count += 2
145  continue
146  elif count + 2 < len(aWords) and aWords[count + 2] == 'og':
147  and_pass = True
148  valPreAnd = val
149  val = False
150  count += 3
151  continue
152 
153  break
154 
155  if not val:
156  return False
157 
158  return val
159 
160 
161 def extract_datetime_da(string, currentDate, default_time):
162  def clean_string(s):
163  """
164  cleans the input string of unneeded punctuation
165  and capitalization among other things.
166 
167  'am' is a preposition, so cannot currently be used
168  for 12 hour date format
169  """
170 
171  s = s.lower().replace('?', '').replace('.', '').replace(',', '') \
172  .replace(' den ', ' ').replace(' det ', ' ').replace(' om ',
173  ' ').replace(
174  ' om ', ' ') \
175  .replace(' på ', ' ').replace(' om ', ' ')
176  wordList = s.split()
177 
178  for idx, word in enumerate(wordList):
179  if isOrdinal_da(word) is not False:
180  word = str(isOrdinal_da(word))
181  wordList[idx] = word
182 
183  return wordList
184 
185  def date_found():
186  return found or \
187  (
188  datestr != "" or timeStr != "" or
189  yearOffset != 0 or monthOffset != 0 or
190  dayOffset is True or hrOffset != 0 or
191  hrAbs or minOffset != 0 or
192  minAbs or secOffset != 0
193  )
194 
195  if string == "" or not currentDate:
196  return None
197 
198  found = False
199  daySpecified = False
200  dayOffset = False
201  monthOffset = 0
202  yearOffset = 0
203  dateNow = currentDate
204  today = dateNow.strftime("%w")
205  currentYear = dateNow.strftime("%Y")
206  fromFlag = False
207  datestr = ""
208  hasYear = False
209  timeQualifier = ""
210 
211  timeQualifiersList = ['tidlig',
212  'morgen',
213  'morgenen',
214  'formidag',
215  'formiddagen',
216  'eftermiddag',
217  'eftermiddagen',
218  'aften',
219  'aftenen',
220  'nat',
221  'natten']
222  markers = ['i', 'om', 'på', 'klokken', 'ved']
223  days = ['mandag', 'tirsdag', 'onsdag',
224  'torsdag', 'fredag', 'lørdag', 'søndag']
225  months = ['januar', 'februar', 'marts', 'april', 'maj', 'juni',
226  'juli', 'august', 'september', 'oktober', 'november',
227  'desember']
228  monthsShort = ['jan', 'feb', 'mar', 'apr', 'maj', 'juni', 'juli', 'aug',
229  'sep', 'okt', 'nov', 'des']
230 
231  validFollowups = days + months + monthsShort
232  validFollowups.append("i dag")
233  validFollowups.append("morgen")
234  validFollowups.append("næste")
235  validFollowups.append("forige")
236  validFollowups.append("nu")
237 
238  words = clean_string(string)
239 
240  for idx, word in enumerate(words):
241  if word == "":
242  continue
243  wordPrevPrev = words[idx - 2] if idx > 1 else ""
244  wordPrev = words[idx - 1] if idx > 0 else ""
245  wordNext = words[idx + 1] if idx + 1 < len(words) else ""
246  wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
247 
248  start = idx
249  used = 0
250  # save timequalifier for later
251  if word in timeQualifiersList:
252  timeQualifier = word
253  # parse today, tomorrow, day after tomorrow
254  elif word == "dag" and not fromFlag:
255  dayOffset = 0
256  used += 1
257  elif word == "morgen" and not fromFlag and wordPrev != "om" and \
258  wordPrev not in days: # morgen means tomorrow if not "am
259  # Morgen" and not [day of the week] morgen
260  dayOffset = 1
261  used += 1
262  elif word == "overmorgen" and not fromFlag:
263  dayOffset = 2
264  used += 1
265  # parse 5 days, 10 weeks, last week, next week
266  elif word == "dag" or word == "dage":
267  if wordPrev[0].isdigit():
268  dayOffset += int(wordPrev)
269  start -= 1
270  used = 2
271  elif word == "uge" or word == "uger" and not fromFlag:
272  if wordPrev[0].isdigit():
273  dayOffset += int(wordPrev) * 7
274  start -= 1
275  used = 2
276  elif wordPrev[:6] == "næste":
277  dayOffset = 7
278  start -= 1
279  used = 2
280  elif wordPrev[:5] == "forige":
281  dayOffset = -7
282  start -= 1
283  used = 2
284  # parse 10 months, next month, last month
285  elif word == "måned" and not fromFlag:
286  if wordPrev[0].isdigit():
287  monthOffset = int(wordPrev)
288  start -= 1
289  used = 2
290  elif wordPrev[:6] == "næste":
291  monthOffset = 1
292  start -= 1
293  used = 2
294  elif wordPrev[:5] == "forige":
295  monthOffset = -1
296  start -= 1
297  used = 2
298  # parse 5 years, next year, last year
299  elif word == "år" and not fromFlag:
300  if wordPrev[0].isdigit():
301  yearOffset = int(wordPrev)
302  start -= 1
303  used = 2
304  elif wordPrev[:6] == " næste":
305  yearOffset = 1
306  start -= 1
307  used = 2
308  elif wordPrev[:6] == "næste":
309  yearOffset = -1
310  start -= 1
311  used = 2
312  # parse Monday, Tuesday, etc., and next Monday,
313  # last Tuesday, etc.
314  elif word in days and not fromFlag:
315  d = days.index(word)
316  dayOffset = (d + 1) - int(today)
317  used = 1
318  if dayOffset < 0:
319  dayOffset += 7
320  if wordNext == "morgen":
321  # morgen means morning if preceded by
322  # the day of the week
323  words[idx + 1] = "tidlig"
324  if wordPrev[:6] == "næste":
325  dayOffset += 7
326  used += 1
327  start -= 1
328  elif wordPrev[:5] == "forige":
329  dayOffset -= 7
330  used += 1
331  start -= 1
332  # parse 15 of July, June 20th, Feb 18, 19 of February
333  elif word in months or word in monthsShort and not fromFlag:
334  try:
335  m = months.index(word)
336  except ValueError:
337  m = monthsShort.index(word)
338  used += 1
339  datestr = months[m]
340  if wordPrev and (wordPrev[0].isdigit() or
341  (wordPrev == "of" and wordPrevPrev[0].isdigit())):
342  if wordPrev == "of" and wordPrevPrev[0].isdigit():
343  datestr += " " + words[idx - 2]
344  used += 1
345  start -= 1
346  else:
347  datestr += " " + wordPrev
348  start -= 1
349  used += 1
350  if wordNext and wordNext[0].isdigit():
351  datestr += " " + wordNext
352  used += 1
353  hasYear = True
354  else:
355  hasYear = False
356 
357  elif wordNext and wordNext[0].isdigit():
358  datestr += " " + wordNext
359  used += 1
360  if wordNextNext and wordNextNext[0].isdigit():
361  datestr += " " + wordNextNext
362  used += 1
363  hasYear = True
364  else:
365  hasYear = False
366  # parse 5 days from tomorrow, 10 weeks from next thursday,
367  # 2 months from July
368 
369  if (
370  word == "fra" or word == "til" or word == "om") and wordNext \
371  in validFollowups:
372  used = 2
373  fromFlag = True
374  if wordNext == "morgenen" and \
375  wordPrev != "om" and \
376  wordPrev not in days:
377  # morgen means tomorrow if not "am Morgen" and not
378  # [day of the week] morgen:
379  dayOffset += 1
380  elif wordNext in days:
381  d = days.index(wordNext)
382  tmpOffset = (d + 1) - int(today)
383  used = 2
384  if tmpOffset < 0:
385  tmpOffset += 7
386  dayOffset += tmpOffset
387  elif wordNextNext and wordNextNext in days:
388  d = days.index(wordNextNext)
389  tmpOffset = (d + 1) - int(today)
390  used = 3
391  if wordNext[:6] == "næste":
392  tmpOffset += 7
393  used += 1
394  start -= 1
395  elif wordNext[:5] == "forige":
396  tmpOffset -= 7
397  used += 1
398  start -= 1
399  dayOffset += tmpOffset
400  if used > 0:
401  if start - 1 > 0 and words[start - 1].startswith("denne"):
402  start -= 1
403  used += 1
404 
405  for i in range(0, used):
406  words[i + start] = ""
407 
408  if start - 1 >= 0 and words[start - 1] in markers:
409  words[start - 1] = ""
410  found = True
411  daySpecified = True
412 
413  # parse time
414  timeStr = ""
415  hrOffset = 0
416  minOffset = 0
417  secOffset = 0
418  hrAbs = None
419  minAbs = None
420 
421  for idx, word in enumerate(words):
422  if word == "":
423  continue
424 
425  wordPrevPrev = words[idx - 2] if idx > 1 else ""
426  wordPrev = words[idx - 1] if idx > 0 else ""
427  wordNext = words[idx + 1] if idx + 1 < len(words) else ""
428  wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
429  wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
430  wordNextNextNextNext = words[idx + 4] if idx + 4 < len(words) else ""
431 
432  # parse noon, midnight, morning, afternoon, evening
433  used = 0
434  if word[:6] == "middag":
435  hrAbs = 12
436  used += 1
437  elif word[:11] == "midnat":
438  hrAbs = 0
439  used += 1
440  elif word == "morgenen" or (
441  wordPrev == "om" and word == "morgenen") or word == "tidlig":
442  if not hrAbs:
443  hrAbs = 8
444  used += 1
445  elif word[:11] == "eftermiddag":
446  if not hrAbs:
447  hrAbs = 15
448  used += 1
449  elif word[:5] == "aften":
450  if not hrAbs:
451  hrAbs = 19
452  used += 1
453  # parse half an hour, quarter hour
454  elif word == "time" and \
455  (wordPrev in markers or wordPrevPrev in markers):
456  if wordPrev[:4] == "halv":
457  minOffset = 30
458  elif wordPrev == "kvarter":
459  minOffset = 15
460  elif wordPrev == "trekvarter":
461  minOffset = 45
462  else:
463  hrOffset = 1
464  if wordPrevPrev in markers:
465  words[idx - 2] = ""
466  words[idx - 1] = ""
467  used += 1
468  hrAbs = -1
469  minAbs = -1
470  # parse 5:00 am, 12:00 p.m., etc
471  elif word[0].isdigit():
472  isTime = True
473  strHH = ""
474  strMM = ""
475  remainder = ""
476  if ':' in word:
477  # parse colons
478  # "3:00 in the morning"
479  stage = 0
480  length = len(word)
481  for i in range(length):
482  if stage == 0:
483  if word[i].isdigit():
484  strHH += word[i]
485  elif word[i] == ":":
486  stage = 1
487  else:
488  stage = 2
489  i -= 1
490  elif stage == 1:
491  if word[i].isdigit():
492  strMM += word[i]
493  else:
494  stage = 2
495  i -= 1
496  elif stage == 2:
497  remainder = word[i:].replace(".", "")
498  break
499  if remainder == "":
500  nextWord = wordNext.replace(".", "")
501  if nextWord == "am" or nextWord == "pm":
502  remainder = nextWord
503  used += 1
504  elif nextWord == "aften":
505  remainder = "pm"
506  used += 1
507  elif wordNext == "om" and wordNextNext == "morgenen":
508  remainder = "am"
509  used += 2
510  elif wordNext == "om" and wordNextNext == "eftermiddagen":
511  remainder = "pm"
512  used += 2
513  elif wordNext == "om" and wordNextNext == "aftenen":
514  remainder = "pm"
515  used += 2
516  elif wordNext == "morgen":
517  remainder = "am"
518  used += 1
519  elif wordNext == "eftermiddag":
520  remainder = "pm"
521  used += 1
522  elif wordNext == "aften":
523  remainder = "pm"
524  used += 1
525  elif wordNext == "i" and wordNextNext == "morgen":
526  remainder = "am"
527  used = 2
528  elif wordNext == "i" and wordNextNext == "eftermiddag":
529  remainder = "pm"
530  used = 2
531  elif wordNext == "i" and wordNextNext == "aften":
532  remainder = "pm"
533  used = 2
534  elif wordNext == "natten":
535  if strHH > 4:
536  remainder = "pm"
537  else:
538  remainder = "am"
539  used += 1
540  else:
541  if timeQualifier != "":
542  if strHH <= 12 and \
543  (timeQualifier == "aftenen" or
544  timeQualifier == "eftermiddagen"):
545  strHH += 12 # what happens when strHH is 24?
546  else:
547  # try to parse # s without colons
548  # 5 hours, 10 minutes etc.
549  length = len(word)
550  strNum = ""
551  remainder = ""
552  for i in range(length):
553  if word[i].isdigit():
554  strNum += word[i]
555  else:
556  remainder += word[i]
557 
558  if remainder == "":
559  remainder = wordNext.replace(".", "").lstrip().rstrip()
560 
561  if (
562  remainder == "pm" or
563  wordNext == "pm" or
564  remainder == "p.m." or
565  wordNext == "p.m."):
566  strHH = strNum
567  remainder = "pm"
568  used = 1
569  elif (
570  remainder == "am" or
571  wordNext == "am" or
572  remainder == "a.m." or
573  wordNext == "a.m."):
574  strHH = strNum
575  remainder = "am"
576  used = 1
577  else:
578  if wordNext == "time" and int(word) < 100:
579  # "in 3 hours"
580  hrOffset = int(word)
581  used = 2
582  isTime = False
583  hrAbs = -1
584  minAbs = -1
585  elif wordNext == "minut":
586  # "in 10 minutes"
587  minOffset = int(word)
588  used = 2
589  isTime = False
590  hrAbs = -1
591  minAbs = -1
592  elif wordNext == "sekund":
593  # in 5 seconds
594  secOffset = int(word)
595  used = 2
596  isTime = False
597  hrAbs = -1
598  minAbs = -1
599 
600  elif wordNext == "time":
601  strHH = word
602  used += 1
603  isTime = True
604  if wordNextNext == timeQualifier:
605  strMM = ""
606  if wordNextNext[:11] == "eftermiddag":
607  used += 1
608  remainder = "pm"
609  elif wordNextNext == "om" and wordNextNextNext == \
610  "eftermiddagen":
611  used += 2
612  remainder = "pm"
613  elif wordNextNext[:5] == "aften":
614  used += 1
615  remainder = "pm"
616  elif wordNextNext == "om" and wordNextNextNext == \
617  "aftenen":
618  used += 2
619  remainder = "pm"
620  elif wordNextNext[:6] == "morgen":
621  used += 1
622  remainder = "am"
623  elif wordNextNext == "om" and wordNextNextNext == \
624  "morgenen":
625  used += 2
626  remainder = "am"
627  elif wordNextNext == "natten":
628  used += 1
629  if 8 <= int(word) <= 12:
630  remainder = "pm"
631  else:
632  remainder = "am"
633 
634  elif is_numeric(wordNextNext):
635  strMM = wordNextNext
636  used += 1
637  if wordNextNextNext == timeQualifier:
638  if wordNextNextNext[:11] == "eftermiddag":
639  used += 1
640  remainder = "pm"
641  elif wordNextNextNext == "om" and \
642  wordNextNextNextNext == \
643  "eftermiddagen":
644  used += 2
645  remainder = "pm"
646  elif wordNextNextNext[:6] == "natten":
647  used += 1
648  remainder = "pm"
649  elif wordNextNextNext == "am" and \
650  wordNextNextNextNext == "natten":
651  used += 2
652  remainder = "pm"
653  elif wordNextNextNext[:7] == "morgenen":
654  used += 1
655  remainder = "am"
656  elif wordNextNextNext == "om" and \
657  wordNextNextNextNext == "morgenen":
658  used += 2
659  remainder = "am"
660  elif wordNextNextNext == "natten":
661  used += 1
662  if 8 <= int(word) <= 12:
663  remainder = "pm"
664  else:
665  remainder = "am"
666 
667  elif wordNext == timeQualifier:
668  strHH = word
669  strMM = 00
670  isTime = True
671  if wordNext[:10] == "eftermidag":
672  used += 1
673  remainder = "pm"
674  elif wordNext == "om" and \
675  wordNextNext == "eftermiddanen":
676  used += 2
677  remainder = "pm"
678  elif wordNext[:7] == "aftenen":
679  used += 1
680  remainder = "pm"
681  elif wordNext == "om" and wordNextNext == "aftenen":
682  used += 2
683  remainder = "pm"
684  elif wordNext[:7] == "morgenen":
685  used += 1
686  remainder = "am"
687  elif wordNext == "ao" and wordNextNext == "morgenen":
688  used += 2
689  remainder = "am"
690  elif wordNext == "natten":
691  used += 1
692  if 8 <= int(word) <= 12:
693  remainder = "pm"
694  else:
695  remainder = "am"
696 
697  # if timeQualifier != "":
698  # military = True
699  # else:
700  # isTime = False
701 
702  strHH = int(strHH) if strHH else 0
703  strMM = int(strMM) if strMM else 0
704  strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH
705  strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH
706  if strHH > 24 or strMM > 59:
707  isTime = False
708  used = 0
709  if isTime:
710  hrAbs = strHH * 1
711  minAbs = strMM * 1
712  used += 1
713  if used > 0:
714  # removed parsed words from the sentence
715  for i in range(used):
716  words[idx + i] = ""
717 
718  if wordPrev == "tidlig":
719  hrOffset = -1
720  words[idx - 1] = ""
721  idx -= 1
722  elif wordPrev == "sen":
723  hrOffset = 1
724  words[idx - 1] = ""
725  idx -= 1
726  if idx > 0 and wordPrev in markers:
727  words[idx - 1] = ""
728  if idx > 1 and wordPrevPrev in markers:
729  words[idx - 2] = ""
730 
731  idx += used - 1
732  found = True
733 
734  # check that we found a date
735  if not date_found:
736  return None
737 
738  if dayOffset is False:
739  dayOffset = 0
740 
741  # perform date manipulation
742 
743  extractedDate = dateNow
744  extractedDate = extractedDate.replace(microsecond=0,
745  second=0,
746  minute=0,
747  hour=0)
748  if datestr != "":
749  en_months = ['january', 'february', 'march', 'april', 'may', 'june',
750  'july', 'august', 'september', 'october', 'november',
751  'december']
752  en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july',
753  'aug',
754  'sept', 'oct', 'nov', 'dec']
755  for idx, en_month in enumerate(en_months):
756  datestr = datestr.replace(months[idx], en_month)
757  for idx, en_month in enumerate(en_monthsShort):
758  datestr = datestr.replace(monthsShort[idx], en_month)
759 
760  temp = datetime.strptime(datestr, "%B %d")
761  if not hasYear:
762  temp = temp.replace(year=extractedDate.year)
763  if extractedDate < temp:
764  extractedDate = extractedDate.replace(year=int(currentYear),
765  month=int(
766  temp.strftime(
767  "%m")),
768  day=int(temp.strftime(
769  "%d")))
770  else:
771  extractedDate = extractedDate.replace(
772  year=int(currentYear) + 1,
773  month=int(temp.strftime("%m")),
774  day=int(temp.strftime("%d")))
775  else:
776  extractedDate = extractedDate.replace(
777  year=int(temp.strftime("%Y")),
778  month=int(temp.strftime("%m")),
779  day=int(temp.strftime("%d")))
780 
781  if timeStr != "":
782  temp = datetime(timeStr)
783  extractedDate = extractedDate.replace(hour=temp.strftime("%H"),
784  minute=temp.strftime("%M"),
785  second=temp.strftime("%S"))
786 
787  if yearOffset != 0:
788  extractedDate = extractedDate + relativedelta(years=yearOffset)
789  if monthOffset != 0:
790  extractedDate = extractedDate + relativedelta(months=monthOffset)
791  if dayOffset != 0:
792  extractedDate = extractedDate + relativedelta(days=dayOffset)
793 
794  if hrAbs is None and minAbs is None and default_time:
795  hrAbs = default_time.hour
796  minAbs = default_time.minute
797 
798  if hrAbs != -1 and minAbs != -1:
799 
800  extractedDate = extractedDate + relativedelta(hours=hrAbs or 0,
801  minutes=minAbs or 0)
802  if (hrAbs or minAbs) and datestr == "":
803  if not daySpecified and dateNow > extractedDate:
804  extractedDate = extractedDate + relativedelta(days=1)
805  if hrOffset != 0:
806  extractedDate = extractedDate + relativedelta(hours=hrOffset)
807  if minOffset != 0:
808  extractedDate = extractedDate + relativedelta(minutes=minOffset)
809  if secOffset != 0:
810  extractedDate = extractedDate + relativedelta(seconds=secOffset)
811  for idx, word in enumerate(words):
812  if words[idx] == "og" and words[idx - 1] == "" \
813  and words[idx + 1] == "":
814  words[idx] = ""
815 
816  resultStr = " ".join(words)
817  resultStr = ' '.join(resultStr.split())
818 
819  return [extractedDate, resultStr]
820 
821 
822 def isFractional_da(input_str):
823  """
824  This function takes the given text and checks if it is a fraction.
825 
826  Args:
827  input_str (str): the string to check if fractional
828  Returns:
829  (bool) or (float): False if not a fraction, otherwise the fraction
830 
831  """
832  if input_str.lower().startswith("halv"):
833  return 0.5
834 
835  if input_str.lower() == "trediedel":
836  return 1.0 / 3
837  elif input_str.endswith('del'):
838  input_str = input_str[:len(input_str) - 3] # e.g. "fünftel"
839  if input_str.lower() in da_numbers:
840  return 1.0 / (da_numbers[input_str.lower()])
841 
842  return False
843 
844 
845 def isOrdinal_da(input_str):
846  """
847  This function takes the given text and checks if it is an ordinal number.
848 
849  Args:
850  input_str (str): the string to check if ordinal
851  Returns:
852  (bool) or (float): False if not an ordinal, otherwise the number
853  corresponding to the ordinal
854 
855  ordinals for 1, 3, 7 and 8 are irregular
856 
857  only works for ordinals corresponding to the numbers in da_numbers
858 
859  """
860 
861  lowerstr = input_str.lower()
862 
863  if lowerstr.startswith("første"):
864  return 1
865  if lowerstr.startswith("anden"):
866  return 2
867  if lowerstr.startswith("tredie"):
868  return 3
869  if lowerstr.startswith("fjerde"):
870  return 4
871  if lowerstr.startswith("femte"):
872  return 5
873  if lowerstr.startswith("sjette"):
874  return 6
875  if lowerstr.startswith("elfte"):
876  return 1
877  if lowerstr.startswith("tolvfte"):
878  return 12
879 
880  if lowerstr[-3:] == "nde":
881  # from 20 suffix is -ste*
882  lowerstr = lowerstr[:-3]
883  if lowerstr in da_numbers:
884  return da_numbers[lowerstr]
885 
886  if lowerstr[-4:] in ["ende"]:
887  lowerstr = lowerstr[:-4]
888  if lowerstr in da_numbers:
889  return da_numbers[lowerstr]
890 
891  if lowerstr[-2:] == "te": # below 20 suffix is -te*
892  lowerstr = lowerstr[:-2]
893  if lowerstr in da_numbers:
894  return da_numbers[lowerstr]
895 
896  return False
897 
898 
899 def normalize_da(text, remove_articles):
900  """ German string normalization """
901 
902  words = text.split() # this also removed extra spaces
903  normalized = ""
904  for word in words:
905  if remove_articles and word in ["den", "det"]:
906  continue
907 
908  # Convert numbers into digits, e.g. "two" -> "2"
909 
910  if word in da_numbers:
911  word = str(da_numbers[word])
912 
913  normalized += " " + word
914 
915  return normalized[1:] # strip the initial space
916 
917 
918 def extract_numbers_da(text, short_scale=True, ordinals=False):
919  """
920  Takes in a string and extracts a list of numbers.
921 
922  Args:
923  text (str): the string to extract a number from
924  short_scale (bool): Use "short scale" or "long scale" for large
925  numbers -- over a million. The default is short scale, which
926  is now common in most English speaking countries.
927  See https://en.wikipedia.org/wiki/Names_of_large_numbers
928  ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
929  Returns:
930  list: list of extracted numbers as floats
931  """
932  return extract_numbers_generic(text, pronounce_number_da, extractnumber_da,
933  short_scale=short_scale, ordinals=ordinals)
def extractnumber_da(text)
Definition: parse_da.py:77
def isOrdinal_da(input_str)
Definition: parse_da.py:845
def extract_datetime_da(string, currentDate, default_time)
Definition: parse_da.py:161
def extract_numbers_generic(text, pronounce_handler, extract_handler, short_scale=True, ordinals=False)
Definition: parse_common.py:55
def look_for_fractions(split_list)
Definition: parse_common.py:36
def isFractional_da(input_str)
Definition: parse_da.py:822
def normalize_da(text, remove_articles)
Definition: parse_da.py:899
def extract_numbers_da(text, short_scale=True, ordinals=False)
Definition: parse_da.py:918


mycroft_ros
Author(s):
autogenerated on Mon Apr 26 2021 02:35:40