parse_de.py
Go to the documentation of this file.
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright 2017 Mycroft AI Inc.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #
17 from datetime import datetime
18 from dateutil.relativedelta import relativedelta
19 from mycroft.util.lang.parse_common import is_numeric, look_for_fractions, \
20  extract_numbers_generic
21 from mycroft.util.lang.format_de import pronounce_number_de
22 
23 de_numbers = {
24  'null': 0,
25  'ein': 1,
26  'eins': 1,
27  'eine': 1,
28  'einer': 1,
29  'einem': 1,
30  'einen': 1,
31  'eines': 1,
32  'zwei': 2,
33  'drei': 3,
34  'vier': 4,
35  u'fünf': 5,
36  'sechs': 6,
37  'sieben': 7,
38  'acht': 8,
39  'neun': 9,
40  'zehn': 10,
41  'elf': 11,
42  u'zwölf': 12,
43  'dreizehn': 13,
44  'vierzehn': 14,
45  u'fünfzehn': 15,
46  'sechzehn': 16,
47  'siebzehn': 17,
48  'achtzehn': 18,
49  'neunzehn': 19,
50  'zwanzig': 20,
51  'einundzwanzig': 21,
52  'zweiundzwanzig': 22,
53  'dreiundzwanzig': 23,
54  'vierundzwanzig': 24,
55  u'fünfundzwanzig': 25,
56  'sechsundzwanzig': 26,
57  'siebenundzwanzig': 27,
58  'achtundzwanzig': 28,
59  'neunundzwanzig': 29,
60  u'dreißig': 30,
61  u'einunddreißig': 31,
62  'vierzig': 40,
63  u'fünfzig': 50,
64  'sechzig': 60,
65  'siebzig': 70,
66  'achtzig': 80,
67  'neunzig': 90,
68  'hundert': 100,
69  'zweihundert': 200,
70  'dreihundert': 300,
71  'vierhundert': 400,
72  u'fünfhundert': 500,
73  'sechshundert': 600,
74  'siebenhundert': 700,
75  'achthundert': 800,
76  'neunhundert': 900,
77  'tausend': 1000,
78  'million': 1000000
79 }
80 
81 
82 def extractnumber_de(text):
83  """
84  This function prepares the given text for parsing by making
85  numbers consistent, getting rid of contractions, etc.
86  Args:
87  text (str): the string to normalize
88  Returns:
89  (int) or (float): The value of extracted number
90 
91 
92  undefined articles cannot be suppressed in German:
93  'ein Pferd' means 'one horse' and 'a horse'
94 
95  """
96  aWords = text.split()
97  aWords = [word for word in aWords if
98  word not in ["der", "die", "das", "des", "den", "dem"]]
99  and_pass = False
100  valPreAnd = False
101  val = False
102  count = 0
103  while count < len(aWords):
104  word = aWords[count]
105  if is_numeric(word):
106  # if word.isdigit(): # doesn't work with decimals
107  val = float(word)
108  elif isFractional_de(word):
109  val = isFractional_de(word)
110  elif isOrdinal_de(word):
111  val = isOrdinal_de(word)
112  else:
113  if word in de_numbers:
114  val = de_numbers[word]
115  if count < (len(aWords) - 1):
116  wordNext = aWords[count + 1]
117  else:
118  wordNext = ""
119  valNext = isFractional_de(wordNext)
120 
121  if valNext:
122  val = val * valNext
123  aWords[count + 1] = ""
124 
125  if not val:
126  # look for fractions like "2/3"
127  aPieces = word.split('/')
128  # if (len(aPieces) == 2 and is_numeric(aPieces[0])
129  # and is_numeric(aPieces[1])):
130  if look_for_fractions(aPieces):
131  val = float(aPieces[0]) / float(aPieces[1])
132  elif and_pass:
133  # added to value, quit here
134  val = valPreAnd
135  break
136  else:
137  count += 1
138  continue
139 
140  aWords[count] = ""
141 
142  if and_pass:
143  aWords[count - 1] = '' # remove "and"
144  val += valPreAnd
145  elif count + 1 < len(aWords) and aWords[count + 1] == 'und':
146  and_pass = True
147  valPreAnd = val
148  val = False
149  count += 2
150  continue
151  elif count + 2 < len(aWords) and aWords[count + 2] == 'und':
152  and_pass = True
153  valPreAnd = val
154  val = False
155  count += 3
156  continue
157 
158  break
159 
160  if not val:
161  return False
162 
163  return val
164 
165 
166 def extract_datetime_de(string, currentDate, default_time):
167  def clean_string(s):
168  """
169  cleans the input string of unneeded punctuation
170  and capitalization among other things.
171 
172  'am' is a preposition, so cannot currently be used
173  for 12 hour date format
174  """
175 
176  s = s.lower().replace('?', '').replace('.', '').replace(',', '') \
177  .replace(' der ', ' ').replace(' den ', ' ').replace(' an ',
178  ' ').replace(
179  ' am ', ' ') \
180  .replace(' auf ', ' ').replace(' um ', ' ')
181  wordList = s.split()
182 
183  for idx, word in enumerate(wordList):
184  if isOrdinal_de(word) is not False:
185  word = str(isOrdinal_de(word))
186  wordList[idx] = word
187 
188  return wordList
189 
190  def date_found():
191  return found or \
192  (
193  datestr != "" or timeStr != "" or
194  yearOffset != 0 or monthOffset != 0 or
195  dayOffset is True or hrOffset != 0 or
196  hrAbs or minOffset != 0 or
197  minAbs or secOffset != 0
198  )
199 
200  if string == "" or not currentDate:
201  return None
202 
203  found = False
204  daySpecified = False
205  dayOffset = False
206  monthOffset = 0
207  yearOffset = 0
208  dateNow = currentDate
209  today = dateNow.strftime("%w")
210  currentYear = dateNow.strftime("%Y")
211  fromFlag = False
212  datestr = ""
213  hasYear = False
214  timeQualifier = ""
215 
216  timeQualifiersList = [u'früh', 'morgens', 'vormittag', 'vormittags',
217  'nachmittag', 'nachmittags', 'abend', 'abends',
218  'nachts']
219  markers = ['in', 'am', 'gegen', 'bis', u'für']
220  days = ['montag', 'dienstag', 'mittwoch',
221  'donnerstag', 'freitag', 'samstag', 'sonntag']
222  months = ['januar', 'februar', u'märz', 'april', 'mai', 'juni',
223  'juli', 'august', 'september', 'october', 'november',
224  'dezember']
225  monthsShort = ['jan', 'feb', u'mär', 'apr', 'mai', 'juni', 'juli', 'aug',
226  'sept', 'oct', 'nov', 'dez']
227 
228  validFollowups = days + months + monthsShort
229  validFollowups.append("heute")
230  validFollowups.append("morgen")
231  validFollowups.append(u"nächste")
232  validFollowups.append(u"nächster")
233  validFollowups.append(u"nächstes")
234  validFollowups.append(u"nächsten")
235  validFollowups.append(u"nächstem")
236  validFollowups.append("letzte")
237  validFollowups.append("letzter")
238  validFollowups.append("letztes")
239  validFollowups.append("letzten")
240  validFollowups.append("letztem")
241  validFollowups.append("jetzt")
242 
243  words = clean_string(string)
244 
245  for idx, word in enumerate(words):
246  if word == "":
247  continue
248  wordPrevPrev = words[idx - 2] if idx > 1 else ""
249  wordPrev = words[idx - 1] if idx > 0 else ""
250  wordNext = words[idx + 1] if idx + 1 < len(words) else ""
251  wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
252 
253  # this isn't in clean string because I don't want to save back to words
254 
255  if word != 'morgen' and word != u'übermorgen':
256  if word[-2:] == "en":
257  word = word[:-2] # remove en
258  if word != 'heute':
259  if word[-1:] == "e":
260  word = word[:-1] # remove plural for most nouns
261 
262  start = idx
263  used = 0
264  # save timequalifier for later
265  if word in timeQualifiersList:
266  timeQualifier = word
267  # parse today, tomorrow, day after tomorrow
268  elif word == "heute" and not fromFlag:
269  dayOffset = 0
270  used += 1
271  elif word == "morgen" and not fromFlag and wordPrev != "am" and \
272  wordPrev not in days: # morgen means tomorrow if not "am
273  # Morgen" and not [day of the week] morgen
274  dayOffset = 1
275  used += 1
276  elif word == u"übermorgen" and not fromFlag:
277  dayOffset = 2
278  used += 1
279  # parse 5 days, 10 weeks, last week, next week
280  elif word == "tag" or word == "tage":
281  if wordPrev[0].isdigit():
282  dayOffset += int(wordPrev)
283  start -= 1
284  used = 2
285  elif word == "woch" and not fromFlag:
286  if wordPrev[0].isdigit():
287  dayOffset += int(wordPrev) * 7
288  start -= 1
289  used = 2
290  elif wordPrev[:6] == u"nächst":
291  dayOffset = 7
292  start -= 1
293  used = 2
294  elif wordPrev[:5] == "letzt":
295  dayOffset = -7
296  start -= 1
297  used = 2
298  # parse 10 months, next month, last month
299  elif word == "monat" and not fromFlag:
300  if wordPrev[0].isdigit():
301  monthOffset = int(wordPrev)
302  start -= 1
303  used = 2
304  elif wordPrev[:6] == u"nächst":
305  monthOffset = 1
306  start -= 1
307  used = 2
308  elif wordPrev[:5] == "letzt":
309  monthOffset = -1
310  start -= 1
311  used = 2
312  # parse 5 years, next year, last year
313  elif word == "jahr" and not fromFlag:
314  if wordPrev[0].isdigit():
315  yearOffset = int(wordPrev)
316  start -= 1
317  used = 2
318  elif wordPrev[:6] == u"nächst":
319  yearOffset = 1
320  start -= 1
321  used = 2
322  elif wordPrev[:6] == u"nächst":
323  yearOffset = -1
324  start -= 1
325  used = 2
326  # parse Monday, Tuesday, etc., and next Monday,
327  # last Tuesday, etc.
328  elif word in days and not fromFlag:
329  d = days.index(word)
330  dayOffset = (d + 1) - int(today)
331  used = 1
332  if dayOffset < 0:
333  dayOffset += 7
334  if wordNext == "morgen": # morgen means morning if preceded by
335  # the day of the week
336  words[idx + 1] = u"früh"
337  if wordPrev[:6] == u"nächst":
338  dayOffset += 7
339  used += 1
340  start -= 1
341  elif wordPrev[:5] == "letzt":
342  dayOffset -= 7
343  used += 1
344  start -= 1
345  # parse 15 of July, June 20th, Feb 18, 19 of February
346  elif word in months or word in monthsShort and not fromFlag:
347  try:
348  m = months.index(word)
349  except ValueError:
350  m = monthsShort.index(word)
351  used += 1
352  datestr = months[m]
353  if wordPrev and (wordPrev[0].isdigit() or
354  (wordPrev == "of" and wordPrevPrev[0].isdigit())):
355  if wordPrev == "of" and wordPrevPrev[0].isdigit():
356  datestr += " " + words[idx - 2]
357  used += 1
358  start -= 1
359  else:
360  datestr += " " + wordPrev
361  start -= 1
362  used += 1
363  if wordNext and wordNext[0].isdigit():
364  datestr += " " + wordNext
365  used += 1
366  hasYear = True
367  else:
368  hasYear = False
369 
370  elif wordNext and wordNext[0].isdigit():
371  datestr += " " + wordNext
372  used += 1
373  if wordNextNext and wordNextNext[0].isdigit():
374  datestr += " " + wordNextNext
375  used += 1
376  hasYear = True
377  else:
378  hasYear = False
379  # parse 5 days from tomorrow, 10 weeks from next thursday,
380  # 2 months from July
381 
382  if (
383  word == "von" or word == "nach" or word == "ab") and wordNext \
384  in validFollowups:
385  used = 2
386  fromFlag = True
387  if wordNext == "morgen" and wordPrev != "am" and \
388  wordPrev not in days: # morgen means tomorrow if not "am
389  # Morgen" and not [day of the week] morgen:
390  dayOffset += 1
391  elif wordNext in days:
392  d = days.index(wordNext)
393  tmpOffset = (d + 1) - int(today)
394  used = 2
395  if tmpOffset < 0:
396  tmpOffset += 7
397  dayOffset += tmpOffset
398  elif wordNextNext and wordNextNext in days:
399  d = days.index(wordNextNext)
400  tmpOffset = (d + 1) - int(today)
401  used = 3
402  if wordNext[:6] == u"nächst":
403  tmpOffset += 7
404  used += 1
405  start -= 1
406  elif wordNext[:5] == "letzt":
407  tmpOffset -= 7
408  used += 1
409  start -= 1
410  dayOffset += tmpOffset
411  if used > 0:
412  if start - 1 > 0 and words[start - 1].startswith("diese"):
413  start -= 1
414  used += 1
415 
416  for i in range(0, used):
417  words[i + start] = ""
418 
419  if start - 1 >= 0 and words[start - 1] in markers:
420  words[start - 1] = ""
421  found = True
422  daySpecified = True
423 
424  # parse time
425  timeStr = ""
426  hrOffset = 0
427  minOffset = 0
428  secOffset = 0
429  hrAbs = None
430  minAbs = None
431 
432  for idx, word in enumerate(words):
433  if word == "":
434  continue
435 
436  wordPrevPrev = words[idx - 2] if idx > 1 else ""
437  wordPrev = words[idx - 1] if idx > 0 else ""
438  wordNext = words[idx + 1] if idx + 1 < len(words) else ""
439  wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
440  wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
441  wordNextNextNextNext = words[idx + 4] if idx + 4 < len(words) else ""
442 
443  # parse noon, midnight, morning, afternoon, evening
444  used = 0
445  if word[:6] == "mittag":
446  hrAbs = 12
447  used += 1
448  elif word[:11] == "mitternacht":
449  hrAbs = 0
450  used += 1
451  elif word == "morgens" or (
452  wordPrev == "am" and word == "morgen") or word == u"früh":
453  if not hrAbs:
454  hrAbs = 8
455  used += 1
456  elif word[:10] == "nachmittag":
457  if not hrAbs:
458  hrAbs = 15
459  used += 1
460  elif word[:5] == "abend":
461  if not hrAbs:
462  hrAbs = 19
463  used += 1
464  # parse half an hour, quarter hour
465  elif word == "stunde" and \
466  (wordPrev in markers or wordPrevPrev in markers):
467  if wordPrev[:4] == "halb":
468  minOffset = 30
469  elif wordPrev == "viertel":
470  minOffset = 15
471  elif wordPrev == "dreiviertel":
472  minOffset = 45
473  else:
474  hrOffset = 1
475  if wordPrevPrev in markers:
476  words[idx - 2] = ""
477  words[idx - 1] = ""
478  used += 1
479  hrAbs = -1
480  minAbs = -1
481  # parse 5:00 am, 12:00 p.m., etc
482  elif word[0].isdigit():
483  isTime = True
484  strHH = ""
485  strMM = ""
486  remainder = ""
487  if ':' in word:
488  # parse colons
489  # "3:00 in the morning"
490  stage = 0
491  length = len(word)
492  for i in range(length):
493  if stage == 0:
494  if word[i].isdigit():
495  strHH += word[i]
496  elif word[i] == ":":
497  stage = 1
498  else:
499  stage = 2
500  i -= 1
501  elif stage == 1:
502  if word[i].isdigit():
503  strMM += word[i]
504  else:
505  stage = 2
506  i -= 1
507  elif stage == 2:
508  remainder = word[i:].replace(".", "")
509  break
510  if remainder == "":
511  nextWord = wordNext.replace(".", "")
512  if nextWord == "am" or nextWord == "pm":
513  remainder = nextWord
514  used += 1
515  elif nextWord == "abends":
516  remainder = "pm"
517  used += 1
518  elif wordNext == "am" and wordNextNext == "morgen":
519  remainder = "am"
520  used += 2
521  elif wordNext == "am" and wordNextNext == "nachmittag":
522  remainder = "pm"
523  used += 2
524  elif wordNext == "am" and wordNextNext == "abend":
525  remainder = "pm"
526  used += 2
527  elif wordNext == "morgens":
528  remainder = "am"
529  used += 1
530  elif wordNext == "nachmittags":
531  remainder = "pm"
532  used += 1
533  elif wordNext == "abends":
534  remainder = "pm"
535  used += 1
536  elif wordNext == "heute" and wordNextNext == "morgen":
537  remainder = "am"
538  used = 2
539  elif wordNext == "heute" and wordNextNext == "nachmittag":
540  remainder = "pm"
541  used = 2
542  elif wordNext == "heute" and wordNextNext == "abend":
543  remainder = "pm"
544  used = 2
545  elif wordNext == "nachts":
546  if strHH > 4:
547  remainder = "pm"
548  else:
549  remainder = "am"
550  used += 1
551  else:
552  if timeQualifier != "":
553  if strHH <= 12 and \
554  (timeQualifier == "abends" or
555  timeQualifier == "nachmittags"):
556  strHH += 12 # what happens when strHH is 24?
557  else:
558  # try to parse # s without colons
559  # 5 hours, 10 minutes etc.
560  length = len(word)
561  strNum = ""
562  remainder = ""
563  for i in range(length):
564  if word[i].isdigit():
565  strNum += word[i]
566  else:
567  remainder += word[i]
568 
569  if remainder == "":
570  remainder = wordNext.replace(".", "").lstrip().rstrip()
571 
572  if (
573  remainder == "pm" or
574  wordNext == "pm" or
575  remainder == "p.m." or
576  wordNext == "p.m."):
577  strHH = strNum
578  remainder = "pm"
579  used = 1
580  elif (
581  remainder == "am" or
582  wordNext == "am" or
583  remainder == "a.m." or
584  wordNext == "a.m."):
585  strHH = strNum
586  remainder = "am"
587  used = 1
588  else:
589  if wordNext == "stund" and int(word) < 100:
590  # "in 3 hours"
591  hrOffset = int(word)
592  used = 2
593  isTime = False
594  hrAbs = -1
595  minAbs = -1
596  elif wordNext == "minut":
597  # "in 10 minutes"
598  minOffset = int(word)
599  used = 2
600  isTime = False
601  hrAbs = -1
602  minAbs = -1
603  elif wordNext == "sekund":
604  # in 5 seconds
605  secOffset = int(word)
606  used = 2
607  isTime = False
608  hrAbs = -1
609  minAbs = -1
610 
611  elif wordNext == "uhr":
612  strHH = word
613  used += 1
614  isTime = True
615  if wordNextNext == timeQualifier:
616  strMM = ""
617  if wordNextNext[:10] == "nachmittag":
618  used += 1
619  remainder = "pm"
620  elif wordNextNext == "am" and wordNextNextNext == \
621  "nachmittag":
622  used += 2
623  remainder = "pm"
624  elif wordNextNext[:5] == "abend":
625  used += 1
626  remainder = "pm"
627  elif wordNextNext == "am" and wordNextNextNext == \
628  "abend":
629  used += 2
630  remainder = "pm"
631  elif wordNextNext[:7] == "morgens":
632  used += 1
633  remainder = "am"
634  elif wordNextNext == "am" and wordNextNextNext == \
635  "morgen":
636  used += 2
637  remainder = "am"
638  elif wordNextNext == "nachts":
639  used += 1
640  if 8 <= int(word) <= 12:
641  remainder = "pm"
642  else:
643  remainder = "am"
644 
645  elif is_numeric(wordNextNext):
646  strMM = wordNextNext
647  used += 1
648  if wordNextNextNext == timeQualifier:
649  if wordNextNextNext[:10] == "nachmittag":
650  used += 1
651  remainder = "pm"
652  elif wordNextNextNext == "am" and \
653  wordNextNextNextNext == "nachmittag":
654  used += 2
655  remainder = "pm"
656  elif wordNextNextNext[:5] == "abend":
657  used += 1
658  remainder = "pm"
659  elif wordNextNextNext == "am" and \
660  wordNextNextNextNext == "abend":
661  used += 2
662  remainder = "pm"
663  elif wordNextNextNext[:7] == "morgens":
664  used += 1
665  remainder = "am"
666  elif wordNextNextNext == "am" and \
667  wordNextNextNextNext == "morgen":
668  used += 2
669  remainder = "am"
670  elif wordNextNextNext == "nachts":
671  used += 1
672  if 8 <= int(word) <= 12:
673  remainder = "pm"
674  else:
675  remainder = "am"
676 
677  elif wordNext == timeQualifier:
678  strHH = word
679  strMM = 00
680  isTime = True
681  if wordNext[:10] == "nachmittag":
682  used += 1
683  remainder = "pm"
684  elif wordNext == "am" and wordNextNext == "nachmittag":
685  used += 2
686  remainder = "pm"
687  elif wordNext[:5] == "abend":
688  used += 1
689  remainder = "pm"
690  elif wordNext == "am" and wordNextNext == "abend":
691  used += 2
692  remainder = "pm"
693  elif wordNext[:7] == "morgens":
694  used += 1
695  remainder = "am"
696  elif wordNext == "am" and wordNextNext == "morgen":
697  used += 2
698  remainder = "am"
699  elif wordNext == "nachts":
700  used += 1
701  if 8 <= int(word) <= 12:
702  remainder = "pm"
703  else:
704  remainder = "am"
705 
706  # if timeQualifier != "":
707  # military = True
708  # else:
709  # isTime = False
710 
711  strHH = int(strHH) if strHH else 0
712  strMM = int(strMM) if strMM else 0
713  strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH
714  strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH
715  if strHH > 24 or strMM > 59:
716  isTime = False
717  used = 0
718  if isTime:
719  hrAbs = strHH * 1
720  minAbs = strMM * 1
721  used += 1
722  if used > 0:
723  # removed parsed words from the sentence
724  for i in range(used):
725  words[idx + i] = ""
726 
727  if wordPrev == "Uhr":
728  words[words.index(wordPrev)] = ""
729 
730  if wordPrev == u"früh":
731  hrOffset = -1
732  words[idx - 1] = ""
733  idx -= 1
734  elif wordPrev == u"spät":
735  hrOffset = 1
736  words[idx - 1] = ""
737  idx -= 1
738  if idx > 0 and wordPrev in markers:
739  words[idx - 1] = ""
740  if idx > 1 and wordPrevPrev in markers:
741  words[idx - 2] = ""
742 
743  idx += used - 1
744  found = True
745 
746  # check that we found a date
747  if not date_found:
748  return None
749 
750  if dayOffset is False:
751  dayOffset = 0
752 
753  # perform date manipulation
754 
755  extractedDate = dateNow
756  extractedDate = extractedDate.replace(microsecond=0,
757  second=0,
758  minute=0,
759  hour=0)
760  if datestr != "":
761  en_months = ['january', 'february', 'march', 'april', 'may', 'june',
762  'july', 'august', 'september', 'october', 'november',
763  'december']
764  en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july',
765  'aug',
766  'sept', 'oct', 'nov', 'dec']
767  for idx, en_month in enumerate(en_months):
768  datestr = datestr.replace(months[idx], en_month)
769  for idx, en_month in enumerate(en_monthsShort):
770  datestr = datestr.replace(monthsShort[idx], en_month)
771 
772  temp = datetime.strptime(datestr, "%B %d")
773  if not hasYear:
774  temp = temp.replace(year=extractedDate.year)
775  if extractedDate < temp:
776  extractedDate = extractedDate.replace(year=int(currentYear),
777  month=int(
778  temp.strftime(
779  "%m")),
780  day=int(temp.strftime(
781  "%d")))
782  else:
783  extractedDate = extractedDate.replace(
784  year=int(currentYear) + 1,
785  month=int(temp.strftime("%m")),
786  day=int(temp.strftime("%d")))
787  else:
788  extractedDate = extractedDate.replace(
789  year=int(temp.strftime("%Y")),
790  month=int(temp.strftime("%m")),
791  day=int(temp.strftime("%d")))
792 
793  if timeStr != "":
794  temp = datetime(timeStr)
795  extractedDate = extractedDate.replace(hour=temp.strftime("%H"),
796  minute=temp.strftime("%M"),
797  second=temp.strftime("%S"))
798 
799  if yearOffset != 0:
800  extractedDate = extractedDate + relativedelta(years=yearOffset)
801  if monthOffset != 0:
802  extractedDate = extractedDate + relativedelta(months=monthOffset)
803  if dayOffset != 0:
804  extractedDate = extractedDate + relativedelta(days=dayOffset)
805 
806  if hrAbs is None and minAbs is None and default_time:
807  hrAbs = default_time.hour
808  minAbs = default_time.minute
809 
810  if hrAbs != -1 and minAbs != -1:
811 
812  extractedDate = extractedDate + relativedelta(hours=hrAbs or 0,
813  minutes=minAbs or 0)
814  if (hrAbs or minAbs) and datestr == "":
815  if not daySpecified and dateNow > extractedDate:
816  extractedDate = extractedDate + relativedelta(days=1)
817  if hrOffset != 0:
818  extractedDate = extractedDate + relativedelta(hours=hrOffset)
819  if minOffset != 0:
820  extractedDate = extractedDate + relativedelta(minutes=minOffset)
821  if secOffset != 0:
822  extractedDate = extractedDate + relativedelta(seconds=secOffset)
823  for idx, word in enumerate(words):
824  if words[idx] == "und" and words[idx - 1] == "" \
825  and words[idx + 1] == "":
826  words[idx] = ""
827 
828  resultStr = " ".join(words)
829  resultStr = ' '.join(resultStr.split())
830 
831  return [extractedDate, resultStr]
832 
833 
834 def isFractional_de(input_str):
835  """
836  This function takes the given text and checks if it is a fraction.
837 
838  Args:
839  input_str (str): the string to check if fractional
840  Returns:
841  (bool) or (float): False if not a fraction, otherwise the fraction
842 
843  """
844  if input_str.lower().startswith("halb"):
845  return 0.5
846 
847  if input_str.lower() == "drittel":
848  return 1.0 / 3
849  elif input_str.endswith('tel'):
850  if input_str.endswith('stel'):
851  input_str = input_str[:len(input_str) - 4] # e.g. "hundertstel"
852  else:
853  input_str = input_str[:len(input_str) - 3] # e.g. "fünftel"
854  if input_str.lower() in de_numbers:
855  return 1.0 / (de_numbers[input_str.lower()])
856 
857  return False
858 
859 
860 def isOrdinal_de(input_str):
861  """
862  This function takes the given text and checks if it is an ordinal number.
863 
864  Args:
865  input_str (str): the string to check if ordinal
866  Returns:
867  (bool) or (float): False if not an ordinal, otherwise the number
868  corresponding to the ordinal
869 
870  ordinals for 1, 3, 7 and 8 are irregular
871 
872  only works for ordinals corresponding to the numbers in de_numbers
873 
874  """
875 
876  lowerstr = input_str.lower()
877 
878  if lowerstr.startswith("erste"):
879  return 1
880  if lowerstr.startswith("dritte"):
881  return 3
882  if lowerstr.startswith("siebte"):
883  return 7
884  if lowerstr.startswith("achte"):
885  return 8
886 
887  if lowerstr[-3:] == "ste": # from 20 suffix is -ste*
888  lowerstr = lowerstr[:-3]
889  if lowerstr in de_numbers:
890  return de_numbers[lowerstr]
891 
892  if lowerstr[-4:] in ["ster", "stes", "sten", "stem"]:
893  lowerstr = lowerstr[:-4]
894  if lowerstr in de_numbers:
895  return de_numbers[lowerstr]
896 
897  if lowerstr[-2:] == "te": # below 20 suffix is -te*
898  lowerstr = lowerstr[:-2]
899  if lowerstr in de_numbers:
900  return de_numbers[lowerstr]
901 
902  if lowerstr[-3:] in ["ter", "tes", "ten", "tem"]:
903  lowerstr = lowerstr[:-3]
904  if lowerstr in de_numbers:
905  return de_numbers[lowerstr]
906 
907  return False
908 
909 
910 def normalize_de(text, remove_articles):
911  """ German string normalization """
912 
913  words = text.split() # this also removed extra spaces
914  normalized = ""
915  for word in words:
916  if remove_articles and word in ["der", "die", "das", "des", "den",
917  "dem"]:
918  continue
919 
920  # Expand common contractions, e.g. "isn't" -> "is not"
921  contraction = ["net", "nett"]
922  if word in contraction:
923  expansion = ["nicht", "nicht"]
924  word = expansion[contraction.index(word)]
925 
926  # Convert numbers into digits, e.g. "two" -> "2"
927 
928  if word in de_numbers:
929  word = str(de_numbers[word])
930 
931  normalized += " " + word
932 
933  return normalized[1:] # strip the initial space
934 
935 
936 def extract_numbers_de(text, short_scale=True, ordinals=False):
937  """
938  Takes in a string and extracts a list of numbers.
939 
940  Args:
941  text (str): the string to extract a number from
942  short_scale (bool): Use "short scale" or "long scale" for large
943  numbers -- over a million. The default is short scale, which
944  is now common in most English speaking countries.
945  See https://en.wikipedia.org/wiki/Names_of_large_numbers
946  ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
947  Returns:
948  list: list of extracted numbers as floats
949  """
950  return extract_numbers_generic(text, pronounce_number_de, extractnumber_de,
951  short_scale=short_scale, ordinals=ordinals)
def extract_numbers_de(text, short_scale=True, ordinals=False)
Definition: parse_de.py:936
def isFractional_de(input_str)
Definition: parse_de.py:834
def extract_numbers_generic(text, pronounce_handler, extract_handler, short_scale=True, ordinals=False)
Definition: parse_common.py:55
def normalize_de(text, remove_articles)
Definition: parse_de.py:910
def look_for_fractions(split_list)
Definition: parse_common.py:36
def extract_datetime_de(string, currentDate, default_time)
Definition: parse_de.py:166
def extractnumber_de(text)
Definition: parse_de.py:82
def isOrdinal_de(input_str)
Definition: parse_de.py:860


mycroft_ros
Author(s):
autogenerated on Mon Apr 26 2021 02:35:40