parse_sv.py
Go to the documentation of this file.
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright 2017 Mycroft AI Inc.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #
17 from datetime import datetime
18 from dateutil.relativedelta import relativedelta
19 from mycroft.util.lang.parse_common import is_numeric, look_for_fractions
20 
21 
22 def extractnumber_sv(text):
23  """
24  This function prepares the given text for parsing by making
25  numbers consistent, getting rid of contractions, etc.
26  Args:
27  text (str): the string to normalize
28  Returns:
29  (int) or (float): The value of extracted number
30  """
31  aWords = text.split()
32  and_pass = False
33  valPreAnd = False
34  val = False
35  count = 0
36  while count < len(aWords):
37  word = aWords[count]
38  if is_numeric(word):
39  val = float(word)
40  elif word == "första":
41  val = 1
42  elif word == "andra":
43  val = 2
44  elif word == "tredje":
45  val = 3
46  elif word == "fjärde":
47  val = 4
48  elif word == "femte":
49  val = 5
50  elif word == "sjätte":
51  val = 6
52  elif is_fractional_sv(word):
53  val = is_fractional_sv(word)
54  else:
55  if word == "en":
56  val = 1
57  if word == "ett":
58  val = 1
59  elif word == "två":
60  val = 2
61  elif word == "tre":
62  val = 3
63  elif word == "fyra":
64  val = 4
65  elif word == "fem":
66  val = 5
67  elif word == "sex":
68  val = 6
69  elif word == "sju":
70  val = 7
71  elif word == "åtta":
72  val = 8
73  elif word == "nio":
74  val = 9
75  elif word == "tio":
76  val = 10
77  if val:
78  if count < (len(aWords) - 1):
79  wordNext = aWords[count + 1]
80  else:
81  wordNext = ""
82  valNext = is_fractional_sv(wordNext)
83 
84  if valNext:
85  val = val * valNext
86  aWords[count + 1] = ""
87 
88  if not val:
89  # look for fractions like "2/3"
90  aPieces = word.split('/')
91  if look_for_fractions(aPieces):
92  val = float(aPieces[0]) / float(aPieces[1])
93  elif and_pass:
94  # added to value, quit here
95  val = valPreAnd
96  break
97  else:
98  count += 1
99  continue
100 
101  aWords[count] = ""
102 
103  if and_pass:
104  aWords[count - 1] = '' # remove "och"
105  val += valPreAnd
106  elif count + 1 < len(aWords) and aWords[count + 1] == 'och':
107  and_pass = True
108  valPreAnd = val
109  val = False
110  count += 2
111  continue
112  elif count + 2 < len(aWords) and aWords[count + 2] == 'och':
113  and_pass = True
114  valPreAnd = val
115  val = False
116  count += 3
117  continue
118 
119  break
120 
121  if not val:
122  return False
123 
124  return val
125 
126 
127 def extract_datetime_sv(string, currentDate, default_time):
128  def clean_string(s):
129  """
130  cleans the input string of unneeded punctuation and capitalization
131  among other things.
132  """
133  s = s.lower().replace('?', '').replace('.', '').replace(',', '') \
134  .replace(' den ', ' ').replace(' en ', ' ')
135  wordList = s.split()
136  for idx, word in enumerate(wordList):
137  word = word.replace("'s", "")
138 
139  ordinals = ["rd", "st", "nd", "th"]
140  if word[0].isdigit():
141  for ordinal in ordinals:
142  if ordinal in word:
143  word = word.replace(ordinal, "")
144  wordList[idx] = word
145 
146  return wordList
147 
148  def date_found():
149  return found or \
150  (
151  datestr != "" or timeStr != "" or
152  yearOffset != 0 or monthOffset != 0 or
153  dayOffset is True or hrOffset != 0 or
154  hrAbs or minOffset != 0 or
155  minAbs or secOffset != 0
156  )
157 
158  if string == "" or not currentDate:
159  return None
160 
161  found = False
162  daySpecified = False
163  dayOffset = False
164  monthOffset = 0
165  yearOffset = 0
166  dateNow = currentDate
167  today = dateNow.strftime("%w")
168  currentYear = dateNow.strftime("%Y")
169  fromFlag = False
170  datestr = ""
171  hasYear = False
172  timeQualifier = ""
173 
174  timeQualifiersList = ['morgon', 'förmiddag', 'eftermiddag', 'kväll']
175  markers = ['på', 'i', 'den här', 'kring', 'efter']
176  days = ['måndag', 'tisdag', 'onsdag', 'torsdag',
177  'fredag', 'lördag', 'söndag']
178  months = ['januari', 'februari', 'mars', 'april', 'maj', 'juni',
179  'juli', 'augusti', 'september', 'oktober', 'november',
180  'december']
181  monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug',
182  'sept', 'oct', 'nov', 'dec']
183 
184  words = clean_string(string)
185 
186  for idx, word in enumerate(words):
187  if word == "":
188  continue
189  wordPrevPrev = words[idx - 2] if idx > 1 else ""
190  wordPrev = words[idx - 1] if idx > 0 else ""
191  wordNext = words[idx + 1] if idx + 1 < len(words) else ""
192  wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
193 
194  # this isn't in clean string because I don't want to save back to words
195  word = word.rstrip('s')
196  start = idx
197  used = 0
198  # save timequalifier for later
199  if word in timeQualifiersList:
200  timeQualifier = word
201  # parse today, tomorrow, day after tomorrow
202  elif word == "idag" and not fromFlag:
203  dayOffset = 0
204  used += 1
205  elif word == "imorgon" and not fromFlag:
206  dayOffset = 1
207  used += 1
208  elif word == "morgondagen" or word == "morgondagens" and not fromFlag:
209  dayOffset = 1
210  used += 1
211  elif word == "övermorgon" and not fromFlag:
212  dayOffset = 2
213  used += 1
214  # parse 5 days, 10 weeks, last week, next week
215  elif word == "dag" or word == "dagar":
216  if wordPrev[0].isdigit():
217  dayOffset += int(wordPrev)
218  start -= 1
219  used = 2
220  elif word == "vecka" or word == "veckor" and not fromFlag:
221  if wordPrev[0].isdigit():
222  dayOffset += int(wordPrev) * 7
223  start -= 1
224  used = 2
225  elif wordPrev == "nästa":
226  dayOffset = 7
227  start -= 1
228  used = 2
229  elif wordPrev == "förra":
230  dayOffset = -7
231  start -= 1
232  used = 2
233  # parse 10 months, next month, last month
234  elif word == "månad" and not fromFlag:
235  if wordPrev[0].isdigit():
236  monthOffset = int(wordPrev)
237  start -= 1
238  used = 2
239  elif wordPrev == "nästa":
240  monthOffset = 1
241  start -= 1
242  used = 2
243  elif wordPrev == "förra":
244  monthOffset = -1
245  start -= 1
246  used = 2
247  # parse 5 years, next year, last year
248  elif word == "år" and not fromFlag:
249  if wordPrev[0].isdigit():
250  yearOffset = int(wordPrev)
251  start -= 1
252  used = 2
253  elif wordPrev == "nästa":
254  yearOffset = 1
255  start -= 1
256  used = 2
257  elif wordPrev == "förra":
258  yearOffset = -1
259  start -= 1
260  used = 2
261  # parse Monday, Tuesday, etc., and next Monday,
262  # last Tuesday, etc.
263  elif word in days and not fromFlag:
264  d = days.index(word)
265  dayOffset = (d + 1) - int(today)
266  used = 1
267  if dayOffset < 0:
268  dayOffset += 7
269  if wordPrev == "nästa":
270  dayOffset += 7
271  used += 1
272  start -= 1
273  elif wordPrev == "förra":
274  dayOffset -= 7
275  used += 1
276  start -= 1
277  # parse 15 of July, June 20th, Feb 18, 19 of February
278  elif word in months or word in monthsShort and not fromFlag:
279  try:
280  m = months.index(word)
281  except ValueError:
282  m = monthsShort.index(word)
283  used += 1
284  datestr = months[m]
285  if wordPrev and (wordPrev[0].isdigit() or
286  (wordPrev == "of" and wordPrevPrev[0].isdigit())):
287  if wordPrev == "of" and wordPrevPrev[0].isdigit():
288  datestr += " " + words[idx - 2]
289  used += 1
290  start -= 1
291  else:
292  datestr += " " + wordPrev
293  start -= 1
294  used += 1
295  if wordNext and wordNext[0].isdigit():
296  datestr += " " + wordNext
297  used += 1
298  hasYear = True
299  else:
300  hasYear = False
301 
302  elif wordNext and wordNext[0].isdigit():
303  datestr += " " + wordNext
304  used += 1
305  if wordNextNext and wordNextNext[0].isdigit():
306  datestr += " " + wordNextNext
307  used += 1
308  hasYear = True
309  else:
310  hasYear = False
311  # parse 5 days from tomorrow, 10 weeks from next thursday,
312  # 2 months from July
313  validFollowups = days + months + monthsShort
314  validFollowups.append("idag")
315  validFollowups.append("imorgon")
316  validFollowups.append("nästa")
317  validFollowups.append("förra")
318  validFollowups.append("nu")
319  if (word == "från" or word == "efter") and wordNext in validFollowups:
320  used = 2
321  fromFlag = True
322  if wordNext == "imorgon":
323  dayOffset += 1
324  elif wordNext in days:
325  d = days.index(wordNext)
326  tmpOffset = (d + 1) - int(today)
327  used = 2
328  if tmpOffset < 0:
329  tmpOffset += 7
330  dayOffset += tmpOffset
331  elif wordNextNext and wordNextNext in days:
332  d = days.index(wordNextNext)
333  tmpOffset = (d + 1) - int(today)
334  used = 3
335  if wordNext == "nästa":
336  tmpOffset += 7
337  used += 1
338  start -= 1
339  elif wordNext == "förra":
340  tmpOffset -= 7
341  used += 1
342  start -= 1
343  dayOffset += tmpOffset
344  if used > 0:
345  if start - 1 > 0 and words[start - 1] == "denna":
346  start -= 1
347  used += 1
348 
349  for i in range(0, used):
350  words[i + start] = ""
351 
352  if start - 1 >= 0 and words[start - 1] in markers:
353  words[start - 1] = ""
354  found = True
355  daySpecified = True
356 
357  # parse time
358  timeStr = ""
359  hrOffset = 0
360  minOffset = 0
361  secOffset = 0
362  hrAbs = None
363  minAbs = None
364 
365  for idx, word in enumerate(words):
366  if word == "":
367  continue
368 
369  wordPrevPrev = words[idx - 2] if idx > 1 else ""
370  wordPrev = words[idx - 1] if idx > 0 else ""
371  wordNext = words[idx + 1] if idx + 1 < len(words) else ""
372  wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
373  # parse noon, midnight, morning, afternoon, evening
374  used = 0
375  if word == "middag":
376  hrAbs = 12
377  used += 1
378  elif word == "midnatt":
379  hrAbs = 0
380  used += 1
381  elif word == "morgon":
382  if not hrAbs:
383  hrAbs = 8
384  used += 1
385  elif word == "förmiddag":
386  if not hrAbs:
387  hrAbs = 10
388  used += 1
389  elif word == "eftermiddag":
390  if not hrAbs:
391  hrAbs = 15
392  used += 1
393  elif word == "kväll":
394  if not hrAbs:
395  hrAbs = 19
396  used += 1
397  # parse half an hour, quarter hour
398  elif wordPrev in markers or wordPrevPrev in markers:
399  if word == "halvtimme" or word == "halvtimma":
400  minOffset = 30
401  elif word == "kvart":
402  minOffset = 15
403  elif word == "timme" or word == "timma":
404  hrOffset = 1
405  words[idx - 1] = ""
406  used += 1
407  hrAbs = -1
408  minAbs = -1
409  # parse 5:00 am, 12:00 p.m., etc
410  elif word[0].isdigit():
411  isTime = True
412  strHH = ""
413  strMM = ""
414  remainder = ""
415  if ':' in word:
416  # parse colons
417  # "3:00 in the morning"
418  stage = 0
419  length = len(word)
420  for i in range(length):
421  if stage == 0:
422  if word[i].isdigit():
423  strHH += word[i]
424  elif word[i] == ":":
425  stage = 1
426  else:
427  stage = 2
428  i -= 1
429  elif stage == 1:
430  if word[i].isdigit():
431  strMM += word[i]
432  else:
433  stage = 2
434  i -= 1
435  elif stage == 2:
436  remainder = word[i:].replace(".", "")
437  break
438  if remainder == "":
439  nextWord = wordNext.replace(".", "")
440  if nextWord == "am" or nextWord == "pm":
441  remainder = nextWord
442  used += 1
443  elif nextWord == "tonight":
444  remainder = "pm"
445  used += 1
446  elif wordNext == "in" and wordNextNext == "the" and \
447  words[idx + 3] == "morning":
448  remainder = "am"
449  used += 3
450  elif wordNext == "in" and wordNextNext == "the" and \
451  words[idx + 3] == "afternoon":
452  remainder = "pm"
453  used += 3
454  elif wordNext == "in" and wordNextNext == "the" and \
455  words[idx + 3] == "evening":
456  remainder = "pm"
457  used += 3
458  elif wordNext == "in" and wordNextNext == "morning":
459  remainder = "am"
460  used += 2
461  elif wordNext == "in" and wordNextNext == "afternoon":
462  remainder = "pm"
463  used += 2
464  elif wordNext == "in" and wordNextNext == "evening":
465  remainder = "pm"
466  used += 2
467  elif wordNext == "this" and wordNextNext == "morning":
468  remainder = "am"
469  used = 2
470  elif wordNext == "this" and wordNextNext == "afternoon":
471  remainder = "pm"
472  used = 2
473  elif wordNext == "this" and wordNextNext == "evening":
474  remainder = "pm"
475  used = 2
476  elif wordNext == "at" and wordNextNext == "night":
477  if strHH > 5:
478  remainder = "pm"
479  else:
480  remainder = "am"
481  used += 2
482  else:
483  if timeQualifier != "":
484  if strHH <= 12 and \
485  (timeQualifier == "evening" or
486  timeQualifier == "afternoon"):
487  strHH += 12
488  else:
489  # try to parse # s without colons
490  # 5 hours, 10 minutes etc.
491  length = len(word)
492  strNum = ""
493  remainder = ""
494  for i in range(length):
495  if word[i].isdigit():
496  strNum += word[i]
497  else:
498  remainder += word[i]
499 
500  if remainder == "":
501  remainder = wordNext.replace(".", "").lstrip().rstrip()
502 
503  if (
504  remainder == "pm" or
505  wordNext == "pm" or
506  remainder == "p.m." or
507  wordNext == "p.m."):
508  strHH = strNum
509  remainder = "pm"
510  used = 1
511  elif (
512  remainder == "am" or
513  wordNext == "am" or
514  remainder == "a.m." or
515  wordNext == "a.m."):
516  strHH = strNum
517  remainder = "am"
518  used = 1
519  else:
520  if wordNext == "pm" or wordNext == "p.m.":
521  strHH = strNum
522  remainder = "pm"
523  used = 1
524  elif wordNext == "am" or wordNext == "a.m.":
525  strHH = strNum
526  remainder = "am"
527  used = 1
528  elif (
529  int(word) > 100 and
530  (
531  wordPrev == "o" or
532  wordPrev == "oh"
533  )):
534  # 0800 hours (pronounced oh-eight-hundred)
535  strHH = int(word) / 100
536  strMM = int(word) - strHH * 100
537  if wordNext == "hours":
538  used += 1
539  elif (
540  wordNext == "hours" and
541  word[0] != '0' and
542  (
543  int(word) < 100 and
544  int(word) > 2400
545  )):
546  # "in 3 hours"
547  hrOffset = int(word)
548  used = 2
549  isTime = False
550  hrAbs = -1
551  minAbs = -1
552 
553  elif wordNext == "minutes":
554  # "in 10 minutes"
555  minOffset = int(word)
556  used = 2
557  isTime = False
558  hrAbs = -1
559  minAbs = -1
560  elif wordNext == "seconds":
561  # in 5 seconds
562  secOffset = int(word)
563  used = 2
564  isTime = False
565  hrAbs = -1
566  minAbs = -1
567  elif int(word) > 100:
568  strHH = int(word) / 100
569  strMM = int(word) - strHH * 100
570  if wordNext == "hours":
571  used += 1
572  elif wordNext[0].isdigit():
573  strHH = word
574  strMM = wordNext
575  used += 1
576  if wordNextNext == "hours":
577  used += 1
578  elif (
579  wordNext == "" or wordNext == "o'clock" or
580  (
581  wordNext == "in" and
582  (
583  wordNextNext == "the" or
584  wordNextNext == timeQualifier
585  )
586  )):
587  strHH = word
588  strMM = 00
589  if wordNext == "o'clock":
590  used += 1
591  if wordNext == "in" or wordNextNext == "in":
592  used += (1 if wordNext == "in" else 2)
593  if (wordNextNext and
594  wordNextNext in timeQualifier or
595  (words[words.index(wordNextNext) + 1] and
596  words[words.index(wordNextNext) + 1] in
597  timeQualifier)):
598  if (wordNextNext == "afternoon" or
599  (len(words) >
600  words.index(wordNextNext) + 1 and
601  words[words.index(
602  wordNextNext) + 1] == "afternoon")):
603  remainder = "pm"
604  if (wordNextNext == "evening" or
605  (len(words) >
606  (words.index(wordNextNext) + 1) and
607  words[words.index(
608  wordNextNext) + 1] == "evening")):
609  remainder = "pm"
610  if (wordNextNext == "morning" or
611  (len(words) >
612  words.index(wordNextNext) + 1 and
613  words[words.index(
614  wordNextNext) + 1] == "morning")):
615  remainder = "am"
616  else:
617  isTime = False
618 
619  strHH = int(strHH) if strHH else 0
620  strMM = int(strMM) if strMM else 0
621  strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH
622  strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH
623  if strHH > 24 or strMM > 59:
624  isTime = False
625  used = 0
626  if isTime:
627  hrAbs = strHH * 1
628  minAbs = strMM * 1
629  used += 1
630  if used > 0:
631  # removed parsed words from the sentence
632  for i in range(used):
633  words[idx + i] = ""
634 
635  if wordPrev == "o" or wordPrev == "oh":
636  words[words.index(wordPrev)] = ""
637 
638  if wordPrev == "early":
639  hrOffset = -1
640  words[idx - 1] = ""
641  idx -= 1
642  elif wordPrev == "late":
643  hrOffset = 1
644  words[idx - 1] = ""
645  idx -= 1
646  if idx > 0 and wordPrev in markers:
647  words[idx - 1] = ""
648  if idx > 1 and wordPrevPrev in markers:
649  words[idx - 2] = ""
650 
651  idx += used - 1
652  found = True
653 
654  # check that we found a date
655  if not date_found:
656  return None
657 
658  if dayOffset is False:
659  dayOffset = 0
660 
661  # perform date manipulation
662 
663  extractedDate = dateNow
664  extractedDate = extractedDate.replace(microsecond=0,
665  second=0,
666  minute=0,
667  hour=0)
668  if datestr != "":
669  temp = datetime.strptime(datestr, "%B %d")
670  if not hasYear:
671  temp = temp.replace(year=extractedDate.year)
672  if extractedDate < temp:
673  extractedDate = extractedDate.replace(year=int(currentYear),
674  month=int(
675  temp.strftime(
676  "%m")),
677  day=int(temp.strftime(
678  "%d")))
679  else:
680  extractedDate = extractedDate.replace(
681  year=int(currentYear) + 1,
682  month=int(temp.strftime("%m")),
683  day=int(temp.strftime("%d")))
684  else:
685  extractedDate = extractedDate.replace(
686  year=int(temp.strftime("%Y")),
687  month=int(temp.strftime("%m")),
688  day=int(temp.strftime("%d")))
689 
690  if timeStr != "":
691  temp = datetime(timeStr)
692  extractedDate = extractedDate.replace(hour=temp.strftime("%H"),
693  minute=temp.strftime("%M"),
694  second=temp.strftime("%S"))
695 
696  if yearOffset != 0:
697  extractedDate = extractedDate + relativedelta(years=yearOffset)
698  if monthOffset != 0:
699  extractedDate = extractedDate + relativedelta(months=monthOffset)
700  if dayOffset != 0:
701  extractedDate = extractedDate + relativedelta(days=dayOffset)
702 
703  if hrAbs is None and minAbs is None and default_time:
704  hrAbs = default_time.hour
705  minAbs = default_time.minute
706  if hrAbs != -1 and minAbs != -1:
707  extractedDate = extractedDate + relativedelta(hours=hrAbs or 0,
708  minutes=minAbs or 0)
709  if (hrAbs or minAbs) and datestr == "":
710  if not daySpecified and dateNow > extractedDate:
711  extractedDate = extractedDate + relativedelta(days=1)
712  if hrOffset != 0:
713  extractedDate = extractedDate + relativedelta(hours=hrOffset)
714  if minOffset != 0:
715  extractedDate = extractedDate + relativedelta(minutes=minOffset)
716  if secOffset != 0:
717  extractedDate = extractedDate + relativedelta(seconds=secOffset)
718  for idx, word in enumerate(words):
719  if words[idx] == "and" and words[idx - 1] == "" and words[
720  idx + 1] == "":
721  words[idx] = ""
722 
723  resultStr = " ".join(words)
724  resultStr = ' '.join(resultStr.split())
725  return [extractedDate, resultStr]
726 
727 
728 def is_fractional_sv(input_str):
729  """
730  This function takes the given text and checks if it is a fraction.
731 
732  Args:
733  input_str (str): the string to check if fractional
734  Returns:
735  (bool) or (float): False if not a fraction, otherwise the fraction
736 
737  """
738  if input_str.endswith('ars', -3):
739  input_str = input_str[:len(input_str) - 3] # e.g. "femtedelar"
740  if input_str.endswith('ar', -2):
741  input_str = input_str[:len(input_str) - 2] # e.g. "femtedelar"
742  if input_str.endswith('a', -1):
743  input_str = input_str[:len(input_str) - 1] # e.g. "halva"
744  if input_str.endswith('s', -1):
745  input_str = input_str[:len(input_str) - 1] # e.g. "halva"
746 
747  aFrac = ["hel", "halv", "tredjedel", "fjärdedel", "femtedel", "sjättedel",
748  "sjundedel", "åttondel", "niondel", "tiondel", "elftedel",
749  "tolftedel"]
750  if input_str.lower() in aFrac:
751  return 1.0 / (aFrac.index(input_str) + 1)
752  if input_str == "kvart":
753  return 1.0 / 4
754  if input_str == "trekvart":
755  return 3.0 / 4
756 
757  return False
758 
759 
760 def normalize_sv(text, remove_articles):
761  """ English string normalization """
762 
763  words = text.split() # this also removed extra spaces
764  normalized = ''
765  for word in words:
766  # Convert numbers into digits, e.g. "two" -> "2"
767  if word == 'en':
768  word = 'ett'
769  textNumbers = ["noll", "ett", "två", "tre", "fyra", "fem", "sex",
770  "sju", "åtta", "nio", "tio", "elva", "tolv",
771  "tretton", "fjorton", "femton", "sexton",
772  "sjutton", "arton", "nitton", "tjugo"]
773  if word in textNumbers:
774  word = str(textNumbers.index(word))
775 
776  normalized += " " + word
777 
778  return normalized[1:] # strip the initial space
def extractnumber_sv(text)
Definition: parse_sv.py:22
def normalize_sv(text, remove_articles)
Definition: parse_sv.py:760
def extract_datetime_sv(string, currentDate, default_time)
Definition: parse_sv.py:127
def look_for_fractions(split_list)
Definition: parse_common.py:36
def is_fractional_sv(input_str)
Definition: parse_sv.py:728


mycroft_ros
Author(s):
autogenerated on Mon Apr 26 2021 02:35:40