parse.py
Go to the documentation of this file.
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright 2017 Mycroft AI Inc.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #
17 from difflib import SequenceMatcher
18 from mycroft.util.time import now_local
19 from mycroft.util.lang import get_primary_lang_code
20 
21 from mycroft.util.lang.parse_en import *
22 from mycroft.util.lang.parse_pt import *
23 from mycroft.util.lang.parse_es import *
24 from mycroft.util.lang.parse_it import *
25 from mycroft.util.lang.parse_sv import *
26 
27 from mycroft.util.lang.parse_de import extractnumber_de
28 from mycroft.util.lang.parse_de import extract_numbers_de
29 from mycroft.util.lang.parse_de import extract_datetime_de
30 from mycroft.util.lang.parse_de import normalize_de
31 from mycroft.util.lang.parse_fr import extractnumber_fr
32 from mycroft.util.lang.parse_fr import extract_numbers_fr
33 from mycroft.util.lang.parse_fr import extract_datetime_fr
34 from mycroft.util.lang.parse_fr import normalize_fr
35 from mycroft.util.lang.parse_da import extractnumber_da
36 from mycroft.util.lang.parse_da import extract_numbers_da
37 from mycroft.util.lang.parse_da import extract_datetime_da
38 from mycroft.util.lang.parse_da import normalize_da
39 
40 from .log import LOG
41 
42 
43 def _log_unsupported_language(language, supported_languages):
44  """
45  Log a warning when a language is unsupported
46 
47  Arguments:
48  language: str
49  The language that was supplied.
50  supported_languages: [str]
51  The list of supported languages.
52  """
53  supported = ' '.join(supported_languages)
54  LOG.warning('Language "{language}" not recognized! Please make sure your '
55  'language is one of the following: {supported}.'
56  .format(language=language, supported=supported))
57 
58 
59 def fuzzy_match(x, against):
60  """Perform a 'fuzzy' comparison between two strings.
61  Returns:
62  float: match percentage -- 1.0 for perfect match,
63  down to 0.0 for no match at all.
64  """
65  return SequenceMatcher(None, x, against).ratio()
66 
67 
68 def match_one(query, choices):
69  """
70  Find best match from a list or dictionary given an input
71 
72  Arguments:
73  query: string to test
74  choices: list or dictionary of choices
75 
76  Returns: tuple with best match, score
77  """
78  if isinstance(choices, dict):
79  _choices = list(choices.keys())
80  elif isinstance(choices, list):
81  _choices = choices
82  else:
83  raise ValueError('a list or dict of choices must be provided')
84 
85  best = (_choices[0], fuzzy_match(query, _choices[0]))
86  for c in _choices[1:]:
87  score = fuzzy_match(query, c)
88  if score > best[1]:
89  best = (c, score)
90 
91  if isinstance(choices, dict):
92  return (choices[best[0]], best[1])
93  else:
94  return best
95 
96 
97 def extract_numbers(text, short_scale=True, ordinals=False, lang=None):
98  """
99  Takes in a string and extracts a list of numbers.
100 
101  Args:
102  text (str): the string to extract a number from
103  short_scale (bool): Use "short scale" or "long scale" for large
104  numbers -- over a million. The default is short scale, which
105  is now common in most English speaking countries.
106  See https://en.wikipedia.org/wiki/Names_of_large_numbers
107  ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
108  lang (str): the BCP-47 code for the language to use, None uses default
109  Returns:
110  list: list of extracted numbers as floats, or empty list if none found
111  """
112  lang_code = get_primary_lang_code(lang)
113  if lang_code == "en":
114  return extract_numbers_en(text, short_scale, ordinals)
115  elif lang_code == "de":
116  return extract_numbers_de(text, short_scale, ordinals)
117  elif lang_code == "fr":
118  return extract_numbers_fr(text, short_scale, ordinals)
119  elif lang_code == "it":
120  return extract_numbers_it(text, short_scale, ordinals)
121  elif lang_code == "da":
122  return extract_numbers_da(text, short_scale, ordinals)
123  return []
124 
125 
126 def extract_number(text, short_scale=True, ordinals=False, lang=None):
127  """Takes in a string and extracts a number.
128 
129  Args:
130  text (str): the string to extract a number from
131  short_scale (bool): Use "short scale" or "long scale" for large
132  numbers -- over a million. The default is short scale, which
133  is now common in most English speaking countries.
134  See https://en.wikipedia.org/wiki/Names_of_large_numbers
135  ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
136  lang (str): the BCP-47 code for the language to use, None uses default
137  Returns:
138  (int, float or False): The number extracted or False if the input
139  text contains no numbers
140  """
141  lang_code = get_primary_lang_code(lang)
142  if lang_code == "en":
143  return extractnumber_en(text, short_scale=short_scale,
144  ordinals=ordinals)
145  elif lang_code == "es":
146  return extractnumber_es(text)
147  elif lang_code == "pt":
148  return extractnumber_pt(text)
149  elif lang_code == "it":
150  return extractnumber_it(text, short_scale=short_scale,
151  ordinals=ordinals)
152  elif lang_code == "fr":
153  return extractnumber_fr(text)
154  elif lang_code == "sv":
155  return extractnumber_sv(text)
156  elif lang_code == "de":
157  return extractnumber_de(text)
158  elif lang_code == "da":
159  return extractnumber_da(text)
160  # TODO: extractnumber_xx for other languages
161  _log_unsupported_language(lang_lower,
162  ['en', 'es', 'pt', 'it', 'fr', 'sv', 'de', 'da'])
163  return text
164 
165 
166 def extract_duration(text, lang=None):
167  """ Convert an english phrase into a number of seconds
168 
169  Convert things like:
170  "10 minute"
171  "2 and a half hours"
172  "3 days 8 hours 10 minutes and 49 seconds"
173  into an int, representing the total number of seconds.
174 
175  The words used in the duration will be consumed, and
176  the remainder returned.
177 
178  As an example, "set a timer for 5 minutes" would return
179  (300, "set a timer for").
180 
181  Args:
182  text (str): string containing a duration
183  lang (str): the BCP-47 code for the language to use, None uses default
184 
185  Returns:
186  (timedelta, str):
187  A tuple containing the duration and the remaining text
188  not consumed in the parsing. The first value will
189  be None if no duration is found. The text returned
190  will have whitespace stripped from the ends.
191  """
192  lang_code = get_primary_lang_code(lang)
193 
194  if lang_code == "en":
195  return extract_duration_en(text)
196 
197  # TODO: extract_duration for other languages
198  _log_unsupported_language(lang_code, ['en'])
199  return None
200 
201 
202 def extract_datetime(text, anchorDate=None, lang=None, default_time=None):
203  """
204  Extracts date and time information from a sentence. Parses many of the
205  common ways that humans express dates and times, including relative dates
206  like "5 days from today", "tomorrow', and "Tuesday".
207 
208  Vague terminology are given arbitrary values, like:
209  - morning = 8 AM
210  - afternoon = 3 PM
211  - evening = 7 PM
212 
213  If a time isn't supplied or implied, the function defaults to 12 AM
214 
215  Args:
216  text (str): the text to be interpreted
217  anchorDate (:obj:`datetime`, optional): the date to be used for
218  relative dating (for example, what does "tomorrow" mean?).
219  Defaults to the current local date/time.
220  lang (str): the BCP-47 code for the language to use, None uses default
221  default_time (datetime.time): time to use if none was found in
222  the input string.
223 
224  Returns:
225  [:obj:`datetime`, :obj:`str`]: 'datetime' is the extracted date
226  as a datetime object in the user's local timezone.
227  'leftover_string' is the original phrase with all date and time
228  related keywords stripped out. See examples for further
229  clarification
230 
231  Returns 'None' if no date or time related text is found.
232 
233  Examples:
234 
235  >>> extract_datetime(
236  ... "What is the weather like the day after tomorrow?",
237  ... datetime(2017, 06, 30, 00, 00)
238  ... )
239  [datetime.datetime(2017, 7, 2, 0, 0), 'what is weather like']
240 
241  >>> extract_datetime(
242  ... "Set up an appointment 2 weeks from Sunday at 5 pm",
243  ... datetime(2016, 02, 19, 00, 00)
244  ... )
245  [datetime.datetime(2016, 3, 6, 17, 0), 'set up appointment']
246 
247  >>> extract_datetime(
248  ... "Set up an appointment",
249  ... datetime(2016, 02, 19, 00, 00)
250  ... )
251  None
252  """
253 
254  lang_code = get_primary_lang_code(lang)
255 
256  if not anchorDate:
257  anchorDate = now_local()
258 
259  if lang_code == "en":
260  return extract_datetime_en(text, anchorDate, default_time)
261  elif lang_code == "es":
262  return extract_datetime_es(text, anchorDate, default_time)
263  elif lang_code == "pt":
264  return extract_datetime_pt(text, anchorDate, default_time)
265  elif lang_code == "it":
266  return extract_datetime_it(text, anchorDate, default_time)
267  elif lang_code == "fr":
268  return extract_datetime_fr(text, anchorDate, default_time)
269  elif lang_code == "sv":
270  return extract_datetime_sv(text, anchorDate, default_time)
271  elif lang_code == "de":
272  return extract_datetime_de(text, anchorDate, default_time)
273  elif lang_code == "da":
274  return extract_datetime_da(text, anchorDate, default_time)
275  # TODO: extract_datetime for other languages
276  _log_unsupported_language(lang_code,
277  ['en', 'es', 'pt', 'it', 'fr', 'sv', 'de', 'da'])
278  return text
279 
280 
281 def normalize(text, lang=None, remove_articles=True):
282  """Prepare a string for parsing
283 
284  This function prepares the given text for parsing by making
285  numbers consistent, getting rid of contractions, etc.
286 
287  Args:
288  text (str): the string to normalize
289  lang (str): the BCP-47 code for the language to use, None uses default
290  remove_articles (bool): whether to remove articles (like 'a', or
291  'the'). True by default.
292 
293  Returns:
294  (str): The normalized string.
295  """
296 
297  lang_code = get_primary_lang_code(lang)
298 
299  if lang_code == "en":
300  return normalize_en(text, remove_articles)
301  elif lang_code == "es":
302  return normalize_es(text, remove_articles)
303  elif lang_code == "pt":
304  return normalize_pt(text, remove_articles)
305  elif lang_code == "it":
306  return normalize_it(text, remove_articles)
307  elif lang_code == "fr":
308  return normalize_fr(text, remove_articles)
309  elif lang_code == "sv":
310  return normalize_sv(text, remove_articles)
311  elif lang_code == "de":
312  return normalize_de(text, remove_articles)
313  elif lang_code == "da":
314  return normalize_da(text, remove_articles)
315  # TODO: Normalization for other languages
316  _log_unsupported_language(lang_code,
317  ['en', 'es', 'pt', 'it', 'fr', 'sv', 'de', 'da'])
318  return text
319 
320 
321 def get_gender(word, context="", lang=None):
322  """ Guess the gender of a word
323 
324  Some languages assign genders to specific words. This method will attempt
325  to determine the gender, optionally using the provided context sentence.
326 
327  Args:
328  word (str): The word to look up
329  context (str, optional): String containing word, for context
330  lang (str): the BCP-47 code for the language to use, None uses default
331 
332  Returns:
333  str: The code "m" (male), "f" (female) or "n" (neutral) for the gender,
334  or None if unknown/or unused in the given language.
335  """
336 
337  lang_code = get_primary_lang_code(lang)
338 
339  if lang_code in ["pt", "es"]:
340  # spanish follows same rules
341  return get_gender_pt(word, context)
342  elif lang_code == "it":
343  return get_gender_it(word, context)
344  return None
def extractnumber_it(text, short_scale=False, ordinals=False)
Definition: parse_it.py:385
def now_local(tz=None)
Definition: time.py:54
def normalize_en(text, remove_articles)
Definition: parse_en.py:1495
def extract_number(text, short_scale=True, ordinals=False, lang=None)
Definition: parse.py:126
def normalize_it(text, remove_articles)
Definition: parse_it.py:566
def _log_unsupported_language(language, supported_languages)
Definition: parse.py:43
def extract_datetime_fr(string, currentDate, default_time)
Definition: parse_fr.py:476
def extract_numbers_de(text, short_scale=True, ordinals=False)
Definition: parse_de.py:936
def extract_datetime_en(string, dateNow, default_time)
Definition: parse_en.py:667
def extract_datetime_it(string, dateNow, default_time)
Definition: parse_it.py:600
def extractnumber_da(text)
Definition: parse_da.py:77
def extractnumber_sv(text)
Definition: parse_sv.py:22
def match_one(query, choices)
Definition: parse.py:68
def extractnumber_en(text, short_scale=True, ordinals=False)
Definition: parse_en.py:592
def extract_datetime_da(string, currentDate, default_time)
Definition: parse_da.py:161
def extract_datetime_es(input_str, currentDate=None, default_time=None)
Definition: parse_es.py:363
def extract_numbers(text, short_scale=True, ordinals=False, lang=None)
Definition: parse.py:97
def extract_numbers_en(text, short_scale=True, ordinals=False)
Definition: parse_en.py:1476
def normalize_de(text, remove_articles)
Definition: parse_de.py:910
def get_gender_pt(word, raw_string="")
Definition: parse_pt.py:1125
def extract_duration_en(text)
Definition: parse_en.py:612
def extract_datetime_pt(input_str, currentDate, default_time)
Definition: parse_pt.py:309
def normalize_sv(text, remove_articles)
Definition: parse_sv.py:760
def extract_duration(text, lang=None)
Definition: parse.py:166
def extract_datetime_sv(string, currentDate, default_time)
Definition: parse_sv.py:127
def extract_numbers_it(text, short_scale=False, ordinals=False)
Definition: parse_it.py:1310
def extract_datetime_de(string, currentDate, default_time)
Definition: parse_de.py:166
def extractnumber_pt(text)
Definition: parse_pt.py:64
def normalize_pt(text, remove_articles)
Definition: parse_pt.py:271
def get_gender_it(word, raw_string="")
Definition: parse_it.py:1283
def normalize_fr(text, remove_articles)
Definition: parse_fr.py:1033
def extractnumber_de(text)
Definition: parse_de.py:82
def get_gender(word, context="", lang=None)
Definition: parse.py:321
def normalize_da(text, remove_articles)
Definition: parse_da.py:899
def extract_numbers_fr(text, short_scale=True, ordinals=False)
Definition: parse_fr.py:1070
def extract_datetime(text, anchorDate=None, lang=None, default_time=None)
Definition: parse.py:202
def normalize_es(text, remove_articles)
Definition: parse_es.py:336
def fuzzy_match(x, against)
Definition: parse.py:59
def extract_numbers_da(text, short_scale=True, ordinals=False)
Definition: parse_da.py:918
def normalize(text, lang=None, remove_articles=True)
Definition: parse.py:281


mycroft_ros
Author(s):
autogenerated on Mon Apr 26 2021 02:35:40