recognize_google_cloud.py
Go to the documentation of this file.
1 # file to override recognize_google_cloud (https://github.com/Uberi/speech_recognition/blob/3.8.1/speech_recognition/__init__.py)
2 #
3 # we need this to pass more config params, like enable_speaker_diarization=True
4 # see https://cloud.google.com/speech-to-text/docs/multiple-voices
5 
6 import speech_recognition as SR
7 from speech_recognition import *
8 
9 class RecognizerEx(SR.Recognizer):
10  def recognize_google_cloud(self, audio_data, credentials_json=None, language="en-US", preferred_phrases=None, show_all=False, user_config = {}):
11  """
12  Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech API.
13 
14  This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart <https://cloud.google.com/speech/docs/getting-started>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__.
15 
16  The recognition language is determined by ``language``, which is a BCP-47 language tag like ``"en-US"`` (US English). A list of supported language tags can be found in the `Google Cloud Speech API documentation <https://cloud.google.com/speech/docs/languages>`__.
17 
18  If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__.
19 
20  Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary.
21 
22  Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection.
23  """
24  assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data"
25  if credentials_json is not None:
26  try: json.loads(credentials_json)
27  except Exception: raise AssertionError("``credentials_json`` must be ``None`` or a valid JSON string")
28  assert isinstance(language, str), "``language`` must be a string"
29  assert preferred_phrases is None or all(isinstance(preferred_phrases, (type(""), type(u""))) for preferred_phrases in preferred_phrases), "``preferred_phrases`` must be a list of strings"
30 
31  # See https://cloud.google.com/speech/reference/rest/v1/RecognitionConfig
32  flac_data = audio_data.get_flac_data(
33  convert_rate=None if 8000 <= audio_data.sample_rate <= 48000 else max(8000, min(audio_data.sample_rate, 48000)), # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range
34  convert_width=2 # audio samples must be 16-bit
35  )
36 
37  try:
38  from oauth2client.client import GoogleCredentials
39  from googleapiclient.discovery import build
40  import googleapiclient.errors
41 
42  # cannot simply use 'http = httplib2.Http(timeout=self.operation_timeout)'
43  # because discovery.build() says 'Arguments http and credentials are mutually exclusive'
44  import socket
45  import googleapiclient.http
46  if self.operation_timeout and socket.getdefaulttimeout() is None:
47  # override constant (used by googleapiclient.http.build_http())
48  googleapiclient.http.DEFAULT_HTTP_TIMEOUT_SEC = self.operation_timeout
49 
50  if credentials_json is None:
51  api_credentials = GoogleCredentials.get_application_default()
52  else:
53  # the credentials can only be read from a file, so we'll make a temp file and write in the contents to work around that
54  with PortableNamedTemporaryFile("w") as f:
55  f.write(credentials_json)
56  f.flush()
57  api_credentials = GoogleCredentials.from_stream(f.name)
58 
59  speech_service = build("speech", "v1", credentials=api_credentials, cache_discovery=False)
60  except ImportError:
61  raise RequestError("missing google-api-python-client module: ensure that google-api-python-client is set up correctly.")
62 
63  speech_config = {"encoding": "FLAC", "sampleRateHertz": audio_data.sample_rate, "languageCode": language}
64 
65  ###################################
66  ## Support user defined configs ##
67  ###################################
68  speech_config.update(user_config)
69  ###################################
70  ## ##
71  ###################################
72 
73  if preferred_phrases is not None:
74  speech_config["speechContext"] = {"phrases": preferred_phrases}
75  if show_all:
76  speech_config["enableWordTimeOffsets"] = True # some useful extra options for when we want all the output
77  request = speech_service.speech().recognize(body={"audio": {"content": base64.b64encode(flac_data).decode("utf8")}, "config": speech_config})
78 
79  try:
80  response = request.execute()
81  except googleapiclient.errors.HttpError as e:
82  raise RequestError(e)
83  except URLError as e:
84  raise RequestError("recognition connection failed: {0}".format(e.reason))
85 
86  if show_all: return response
87  if "results" not in response or len(response["results"]) == 0: raise UnknownValueError()
88  transcript = ""
89  for result in response["results"]:
90  if speech_config.has_key('diarizationConfig') and \
91  speech_config['diarizationConfig']['enableSpeakerDiarization'] == True:
92  # when diariazationConfig is true, use words with speakerTag:
93  speakerTag = None
94  for word in result["alternatives"][0]["words"]:
95  if word.has_key('speakerTag'):
96  if speakerTag != word['speakerTag']:
97  speakerTag = word['speakerTag']
98  transcript += "[{}]".format(speakerTag)
99  transcript += ' ' + word['word']
100  elif result["alternatives"][0].has_key("transcript"):
101  print("trasncript?")
102  transcript += result["alternatives"][0]["transcript"].strip() + " "
103 
104  return transcript
def recognize_google_cloud(self, audio_data, credentials_json=None, language="en-US", preferred_phrases=None, show_all=False, user_config={})


ros_speech_recognition
Author(s): Yuki Furuta
autogenerated on Tue May 11 2021 02:55:47