speech_recognition_node.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Copyright: Yuki Furuta <furushchev@jsk.imi.i.u-tokyo.ac.jp>
4 
5 import actionlib
6 import rospy
7 import speech_recognition as SR
9 import json
10 import array
11 import sys
12 from threading import Lock
13 
14 from audio_common_msgs.msg import AudioData
15 from sound_play.msg import SoundRequest, SoundRequestAction, SoundRequestGoal
16 
17 from speech_recognition_msgs.msg import SpeechRecognitionCandidates
18 from speech_recognition_msgs.srv import SpeechRecognition
19 from speech_recognition_msgs.srv import SpeechRecognitionResponse
20 
21 from dynamic_reconfigure.server import Server
22 from ros_speech_recognition.cfg import SpeechRecognitionConfig as Config
23 
24 
25 class ROSAudio(SR.AudioSource):
26  def __init__(self, topic_name="audio", depth=16, n_channel=1,
27  sample_rate=16000, chunk_size=1024, buffer_size=10240):
28  assert buffer_size > chunk_size
29 
30  self.topic_name = topic_name
31  self.buffer_size = buffer_size
32 
33  if depth == 8:
34  self.SAMPLE_WIDTH = 1
35  elif depth == 16:
36  self.SAMPLE_WIDTH = 2
37  elif depth == 32:
38  self.SAMPLE_WIDTH = 4
39  else:
40  raise ValueError("depth must be 8, 16 or 32")
41 
42  self.SAMPLE_RATE = sample_rate
43  self.CHUNK = chunk_size
44  self.n_channel = n_channel
45 
46  self.stream = None
47 
48  def open(self):
49  if self.stream is not None:
50  self.stream.close()
51  self.stream = None
53  self.topic_name, self.buffer_size, depth=self.SAMPLE_WIDTH*8,
54  n_channel=self.n_channel)
55  return self
56 
57  def close(self):
58  self.stream.close()
59  self.stream = None
60 
61  def __enter__(self):
62  return self.open()
63 
64  def __exit__(self, exc_type, exc_value, traceback):
65  self.close()
66 
67  class AudioStream(object):
68  def __init__(self, topic_name, buffer_size=10240, depth=16,
69  n_channel=1, target_channel=0):
70  self.buffer_size = buffer_size
71  self.lock = Lock()
72  self.buffer = bytes()
73  self.depth = depth
74  self.n_channel = n_channel
75  self.target_channel = min(self.n_channel - 1, max(0, target_channel))
76  self.sub_audio = rospy.Subscriber(
77  topic_name, AudioData, self.audio_cb)
78  self.type_code = {}
79  for code in ['b', 'h', 'i', 'l']:
80  self.type_code[array.array(code).itemsize] = code
81 
82  def read_once(self, size):
83  with self.lock:
84  buf = self.buffer[:size]
85  self.buffer = self.buffer[size:]
86  return buf
87 
88  def read(self, size):
89  while not rospy.is_shutdown() and len(self.buffer) < size:
90  rospy.sleep(0.001)
91  return self.read_once(size)
92 
93  def close(self):
94  try:
95  self.sub_audio.unregister()
96  except:
97  pass
98  self.buffer = bytes()
99 
100  def audio_cb(self, msg):
101  with self.lock:
102  dtype = self.type_code[self.depth/8]
103  # take out target_channel channel data from multi channel data
104  data = array.array(dtype, bytes(msg.data)).tolist()
105  chan_data = data[self.target_channel::self.n_channel]
106  self.buffer += array.array(dtype, chan_data).tostring()
107  overflow = len(self.buffer) - self.buffer_size
108  if overflow > 0:
109  self.buffer = self.buffer[overflow:]
110 
111 
112 class ROSSpeechRecognition(object):
113  def __init__(self):
114  self.default_duration = rospy.get_param("~duration", 10.0)
115  self.engine = None
116  self.recognizer = RecognizerEx() # Use custom Recognizer to support exteded API
117  self.audio = ROSAudio(topic_name=rospy.get_param("~audio_topic", "audio"),
118  depth=rospy.get_param("~depth", 16),
119  n_channel=rospy.get_param("~n_channel", 1),
120  sample_rate=rospy.get_param("~sample_rate", 16000),
121  buffer_size=rospy.get_param("~buffer_size", 10240))
122 
123  # initialize sound play client
124  self.act_sound = actionlib.SimpleActionClient("sound_play", SoundRequestAction)
125  if not self.act_sound.wait_for_server(rospy.Duration(5.0)):
126  rospy.logwarn("Failed to find sound_play action. Disabled audio alert")
127  self.act_sound = None
128  self.signals = {
129  "start": rospy.get_param("~start_signal",
130  "/usr/share/sounds/ubuntu/stereo/bell.ogg"),
131  "recognized": rospy.get_param("~recognized_signal",
132  "/usr/share/sounds/ubuntu/stereo/button-toggle-on.ogg"),
133  "success": rospy.get_param("~success_signal",
134  "/usr/share/sounds/ubuntu/stereo/message-new-instant.ogg"),
135  "timeout": rospy.get_param("~timeout_signal",
136  "/usr/share/sounds/ubuntu/stereo/window-slide.ogg"),
137  }
138 
139  self.dyn_srv = Server(Config, self.config_callback)
140 
141  self.stop_fn = None
142  self.continuous = rospy.get_param("~continuous", False)
143  if self.continuous:
144  rospy.loginfo("Enabled continuous mode")
145  self.pub = rospy.Publisher("/Tablet/voice",
146  SpeechRecognitionCandidates,
147  queue_size=1)
148  else:
149  self.srv = rospy.Service("speech_recognition",
150  SpeechRecognition,
152 
153  def config_callback(self, config, level):
154  # config for engine
155  self.language = config.language
156  if self.engine != config.engine:
157  self.args = {}
158  self.engine = config.engine
159 
160  # config for adaptive thresholding
161  self.dynamic_energy_threshold = config.dynamic_energy_threshold
162  if self.dynamic_energy_threshold:
163  config.energy_threshold = self.recognizer.energy_threshold
164  else:
165  self.recognizer.energy_threshold = config.energy_threshold
166  self.recognizer.dynamic_energy_adjustment_damping = config.dynamic_energy_adjustment_damping
167  self.recognizer.dynamic_energy_ratio = config.dynamic_energy_ratio
168 
169  # config for timeout
170  if config.listen_timeout > 0.0:
171  self.listen_timeout = config.listen_timeout
172  else:
173  self.listen_timeout = None
174  if config.phrase_time_limit > 0.0:
175  self.phrase_time_limit = config.phrase_time_limit
176  else:
177  self.phrase_time_limit = None
178  if config.operation_timeout > 0.0:
179  self.recognizer.operation_timeout = config.operation_timeout
180  else:
181  self.recognizer.operation_timeout = None
182 
183  # config for VAD
184  if config.pause_threshold < config.non_speaking_duration:
185  config.pause_threshold = config.non_speaking_duration
186  self.recognizer.pause_threshold = config.pause_threshold
187  self.recognizer.non_speaking_duration = config.non_speaking_duration
188  self.recognizer.phrase_threshold = config.phrase_threshold
189 
190  return config
191 
192  def play_sound(self, key, timeout=5.0):
193  if self.act_sound is None:
194  return
195  req = SoundRequest()
196  req.sound = SoundRequest.PLAY_FILE
197  req.command = SoundRequest.PLAY_ONCE
198  if hasattr(SoundRequest, 'volume'): # volume is added from 0.3.0 https://github.com/ros-drivers/audio_common/commit/da9623414f381642e52f59701c09928c72a54be7#diff-fe2d85580f1ccfed4e23a608df44a7f7
199  req.volume = 1.0
200  req.arg = self.signals[key]
201  goal = SoundRequestGoal(sound_request=req)
202  self.act_sound.send_goal_and_wait(goal, rospy.Duration(timeout))
203 
204  def recognize(self, audio):
205  recog_func = None
206  if self.engine == Config.SpeechRecognition_Google:
207  if not self.args:
208  self.args = {'key': rospy.get_param("~google_key", None)}
209  recog_func = self.recognizer.recognize_google
210  elif self.engine == Config.SpeechRecognition_GoogleCloud:
211  if not self.args:
212  credentials_path = rospy.get_param("~google_cloud_credentials_json", None)
213  if credentials_path is not None:
214  with open(credentials_path) as j:
215  credentials_json = j.read()
216  else:
217  credentials_json = None
218  self.args = {'credentials_json': credentials_json,
219  'preferred_phrases': rospy.get_param('~google_cloud_preferred_phrases', None)}
220  if rospy.has_param('~diarizationConfig') :
221  self.args.update({'user_config': {'diarizationConfig': rospy.get_param('~diarizationConfig') }})
222  recog_func = self.recognizer.recognize_google_cloud
223  elif self.engine == Config.SpeechRecognition_Sphinx:
224  recog_func = self.recognizer.recognize_sphinx
225  elif self.engine == Config.SpeechRecognition_Wit:
226  recog_func = self.recognizer.recognize_wit
227  elif self.engine == Config.SpeechRecognition_Bing:
228  if not self.args:
229  self.args = {'key': rospy.get_param("~bing_key")}
230  recog_func = self.recognizer.recognize_bing
231  elif self.engine == Config.SpeechRecognition_Houndify:
232  recog_func = self.recognizer.recognize_houndify
233  elif self.engine == Config.SpeechRecognition_IBM:
234  recog_func = self.recognizer.recognize_ibm
235 
236  return recog_func(audio_data=audio, language=self.language, **self.args)
237 
238  def audio_cb(self, _, audio):
239  try:
240  rospy.logdebug("Waiting for result... (Sent %d bytes)" % len(audio.get_raw_data()))
241  result = self.recognize(audio)
242  rospy.loginfo("Result: %s" % result.encode('utf-8'))
243  msg = SpeechRecognitionCandidates(transcript=[result])
244  self.pub.publish(msg)
245  except SR.UnknownValueError as e:
246  if self.dynamic_energy_threshold:
247  self.recognizer.adjust_for_ambient_noise(self.audio)
248  rospy.loginfo("Updated energy threshold to %f" % self.recognizer.energy_threshold)
249  except SR.RequestError as e:
250  rospy.logerr("Failed to recognize: %s" % str(e))
251  except SR.UnknownValueError as e:
252  rospy.logerr("Failed to recognize: %s" % str(e))
253  except:
254  rospy.logerr("Unexpected error: %s" % str(sys.exc_info()))
255 
257  if self.dynamic_energy_threshold:
258  with self.audio as src:
259  self.recognizer.adjust_for_ambient_noise(src)
260  rospy.loginfo("Set minimum energy threshold to {}".format(
261  self.recognizer.energy_threshold))
262  self.stop_fn = self.recognizer.listen_in_background(
263  self.audio, self.audio_cb, phrase_time_limit=self.phrase_time_limit)
264  rospy.on_shutdown(self.on_shutdown)
265 
266  def on_shutdown(self):
267  if self.stop_fn is not None:
268  self.stop_fn()
269 
271  res = SpeechRecognitionResponse()
272 
273  duration = req.duration
274  if duration <= 0.0:
275  duration = self.default_duration
276 
277  with self.audio as src:
278  if self.dynamic_energy_threshold:
279  self.recognizer.adjust_for_ambient_noise(src)
280  rospy.loginfo("Set minimum energy threshold to %f" % self.recognizer.energy_threshold)
281 
282  if not req.quiet:
283  self.play_sound("start", 0.1)
284 
285  start_time = rospy.Time.now()
286  while (rospy.Time.now() - start_time).to_sec() < duration:
287  rospy.loginfo("Waiting for speech...")
288  try:
289  audio = self.recognizer.listen(
290  src, timeout=self.listen_timeout, phrase_time_limit=self.phrase_time_limit)
291  except SR.WaitTimeoutError as e:
292  rospy.logwarn(e)
293  break
294  if not req.quiet:
295  self.play_sound("recognized", 0.05)
296  rospy.loginfo("Waiting for result... (Sent %d bytes)" % len(audio.get_raw_data()))
297 
298  try:
299  result = self.recognize(audio)
300  rospy.loginfo("Result: %s" % result.encode('utf-8'))
301  if not req.quiet:
302  self.play_sound("success", 0.1)
303  res.result = SpeechRecognitionCandidates(transcript=[result])
304  return res
305  except SR.UnknownValueError:
306  if self.dynamic_energy_threshold:
307  self.recognizer.adjust_for_ambient_noise(src)
308  rospy.loginfo("Set minimum energy threshold to %f" % self.recognizer.energy_threshold)
309  except SR.RequestError as e:
310  rospy.logerr("Failed to recognize: %s" % str(e))
311  rospy.sleep(0.1)
312  if rospy.is_shutdown():
313  break
314 
315  # Timeout
316  if not req.quiet:
317  self.play_sound("timeout", 0.1)
318  return res
319 
320  def spin(self):
321  if self.continuous:
323  rospy.spin()
324 
325 
326 if __name__ == '__main__':
327  rospy.init_node("speech_recognition")
329  rec.spin()
def __exit__(self, exc_type, exc_value, traceback)
def __init__(self, topic_name="audio", depth=16, n_channel=1, sample_rate=16000, chunk_size=1024, buffer_size=10240)
def __init__(self, topic_name, buffer_size=10240, depth=16, n_channel=1, target_channel=0)


ros_speech_recognition
Author(s): Yuki Furuta
autogenerated on Tue May 11 2021 02:55:47