speech_recognition_node.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Copyright: Yuki Furuta <furushchev@jsk.imi.i.u-tokyo.ac.jp>
4 
5 import actionlib
6 import rospy
7 import speech_recognition as SR
8 import json
9 from threading import Lock
10 
11 from audio_common_msgs.msg import AudioData
12 from sound_play.msg import SoundRequest, SoundRequestAction, SoundRequestGoal
13 
14 from speech_recognition_msgs.msg import SpeechRecognitionCandidates
15 from speech_recognition_msgs.srv import SpeechRecognition
16 from speech_recognition_msgs.srv import SpeechRecognitionResponse
17 
18 from dynamic_reconfigure.server import Server
19 from ros_speech_recognition.cfg import SpeechRecognitionConfig as Config
20 
21 
22 class ROSAudio(SR.AudioSource):
23  def __init__(self, topic_name="audio", depth=16, sample_rate=16000, chunk_size=1024, buffer_size=10240):
24  assert buffer_size > chunk_size
25 
26  self.topic_name = topic_name
27  self.buffer_size = buffer_size
28 
29  if depth == 8:
30  self.SAMPLE_WIDTH = 1L
31  elif depth == 16:
32  self.SAMPLE_WIDTH = 2L
33  elif depth == 32:
34  self.SAMPLE_WIDTH = 4L
35  else:
36  raise ValueError("depth must be 8, 16 or 32")
37 
38  self.SAMPLE_RATE = sample_rate
39  self.CHUNK = chunk_size
40 
41  self.stream = None
42 
43  def open(self):
44  if self.stream is not None:
45  self.stream.close()
46  self.stream = None
48  return self
49 
50  def close(self):
51  self.stream.close()
52  self.stream = None
53 
54  def __enter__(self):
55  return self.open()
56 
57  def __exit__(self, exc_type, exc_value, traceback):
58  self.close()
59 
60  class AudioStream(object):
61  def __init__(self, topic_name, buffer_size=10240):
62  self.buffer_size = buffer_size
63  self.lock = Lock()
64  self.buffer = bytes()
65  self.sub_audio = rospy.Subscriber(
66  topic_name, AudioData, self.audio_cb)
67 
68  def read_once(self, size):
69  with self.lock:
70  buf = self.buffer[:size]
71  self.buffer = self.buffer[size:]
72  return buf
73 
74  def read(self, size):
75  while not rospy.is_shutdown() and len(self.buffer) < size:
76  rospy.sleep(0.001)
77  return self.read_once(size)
78 
79  def close(self):
80  try:
81  self.sub_audio.unregister()
82  except:
83  pass
84  self.buffer = bytes()
85 
86  def audio_cb(self, msg):
87  with self.lock:
88  self.buffer += bytes(msg.data)
89  overflow = len(self.buffer) - self.buffer_size
90  if overflow > 0:
91  self.buffer = self.buffer[overflow:]
92 
93 
94 class ROSSpeechRecognition(object):
95  def __init__(self):
96  self.default_duration = rospy.get_param("~duration", 10.0)
97  self.engine = None
98  self.recognizer = SR.Recognizer()
99  self.audio = ROSAudio(topic_name="audio",
100  depth=rospy.get_param("~depth", 16),
101  sample_rate=rospy.get_param("~sample_rate", 16000))
102 
103  # initialize sound play client
104  self.act_sound = actionlib.SimpleActionClient("sound_play", SoundRequestAction)
105  if not self.act_sound.wait_for_server(rospy.Duration(5.0)):
106  rospy.logwarn("Failed to find sound_play action. Disabled audio alert")
107  self.act_sound = None
108  self.signals = {
109  "start": rospy.get_param("~start_signal",
110  "/usr/share/sounds/ubuntu/stereo/bell.ogg"),
111  "recognized": rospy.get_param("~recognized_signal",
112  "/usr/share/sounds/ubuntu/stereo/button-toggle-on.ogg"),
113  "success": rospy.get_param("~success_signal",
114  "/usr/share/sounds/ubuntu/stereo/message-new-instant.ogg"),
115  "timeout": rospy.get_param("~timeout_signal",
116  "/usr/share/sounds/ubuntu/stereo/window-slide.ogg"),
117  }
118 
119  self.dyn_srv = Server(Config, self.config_callback)
120 
121  self.stop_fn = None
122  self.continuous = rospy.get_param("~continuous", False)
123  if self.continuous:
124  rospy.loginfo("Enabled continuous mode")
125  self.pub = rospy.Publisher("/Tablet/voice",
126  SpeechRecognitionCandidates,
127  queue_size=1)
128  else:
129  self.srv = rospy.Service("speech_recognition",
130  SpeechRecognition,
132 
133  def config_callback(self, config, level):
134  # config for engine
135  self.language = config.language
136  if self.engine != config.engine:
137  self.args = {}
138  self.engine = config.engine
139 
140  # config for adaptive thresholding
141  self.dynamic_energy_threshold = config.dynamic_energy_threshold
142  if self.dynamic_energy_threshold:
143  config.energy_threshold = self.recognizer.energy_threshold
144  else:
145  self.recognizer.energy_threshold = config.energy_threshold
146  self.recognizer.dynamic_energy_adjustment_damping = config.dynamic_energy_adjustment_damping
147  self.recognizer.dynamic_energy_ratio = config.dynamic_energy_ratio
148 
149  # config for timeout
150  if config.listen_timeout > 0.0:
151  self.listen_timeout = config.listen_timeout
152  else:
153  self.listen_timeout = None
154  if config.phrase_time_limit > 0.0:
155  self.phrase_time_limit = config.phrase_time_limit
156  else:
157  self.phrase_time_limit = None
158  if config.operation_timeout > 0.0:
159  self.recognizer.operation_timeout = config.operation_timeout
160  else:
161  self.recognizer.operation_timeout = None
162 
163  # config for VAD
164  if config.pause_threshold < config.non_speaking_duration:
165  config.pause_threshold = config.non_speaking_duration
166  self.recognizer.pause_threshold = config.pause_threshold
167  self.recognizer.non_speaking_duration = config.non_speaking_duration
168  self.recognizer.phrase_threshold = config.phrase_threshold
169 
170  return config
171 
172  def play_sound(self, key, timeout=5.0):
173  if self.act_sound is None:
174  return
175  req = SoundRequest()
176  req.sound = SoundRequest.PLAY_FILE
177  req.command = SoundRequest.PLAY_ONCE
178  req.arg = self.signals[key]
179  goal = SoundRequestGoal(sound_request=req)
180  self.act_sound.send_goal_and_wait(goal, rospy.Duration(timeout))
181 
182  def recognize(self, audio):
183  recog_func = None
184  if self.engine == Config.SpeechRecognition_Google:
185  if not self.args:
186  self.args = {'key': rospy.get_param("~google_key", None)}
187  recog_func = self.recognizer.recognize_google
188  elif self.engine == Config.SpeechRecognition_GoogleCloud:
189  if not self.args:
190  credentials_path = rospy.get_param("~google_cloud_credentials_json", None)
191  if credentials_path is not None:
192  with open(credentials_path) as j:
193  credentials_json = j.read()
194  else:
195  credentials_json = None
196  self.args = {'credentials_json': credentials_json,
197  'preferred_phrases': rospy.get_param('~google_cloud_preferred_phrases', None)}
198  recog_func = self.recognizer.recognize_google_cloud
199  elif self.engine == Config.SpeechRecognition_Sphinx:
200  recog_func = self.recognizer.recognize_sphinx
201  elif self.engine == Config.SpeechRecognition_Wit:
202  recog_func = self.recognizer.recognize_wit
203  elif self.engine == Config.SpeechRecognition_Bing:
204  if not self.args:
205  self.args = {'key': rospy.get_param("~bing_key")}
206  recog_func = self.recognizer.recognize_bing
207  elif self.engine == Config.SpeechRecognition_Houndify:
208  recog_func = self.recognizer.recognize_houndify
209  elif self.engine == Config.SpeechRecognition_IBM:
210  recog_func = self.recognizer.recognize_ibm
211 
212  return recog_func(audio_data=audio, language=self.language, **self.args)
213 
214  def audio_cb(self, _, audio):
215  try:
216  rospy.logdebug("Waiting for result... (Sent %d bytes)" % len(audio.get_raw_data()))
217  result = self.recognize(audio)
218  rospy.loginfo("Result: %s" % result.encode('utf-8'))
219  msg = SpeechRecognitionCandidates(transcript=[result])
220  self.pub.publish(msg)
221  except SR.UnknownValueError as e:
222  if self.dynamic_energy_threshold:
223  self.recognizer.adjust_for_ambient_noise(self.audio)
224  rospy.loginfo("Updated energy threshold to %f" % self.recognizer.energy_threshold)
225  except SR.RequestError as e:
226  rospy.logerr("Failed to recognize: %s" % str(e))
227 
229  if self.dynamic_energy_threshold:
230  with self.audio as src:
231  self.recognizer.adjust_for_ambient_noise(src)
232  rospy.loginfo("Set minimum energy threshold to {}".format(
233  self.recognizer.energy_threshold))
234  self.stop_fn = self.recognizer.listen_in_background(
235  self.audio, self.audio_cb, phrase_time_limit=self.phrase_time_limit)
236  rospy.on_shutdown(self.on_shutdown)
237 
238  def on_shutdown(self):
239  if self.stop_fn is not None:
240  self.stop_fn()
241 
243  res = SpeechRecognitionResponse()
244 
245  duration = req.duration
246  if duration <= 0.0:
247  duration = self.default_duration
248 
249  with self.audio as src:
250  if self.dynamic_energy_threshold:
251  self.recognizer.adjust_for_ambient_noise(src)
252  rospy.loginfo("Set minimum energy threshold to %f" % self.recognizer.energy_threshold)
253 
254  if not req.quiet:
255  self.play_sound("start", 0.1)
256 
257  start_time = rospy.Time.now()
258  while (rospy.Time.now() - start_time).to_sec() < duration:
259  rospy.loginfo("Waiting for speech...")
260  try:
261  audio = self.recognizer.listen(
262  src, timeout=self.listen_timeout, phrase_time_limit=self.phrase_time_limit)
263  except SR.WaitTimeoutError as e:
264  rospy.logwarn(e)
265  break
266  if not req.quiet:
267  self.play_sound("recognized", 0.05)
268  rospy.loginfo("Waiting for result... (Sent %d bytes)" % len(audio.get_raw_data()))
269 
270  try:
271  result = self.recognize(audio)
272  rospy.loginfo("Result: %s" % result.encode('utf-8'))
273  if not req.quiet:
274  self.play_sound("success", 0.1)
275  res.result = SpeechRecognitionCandidates(transcript=[result])
276  return res
277  except SR.UnknownValueError:
278  if self.dynamic_energy_threshold:
279  self.recognizer.adjust_for_ambient_noise(src)
280  rospy.loginfo("Set minimum energy threshold to %f" % self.recognizer.energy_threshold)
281  except SR.RequestError as e:
282  rospy.logerr("Failed to recognize: %s" % str(e))
283  rospy.sleep(0.1)
284  if rospy.is_shutdown():
285  break
286 
287  # Timeout
288  if not req.quiet:
289  self.play_sound("timeout", 0.1)
290  return res
291 
292  def spin(self):
293  if self.continuous:
295  rospy.spin()
296 
297 
298 if __name__ == '__main__':
299  rospy.init_node("speech_recognition")
301  rec.spin()
def __exit__(self, exc_type, exc_value, traceback)
def __init__(self, topic_name, buffer_size=10240)
def __init__(self, topic_name="audio", depth=16, sample_rate=16000, chunk_size=1024, buffer_size=10240)


ros_speech_recognition
Author(s): Yuki Furuta
autogenerated on Wed Jul 10 2019 03:47:14