00001
00002
00003
00004
00005 import actionlib
00006 import rospy
00007 import speech_recognition as SR
00008 import json
00009 from threading import Lock
00010
00011 from audio_common_msgs.msg import AudioData
00012 from sound_play.msg import SoundRequest, SoundRequestAction, SoundRequestGoal
00013
00014 from speech_recognition_msgs.msg import SpeechRecognitionCandidates
00015 from speech_recognition_msgs.srv import SpeechRecognition
00016 from speech_recognition_msgs.srv import SpeechRecognitionResponse
00017
00018 from dynamic_reconfigure.server import Server
00019 from ros_speech_recognition.cfg import SpeechRecognitionConfig as Config
00020
00021
00022 class ROSAudio(SR.AudioSource):
00023 def __init__(self, topic_name="audio", depth=16, sample_rate=16000, chunk_size=1024, buffer_size=10240):
00024 assert buffer_size > chunk_size
00025
00026 self.topic_name = topic_name
00027 self.buffer_size = buffer_size
00028
00029 if depth == 8:
00030 self.SAMPLE_WIDTH = 1L
00031 elif depth == 16:
00032 self.SAMPLE_WIDTH = 2L
00033 elif depth == 32:
00034 self.SAMPLE_WIDTH = 4L
00035 else:
00036 raise ValueError("depth must be 8, 16 or 32")
00037
00038 self.SAMPLE_RATE = sample_rate
00039 self.CHUNK = chunk_size
00040
00041 self.stream = None
00042
00043 def open(self):
00044 if self.stream is not None:
00045 self.stream.close()
00046 self.stream = None
00047 self.stream = ROSAudio.AudioStream(self.topic_name, self.buffer_size)
00048 return self
00049
00050 def close(self):
00051 self.stream.close()
00052 self.stream = None
00053
00054 def __enter__(self):
00055 return self.open()
00056
00057 def __exit__(self, exc_type, exc_value, traceback):
00058 self.close()
00059
00060 class AudioStream(object):
00061 def __init__(self, topic_name, buffer_size=10240):
00062 self.buffer_size = buffer_size
00063 self.lock = Lock()
00064 self.buffer = bytes()
00065 self.sub_audio = rospy.Subscriber(
00066 topic_name, AudioData, self.audio_cb)
00067
00068 def read_once(self, size):
00069 with self.lock:
00070 buf = self.buffer[:size]
00071 self.buffer = self.buffer[size:]
00072 return buf
00073
00074 def read(self, size):
00075 while not rospy.is_shutdown() and len(self.buffer) < size:
00076 rospy.sleep(0.001)
00077 return self.read_once(size)
00078
00079 def close(self):
00080 try:
00081 self.sub_audio.unregister()
00082 except:
00083 pass
00084 self.buffer = bytes()
00085
00086 def audio_cb(self, msg):
00087 with self.lock:
00088 self.buffer += bytes(msg.data)
00089 overflow = len(self.buffer) - self.buffer_size
00090 if overflow > 0:
00091 self.buffer = self.buffer[overflow:]
00092
00093
00094 class ROSSpeechRecognition(object):
00095 def __init__(self):
00096 self.default_duration = rospy.get_param("~duration", 10.0)
00097 self.engine = None
00098 self.recognizer = SR.Recognizer()
00099 self.audio = ROSAudio(topic_name="audio",
00100 depth=rospy.get_param("~depth", 16),
00101 sample_rate=rospy.get_param("~sample_rate", 16000))
00102
00103
00104 self.act_sound = actionlib.SimpleActionClient("sound_play", SoundRequestAction)
00105 if not self.act_sound.wait_for_server(rospy.Duration(5.0)):
00106 rospy.logwarn("Failed to find sound_play action. Disabled audio alert")
00107 self.act_sound = None
00108 self.signals = {
00109 "start": rospy.get_param("~start_signal",
00110 "/usr/share/sounds/ubuntu/stereo/bell.ogg"),
00111 "recognized": rospy.get_param("~recognized_signal",
00112 "/usr/share/sounds/ubuntu/stereo/button-toggle-on.ogg"),
00113 "success": rospy.get_param("~success_signal",
00114 "/usr/share/sounds/ubuntu/stereo/message-new-instant.ogg"),
00115 "timeout": rospy.get_param("~timeout_signal",
00116 "/usr/share/sounds/ubuntu/stereo/window-slide.ogg"),
00117 }
00118
00119 self.dyn_srv = Server(Config, self.config_callback)
00120
00121 self.stop_fn = None
00122 self.continuous = rospy.get_param("~continuous", False)
00123 if self.continuous:
00124 rospy.loginfo("Enabled continuous mode")
00125 self.pub = rospy.Publisher("/Tablet/voice",
00126 SpeechRecognitionCandidates,
00127 queue_size=1)
00128 else:
00129 self.srv = rospy.Service("speech_recognition",
00130 SpeechRecognition,
00131 self.speech_recognition_srv_cb)
00132
00133 def config_callback(self, config, level):
00134
00135 self.language = config.language
00136 if self.engine != config.engine:
00137 self.args = {}
00138 self.engine = config.engine
00139
00140
00141 self.dynamic_energy_threshold = config.dynamic_energy_threshold
00142 if self.dynamic_energy_threshold:
00143 config.energy_threshold = self.recognizer.energy_threshold
00144 else:
00145 self.recognizer.energy_threshold = config.energy_threshold
00146 self.recognizer.dynamic_energy_adjustment_damping = config.dynamic_energy_adjustment_damping
00147 self.recognizer.dynamic_energy_ratio = config.dynamic_energy_ratio
00148
00149
00150 if config.listen_timeout > 0.0:
00151 self.listen_timeout = config.listen_timeout
00152 else:
00153 self.listen_timeout = None
00154 if config.phrase_time_limit > 0.0:
00155 self.phrase_time_limit = config.phrase_time_limit
00156 else:
00157 self.phrase_time_limit = None
00158 if config.operation_timeout > 0.0:
00159 self.recognizer.operation_timeout = config.operation_timeout
00160 else:
00161 self.recognizer.operation_timeout = None
00162
00163
00164 if config.pause_threshold < config.non_speaking_duration:
00165 config.pause_threshold = config.non_speaking_duration
00166 self.recognizer.pause_threshold = config.pause_threshold
00167 self.recognizer.non_speaking_duration = config.non_speaking_duration
00168 self.recognizer.phrase_threshold = config.phrase_threshold
00169
00170 return config
00171
00172 def play_sound(self, key, timeout=5.0):
00173 if self.act_sound is None:
00174 return
00175 req = SoundRequest()
00176 req.sound = SoundRequest.PLAY_FILE
00177 req.command = SoundRequest.PLAY_ONCE
00178 req.arg = self.signals[key]
00179 goal = SoundRequestGoal(sound_request=req)
00180 self.act_sound.send_goal_and_wait(goal, rospy.Duration(timeout))
00181
00182 def recognize(self, audio):
00183 recog_func = None
00184 if self.engine == Config.SpeechRecognition_Google:
00185 if not self.args:
00186 self.args = {'key': rospy.get_param("~google_key", None)}
00187 recog_func = self.recognizer.recognize_google
00188 elif self.engine == Config.SpeechRecognition_GoogleCloud:
00189 if not self.args:
00190 credentials_path = rospy.get_param("~google_cloud_credentials_json", None)
00191 if credentials_path is not None:
00192 with open(credentials_path) as j:
00193 credentials_json = j.read()
00194 else:
00195 credentials_json = None
00196 self.args = {'credentials_json': credentials_json,
00197 'preferred_phrases': rospy.get_param('~google_cloud_preferred_phrases', None)}
00198 recog_func = self.recognizer.recognize_google_cloud
00199 elif self.engine == Config.SpeechRecognition_Sphinx:
00200 recog_func = self.recognizer.recognize_sphinx
00201 elif self.engine == Config.SpeechRecognition_Wit:
00202 recog_func = self.recognizer.recognize_wit
00203 elif self.engine == Config.SpeechRecognition_Bing:
00204 if not self.args:
00205 self.args = {'key': rospy.get_param("~bing_key")}
00206 recog_func = self.recognizer.recognize_bing
00207 elif self.engine == Config.SpeechRecognition_Houndify:
00208 recog_func = self.recognizer.recognize_houndify
00209 elif self.engine == Config.SpeechRecognition_IBM:
00210 recog_func = self.recognizer.recognize_ibm
00211
00212 return recog_func(audio_data=audio, language=self.language, **self.args)
00213
00214 def audio_cb(self, _, audio):
00215 try:
00216 rospy.logdebug("Waiting for result... (Sent %d bytes)" % len(audio.get_raw_data()))
00217 result = self.recognize(audio)
00218 rospy.loginfo("Result: %s" % result.encode('utf-8'))
00219 msg = SpeechRecognitionCandidates(transcript=[result])
00220 self.pub.publish(msg)
00221 except SR.UnknownValueError as e:
00222 if self.dynamic_energy_threshold:
00223 self.recognizer.adjust_for_ambient_noise(self.audio)
00224 rospy.loginfo("Updated energy threshold to %f" % self.recognizer.energy_threshold)
00225 except SR.RequestError as e:
00226 rospy.logerr("Failed to recognize: %s" % str(e))
00227
00228 def start_speech_recognition(self):
00229 if self.dynamic_energy_threshold:
00230 with self.audio as src:
00231 self.recognizer.adjust_for_ambient_noise(src)
00232 rospy.loginfo("Set minimum energy threshold to {}".format(
00233 self.recognizer.energy_threshold))
00234 self.stop_fn = self.recognizer.listen_in_background(
00235 self.audio, self.audio_cb, phrase_time_limit=self.phrase_time_limit)
00236 rospy.on_shutdown(self.on_shutdown)
00237
00238 def on_shutdown(self):
00239 if self.stop_fn is not None:
00240 self.stop_fn()
00241
00242 def speech_recognition_srv_cb(self, req):
00243 res = SpeechRecognitionResponse()
00244
00245 duration = req.duration
00246 if duration <= 0.0:
00247 duration = self.default_duration
00248
00249 with self.audio as src:
00250 if self.dynamic_energy_threshold:
00251 self.recognizer.adjust_for_ambient_noise(src)
00252 rospy.loginfo("Set minimum energy threshold to %f" % self.recognizer.energy_threshold)
00253
00254 if not req.quiet:
00255 self.play_sound("start", 0.1)
00256
00257 start_time = rospy.Time.now()
00258 while (rospy.Time.now() - start_time).to_sec() < duration:
00259 rospy.loginfo("Waiting for speech...")
00260 try:
00261 audio = self.recognizer.listen(
00262 src, timeout=self.listen_timeout, phrase_time_limit=self.phrase_time_limit)
00263 except SR.WaitTimeoutError as e:
00264 rospy.logwarn(e)
00265 break
00266 if not req.quiet:
00267 self.play_sound("recognized", 0.05)
00268 rospy.loginfo("Waiting for result... (Sent %d bytes)" % len(audio.get_raw_data()))
00269
00270 try:
00271 result = self.recognize(audio)
00272 rospy.loginfo("Result: %s" % result.encode('utf-8'))
00273 if not req.quiet:
00274 self.play_sound("success", 0.1)
00275 res.result = SpeechRecognitionCandidates(transcript=[result])
00276 return res
00277 except SR.UnknownValueError:
00278 if self.dynamic_energy_threshold:
00279 self.recognizer.adjust_for_ambient_noise(src)
00280 rospy.loginfo("Set minimum energy threshold to %f" % self.recognizer.energy_threshold)
00281 except SR.RequestError as e:
00282 rospy.logerr("Failed to recognize: %s" % str(e))
00283 rospy.sleep(0.1)
00284 if rospy.is_shutdown():
00285 break
00286
00287
00288 if not req.quiet:
00289 self.play_sound("timeout", 0.1)
00290 return res
00291
00292 def spin(self):
00293 if self.continuous:
00294 self.start_speech_recognition()
00295 rospy.spin()
00296
00297
00298 if __name__ == '__main__':
00299 rospy.init_node("speech_recognition")
00300 rec = ROSSpeechRecognition()
00301 rec.spin()