00001
00002
00003
00004
00005 import actionlib
00006 import rospy
00007 import speech_recognition as SR
00008 from threading import Lock
00009
00010 from audio_common_msgs.msg import AudioData
00011 from sound_play.msg import SoundRequest, SoundRequestAction, SoundRequestGoal
00012
00013 from speech_recognition_msgs.msg import SpeechRecognitionCandidates
00014 from speech_recognition_msgs.srv import SpeechRecognition
00015 from speech_recognition_msgs.srv import SpeechRecognitionResponse
00016
00017 from dynamic_reconfigure.server import Server
00018 from ros_speech_recognition.cfg import SpeechRecognitionConfig as Config
00019
00020
00021 class ROSAudio(SR.AudioSource):
00022 def __init__(self, topic_name="audio", depth=16, sample_rate=16000, chunk_size=1024, buffer_size=10240):
00023 assert buffer_size > chunk_size
00024
00025 self.topic_name = topic_name
00026 self.buffer_size = buffer_size
00027
00028 if depth == 8:
00029 self.SAMPLE_WIDTH = 1L
00030 elif depth == 16:
00031 self.SAMPLE_WIDTH = 2L
00032 elif depth == 32:
00033 self.SAMPLE_WIDTH = 4L
00034 else:
00035 raise ValueError("depth must be 8, 16 or 32")
00036
00037 self.SAMPLE_RATE = sample_rate
00038 self.CHUNK = chunk_size
00039
00040 self.stream = None
00041
00042 def __enter__(self):
00043 if self.stream is not None:
00044 self.close()
00045 self.stream = ROSAudio.AudioStream(self.topic_name, self.buffer_size)
00046 return self
00047
00048 def __exit__(self, exc_type, exc_value, traceback):
00049 self.stream.close()
00050 self.stream = None
00051
00052 class AudioStream(object):
00053 def __init__(self, topic_name, buffer_size=10240):
00054 self.buffer_size = buffer_size
00055 self.lock = Lock()
00056 self.buffer = bytes()
00057 self.sub_audio = rospy.Subscriber(
00058 topic_name, AudioData, self.audio_cb)
00059
00060 def read_once(self, size):
00061 with self.lock:
00062 buf = self.buffer[:size]
00063 self.buffer = self.buffer[size:]
00064 return buf
00065
00066 def read(self, size):
00067 while not rospy.is_shutdown() and len(self.buffer) < size:
00068 rospy.sleep(0.001)
00069 return self.read_once(size)
00070
00071 def close(self):
00072 self.sub_audio.unregister()
00073 self.buffer = bytes()
00074
00075 def audio_cb(self, msg):
00076 with self.lock:
00077 self.buffer += bytes(msg.data)
00078 overflow = len(self.buffer) - self.buffer_size
00079 if overflow > 0:
00080 self.buffer = self.buffer[overflow:]
00081
00082
00083 class ROSSpeechRecognition(object):
00084 def __init__(self):
00085 self.default_duration = rospy.get_param("~duration", 10.0)
00086 self.engine = None
00087 self.recognizer = SR.Recognizer()
00088 self.audio = ROSAudio(topic_name="audio",
00089 depth=rospy.get_param("~depth", 16),
00090 sample_rate=rospy.get_param("~sample_rate", 16000))
00091
00092
00093 self.act_sound = actionlib.SimpleActionClient("sound_play", SoundRequestAction)
00094 if not self.act_sound.wait_for_server(rospy.Duration(5.0)):
00095 rospy.logwarn("Failed to find sound_play action. Disabled audio alert")
00096 self.act_sound = None
00097 self.signals = {
00098 "start": rospy.get_param("~start_signal",
00099 "/usr/share/sounds/ubuntu/stereo/bell.ogg"),
00100 "recognized": rospy.get_param("~recognized_signal",
00101 "/usr/share/sounds/ubuntu/stereo/button-toggle-on.ogg"),
00102 "success": rospy.get_param("~success_signal",
00103 "/usr/share/sounds/ubuntu/stereo/message-new-instant.ogg"),
00104 "timeout": rospy.get_param("~timeout_signal",
00105 "/usr/share/sounds/ubuntu/stereo/window-slide.ogg"),
00106 }
00107
00108 self.srv = Server(Config, self.config_callback)
00109
00110 self.initialized = False
00111
00112 self.srv_speech_recognition = rospy.Service("speech_recognition",
00113 SpeechRecognition,
00114 self.speech_recognition_cb)
00115
00116 def config_callback(self, config, level):
00117
00118 self.language = config.language
00119 if self.engine != config.engine:
00120 self.args = {}
00121 self.engine = config.engine
00122
00123
00124 self.dynamic_energy_threshold = config.dynamic_energy_threshold
00125 if self.dynamic_energy_threshold:
00126 config.energy_threshold = self.recognizer.energy_threshold
00127 else:
00128 self.recognizer.energy_threshold = config.energy_threshold
00129 self.recognizer.dynamic_energy_adjustment_damping = config.dynamic_energy_adjustment_damping
00130 self.recognizer.dynamic_energy_ratio = config.dynamic_energy_ratio
00131
00132
00133 if config.operation_timeout > 0.0:
00134 self.recognizer.operation_timeout = config.operation_timeout
00135 else:
00136 self.recognizer.operation_timeout = None
00137
00138
00139 if config.pause_threshold < config.non_speaking_duration:
00140 config.pause_threshold = config.non_speaking_duration
00141 self.recognizer.pause_threshold = config.pause_threshold
00142 self.recognizer.non_speaking_duration = config.non_speaking_duration
00143 self.recognizer.phrase_threshold = config.phrase_threshold
00144
00145 return config
00146
00147 def play_sound(self, key, timeout=5.0):
00148 if self.act_sound is None:
00149 return
00150 req = SoundRequest()
00151 req.sound = SoundRequest.PLAY_FILE
00152 req.command = SoundRequest.PLAY_ONCE
00153 req.arg = self.signals[key]
00154 goal = SoundRequestGoal(sound_request=req)
00155 self.act_sound.send_goal_and_wait(goal, rospy.Duration(timeout))
00156
00157 def recognize(self, audio):
00158 recog_func = None
00159 if self.engine == Config.SpeechRecognition_Google:
00160 if not self.args:
00161 self.args = {'key': rospy.get_param("~google_key", None)}
00162 recog_func = self.recognizer.recognize_google
00163 elif self.engine == Config.SpeechRecognition_GoogleCloud:
00164 if not self.args:
00165 self.args = {'credential_json': rospy.get_param("~google_cloud_credentials_json", None),
00166 'preferred_phrases': rospy.get_param('~google_cloud_preferred_phrases', None)}
00167 recog_func = self.recognizer.recognize_google_cloud
00168 elif self.engine == Config.SpeechRecognition_Sphinx:
00169 recog_func = self.recognizer.recognize_sphinx
00170 elif self.engine == Config.SpeechRecognition_Wit:
00171 recog_func = self.recognizer.recognize_wit
00172 elif self.engine == Config.SpeechRecognition_Bing:
00173 if not self.args:
00174 self.args = {'key': rospy.get_param("~bing_key")}
00175 recog_func = self.recognizer.recognize_bing
00176 elif self.engine == Config.SpeechRecognition_Houndify:
00177 recog_func = self.recognizer.recognize_houndify
00178 elif self.engine == Config.SpeechRecognition_IBM:
00179 recog_func = self.recognizer.recognize_ibm
00180
00181 return recog_func(audio_data=audio, language=self.language, **self.args)
00182
00183 def speech_recognition_cb(self, req):
00184 res = SpeechRecognitionResponse()
00185
00186 duration = req.duration
00187 if duration <= 0.0:
00188 duration = self.default_duration
00189
00190 with self.audio as src:
00191 if self.dynamic_energy_threshold:
00192 self.recognizer.adjust_for_ambient_noise(src)
00193 rospy.loginfo("Set minimum energy threshold to %f" % self.recognizer.energy_threshold)
00194
00195 if not req.quiet:
00196 self.play_sound("start", 0.1)
00197
00198 start_time = rospy.Time.now()
00199 while (rospy.Time.now() - start_time).to_sec() < duration:
00200 rospy.loginfo("Waiting for speech...")
00201 audio = self.recognizer.listen(src)
00202 if not req.quiet:
00203 self.play_sound("recognized", 0.05)
00204 rospy.loginfo("Waiting for result... (Sent %d bytes)" % len(audio.get_raw_data()))
00205
00206 try:
00207 result = self.recognize(audio)
00208 rospy.loginfo("Result: %s" % result.encode('utf-8'))
00209 if not req.quiet:
00210 self.play_sound("success", 0.1)
00211 res.result = SpeechRecognitionCandidates(transcript=[result])
00212 return res
00213 except SR.UnknownValueError:
00214 if self.dynamic_energy_threshold:
00215 self.recognizer.adjust_for_ambient_noise(src)
00216 rospy.loginfo("Set minimum energy threshold to %f" % self.recognizer.energy_threshold)
00217 except SR.RequestError as e:
00218 rospy.logerr("Failed to recognize: %s" % str(e))
00219 rospy.sleep(0.1)
00220 if rospy.is_shutdown():
00221 break
00222
00223
00224 if not req.quiet:
00225 self.play_sound("timeout", 0.1)
00226 return res
00227
00228
00229 if __name__ == '__main__':
00230 rospy.init_node("speech_recognition")
00231 rec = ROSSpeechRecognition()
00232 rospy.spin()