speech_recognition_node.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 # -*- coding: utf-8 -*-
00003 # Copyright: Yuki Furuta <furushchev@jsk.imi.i.u-tokyo.ac.jp>
00004 
00005 import actionlib
00006 import rospy
00007 import speech_recognition as SR
00008 from threading import Lock
00009 
00010 from audio_common_msgs.msg import AudioData
00011 from sound_play.msg import SoundRequest, SoundRequestAction, SoundRequestGoal
00012 
00013 from speech_recognition_msgs.msg import SpeechRecognitionCandidates
00014 from speech_recognition_msgs.srv import SpeechRecognition
00015 from speech_recognition_msgs.srv import SpeechRecognitionResponse
00016 
00017 from dynamic_reconfigure.server import Server
00018 from ros_speech_recognition.cfg import SpeechRecognitionConfig as Config
00019 
00020 
00021 class ROSAudio(SR.AudioSource):
00022     def __init__(self, topic_name="audio", depth=16, sample_rate=16000, chunk_size=1024, buffer_size=10240):
00023         assert buffer_size > chunk_size
00024 
00025         self.topic_name = topic_name
00026         self.buffer_size = buffer_size
00027 
00028         if depth == 8:
00029             self.SAMPLE_WIDTH = 1L
00030         elif depth == 16:
00031             self.SAMPLE_WIDTH = 2L
00032         elif depth == 32:
00033             self.SAMPLE_WIDTH = 4L
00034         else:
00035             raise ValueError("depth must be 8, 16 or 32")
00036 
00037         self.SAMPLE_RATE = sample_rate
00038         self.CHUNK = chunk_size
00039 
00040         self.stream = None
00041 
00042     def __enter__(self):
00043         if self.stream is not None:
00044             self.close()
00045         self.stream = ROSAudio.AudioStream(self.topic_name, self.buffer_size)
00046         return self
00047 
00048     def __exit__(self, exc_type, exc_value, traceback):
00049         self.stream.close()
00050         self.stream = None
00051 
00052     class AudioStream(object):
00053         def __init__(self, topic_name, buffer_size=10240):
00054             self.buffer_size = buffer_size
00055             self.lock = Lock()
00056             self.buffer = bytes()
00057             self.sub_audio = rospy.Subscriber(
00058                 topic_name, AudioData, self.audio_cb)
00059 
00060         def read_once(self, size):
00061             with self.lock:
00062                 buf = self.buffer[:size]
00063                 self.buffer = self.buffer[size:]
00064                 return buf
00065 
00066         def read(self, size):
00067             while not rospy.is_shutdown() and len(self.buffer) < size:
00068                 rospy.sleep(0.001)
00069             return self.read_once(size)
00070 
00071         def close(self):
00072             self.sub_audio.unregister()
00073             self.buffer = bytes()
00074 
00075         def audio_cb(self, msg):
00076             with self.lock:
00077                 self.buffer += bytes(msg.data)
00078                 overflow = len(self.buffer) - self.buffer_size
00079                 if overflow > 0:
00080                     self.buffer = self.buffer[overflow:]
00081 
00082 
00083 class ROSSpeechRecognition(object):
00084     def __init__(self):
00085         self.default_duration = rospy.get_param("~duration", 10.0)
00086         self.engine = None
00087         self.recognizer = SR.Recognizer()
00088         self.audio = ROSAudio(topic_name="audio",
00089                               depth=rospy.get_param("~depth", 16),
00090                               sample_rate=rospy.get_param("~sample_rate", 16000))
00091 
00092         # initialize sound play client
00093         self.act_sound = actionlib.SimpleActionClient("sound_play", SoundRequestAction)
00094         if not self.act_sound.wait_for_server(rospy.Duration(5.0)):
00095             rospy.logwarn("Failed to find sound_play action. Disabled audio alert")
00096             self.act_sound = None
00097         self.signals = {
00098             "start": rospy.get_param("~start_signal",
00099                                      "/usr/share/sounds/ubuntu/stereo/bell.ogg"),
00100             "recognized": rospy.get_param("~recognized_signal",
00101                                           "/usr/share/sounds/ubuntu/stereo/button-toggle-on.ogg"),
00102             "success": rospy.get_param("~success_signal",
00103                                        "/usr/share/sounds/ubuntu/stereo/message-new-instant.ogg"),
00104             "timeout": rospy.get_param("~timeout_signal",
00105                                        "/usr/share/sounds/ubuntu/stereo/window-slide.ogg"),
00106         }
00107 
00108         self.srv = Server(Config, self.config_callback)
00109 
00110         self.initialized = False
00111 
00112         self.srv_speech_recognition = rospy.Service("speech_recognition",
00113                                                     SpeechRecognition,
00114                                                     self.speech_recognition_cb)
00115 
00116     def config_callback(self, config, level):
00117         # config for engine
00118         self.language = config.language
00119         if self.engine != config.engine:
00120             self.args = {}
00121             self.engine = config.engine
00122 
00123         # config for adaptive thresholding
00124         self.dynamic_energy_threshold = config.dynamic_energy_threshold
00125         if self.dynamic_energy_threshold:
00126             config.energy_threshold = self.recognizer.energy_threshold
00127         else:
00128             self.recognizer.energy_threshold = config.energy_threshold
00129         self.recognizer.dynamic_energy_adjustment_damping = config.dynamic_energy_adjustment_damping
00130         self.recognizer.dynamic_energy_ratio = config.dynamic_energy_ratio
00131 
00132         # config for timeout
00133         if config.operation_timeout > 0.0:
00134             self.recognizer.operation_timeout = config.operation_timeout
00135         else:
00136             self.recognizer.operation_timeout = None
00137 
00138         # config for VAD
00139         if config.pause_threshold < config.non_speaking_duration:
00140             config.pause_threshold = config.non_speaking_duration
00141         self.recognizer.pause_threshold = config.pause_threshold
00142         self.recognizer.non_speaking_duration = config.non_speaking_duration
00143         self.recognizer.phrase_threshold = config.phrase_threshold
00144 
00145         return config
00146 
00147     def play_sound(self, key, timeout=5.0):
00148         if self.act_sound is None:
00149             return
00150         req = SoundRequest()
00151         req.sound = SoundRequest.PLAY_FILE
00152         req.command = SoundRequest.PLAY_ONCE
00153         req.arg = self.signals[key]
00154         goal = SoundRequestGoal(sound_request=req)
00155         self.act_sound.send_goal_and_wait(goal, rospy.Duration(timeout))
00156 
00157     def recognize(self, audio):
00158         recog_func = None
00159         if self.engine == Config.SpeechRecognition_Google:
00160             if not self.args:
00161                 self.args = {'key': rospy.get_param("~google_key", None)}
00162             recog_func = self.recognizer.recognize_google
00163         elif self.engine == Config.SpeechRecognition_GoogleCloud:
00164             if not self.args:
00165                 self.args = {'credential_json': rospy.get_param("~google_cloud_credentials_json", None),
00166                              'preferred_phrases': rospy.get_param('~google_cloud_preferred_phrases', None)}
00167             recog_func = self.recognizer.recognize_google_cloud
00168         elif self.engine == Config.SpeechRecognition_Sphinx:
00169             recog_func = self.recognizer.recognize_sphinx
00170         elif self.engine == Config.SpeechRecognition_Wit:
00171             recog_func = self.recognizer.recognize_wit
00172         elif self.engine == Config.SpeechRecognition_Bing:
00173             if not self.args:
00174                 self.args = {'key': rospy.get_param("~bing_key")}
00175             recog_func = self.recognizer.recognize_bing
00176         elif self.engine == Config.SpeechRecognition_Houndify:
00177             recog_func = self.recognizer.recognize_houndify
00178         elif self.engine == Config.SpeechRecognition_IBM:
00179             recog_func = self.recognizer.recognize_ibm
00180 
00181         return recog_func(audio_data=audio, language=self.language, **self.args)
00182 
00183     def speech_recognition_cb(self, req):
00184         res = SpeechRecognitionResponse()
00185 
00186         duration = req.duration
00187         if duration <= 0.0:
00188             duration = self.default_duration
00189 
00190         with self.audio as src:
00191             if self.dynamic_energy_threshold:
00192                 self.recognizer.adjust_for_ambient_noise(src)
00193                 rospy.loginfo("Set minimum energy threshold to %f" % self.recognizer.energy_threshold)
00194 
00195             if not req.quiet:
00196                 self.play_sound("start", 0.1)
00197 
00198             start_time = rospy.Time.now()
00199             while (rospy.Time.now() - start_time).to_sec() < duration:
00200                 rospy.loginfo("Waiting for speech...")
00201                 audio = self.recognizer.listen(src)
00202                 if not req.quiet:
00203                     self.play_sound("recognized", 0.05)
00204                 rospy.loginfo("Waiting for result... (Sent %d bytes)" % len(audio.get_raw_data()))
00205 
00206                 try:
00207                     result = self.recognize(audio)
00208                     rospy.loginfo("Result: %s" % result.encode('utf-8'))
00209                     if not req.quiet:
00210                         self.play_sound("success", 0.1)
00211                     res.result = SpeechRecognitionCandidates(transcript=[result])
00212                     return res
00213                 except SR.UnknownValueError:
00214                     if self.dynamic_energy_threshold:
00215                         self.recognizer.adjust_for_ambient_noise(src)
00216                         rospy.loginfo("Set minimum energy threshold to %f" % self.recognizer.energy_threshold)
00217                 except SR.RequestError as e:
00218                     rospy.logerr("Failed to recognize: %s" % str(e))
00219                 rospy.sleep(0.1)
00220                 if rospy.is_shutdown():
00221                     break
00222 
00223             # Timeout
00224             if not req.quiet:
00225                 self.play_sound("timeout", 0.1)
00226             return res
00227 
00228 
00229 if __name__ == '__main__':
00230     rospy.init_node("speech_recognition")
00231     rec = ROSSpeechRecognition()
00232     rospy.spin()


ros_speech_recognition
Author(s): Yuki Furuta
autogenerated on Sat Sep 9 2017 02:33:40