respeaker_ros: speech_to_text.py Source File

Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 # -*- coding: utf-8 -*-
00003 # Author: Yuki Furuta <furushchev@jsk.imi.i.u-tokyo.ac.jp>
00004 
00005 import actionlib
00006 import rospy
00007 try:
00008     import speech_recognition as SR
00009 except ImportError as e:
00010     raise ImportError(str(e) + '\nplease try "pip install speechrecognition"')
00011 
00012 from actionlib_msgs.msg import GoalStatus, GoalStatusArray
00013 from audio_common_msgs.msg import AudioData
00014 from sound_play.msg import SoundRequest, SoundRequestAction, SoundRequestGoal
00015 from speech_recognition_msgs.msg import SpeechRecognitionCandidates
00016 
00017 
00018 class SpeechToText(object):
00019     def __init__(self):
00020         # format of input audio data
00021         self.sample_rate = rospy.get_param("~sample_rate", 16000)
00022         self.sample_width = rospy.get_param("~sample_width", 2L)
00023         # language of STT service
00024         self.language = rospy.get_param("~language", "ja-JP")
00025         # ignore voice input while the robot is speaking
00026         self.self_cancellation = rospy.get_param("~self_cancellation", True)
00027         # time to assume as SPEAKING after tts service is finished
00028         self.tts_tolerance = rospy.Duration.from_sec(
00029             rospy.get_param("~tts_tolerance", 1.0))
00030         tts_action_names = rospy.get_param(
00031             '~tts_action_names', ['soundplay'])
00032 
00033         self.recognizer = SR.Recognizer()
00034 
00035         self.tts_action = None
00036         self.last_tts = None
00037         self.is_canceling = False
00038         self.tts_actions = []
00039         if self.self_cancellation:
00040             for tts_action_name in tts_action_names:
00041                 tts_action = actionlib.SimpleActionClient(
00042                     tts_action_name, SoundRequestAction)
00043                 if tts_action.wait_for_server(rospy.Duration(5.0)):
00044                     self.tts_actions.append(tts_action)
00045                 else:
00046                     rospy.logerr(
00047                         "action '{}' is not initialized."
00048                         .format(tts_action_name))
00049                 self.tts_timer = rospy.Timer(rospy.Duration(0.1), self.tts_timer_cb)
00050 
00051         self.pub_speech = rospy.Publisher(
00052             "speech_to_text", SpeechRecognitionCandidates, queue_size=1)
00053         self.sub_audio = rospy.Subscriber("audio", AudioData, self.audio_cb)
00054 
00055     def tts_timer_cb(self, event):
00056         stamp = event.current_real
00057         active = False
00058         for tts_action in self.tts_actions:
00059             for st in tts_action.action_client.last_status_msg.status_list:
00060                 if st.status == GoalStatus.ACTIVE:
00061                     active = True
00062                     break
00063             if active:
00064                 break
00065         if active:
00066             if not self.is_canceling:
00067                 rospy.logdebug("START CANCELLATION")
00068                 self.is_canceling = True
00069                 self.last_tts = None
00070         elif self.is_canceling:
00071             if self.last_tts is None:
00072                 self.last_tts = stamp
00073             if stamp - self.last_tts > self.tts_tolerance:
00074                 rospy.logdebug("END CANCELLATION")
00075                 self.is_canceling = False
00076 
00077     def audio_cb(self, msg):
00078         if self.is_canceling:
00079             rospy.loginfo("Speech is cancelled")
00080             return
00081         data = SR.AudioData(msg.data, self.sample_rate, self.sample_width)
00082         try:
00083             rospy.loginfo("Waiting for result %d" % len(data.get_raw_data()))
00084             result = self.recognizer.recognize_google(
00085                 data, language=self.language)
00086             msg = SpeechRecognitionCandidates(transcript=[result])
00087             self.pub_speech.publish(msg)
00088         except SR.UnknownValueError as e:
00089             rospy.logerr("Failed to recognize: %s" % str(e))
00090         except SR.RequestError as e:
00091             rospy.logerr("Failed to recognize: %s" % str(e))
00092 
00093 
00094 if __name__ == '__main__':
00095     rospy.init_node("speech_to_text")
00096     stt = SpeechToText()
00097     rospy.spin()