julius_ros: julius_client.py Source File

Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 # -*- coding: utf-8 -*-
00003 # Copyright: Yuki Furuta <furushchev@jsk.imi.i.u-tokyo.ac.jp>
00004 
00005 import actionlib
00006 from threading import Lock
00007 import os
00008 import rospy
00009 from sound_play.msg import SoundRequest, SoundRequestAction, SoundRequestGoal
00010 from julius_ros.utils import make_phonemes_from_words
00011 from julius_ros.utils import make_grammar_from_rules
00012 from julius_ros.utils import make_voca_from_categories
00013 from julius_ros.utils import make_dfa
00014 from julius_ros.audio_transport import AudioTransport
00015 from julius_ros.module_client import ModuleClient
00016 from speech_recognition_msgs.msg import Grammar
00017 from speech_recognition_msgs.msg import SpeechRecognitionCandidates
00018 from speech_recognition_msgs.msg import Vocabulary
00019 from speech_recognition_msgs.srv import SpeechRecognition
00020 from speech_recognition_msgs.srv import SpeechRecognitionResponse
00021 from std_srvs.srv import Empty, EmptyResponse
00022 import lxml.etree
00023 
00024 
00025 class JuliusClient(object):
00026     start_signal = "/usr/share/sounds/ubuntu/stereo/bell.ogg"
00027     success_signal = "/usr/share/sounds/ubuntu/stereo/message-new-instant.ogg"
00028     timeout_signal = "/usr/share/sounds/ubuntu/stereo/window-slide.ogg"
00029 
00030     def __init__(self):
00031         # load parameters
00032         self.encoding = rospy.get_param("~encoding", "utf-8")
00033         self.default_duration = rospy.get_param("~duration", 3.0)
00034         self.default_threshold = rospy.get_param("~threshold", 0.8)
00035         self.use_isolated_word = rospy.get_param("~use_isolated_word", True)
00036         self.start_signal_action_timeout = rospy.get_param("~start_signal_action_timeout", 0.3)
00037 
00038         self.pub_speech_recognition = rospy.Publisher("speech_to_text",
00039                                                       SpeechRecognitionCandidates,
00040                                                       queue_size=1)
00041 
00042         # initialize sound play
00043         self.act_sound = actionlib.SimpleActionClient("sound_play", SoundRequestAction)
00044         if not self.act_sound.wait_for_server(rospy.Duration(5.0)):
00045             rospy.logwarn("Failed to find sound_play action. Disabled audio alert")
00046             self.act_sound = None
00047 
00048         # setup julius
00049         host = rospy.get_param("~host", "localhost")
00050         module_port = rospy.get_param("~module_port", 10500)
00051         audio_port = rospy.get_param("~audio_port", 10501)
00052         max_retry = rospy.get_param("~max_connection_retry", 0)
00053 
00054         self.module = ModuleClient(host, module_port, max_retry, self.encoding)
00055         self.audio = AudioTransport(host, audio_port, max_retry, "audio")
00056 
00057         rospy.on_shutdown(self.shutdown_cb)
00058         self.module.on_received_data(self.julius_cb)
00059 
00060         self.module.start()
00061         self.audio.start()
00062 
00063         self.status()
00064         self.start()
00065 
00066         # start subscribe
00067         self.lock = Lock()
00068         self.last_speech = SpeechRecognitionCandidates()
00069         self.vocabularies = {}
00070 
00071         self.srv_show_status = rospy.Service("show_julius_status",
00072                                              Empty, self.status)
00073 
00074         if self.use_isolated_word:
00075             self.sub_vocabulary = rospy.Subscriber("vocabulary", Vocabulary,
00076                                                    self.vocabulary_cb)
00077         else:
00078             self.sub_grammar = rospy.Subscriber("grammar", Grammar,
00079                                                 self.grammar_cb)
00080         self.srv_speech_recognition = rospy.Service("speech_recognition",
00081                                                     SpeechRecognition,
00082                                                     self.speech_recognition_cb)
00083 
00084     def start(self):
00085         self.grammar_changed = None
00086         self.module.send_command(["INPUTONCHANGE", "PAUSE"])
00087 
00088     def status(self, args=None):
00089         self.module.send_command(["VERSION"])
00090         self.module.send_command(["STATUS"])
00091         self.module.send_command(["GRAMINFO"])
00092         return EmptyResponse()
00093 
00094     def play_sound(self, path, timeout=5.0):
00095         if self.act_sound is None:
00096             return
00097         req = SoundRequest()
00098         req.sound = SoundRequest.PLAY_FILE
00099         req.command = SoundRequest.PLAY_ONCE
00100         req.arg = path
00101         goal = SoundRequestGoal(sound_request=req)
00102         self.act_sound.send_goal_and_wait(goal, rospy.Duration(timeout))
00103 
00104     def send_grammar_cmd(self, cmd):
00105         self.grammar_changed = None
00106         self.module.send_command(cmd)
00107         while self.grammar_changed is None:
00108             rospy.sleep(0.01)
00109         return self.grammar_changed
00110 
00111     def activate_gram(self, name):
00112         with self.lock:
00113             cmd = ["ACTIVATEGRAM", name]
00114             self.module.send_command(cmd)
00115             ok = True
00116             if ok:
00117                 rospy.loginfo("Successfully activated grammar")
00118             else:
00119                 rospy.logerr("Failed to activate grammar")
00120             return ok
00121 
00122     def deactivate_gram(self, name):
00123         with self.lock:
00124             cmd = ["DEACTIVATEGRAM", name]
00125             rospy.loginfo("Deactivating %s" % name)
00126             self.module.send_command(cmd)
00127             ok = True
00128             if ok:
00129                 rospy.loginfo("Successfully deactivated grammar")
00130             else:
00131                 rospy.logerr("Failed to deactivate grammar")
00132             return ok
00133 
00134     def do_gram(self, cmd_name, name, dfa, dic):
00135         with self.lock:
00136             assert cmd_name
00137             assert name
00138             assert dic
00139             cmd = ["%s %s" % (cmd_name, name)]
00140             if dfa is not None:
00141                 cmd += [' ' + l.strip() for l in dfa if l.strip()]
00142                 cmd += ["DFAEND"]
00143             cmd += [' ' + l.strip() for l in dic if l.strip()]
00144             cmd += ["DICEND"]
00145             ok = self.send_grammar_cmd(cmd)
00146             if ok:
00147                 rospy.loginfo("%s Success" % cmd_name)
00148             else:
00149                 rospy.logerr("%s Failed" % cmd_name)
00150             return ok
00151 
00152     def add_gram(self, name, dfa, dic):
00153         ok = self.do_gram("ADDGRAM", name, dfa, dic)
00154         ok &= self.deactivate_gram(name)
00155         return ok
00156 
00157     def change_gram(self, name, dfa, dic):
00158         return self.do_gram("CHANGEGRAM", name, dfa, dic)
00159 
00160     def grammar_cb(self, msg):
00161         if msg.name:
00162             name = msg.name
00163         else:
00164             rospy.logwarn("Name of grammar is empty. Use 'unknown'")
00165             name = 'unknown'
00166         grammar = make_grammar_from_rules(msg.rules)
00167         voca = make_voca_from_categories(msg.categories, msg.vocabularies)
00168         result = make_dfa(grammar, voca)
00169         if result is None:
00170             rospy.logerr("Failed to make dfa from grammar message")
00171             return
00172         dfa, dic = result
00173         ok = self.add_gram(name, dfa.split(os.linesep), dic.split(os.linesep))
00174         if ok:
00175             self.vocabularies[name] = list(set(e for v in msg.vocabularies for e in v.words))
00176         else:
00177             rospy.logerr("Failed to change vocabulary")
00178 
00179     def vocabulary_cb(self, msg):
00180         if msg.name:
00181             name = msg.name
00182         else:
00183             rospy.logwarn("Name of grammar is empty. Use 'unknown'")
00184             name = 'unknown'
00185         if len(msg.phonemes) == 0 or not msg.phonemes[0]:
00186             phonemes = make_phonemes_from_words(msg.words)
00187         else:
00188             phonemes = msg.phonemes
00189         dic = [" %s\t%s" % (w, p) for w, p in zip(msg.words, phonemes)]
00190         ok = self.add_gram(name, None, dic)
00191         if ok:
00192             self.vocabularies[name] = list(set(msg.words))
00193         else:
00194             rospy.logerr("Failed to change vocabulary")
00195 
00196     def speech_recognition_cb(self, req):
00197         res = SpeechRecognitionResponse()
00198 
00199         # change grammar
00200         candidate_words = []
00201         if req.grammar_name:
00202             ok = self.activate_gram(req.grammar_name)
00203             if not ok:
00204                 rospy.logerr("failed to activate grammar %s" % req.grammar_name)
00205                 return res
00206             if req.grammar_name in self.vocabularies:
00207                 candidate_words = self.vocabularies[req.grammar_name]
00208         elif req.grammar.rules:
00209             g = req.grammar
00210             if not g.name:
00211                 g.name = 'unknown'
00212             grammar = make_grammar_from_rules(g.rules)
00213             voca = make_voca_from_categories(g.categories, g.vocabularies)
00214             result = make_dfa(grammar, voca)
00215             if result is None:
00216                 rospy.logerr("Failed to make dfa from grammar message")
00217                 return res
00218             dfa, dic = result
00219             ok = self.change_gram(g.name, dfa.split(os.linesep), dic.split(os.linesep))
00220             if not ok:
00221                 rospy.logerr("Failed to change grammar")
00222                 return res
00223             self.vocabularies[g.name] = list(set(e for v in msg.vocabularies for e in v.words))
00224             candidate_words = self.vocabularies[g.name]
00225         elif req.vocabulary.words:
00226             v = req.vocabulary
00227             if not v.name:
00228                 v.name = 'unknown'
00229             if len(v.phonemes) == 0 or not v.phonemes[0]:
00230                 v.phonemes = make_phonemes_from_words(v.words)
00231             dic = [" %s\t%s" % (w, p) for w, p in zip(v.words, v.phonemes)]
00232             ok = self.change_gram(v.name, None, dic)
00233             if not ok:
00234                 rospy.logerr("Failed to change vocabulary")
00235                 return res
00236             self.vocabularies[v.name] = list(set(v.words))
00237             candidate_words = self.vocabularies[v.name]
00238         else:
00239             rospy.logerr("Invalid request: 'grammar_name', 'grammar' or 'vocabulary' must be filled")
00240             return res
00241 
00242         duration = req.duration
00243         if duration <= 0.0:
00244             duration = self.default_duration
00245 
00246         threshold = req.threshold
00247         if threshold <= 0.0 or threshold > 1.0:
00248             threshold = self.default_threshold
00249 
00250         if not req.quiet:
00251             self.play_sound(self.start_signal, self.start_signal_action_timeout)
00252         start_time = rospy.Time.now()
00253         self.last_speech = SpeechRecognitionCandidates()
00254         while (rospy.Time.now() - start_time).to_sec() < duration:
00255             rospy.sleep(0.1)
00256             speech = self.last_speech
00257             if not self.last_speech.transcript:
00258                 continue
00259             if candidate_words:
00260                 ok = speech.transcript[0] in candidate_words and speech.confidence[0] >= threshold
00261             else:
00262                 ok = speech.confidence[0] >= threshold
00263             if ok:
00264                 t0 = speech.transcript[0]
00265                 c0 = speech.confidence[0]
00266                 rospy.loginfo("Recognized %s (%f)..." % (t0, c0))
00267                 if not req.quiet:
00268                     self.play_sound(self.success_signal, 0.1)
00269                 res.result = speech
00270                 return res
00271 
00272         # timeout
00273         rospy.logerr("Timed out")
00274         if not req.quiet:
00275             self.play_sound(self.timeout_signal, 0.1)
00276         return res
00277 
00278     def process_result(self, data):
00279         results = {}
00280         for shypo in data.xpath("//SHYPO"):
00281             transcript = []
00282             confidence = 0.0
00283             for whypo in shypo.xpath("./WHYPO"):
00284                 word = whypo.attrib["WORD"].encode(self.encoding)
00285                 if word.startswith("<"):
00286                     continue
00287                 transcript.append(word)
00288                 confidence += float(whypo.attrib["CM"])
00289             confidence /= len(transcript)
00290             transcript = " ".join(transcript)
00291             results[confidence] = transcript
00292 
00293         msg = SpeechRecognitionCandidates()
00294         debug_str = ["Recognized:"]
00295         for i, result in enumerate(sorted(results.items(), reverse=True)):
00296             c, t = result
00297             debug_str += ["%d: %s (%.2f)" % (i+1, t, c)]
00298             msg.transcript.append(t)
00299             msg.confidence.append(c)
00300         self.pub_speech_recognition.publish(msg)
00301         self.last_speech = msg
00302         rospy.logdebug(os.linesep.join(debug_str))
00303 
00304     def shutdown_cb(self):
00305         self.module.join()
00306         self.audio.join()
00307 
00308     def julius_cb(self, data):
00309         status, detail = data
00310         rospy.logdebug("status: %s" % status)
00311         rospy.logdebug("detail: %s" % lxml.etree.tostring(detail))
00312         if status == 'ENGINEINFO':
00313             version = detail.attrib["VERSION"]
00314             conf = detail.attrib["CONF"]
00315             rospy.loginfo("Version: %s (%s)" % (version, conf))
00316         elif status == 'SYSINFO':
00317             rospy.loginfo("Status: %s" % detail.attrib["PROCESS"])
00318         elif status == 'GRAMINFO':
00319             rospy.loginfo("Grammar Information:\n%s" % detail.text.strip())
00320         elif status == 'STARTPROC':
00321             rospy.loginfo("Julius Engine initialized")
00322         elif status == 'ENDPROC':
00323             rospy.loginfo("Julius Engine stopped")
00324         elif status == 'STARTRECOG':
00325             rospy.logdebug("Start Recognize")
00326         elif status == 'ENDRECOG':
00327             rospy.logdebug("End Recognize")
00328         elif status == 'RECOGFAIL':
00329             rospy.logerr("Recognition Failed")
00330         elif status == 'RECOGOUT':
00331             self.process_result(detail)
00332         elif status == 'INPUT':
00333             substat = detail.attrib["STATUS"]
00334             if substat == 'STARTREC':
00335                 rospy.logdebug("Detect speak start")
00336             elif substat == 'ENDREC':
00337                 rospy.logdebug("Detect speak end")
00338         elif status == 'INPUTPARAM':
00339             input_frame = int(detail.attrib["FRAMES"])
00340             if input_frame >= 2000:
00341                 rospy.logwarn("Audio segment is too long!! Please volume down your microphone.")
00342         elif status == 'GRAMMAR':
00343             substat = detail.attrib["STATUS"]
00344             if substat == 'RECEIVED':
00345                 self.grammar_changed = True
00346             elif substat == "ERROR":
00347                 reason = detail.attrib["REASON"]
00348                 rospy.logerr("Failed to change grammar: %s" % reason)
00349                 self.grammar_changed = False
00350         else:
00351             rospy.logwarn("Received %s" % status)
00352             rospy.logwarn("%s", lxml.etree.tostring(detail))
00353 
00354 if __name__ == '__main__':
00355     rospy.init_node("julius_client")
00356     client = JuliusClient()
00357     rospy.spin()