00001 #include "speech_to_text_action_handler.h" 00002 00003 #include "third_party/gflags.h" 00004 #include "third_party/glog.h" 00005 #include "util/statusor.h" 00006 00007 namespace speech = ::cogrob::cloud::speech; 00008 00009 using SpeechToTextSimpleActionServer = 00010 actionlib::SimpleActionServer<gcloud_speech_msgs::SpeechToTextAction>; 00011 00012 DEFINE_int32(speech_fail_prematurely_retry_cutoff_msec, 300, 00013 "Cutoff duration to allow retrying if recognition failed prematurely."); 00014 00015 namespace gcloud_speech { 00016 00017 SpeechToTextActionHandler::SpeechToTextActionHandler( 00018 speech::GoogleSpeechRecognizerInterface* recognizer, 00019 SpeechToTextSimpleActionServer* simple_action_server) { 00020 recognizer_ = recognizer; 00021 simple_action_server_ = simple_action_server; 00022 is_active_.store(false); 00023 } 00024 00025 00026 void SpeechToTextActionHandler::AudioMsgCallback( 00027 const gcloud_speech_msgs::LinearPcm16Le16000Audio::ConstPtr& msg) { 00028 if (msg->data.size() & 1 != 0) { 00029 LOG(ERROR) << "Size of data in LinearPcm16Le16000Audio is not " 00030 << "multiple of 2. Discarding sample."; 00031 DCHECK(false); 00032 return; 00033 } 00034 if (is_active_.load()) { 00035 std::unique_ptr<speech::AudioSample> audio_sample( 00036 new speech::AudioSample(msg->data)); 00037 audio_queue_.push(std::move(audio_sample)); 00038 } 00039 } 00040 00041 void SpeechToTextActionHandler::ExecuteSpeechToTextAction( 00042 const gcloud_speech_msgs::SpeechToTextGoalConstPtr& goal) { 00043 audio_queue_.clear(); 00044 result_queue_.clear(); 00045 recognizer_->Stop(); 00046 is_active_.store(true); 00047 00048 int max_audio_seconds = goal->listen_duration_sec; 00049 if (max_audio_seconds == 0) { 00050 max_audio_seconds = 14; 00051 } 00052 int max_wait_seconds = goal->max_recognition_duration_sec; 00053 if (max_wait_seconds == 0) { 00054 max_wait_seconds = max_audio_seconds + 2; 00055 } 00056 00057 bool interim_results = !goal->suppress_interim_results; 00058 00059 // This is the result we will publish. 00060 gcloud_speech_msgs::SpeechToTextResult result_msg; 00061 00062 int retry_time_left = 2; 00063 std::chrono::system_clock::time_point retry_deadline = 00064 std::chrono::system_clock::now() + 00065 std::chrono::milliseconds(FLAGS_speech_fail_prematurely_retry_cutoff_msec); 00066 00067 while (retry_time_left > 0 00068 && std::chrono::system_clock::now() < retry_deadline) { 00069 LOG(INFO) << "Start recognize."; 00070 recognizer_->StartRecognize(&audio_queue_, &result_queue_, goal->hints, 00071 max_audio_seconds, max_wait_seconds, goal->max_alternatives); 00072 00073 while (recognizer_->IsRunning() && 00074 !simple_action_server_->isPreemptRequested()) { 00075 util::StatusOr<speech::RecognitionResult> result 00076 = result_queue_.blocking_pop(100); 00077 if (result.ok()) { 00078 // Processes the result and post some feedback. 00079 LOG(INFO) << "Result: " << result.ValueOrDie().ShortDebugString(); 00080 00081 if (result.ValueOrDie().is_final() or interim_results) { 00082 gcloud_speech_msgs::SpeechToTextFeedback feedback_msg; 00083 for (const auto& candidate: result.ValueOrDie().candidates()) { 00084 gcloud_speech_msgs::RecognitionHypothesis hypothesis; 00085 hypothesis.transcript = candidate.transcript(); 00086 hypothesis.confidence = candidate.confidence(); 00087 feedback_msg.hypotheses.push_back(hypothesis); 00088 } 00089 feedback_msg.is_portion_final = result.ValueOrDie().is_final(); 00090 feedback_msg.stability = result.ValueOrDie().stability(); 00091 simple_action_server_->publishFeedback(feedback_msg); 00092 } 00093 00094 if (result.ValueOrDie().is_final()) { 00095 if (result.ValueOrDie().candidates().size() > 0) { 00096 result_msg.transcript += 00097 " " + result.ValueOrDie().candidates()[0].transcript(); 00098 } 00099 } 00100 } 00101 } 00102 00103 recognizer_->Stop(); 00104 00105 if (recognizer_->GetLastResult().ok()) { 00106 // If there is no error, we can quit retrying. 00107 break; 00108 } 00109 // Decreate retry counter so we don't retry too many times. 00110 --retry_time_left; 00111 } 00112 00113 util::StatusOr<speech::RecognitionResult> last_result = 00114 recognizer_->GetLastResult(); 00115 if (!last_result.ok()) { 00116 result_msg.is_error = true; 00117 result_msg.error_info = last_result.status().error_message(); 00118 } 00119 00120 if (simple_action_server_->isPreemptRequested()) { 00121 simple_action_server_->setPreempted(result_msg); 00122 } else { 00123 simple_action_server_->setSucceeded(result_msg); 00124 } 00125 is_active_.store(false); 00126 } 00127 00128 } // namespace gcloud_speech