gcloud_speech: speech_to_text_action

Go to the documentation of this file.
00001 #include "speech_to_text_action_handler.h"
00002 
00003 #include "third_party/gflags.h"
00004 #include "third_party/glog.h"
00005 #include "util/statusor.h"
00006 
00007 namespace speech = ::cogrob::cloud::speech;
00008 
00009 using SpeechToTextSimpleActionServer =
00010     actionlib::SimpleActionServer<gcloud_speech_msgs::SpeechToTextAction>;
00011 
00012 DEFINE_int32(speech_fail_prematurely_retry_cutoff_msec, 300,
00013     "Cutoff duration to allow retrying if recognition failed prematurely.");
00014 
00015 namespace gcloud_speech {
00016 
00017 SpeechToTextActionHandler::SpeechToTextActionHandler(
00018     speech::GoogleSpeechRecognizerInterface* recognizer,
00019     SpeechToTextSimpleActionServer* simple_action_server) {
00020   recognizer_ = recognizer;
00021   simple_action_server_ = simple_action_server;
00022   is_active_.store(false);
00023 }
00024 
00025 
00026 void SpeechToTextActionHandler::AudioMsgCallback(
00027     const gcloud_speech_msgs::LinearPcm16Le16000Audio::ConstPtr& msg) {
00028   if (msg->data.size() & 1 != 0) {
00029       LOG(ERROR) << "Size of data in LinearPcm16Le16000Audio is not "
00030                  << "multiple of 2. Discarding sample.";
00031       DCHECK(false);
00032       return;
00033   }
00034   if (is_active_.load()) {
00035     std::unique_ptr<speech::AudioSample> audio_sample(
00036         new speech::AudioSample(msg->data));
00037     audio_queue_.push(std::move(audio_sample));
00038   }
00039 }
00040 
00041 void SpeechToTextActionHandler::ExecuteSpeechToTextAction(
00042     const gcloud_speech_msgs::SpeechToTextGoalConstPtr& goal) {
00043   audio_queue_.clear();
00044   result_queue_.clear();
00045   recognizer_->Stop();
00046   is_active_.store(true);
00047 
00048   int max_audio_seconds = goal->listen_duration_sec;
00049   if (max_audio_seconds == 0) {
00050     max_audio_seconds = 14;
00051   }
00052   int max_wait_seconds = goal->max_recognition_duration_sec;
00053   if (max_wait_seconds == 0) {
00054     max_wait_seconds = max_audio_seconds + 2;
00055   }
00056 
00057   bool interim_results = !goal->suppress_interim_results;
00058 
00059   // This is the result we will publish.
00060   gcloud_speech_msgs::SpeechToTextResult result_msg;
00061 
00062   int retry_time_left = 2;
00063   std::chrono::system_clock::time_point retry_deadline =
00064     std::chrono::system_clock::now() +
00065     std::chrono::milliseconds(FLAGS_speech_fail_prematurely_retry_cutoff_msec);
00066 
00067   while (retry_time_left > 0
00068          && std::chrono::system_clock::now() < retry_deadline) {
00069     LOG(INFO) << "Start recognize.";
00070     recognizer_->StartRecognize(&audio_queue_, &result_queue_, goal->hints,
00071         max_audio_seconds, max_wait_seconds, goal->max_alternatives);
00072 
00073     while (recognizer_->IsRunning() &&
00074         !simple_action_server_->isPreemptRequested()) {
00075       util::StatusOr<speech::RecognitionResult> result
00076           = result_queue_.blocking_pop(100);
00077       if (result.ok()) {
00078         // Processes the result and post some feedback.
00079         LOG(INFO) << "Result: " << result.ValueOrDie().ShortDebugString();
00080 
00081         if (result.ValueOrDie().is_final() or interim_results) {
00082           gcloud_speech_msgs::SpeechToTextFeedback feedback_msg;
00083           for (const auto& candidate: result.ValueOrDie().candidates()) {
00084             gcloud_speech_msgs::RecognitionHypothesis hypothesis;
00085             hypothesis.transcript = candidate.transcript();
00086             hypothesis.confidence = candidate.confidence();
00087             feedback_msg.hypotheses.push_back(hypothesis);
00088           }
00089           feedback_msg.is_portion_final = result.ValueOrDie().is_final();
00090           feedback_msg.stability = result.ValueOrDie().stability();
00091           simple_action_server_->publishFeedback(feedback_msg);
00092         }
00093 
00094         if (result.ValueOrDie().is_final()) {
00095           if (result.ValueOrDie().candidates().size() > 0) {
00096             result_msg.transcript +=
00097                 " " + result.ValueOrDie().candidates()[0].transcript();
00098           }
00099         }
00100       }
00101     }
00102 
00103     recognizer_->Stop();
00104 
00105     if (recognizer_->GetLastResult().ok()) {
00106       // If there is no error, we can quit retrying.
00107       break;
00108     }
00109     // Decreate retry counter so we don't retry too many times.
00110     --retry_time_left;
00111   }
00112 
00113   util::StatusOr<speech::RecognitionResult> last_result =
00114       recognizer_->GetLastResult();
00115   if (!last_result.ok()) {
00116     result_msg.is_error = true;
00117     result_msg.error_info = last_result.status().error_message();
00118   }
00119 
00120   if (simple_action_server_->isPreemptRequested()) {
00121     simple_action_server_->setPreempted(result_msg);
00122   } else {
00123     simple_action_server_->setSucceeded(result_msg);
00124   }
00125   is_active_.store(false);
00126 }
00127 
00128 }  // namespace gcloud_speech