SpeechRecognizer.py
Go to the documentation of this file.
1 #!/usr/bin/env python3
2 
3 from vosk import Model, KaldiRecognizer
4 import json
5 import wave
6 import numpy as np
7 
8 
10  # grammar format example = '["open", "bottle", "cup", "[unk]"]'
11  def __init__(
12  self,
13  vosk_model_path: str,
14  grammar: str,
15  listen_seconds=3,
16  wave_output_filename="",
17  ):
18 
19  self.LISTEN_SECONDS = listen_seconds # Duration to listen to audio in
20 
21  # nicla vision mic specs
22  self.CHANNELS = 1
23  self.RATE = 16000
24  self.CHUNK = 512
25 
26  # For saving wavs
27  self.WAVE_OUTPUT_FILENAME = wave_output_filename
29 
30  self.audio_buffer = np.array([], dtype="int16")
31  if self.WAVE_OUTPUT_FILENAME:
32  self.recording_frames = [] # Added for storing audio frames
33 
34  self.model = Model(vosk_model_path)
35  self.recognizer = KaldiRecognizer(self.model, self.RATE)
36  self.recognizer.SetGrammar(grammar)
37 
38  def process_audio(self, data):
39 
40  recognized_audio = ""
41 
42  tmp = np.frombuffer(data, dtype="int16")
43 
44  self.audio_buffer = np.concatenate((self.audio_buffer, tmp))
45  if self.WAVE_OUTPUT_FILENAME:
46  self.recording_frames.append(data)
47 
48  if (
49  len(self.audio_buffer) > self.RATE * self.LISTEN_SECONDS
50  ): # process in blocks of 1 second
51  chunk = self.audio_buffer[: self.RATE * self.LISTEN_SECONDS]
52  self.audio_buffer = self.audio_buffer[
53  self.RATE * self.LISTEN_SECONDS :
54  ]
55 
56  if self.recognizer.AcceptWaveform(chunk.tobytes()):
57  result = json.loads(self.recognizer.Result())
58  if result["text"]:
59  # print(f"Recognized: {result['text']}")
60  recognized_audio = result["text"]
61 
62  # print(len(self.recording_frames))
63  # print(int(self.RATE / self.CHUNK * self.LISTEN_SECONDS))
64  if self.WAVE_OUTPUT_FILENAME:
65  if len(self.recording_frames) >= int(
66  self.RATE / self.CHUNK * self.LISTEN_SECONDS
67  ):
68  self.save_audio_to_wav()
69 
70  return recognized_audio
71 
72  def save_audio_to_wav(self):
73  self.WAVE_OUTPUT_FILENAME_i += 1
74  if self.WAVE_OUTPUT_FILENAME_i == 21:
75  self.WAVE_OUTPUT_FILENAME_i = 1
76 
77  with wave.open(
80  + ".wav",
81  "wb",
82  ) as wf:
83  wf.setnchannels(self.CHANNELS)
84  wf.setsampwidth(2)
85  wf.setframerate(self.RATE)
86  wf.writeframes(b"".join(self.recording_frames))
87 
88  print(
89  f"Saved recording as {self.WAVE_OUTPUT_FILENAME+str(self.WAVE_OUTPUT_FILENAME_i)+'.wav'}"
90  )
91  self.recording_frames = []
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.WAVE_OUTPUT_FILENAME
WAVE_OUTPUT_FILENAME
Definition: SpeechRecognizer.py:21
nicla_receiver.int
int
Definition: nicla_receiver.py:14
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.audio_buffer
audio_buffer
Definition: SpeechRecognizer.py:24
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.CHUNK
CHUNK
Definition: SpeechRecognizer.py:18
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.save_audio_to_wav
def save_audio_to_wav(self)
Definition: SpeechRecognizer.py:72
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.CHANNELS
CHANNELS
Definition: SpeechRecognizer.py:16
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.LISTEN_SECONDS
LISTEN_SECONDS
Definition: SpeechRecognizer.py:13
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer
Definition: SpeechRecognizer.py:9
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.recognizer
recognizer
Definition: SpeechRecognizer.py:29
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.recording_frames
recording_frames
Definition: SpeechRecognizer.py:26
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.WAVE_OUTPUT_FILENAME_i
WAVE_OUTPUT_FILENAME_i
Definition: SpeechRecognizer.py:22
nicla_receiver.str
str
Definition: nicla_receiver.py:13
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.process_audio
def process_audio(self, data)
Definition: SpeechRecognizer.py:38
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.RATE
RATE
Definition: SpeechRecognizer.py:17
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.__init__
def __init__(self, str vosk_model_path, str grammar, listen_seconds=3, wave_output_filename="")
Definition: SpeechRecognizer.py:11
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.model
model
Definition: SpeechRecognizer.py:28


nicla_vision_ros
Author(s): Davide Torielli , Damiano Gasperini , Edoardo Del Bianco , Federico Rollo
autogenerated on Sat Nov 16 2024 03:38:18