src
nicla_vision_ros
SpeechRecognizer.py
Go to the documentation of this file.
1
#!/usr/bin/env python3
2
3
from
vosk
import
Model, KaldiRecognizer
4
import
json
5
import
wave
6
import
numpy
as
np
7
8
9
class
SpeechRecognizer
:
10
# grammar format example = '["open", "bottle", "cup", "[unk]"]'
11
def
__init__
(
12
self,
13
vosk_model_path: str,
14
grammar: str,
15
listen_seconds=3,
16
wave_output_filename=
""
,
17
):
18
19
self.
LISTEN_SECONDS
= listen_seconds
# Duration to listen to audio in
20
21
# nicla vision mic specs
22
self.
CHANNELS
= 1
23
self.
RATE
= 16000
24
self.
CHUNK
= 512
25
26
# For saving wavs
27
self.
WAVE_OUTPUT_FILENAME
= wave_output_filename
28
self.
WAVE_OUTPUT_FILENAME_i
= 0
29
30
self.
audio_buffer
= np.array([], dtype=
"int16"
)
31
if
self.
WAVE_OUTPUT_FILENAME
:
32
self.
recording_frames
= []
# Added for storing audio frames
33
34
self.
model
= Model(vosk_model_path)
35
self.
recognizer
= KaldiRecognizer(self.
model
, self.
RATE
)
36
self.
recognizer
.SetGrammar(grammar)
37
38
def
process_audio
(self, data):
39
40
recognized_audio =
""
41
42
tmp = np.frombuffer(data, dtype=
"int16"
)
43
44
self.
audio_buffer
= np.concatenate((self.
audio_buffer
, tmp))
45
if
self.
WAVE_OUTPUT_FILENAME
:
46
self.
recording_frames
.append(data)
47
48
if
(
49
len(self.
audio_buffer
) > self.
RATE
* self.
LISTEN_SECONDS
50
):
# process in blocks of 1 second
51
chunk = self.
audio_buffer
[: self.
RATE
* self.
LISTEN_SECONDS
]
52
self.
audio_buffer
= self.
audio_buffer
[
53
self.
RATE
* self.
LISTEN_SECONDS
:
54
]
55
56
if
self.
recognizer
.AcceptWaveform(chunk.tobytes()):
57
result = json.loads(self.
recognizer
.Result())
58
if
result[
"text"
]:
59
# print(f"Recognized: {result['text']}")
60
recognized_audio = result[
"text"
]
61
62
# print(len(self.recording_frames))
63
# print(int(self.RATE / self.CHUNK * self.LISTEN_SECONDS))
64
if
self.
WAVE_OUTPUT_FILENAME
:
65
if
len(self.
recording_frames
) >=
int
(
66
self.
RATE
/ self.
CHUNK
* self.
LISTEN_SECONDS
67
):
68
self.
save_audio_to_wav
()
69
70
return
recognized_audio
71
72
def
save_audio_to_wav
(self):
73
self.
WAVE_OUTPUT_FILENAME_i
+= 1
74
if
self.
WAVE_OUTPUT_FILENAME_i
== 21:
75
self.
WAVE_OUTPUT_FILENAME_i
= 1
76
77
with
wave.open(
78
self.
WAVE_OUTPUT_FILENAME
79
+
str
(self.
WAVE_OUTPUT_FILENAME_i
)
80
+
".wav"
,
81
"wb"
,
82
)
as
wf:
83
wf.setnchannels(self.
CHANNELS
)
84
wf.setsampwidth(2)
85
wf.setframerate(self.
RATE
)
86
wf.writeframes(b
""
.join(self.
recording_frames
))
87
88
print(
89
f
"Saved recording as {self.WAVE_OUTPUT_FILENAME+str(self.WAVE_OUTPUT_FILENAME_i)+'.wav'}"
90
)
91
self.
recording_frames
= []
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.WAVE_OUTPUT_FILENAME
WAVE_OUTPUT_FILENAME
Definition:
SpeechRecognizer.py:21
nicla_receiver.int
int
Definition:
nicla_receiver.py:14
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.audio_buffer
audio_buffer
Definition:
SpeechRecognizer.py:24
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.CHUNK
CHUNK
Definition:
SpeechRecognizer.py:18
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.save_audio_to_wav
def save_audio_to_wav(self)
Definition:
SpeechRecognizer.py:72
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.CHANNELS
CHANNELS
Definition:
SpeechRecognizer.py:16
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.LISTEN_SECONDS
LISTEN_SECONDS
Definition:
SpeechRecognizer.py:13
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer
Definition:
SpeechRecognizer.py:9
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.recognizer
recognizer
Definition:
SpeechRecognizer.py:29
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.recording_frames
recording_frames
Definition:
SpeechRecognizer.py:26
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.WAVE_OUTPUT_FILENAME_i
WAVE_OUTPUT_FILENAME_i
Definition:
SpeechRecognizer.py:22
nicla_receiver.str
str
Definition:
nicla_receiver.py:13
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.process_audio
def process_audio(self, data)
Definition:
SpeechRecognizer.py:38
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.RATE
RATE
Definition:
SpeechRecognizer.py:17
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.__init__
def __init__(self, str vosk_model_path, str grammar, listen_seconds=3, wave_output_filename="")
Definition:
SpeechRecognizer.py:11
nicla_vision_ros.SpeechRecognizer.SpeechRecognizer.model
model
Definition:
SpeechRecognizer.py:28
nicla_vision_ros
Author(s): Davide Torielli
, Damiano Gasperini
, Edoardo Del Bianco
, Federico Rollo
autogenerated on Sat Nov 16 2024 03:38:18