16 from time
import sleep, time
as get_time
22 from os.path
import isdir, join
25 import speech_recognition
26 from hashlib
import md5
27 from io
import BytesIO, StringIO
28 from speech_recognition
import (
33 from tempfile
import gettempdir
34 from threading
import Thread, Lock
42 resolve_resource_file,
49 def __init__(self, wrapped_stream, format, muted=False):
50 assert wrapped_stream
is not None 61 """ Stop the stream and set the muted flag """ 63 self.wrapped_stream.stop_stream()
66 """ Start the stream and clear the muted flag """ 68 self.wrapped_stream.start_stream()
70 def read(self, size, of_exc=False):
72 Read data from stream. 75 size (int): Number of bytes to read 76 of_exc (bool): flag determining if the audio producer thread 77 should throw IOError at overflows. 82 frames = collections.deque()
90 to_read = min(self.wrapped_stream.get_read_available(), remaining)
94 result = self.wrapped_stream.read(to_read,
95 exception_on_overflow=of_exc)
99 input_latency = self.wrapped_stream.get_input_latency()
100 if input_latency > 0.2:
101 LOG.warning(
"High input latency: %f" % input_latency)
102 audio = b
"".join(list(frames))
106 self.wrapped_stream.close()
110 return self.wrapped_stream.is_stopped()
113 return self.wrapped_stream.stop_stream()
117 def __init__(self, device_index=None, sample_rate=16000, chunk_size=1024,
120 self, device_index=device_index, sample_rate=sample_rate,
121 chunk_size=chunk_size)
127 assert self.
stream is None, \
128 "This audio source is already inside a context manager" 131 input_device_index=self.device_index, channels=1,
132 format=self.format, rate=self.SAMPLE_RATE,
133 frames_per_buffer=self.CHUNK,
135 ), self.format, self.
muted)
138 def __exit__(self, exc_type, exc_value, traceback):
139 if not self.stream.is_stopped():
140 self.stream.stop_stream()
143 self.audio.terminate()
160 return b
'\0' * num_bytes
169 MIN_LOUD_SEC_PER_PHRASE = 0.5
173 MIN_SILENCE_AT_END = 0.25
177 RECORDING_TIMEOUT = 10.0
181 RECORDING_TIMEOUT_WITH_SILENCE = 3.0
184 SEC_BETWEEN_WW_CHECKS = 0.2
189 listener_config = self.config.get(
'listener')
196 speech_recognition.Recognizer.__init__(self)
203 if 'record_utterances' in listener_config:
220 num_phonemes = wake_word_recognizer.num_phonemes
221 len_phoneme = listener_config.get(
'phoneme_duration', 120) / 1000.0
227 except (requests.RequestException, AttributeError):
231 return source.stream.read(source.CHUNK, self.
overflow_exc)
235 return audioop.rms(sound_chunk, sample_width)
238 """Record an entire spoken phrase. 240 Essentially, this code waits for a period of silence and then returns 241 the audio. If silence isn't detected, it will terminate and return 242 a buffer of RECORDING_TIMEOUT duration. 245 source (AudioSource): Source producing the audio chunks 246 sec_per_buffer (float): Fractional number of seconds in each chunk 247 stream (AudioStreamHandler): Stream target that will receive chunks 248 of the utterance audio while it is 252 bytearray: complete audio buffer recorded, including any 253 silence at the end of the user's utterance 264 def increase_noise(level):
265 if level < max_noise:
266 return level + 200 * sec_per_buffer
269 def decrease_noise(level):
270 if level > min_noise:
271 return level - 100 * sec_per_buffer
289 stream.stream_start()
291 phrase_complete =
False 292 while num_chunks < max_chunks
and not phrase_complete:
298 stream.stream_chunk(chunk)
300 energy = self.
calc_energy(chunk, source.SAMPLE_WIDTH)
302 is_loud = energy > test_threshold
304 noise = increase_noise(noise)
307 noise = decrease_noise(noise)
310 if num_chunks % 10 == 0:
312 f.write(
"Energy: cur=" + str(energy) +
" thresh=" +
316 was_loud_enough = num_loud_chunks > min_loud_chunks
318 quiet_enough = noise <= min_noise
320 silence_duration += sec_per_buffer
325 recorded_too_much_silence = num_chunks > max_chunks_of_silence
326 if quiet_enough
and (was_loud_enough
or recorded_too_much_silence):
327 phrase_complete =
True 331 phrase_complete =
True 337 return int(sec * source.SAMPLE_RATE) * source.SAMPLE_WIDTH
354 LOG.debug(
"Button Pressed, wakeword not needed")
361 Signal stop and exit waiting state. 366 ww_module = self.wake_word_recognizer.__class__.__name__
367 if ww_module ==
'PreciseHotword':
368 model_path = self.wake_word_recognizer.precise_model
369 with open(model_path,
'rb')
as f:
370 model_hash = md5(f.read()).hexdigest()
375 'name': self.wake_word_name.replace(
' ',
'-'),
376 'engine': md5(ww_module.encode(
'utf-8')).hexdigest(),
377 'time': str(int(1000 * get_time())),
378 'sessionId': SessionManager.get().session_id,
380 'model': str(model_hash)
386 'audio': BytesIO(audio.get_wav_data()),
387 'metadata': StringIO(json.dumps(metadata))
392 """Listen continuously on source until a wake word is spoken 395 source (AudioSource): Source producing the audio chunks 396 sec_per_buffer (float): Fractional number of seconds in each chunk 398 num_silent_bytes = int(self.
SILENCE_SEC * source.SAMPLE_RATE *
407 buffers_since_check = 0.0
413 said_wake_word =
False 421 energy_avg_samples = int(5 / sec_per_buffer)
429 energy = self.
calc_energy(chunk, source.SAMPLE_WIDTH)
433 if len(energies) < energy_avg_samples:
435 energies.append(energy)
436 avg_energy += float(energy) / energy_avg_samples
439 avg_energy -= float(energies[idx_energy]) / energy_avg_samples
440 avg_energy += float(energy) / energy_avg_samples
441 energies[idx_energy] = energy
442 idx_energy = (idx_energy + 1) % energy_avg_samples
445 if energy < avg_energy * 1.5:
454 f.write(
"Energy: cur=" + str(energy) +
" thresh=" +
461 needs_to_grow = len(byte_data) < max_size
465 byte_data = byte_data[len(chunk):] + chunk
467 buffers_since_check += 1.0
468 self.wake_word_recognizer.update(chunk)
469 if buffers_since_check > buffers_per_check:
470 buffers_since_check -= buffers_per_check
471 chopped = byte_data[-test_size:] \
472 if test_size < len(byte_data)
else byte_data
473 audio_data = chopped + silence
475 self.wake_word_recognizer.found_wake_word(audio_data)
487 module = self.wake_word_recognizer.__class__.__name__
490 '_'.join([str(mtd[k])
for k
in sorted(mtd)])
492 with open(fn,
'wb')
as f:
493 f.write(audio.get_wav_data())
507 Constructs an AudioData instance with the same parameters 508 as the source and the specified frame_data 510 return AudioData(raw_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
512 def listen(self, source, emitter, stream=None):
513 """Listens for chunks of audio that Mycroft should perform STT on. 515 This will listen continuously for a wake-up-word, then return the 516 audio chunk containing the spoken phrase that comes immediately 520 source (AudioSource): Source producing the audio chunks 521 emitter (EventEmitter): Emitter for notifications of when recording 523 stream (AudioStreamHandler): Stream target that will receive chunks 524 of the utterance audio while it is 528 AudioData: audio with the user's utterance, minus the wake-up-word 530 assert isinstance(source, AudioSource),
"Source must be an AudioSource" 533 sec_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
540 self.adjust_for_ambient_noise(source, 1.0)
542 LOG.debug(
"Waiting for wake word...")
547 LOG.debug(
"Recording...")
548 emitter.emit(
"recognizer_loop:record_begin")
552 if self.config.get(
'confirm_listening'):
554 self.config.get(
'sounds').
get(
'start_listening'))
562 emitter.emit(
"recognizer_loop:record_end")
564 LOG.info(
"Recording utterance")
565 stamp = str(datetime.datetime.now())
566 filename =
"/tmp/mycroft_utterance%s.wav" % stamp
567 with open(filename,
'wb')
as filea:
568 filea.write(audio_data.get_wav_data())
569 LOG.debug(
"Thinking...")
574 if self.dynamic_energy_threshold
and energy > 0:
577 self.dynamic_energy_adjustment_damping ** seconds_per_buffer)
581 target_energy * (1 - damping))
def _wait_until_wake_word(self, source, sec_per_buffer)
def resolve_resource_file(res_name)
float RECORDING_TIMEOUT_WITH_SILENCE
def __init__(self, wake_word_recognizer)
def _create_audio_data(raw_data, source)
float SEC_BETWEEN_WW_CHECKS
def _adjust_threshold(self, energy, seconds_per_buffer)
def get_silence(num_bytes)
def check_for_signal(signal_name, sec_lifetime=0)
def calc_energy(sound_chunk, sample_width)
def read(self, size, of_exc=False)
def _skip_wake_word(self)
def listen(self, source, emitter, stream=None)
def _upload_wake_word(self, audio, metadata)
def record_sound_chunk(self, source)
def sec_to_bytes(sec, source)
float MIN_LOUD_SEC_PER_PHRASE
def __init__(self, wrapped_stream, format, muted=False)
def __init__(self, device_index=None, sample_rate=16000, chunk_size=1024, mute=False)
def get_ipc_directory(domain=None)
def __exit__(self, exc_type, exc_value, traceback)
def _compile_metadata(self)
def get(phrase, lang=None, context=None)
def _record_phrase(self, source, sec_per_buffer, stream=None)