mic.py
Go to the documentation of this file.
1 # Copyright 2017 Mycroft AI Inc.
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 #
15 import audioop
16 from time import sleep, time as get_time
17 
18 import collections
19 import datetime
20 import json
21 import os
22 from os.path import isdir, join
23 import pyaudio
24 import requests
25 import speech_recognition
26 from hashlib import md5
27 from io import BytesIO, StringIO
28 from speech_recognition import (
29  Microphone,
30  AudioSource,
31  AudioData
32 )
33 from tempfile import gettempdir
34 from threading import Thread, Lock
35 
36 from mycroft.api import DeviceApi
37 from mycroft.configuration import Configuration
38 from mycroft.session import SessionManager
39 from mycroft.util import (
40  check_for_signal,
41  get_ipc_directory,
42  resolve_resource_file,
43  play_wav
44 )
45 from mycroft.util.log import LOG
46 
47 
49  def __init__(self, wrapped_stream, format, muted=False):
50  assert wrapped_stream is not None
51  self.wrapped_stream = wrapped_stream
52 
53  self.muted = muted
54  if muted:
55  self.mute()
56 
57  self.SAMPLE_WIDTH = pyaudio.get_sample_size(format)
58  self.muted_buffer = b''.join([b'\x00' * self.SAMPLE_WIDTH])
59 
60  def mute(self):
61  """ Stop the stream and set the muted flag """
62  self.muted = True
63  self.wrapped_stream.stop_stream()
64 
65  def unmute(self):
66  """ Start the stream and clear the muted flag """
67  self.muted = False
68  self.wrapped_stream.start_stream()
69 
70  def read(self, size, of_exc=False):
71  """
72  Read data from stream.
73 
74  Arguments:
75  size (int): Number of bytes to read
76  of_exc (bool): flag determining if the audio producer thread
77  should throw IOError at overflows.
78 
79  Returns:
80  Data read from device
81  """
82  frames = collections.deque()
83  remaining = size
84  while remaining > 0:
85  # If muted during read return empty buffer. This ensures no
86  # reads occur while the stream is stopped
87  if self.muted:
88  return self.muted_buffer
89 
90  to_read = min(self.wrapped_stream.get_read_available(), remaining)
91  if to_read == 0:
92  sleep(.01)
93  continue
94  result = self.wrapped_stream.read(to_read,
95  exception_on_overflow=of_exc)
96  frames.append(result)
97  remaining -= to_read
98 
99  input_latency = self.wrapped_stream.get_input_latency()
100  if input_latency > 0.2:
101  LOG.warning("High input latency: %f" % input_latency)
102  audio = b"".join(list(frames))
103  return audio
104 
105  def close(self):
106  self.wrapped_stream.close()
107  self.wrapped_stream = None
108 
109  def is_stopped(self):
110  return self.wrapped_stream.is_stopped()
111 
112  def stop_stream(self):
113  return self.wrapped_stream.stop_stream()
114 
115 
116 class MutableMicrophone(Microphone):
117  def __init__(self, device_index=None, sample_rate=16000, chunk_size=1024,
118  mute=False):
119  Microphone.__init__(
120  self, device_index=device_index, sample_rate=sample_rate,
121  chunk_size=chunk_size)
122  self.muted = False
123  if mute:
124  self.mute()
125 
126  def __enter__(self):
127  assert self.stream is None, \
128  "This audio source is already inside a context manager"
129  self.audio = pyaudio.PyAudio()
130  self.stream = MutableStream(self.audio.open(
131  input_device_index=self.device_index, channels=1,
132  format=self.format, rate=self.SAMPLE_RATE,
133  frames_per_buffer=self.CHUNK,
134  input=True, # stream is an input stream
135  ), self.format, self.muted)
136  return self
137 
138  def __exit__(self, exc_type, exc_value, traceback):
139  if not self.stream.is_stopped():
140  self.stream.stop_stream()
141  self.stream.close()
142  self.stream = None
143  self.audio.terminate()
144 
145  def mute(self):
146  self.muted = True
147  if self.stream:
148  self.stream.mute()
149 
150  def unmute(self):
151  self.muted = False
152  if self.stream:
153  self.stream.unmute()
154 
155  def is_muted(self):
156  return self.muted
157 
158 
159 def get_silence(num_bytes):
160  return b'\0' * num_bytes
161 
162 
163 class ResponsiveRecognizer(speech_recognition.Recognizer):
164  # Padding of silence when feeding to pocketsphinx
165  SILENCE_SEC = 0.01
166 
167  # The minimum seconds of noise before a
168  # phrase can be considered complete
169  MIN_LOUD_SEC_PER_PHRASE = 0.5
170 
171  # The minimum seconds of silence required at the end
172  # before a phrase will be considered complete
173  MIN_SILENCE_AT_END = 0.25
174 
175  # The maximum seconds a phrase can be recorded,
176  # provided there is noise the entire time
177  RECORDING_TIMEOUT = 10.0
178 
179  # The maximum time it will continue to record silence
180  # when not enough noise has been detected
181  RECORDING_TIMEOUT_WITH_SILENCE = 3.0
182 
183  # Time between pocketsphinx checks for the wake word
184  SEC_BETWEEN_WW_CHECKS = 0.2
185 
186  def __init__(self, wake_word_recognizer):
187 
188  self.config = Configuration.get()
189  listener_config = self.config.get('listener')
190  self.upload_url = listener_config['wake_word_upload']['url']
191  self.upload_disabled = listener_config['wake_word_upload']['disable']
192  self.wake_word_name = wake_word_recognizer.key_phrase
193 
194  self.overflow_exc = listener_config.get('overflow_exception', False)
195 
196  speech_recognition.Recognizer.__init__(self)
197  self.wake_word_recognizer = wake_word_recognizer
198  self.audio = pyaudio.PyAudio()
199  self.multiplier = listener_config.get('multiplier')
200  self.energy_ratio = listener_config.get('energy_ratio')
201  # check the config for the flag to save wake words.
202 
203  if 'record_utterances' in listener_config:
204  # TODO: 19.08 remove this backwards compatibility
205  self.save_utterances = listener_config.get('record_utterances')
206  else:
207  self.save_utterances = listener_config.get('save_utterances',
208  False)
209 
210  self.save_wake_words = listener_config.get('record_wake_words')
211  self.saved_wake_words_dir = join(gettempdir(), 'mycroft_wake_words')
212 
213  self.upload_lock = Lock()
215  self.mic_level_file = os.path.join(get_ipc_directory(), "mic_level")
216  self._stop_signaled = False
217 
218  # The maximum audio in seconds to keep for transcribing a phrase
219  # The wake word must fit in this time
220  num_phonemes = wake_word_recognizer.num_phonemes
221  len_phoneme = listener_config.get('phoneme_duration', 120) / 1000.0
222  self.TEST_WW_SEC = num_phonemes * len_phoneme
223  self.SAVED_WW_SEC = max(3, self.TEST_WW_SEC)
224 
225  try:
226  self.account_id = DeviceApi().get()['user']['uuid']
227  except (requests.RequestException, AttributeError):
228  self.account_id = '0'
229 
230  def record_sound_chunk(self, source):
231  return source.stream.read(source.CHUNK, self.overflow_exc)
232 
233  @staticmethod
234  def calc_energy(sound_chunk, sample_width):
235  return audioop.rms(sound_chunk, sample_width)
236 
237  def _record_phrase(self, source, sec_per_buffer, stream=None):
238  """Record an entire spoken phrase.
239 
240  Essentially, this code waits for a period of silence and then returns
241  the audio. If silence isn't detected, it will terminate and return
242  a buffer of RECORDING_TIMEOUT duration.
243 
244  Args:
245  source (AudioSource): Source producing the audio chunks
246  sec_per_buffer (float): Fractional number of seconds in each chunk
247  stream (AudioStreamHandler): Stream target that will receive chunks
248  of the utterance audio while it is
249  being recorded
250 
251  Returns:
252  bytearray: complete audio buffer recorded, including any
253  silence at the end of the user's utterance
254  """
255 
256  num_loud_chunks = 0
257  noise = 0
258 
259  max_noise = 25
260  min_noise = 0
261 
262  silence_duration = 0
263 
264  def increase_noise(level):
265  if level < max_noise:
266  return level + 200 * sec_per_buffer
267  return level
268 
269  def decrease_noise(level):
270  if level > min_noise:
271  return level - 100 * sec_per_buffer
272  return level
273 
274  # Smallest number of loud chunks required to return
275  min_loud_chunks = int(self.MIN_LOUD_SEC_PER_PHRASE / sec_per_buffer)
276 
277  # Maximum number of chunks to record before timing out
278  max_chunks = int(self.RECORDING_TIMEOUT / sec_per_buffer)
279  num_chunks = 0
280 
281  # Will return if exceeded this even if there's not enough loud chunks
282  max_chunks_of_silence = int(self.RECORDING_TIMEOUT_WITH_SILENCE /
283  sec_per_buffer)
284 
285  # bytearray to store audio in
286  byte_data = get_silence(source.SAMPLE_WIDTH)
287 
288  if stream:
289  stream.stream_start()
290 
291  phrase_complete = False
292  while num_chunks < max_chunks and not phrase_complete:
293  chunk = self.record_sound_chunk(source)
294  byte_data += chunk
295  num_chunks += 1
296 
297  if stream:
298  stream.stream_chunk(chunk)
299 
300  energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
301  test_threshold = self.energy_threshold * self.multiplier
302  is_loud = energy > test_threshold
303  if is_loud:
304  noise = increase_noise(noise)
305  num_loud_chunks += 1
306  else:
307  noise = decrease_noise(noise)
308  self._adjust_threshold(energy, sec_per_buffer)
309 
310  if num_chunks % 10 == 0:
311  with open(self.mic_level_file, 'w') as f:
312  f.write("Energy: cur=" + str(energy) + " thresh=" +
313  str(self.energy_threshold))
314  f.close()
315 
316  was_loud_enough = num_loud_chunks > min_loud_chunks
317 
318  quiet_enough = noise <= min_noise
319  if quiet_enough:
320  silence_duration += sec_per_buffer
321  if silence_duration < self.MIN_SILENCE_AT_END:
322  quiet_enough = False # gotta be silent for min of 1/4 sec
323  else:
324  silence_duration = 0
325  recorded_too_much_silence = num_chunks > max_chunks_of_silence
326  if quiet_enough and (was_loud_enough or recorded_too_much_silence):
327  phrase_complete = True
328 
329  # Pressing top-button will end recording immediately
330  if check_for_signal('buttonPress'):
331  phrase_complete = True
332 
333  return byte_data
334 
335  @staticmethod
336  def sec_to_bytes(sec, source):
337  return int(sec * source.SAMPLE_RATE) * source.SAMPLE_WIDTH
338 
339  def _skip_wake_word(self):
340  # Check if told programatically to skip the wake word, like
341  # when we are in a dialog with the user.
342  if check_for_signal('startListening'):
343  return True
344 
345  # Pressing the Mark 1 button can start recording (unless
346  # it is being used to mean 'stop' instead)
347  if check_for_signal('buttonPress', 1):
348  # give other processes time to consume this signal if
349  # it was meant to be a 'stop'
350  sleep(0.25)
351  if check_for_signal('buttonPress'):
352  # Signal is still here, assume it was intended to
353  # begin recording
354  LOG.debug("Button Pressed, wakeword not needed")
355  return True
356 
357  return False
358 
359  def stop(self):
360  """
361  Signal stop and exit waiting state.
362  """
363  self._stop_signaled = True
364 
365  def _compile_metadata(self):
366  ww_module = self.wake_word_recognizer.__class__.__name__
367  if ww_module == 'PreciseHotword':
368  model_path = self.wake_word_recognizer.precise_model
369  with open(model_path, 'rb') as f:
370  model_hash = md5(f.read()).hexdigest()
371  else:
372  model_hash = '0'
373 
374  return {
375  'name': self.wake_word_name.replace(' ', '-'),
376  'engine': md5(ww_module.encode('utf-8')).hexdigest(),
377  'time': str(int(1000 * get_time())),
378  'sessionId': SessionManager.get().session_id,
379  'accountId': self.account_id,
380  'model': str(model_hash)
381  }
382 
383  def _upload_wake_word(self, audio, metadata):
384  requests.post(
385  self.upload_url, files={
386  'audio': BytesIO(audio.get_wav_data()),
387  'metadata': StringIO(json.dumps(metadata))
388  }
389  )
390 
391  def _wait_until_wake_word(self, source, sec_per_buffer):
392  """Listen continuously on source until a wake word is spoken
393 
394  Args:
395  source (AudioSource): Source producing the audio chunks
396  sec_per_buffer (float): Fractional number of seconds in each chunk
397  """
398  num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE *
399  source.SAMPLE_WIDTH)
400 
401  silence = get_silence(num_silent_bytes)
402 
403  # bytearray to store audio in
404  byte_data = silence
405 
406  buffers_per_check = self.SEC_BETWEEN_WW_CHECKS / sec_per_buffer
407  buffers_since_check = 0.0
408 
409  # Max bytes for byte_data before audio is removed from the front
410  max_size = self.sec_to_bytes(self.SAVED_WW_SEC, source)
411  test_size = self.sec_to_bytes(self.TEST_WW_SEC, source)
412 
413  said_wake_word = False
414 
415  # Rolling buffer to track the audio energy (loudness) heard on
416  # the source recently. An average audio energy is maintained
417  # based on these levels.
418  energies = []
419  idx_energy = 0
420  avg_energy = 0.0
421  energy_avg_samples = int(5 / sec_per_buffer) # avg over last 5 secs
422  counter = 0
423 
424  while not said_wake_word and not self._stop_signaled:
425  if self._skip_wake_word():
426  break
427  chunk = self.record_sound_chunk(source)
428 
429  energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
430  if energy < self.energy_threshold * self.multiplier:
431  self._adjust_threshold(energy, sec_per_buffer)
432 
433  if len(energies) < energy_avg_samples:
434  # build the average
435  energies.append(energy)
436  avg_energy += float(energy) / energy_avg_samples
437  else:
438  # maintain the running average and rolling buffer
439  avg_energy -= float(energies[idx_energy]) / energy_avg_samples
440  avg_energy += float(energy) / energy_avg_samples
441  energies[idx_energy] = energy
442  idx_energy = (idx_energy + 1) % energy_avg_samples
443 
444  # maintain the threshold using average
445  if energy < avg_energy * 1.5:
446  if energy > self.energy_threshold:
447  # bump the threshold to just above this value
448  self.energy_threshold = energy * 1.2
449 
450  # Periodically output energy level stats. This can be used to
451  # visualize the microphone input, e.g. a needle on a meter.
452  if counter % 3:
453  with open(self.mic_level_file, 'w') as f:
454  f.write("Energy: cur=" + str(energy) + " thresh=" +
455  str(self.energy_threshold))
456  f.close()
457  counter += 1
458 
459  # At first, the buffer is empty and must fill up. After that
460  # just drop the first chunk bytes to keep it the same size.
461  needs_to_grow = len(byte_data) < max_size
462  if needs_to_grow:
463  byte_data += chunk
464  else: # Remove beginning of audio and add new chunk to end
465  byte_data = byte_data[len(chunk):] + chunk
466 
467  buffers_since_check += 1.0
468  self.wake_word_recognizer.update(chunk)
469  if buffers_since_check > buffers_per_check:
470  buffers_since_check -= buffers_per_check
471  chopped = byte_data[-test_size:] \
472  if test_size < len(byte_data) else byte_data
473  audio_data = chopped + silence
474  said_wake_word = \
475  self.wake_word_recognizer.found_wake_word(audio_data)
476 
477  # Save positive wake words as appropriate
478  if said_wake_word:
479  audio = None
480  mtd = None
481  if self.save_wake_words:
482  # Save wake word locally
483  audio = self._create_audio_data(byte_data, source)
484  mtd = self._compile_metadata()
485  if not isdir(self.saved_wake_words_dir):
486  os.mkdir(self.saved_wake_words_dir)
487  module = self.wake_word_recognizer.__class__.__name__
488 
489  fn = join(self.saved_wake_words_dir,
490  '_'.join([str(mtd[k]) for k in sorted(mtd)])
491  + '.wav')
492  with open(fn, 'wb') as f:
493  f.write(audio.get_wav_data())
494 
495  if self.config['opt_in'] and not self.upload_disabled:
496  # Upload wake word for opt_in people
497  Thread(
498  target=self._upload_wake_word, daemon=True,
499  args=[audio or
500  self._create_audio_data(byte_data, source),
501  mtd or self._compile_metadata()]
502  ).start()
503 
504  @staticmethod
505  def _create_audio_data(raw_data, source):
506  """
507  Constructs an AudioData instance with the same parameters
508  as the source and the specified frame_data
509  """
510  return AudioData(raw_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
511 
512  def listen(self, source, emitter, stream=None):
513  """Listens for chunks of audio that Mycroft should perform STT on.
514 
515  This will listen continuously for a wake-up-word, then return the
516  audio chunk containing the spoken phrase that comes immediately
517  afterwards.
518 
519  Args:
520  source (AudioSource): Source producing the audio chunks
521  emitter (EventEmitter): Emitter for notifications of when recording
522  begins and ends.
523  stream (AudioStreamHandler): Stream target that will receive chunks
524  of the utterance audio while it is
525  being recorded
526 
527  Returns:
528  AudioData: audio with the user's utterance, minus the wake-up-word
529  """
530  assert isinstance(source, AudioSource), "Source must be an AudioSource"
531 
532  # bytes_per_sec = source.SAMPLE_RATE * source.SAMPLE_WIDTH
533  sec_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
534 
535  # Every time a new 'listen()' request begins, reset the threshold
536  # used for silence detection. This is as good of a reset point as
537  # any, as we expect the user and Mycroft to not be talking.
538  # NOTE: adjust_for_ambient_noise() doc claims it will stop early if
539  # speech is detected, but there is no code to actually do that.
540  self.adjust_for_ambient_noise(source, 1.0)
541 
542  LOG.debug("Waiting for wake word...")
543  self._wait_until_wake_word(source, sec_per_buffer)
544  if self._stop_signaled:
545  return
546 
547  LOG.debug("Recording...")
548  emitter.emit("recognizer_loop:record_begin")
549 
550  # If enabled, play a wave file with a short sound to audibly
551  # indicate recording has begun.
552  if self.config.get('confirm_listening'):
553  audio_file = resolve_resource_file(
554  self.config.get('sounds').get('start_listening'))
555  if audio_file:
556  source.mute()
557  play_wav(audio_file).wait()
558  source.unmute()
559 
560  frame_data = self._record_phrase(source, sec_per_buffer, stream)
561  audio_data = self._create_audio_data(frame_data, source)
562  emitter.emit("recognizer_loop:record_end")
563  if self.save_utterances:
564  LOG.info("Recording utterance")
565  stamp = str(datetime.datetime.now())
566  filename = "/tmp/mycroft_utterance%s.wav" % stamp
567  with open(filename, 'wb') as filea:
568  filea.write(audio_data.get_wav_data())
569  LOG.debug("Thinking...")
570 
571  return audio_data
572 
573  def _adjust_threshold(self, energy, seconds_per_buffer):
574  if self.dynamic_energy_threshold and energy > 0:
575  # account for different chunk sizes and rates
576  damping = (
577  self.dynamic_energy_adjustment_damping ** seconds_per_buffer)
578  target_energy = energy * self.energy_ratio
579  self.energy_threshold = (
580  self.energy_threshold * damping +
581  target_energy * (1 - damping))
def _wait_until_wake_word(self, source, sec_per_buffer)
Definition: mic.py:391
def resolve_resource_file(res_name)
def __init__(self, wake_word_recognizer)
Definition: mic.py:186
def _create_audio_data(raw_data, source)
Definition: mic.py:505
def _adjust_threshold(self, energy, seconds_per_buffer)
Definition: mic.py:573
def get_silence(num_bytes)
Definition: mic.py:159
def check_for_signal(signal_name, sec_lifetime=0)
Definition: signal.py:105
def calc_energy(sound_chunk, sample_width)
Definition: mic.py:234
def read(self, size, of_exc=False)
Definition: mic.py:70
def listen(self, source, emitter, stream=None)
Definition: mic.py:512
def _upload_wake_word(self, audio, metadata)
Definition: mic.py:383
def __init__(self, wrapped_stream, format, muted=False)
Definition: mic.py:49
def __init__(self, device_index=None, sample_rate=16000, chunk_size=1024, mute=False)
Definition: mic.py:118
def get_ipc_directory(domain=None)
Definition: signal.py:25
def __exit__(self, exc_type, exc_value, traceback)
Definition: mic.py:138
def get(phrase, lang=None, context=None)
def _record_phrase(self, source, sec_per_buffer, stream=None)
Definition: mic.py:237


mycroft_ros
Author(s):
autogenerated on Mon Apr 26 2021 02:35:40