Coverage for askbob\speech\listener\listener.py: 99%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1import collections

2import logging

3import numpy as np

4import queue

5import scipy

6import scipy.signal

7import wave

8import webrtcvad

11class UtteranceService:

12 """The UtteranceService is responsible for compiling complete utterances for the speech transcriber."""

14 channels: int = 1

15 sample_rate: int = 16000

16 input_rate: int

17 blocks_per_second: int = 50

18 reset_buffer_queue: bool = True

20 # the number of samples/frames in each block (#samples == #blocks as channels == 1)

21 block_size: int = sample_rate // blocks_per_second

22 frame_duration_ms: int = property(

23 lambda self: 1000 * self.block_size // self.sample_rate)

25 buffer_queue: queue.Queue

26 vad: webrtcvad.Vad

28 def __init__(self, aggressiveness: int = 1, lowpass_frequency: int = 65, highpass_frequency: int = 4000) -> None:

29 self.buffer_queue = queue.Queue()

30 self.vad = webrtcvad.Vad(aggressiveness)

32 self._init_filter(lowpass_frequency, highpass_frequency)

34 def _resample(self, data):

35 """Resamples audio frames to the sample rate needed for DeepSpeech and webrtcvad (16000Hz).

37 The user's microphone may not support the native processing sampling rate of 16000Hz, so audio data will have to be resampled from a sample rate supported by their recording device (input_rate) to sample_rate.

38 """

39 data16 = np.frombuffer(data, dtype=np.int16)

40 resample_size = int(len(data16) * self.sample_rate / self.input_rate)

41 resample = scipy.signal.resample(data16, resample_size)

42 resample16 = np.array(resample, dtype=np.int16)

43 return resample16.tobytes()

45 def write_wav(self, filename: str, data):

46 """Writes audio frames to a .wav file.

48 Args:

49 filename (str): The filename of the .wav file to be written.

50 data: The audio frames.

51 """

53 logging.info("Writing wav file: %s", filename)

54 wf = wave.open(filename, 'wb')

55 wf.setnchannels(self.channels)

56 wf.setsampwidth(2) # wf.setsampwidth(self.pa.get_sample_size(format))

57 wf.setframerate(self.sample_rate)

58 wf.writeframes(data)

59 wf.close()

61 def _init_filter(self, lowpass_frequency: int, highpass_frequency: int):

62 """Initialises the bandpass filter.

64 Args:

65 lowpass_frequency (int): The lowpass filter cutoff frequency.

66 highpass_frequency (int): The highpass filter cutoff frequency.

67 """

69 nyquist_frequency = 0.5 * self.sample_rate

70 self.b, self.a = scipy.signal.filter_design.butter(4, [

71 lowpass_frequency / nyquist_frequency,

72 highpass_frequency / nyquist_frequency

73 ], btype='bandpass')

74 self.zi = scipy.signal.signaltools.lfilter_zi(self.b, self.a)

76 def _filter(self, data):

77 """Applies a bandpass filter to the audio signal."""

79 data16 = np.frombuffer(data, dtype=np.int16)

81 filtered, self.zi = scipy.signal.signaltools.lfilter(

82 self.b, self.a, data16, axis=0, zi=self.zi)

84 return np.array(filtered, dtype=np.int16).tobytes()

86 def _frames(self):

87 """Yields all audio frames from the microphone, blocking if necessary."""

88 if self.input_rate == self.sample_rate:

89 while True:

90 yield self._filter(self.buffer_queue.get())

91 else:

92 while True:

93 yield self._filter(self._resample(self.buffer_queue.get()))

95 def utterances(self, padding_ms=300, ratio=0.75):

96 """This is a generator that yields series of consecutive audio frames for each utterence, separated by a single None.

98 It determines the level of voice activity using the ratio of frames in padding_ms. It uses a buffer to include padding_ms prior to being triggered.

100 Example: (frame, ..., frame, None, frame, ..., frame, None, ...)

101 | utterence | | utterence |

102 """

103

104 triggered = False

105 ring_buffer = collections.deque(

106 maxlen=(padding_ms // self.frame_duration_ms))

107

108 try:

109 for frame in self._frames(): 109 ↛ 141line 109 didn't jump to line 141, because the loop on line 109 didn't complete

110 if len(frame) < 640:

111 return