Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import collections
2import logging
3import numpy as np
4import queue
5import scipy
6import scipy.signal
7import wave
8import webrtcvad
11class UtteranceService:
12 """The UtteranceService is responsible for compiling complete utterances for the speech transcriber."""
14 channels: int = 1
15 sample_rate: int = 16000
16 input_rate: int
17 blocks_per_second: int = 50
18 reset_buffer_queue: bool = True
20 # the number of samples/frames in each block (#samples == #blocks as channels == 1)
21 block_size: int = sample_rate // blocks_per_second
22 frame_duration_ms: int = property(
23 lambda self: 1000 * self.block_size // self.sample_rate)
25 buffer_queue: queue.Queue
26 vad: webrtcvad.Vad
28 def __init__(self, aggressiveness: int = 1, lowpass_frequency: int = 65, highpass_frequency: int = 4000) -> None:
29 self.buffer_queue = queue.Queue()
30 self.vad = webrtcvad.Vad(aggressiveness)
32 self._init_filter(lowpass_frequency, highpass_frequency)
34 def _resample(self, data):
35 """Resamples audio frames to the sample rate needed for DeepSpeech and webrtcvad (16000Hz).
37 The user's microphone may not support the native processing sampling rate of 16000Hz, so audio data will have to be resampled from a sample rate supported by their recording device (input_rate) to sample_rate.
38 """
39 data16 = np.frombuffer(data, dtype=np.int16)
40 resample_size = int(len(data16) * self.sample_rate / self.input_rate)
41 resample = scipy.signal.resample(data16, resample_size)
42 resample16 = np.array(resample, dtype=np.int16)
43 return resample16.tobytes()
45 def write_wav(self, filename: str, data):
46 """Writes audio frames to a .wav file.
48 Args:
49 filename (str): The filename of the .wav file to be written.
50 data: The audio frames.
51 """
53 logging.info("Writing wav file: %s", filename)
54 wf = wave.open(filename, 'wb')
55 wf.setnchannels(self.channels)
56 wf.setsampwidth(2) # wf.setsampwidth(self.pa.get_sample_size(format))
57 wf.setframerate(self.sample_rate)
58 wf.writeframes(data)
59 wf.close()
61 def _init_filter(self, lowpass_frequency: int, highpass_frequency: int):
62 """Initialises the bandpass filter.
64 Args:
65 lowpass_frequency (int): The lowpass filter cutoff frequency.
66 highpass_frequency (int): The highpass filter cutoff frequency.
67 """
69 nyquist_frequency = 0.5 * self.sample_rate
70 self.b, self.a = scipy.signal.filter_design.butter(4, [
71 lowpass_frequency / nyquist_frequency,
72 highpass_frequency / nyquist_frequency
73 ], btype='bandpass')
74 self.zi = scipy.signal.signaltools.lfilter_zi(self.b, self.a)
76 def _filter(self, data):
77 """Applies a bandpass filter to the audio signal."""
79 data16 = np.frombuffer(data, dtype=np.int16)
81 filtered, self.zi = scipy.signal.signaltools.lfilter(
82 self.b, self.a, data16, axis=0, zi=self.zi)
84 return np.array(filtered, dtype=np.int16).tobytes()
86 def _frames(self):
87 """Yields all audio frames from the microphone, blocking if necessary."""
88 if self.input_rate == self.sample_rate:
89 while True:
90 yield self._filter(self.buffer_queue.get())
91 else:
92 while True:
93 yield self._filter(self._resample(self.buffer_queue.get()))
95 def utterances(self, padding_ms=300, ratio=0.75):
96 """This is a generator that yields series of consecutive audio frames for each utterence, separated by a single None.
98 It determines the level of voice activity using the ratio of frames in padding_ms. It uses a buffer to include padding_ms prior to being triggered.
100 Example: (frame, ..., frame, None, frame, ..., frame, None, ...)
101 | utterence | | utterence |
102 """
104 triggered = False
105 ring_buffer = collections.deque(
106 maxlen=(padding_ms // self.frame_duration_ms))
108 try:
109 for frame in self._frames(): 109 ↛ 141line 109 didn't jump to line 141, because the loop on line 109 didn't complete
110 if len(frame) < 640:
111 return
113 is_speech = self.vad.is_speech(frame, self.sample_rate)
115 if triggered:
116 yield frame
117 ring_buffer.append((frame, is_speech))
118 num_unvoiced = len(
119 [f for f, speech in ring_buffer if not speech])
121 if num_unvoiced > ratio * ring_buffer.maxlen:
122 triggered = False
123 yield None
124 ring_buffer.clear()
125 if self.reset_buffer_queue:
126 self.buffer_queue = queue.Queue()
127 else:
128 ring_buffer.append((frame, is_speech))
129 num_voiced = len(
130 [f for f, speech in ring_buffer if speech])
132 if num_voiced > ratio * ring_buffer.maxlen:
133 triggered = True
134 for f, _ in ring_buffer:
135 yield f
136 ring_buffer.clear()
138 except KeyboardInterrupt:
139 return
140 finally:
141 self._destroy()
143 def _destroy(self) -> None:
144 pass