Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import collections 

2import logging 

3import numpy as np 

4import queue 

5import scipy 

6import scipy.signal 

7import wave 

8import webrtcvad 

9 

10 

11class UtteranceService: 

12 """The UtteranceService is responsible for compiling complete utterances for the speech transcriber.""" 

13 

14 channels: int = 1 

15 sample_rate: int = 16000 

16 input_rate: int 

17 blocks_per_second: int = 50 

18 reset_buffer_queue: bool = True 

19 

20 # the number of samples/frames in each block (#samples == #blocks as channels == 1) 

21 block_size: int = sample_rate // blocks_per_second 

22 frame_duration_ms: int = property( 

23 lambda self: 1000 * self.block_size // self.sample_rate) 

24 

25 buffer_queue: queue.Queue 

26 vad: webrtcvad.Vad 

27 

28 def __init__(self, aggressiveness: int = 1, lowpass_frequency: int = 65, highpass_frequency: int = 4000) -> None: 

29 self.buffer_queue = queue.Queue() 

30 self.vad = webrtcvad.Vad(aggressiveness) 

31 

32 self._init_filter(lowpass_frequency, highpass_frequency) 

33 

34 def _resample(self, data): 

35 """Resamples audio frames to the sample rate needed for DeepSpeech and webrtcvad (16000Hz). 

36 

37 The user's microphone may not support the native processing sampling rate of 16000Hz, so audio data will have to be resampled from a sample rate supported by their recording device (input_rate) to sample_rate. 

38 """ 

39 data16 = np.frombuffer(data, dtype=np.int16) 

40 resample_size = int(len(data16) * self.sample_rate / self.input_rate) 

41 resample = scipy.signal.resample(data16, resample_size) 

42 resample16 = np.array(resample, dtype=np.int16) 

43 return resample16.tobytes() 

44 

45 def write_wav(self, filename: str, data): 

46 """Writes audio frames to a .wav file. 

47 

48 Args: 

49 filename (str): The filename of the .wav file to be written. 

50 data: The audio frames. 

51 """ 

52 

53 logging.info("Writing wav file: %s", filename) 

54 wf = wave.open(filename, 'wb') 

55 wf.setnchannels(self.channels) 

56 wf.setsampwidth(2) # wf.setsampwidth(self.pa.get_sample_size(format)) 

57 wf.setframerate(self.sample_rate) 

58 wf.writeframes(data) 

59 wf.close() 

60 

61 def _init_filter(self, lowpass_frequency: int, highpass_frequency: int): 

62 """Initialises the bandpass filter. 

63 

64 Args: 

65 lowpass_frequency (int): The lowpass filter cutoff frequency. 

66 highpass_frequency (int): The highpass filter cutoff frequency. 

67 """ 

68 

69 nyquist_frequency = 0.5 * self.sample_rate 

70 self.b, self.a = scipy.signal.filter_design.butter(4, [ 

71 lowpass_frequency / nyquist_frequency, 

72 highpass_frequency / nyquist_frequency 

73 ], btype='bandpass') 

74 self.zi = scipy.signal.signaltools.lfilter_zi(self.b, self.a) 

75 

76 def _filter(self, data): 

77 """Applies a bandpass filter to the audio signal.""" 

78 

79 data16 = np.frombuffer(data, dtype=np.int16) 

80 

81 filtered, self.zi = scipy.signal.signaltools.lfilter( 

82 self.b, self.a, data16, axis=0, zi=self.zi) 

83 

84 return np.array(filtered, dtype=np.int16).tobytes() 

85 

86 def _frames(self): 

87 """Yields all audio frames from the microphone, blocking if necessary.""" 

88 if self.input_rate == self.sample_rate: 

89 while True: 

90 yield self._filter(self.buffer_queue.get()) 

91 else: 

92 while True: 

93 yield self._filter(self._resample(self.buffer_queue.get())) 

94 

95 def utterances(self, padding_ms=300, ratio=0.75): 

96 """This is a generator that yields series of consecutive audio frames for each utterence, separated by a single None. 

97 

98 It determines the level of voice activity using the ratio of frames in padding_ms. It uses a buffer to include padding_ms prior to being triggered. 

99 

100 Example: (frame, ..., frame, None, frame, ..., frame, None, ...) 

101 | utterence | | utterence | 

102 """ 

103 

104 triggered = False 

105 ring_buffer = collections.deque( 

106 maxlen=(padding_ms // self.frame_duration_ms)) 

107 

108 try: 

109 for frame in self._frames(): 109 ↛ 141line 109 didn't jump to line 141, because the loop on line 109 didn't complete

110 if len(frame) < 640: 

111 return 

112 

113 is_speech = self.vad.is_speech(frame, self.sample_rate) 

114 

115 if triggered: 

116 yield frame 

117 ring_buffer.append((frame, is_speech)) 

118 num_unvoiced = len( 

119 [f for f, speech in ring_buffer if not speech]) 

120 

121 if num_unvoiced > ratio * ring_buffer.maxlen: 

122 triggered = False 

123 yield None 

124 ring_buffer.clear() 

125 if self.reset_buffer_queue: 

126 self.buffer_queue = queue.Queue() 

127 else: 

128 ring_buffer.append((frame, is_speech)) 

129 num_voiced = len( 

130 [f for f, speech in ring_buffer if speech]) 

131 

132 if num_voiced > ratio * ring_buffer.maxlen: 

133 triggered = True 

134 for f, _ in ring_buffer: 

135 yield f 

136 ring_buffer.clear() 

137 

138 except KeyboardInterrupt: 

139 return 

140 finally: 

141 self._destroy() 

142 

143 def _destroy(self) -> None: 

144 pass