Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import datetime 

2from typing import Optional 

3import deepspeech 

4import enum 

5import logging 

6import numpy as np 

7import os 

8 

9from askbob.speech.listener.listener import UtteranceService 

10 

11 

12class TranscriptionEvent(enum.Enum): 

13 """An enum representing utterance-related events.""" 

14 START_UTTERANCE = 1 

15 END_UTTERANCE = 2 

16 

17 

18class Transcriber: 

19 """The transcriber performs speech-to-text on captured utterances spoken by the user.""" 

20 

21 def __init__(self, model: str, scorer: str, us: UtteranceService, save_path: Optional[str] = None): 

22 # Load the DeepSpeech model 

23 self.model = self.init_deepspeech(model, scorer) 

24 

25 # Utterance service 

26 self.us = us 

27 self.save_path = save_path 

28 

29 def init_deepspeech(self, model_path: str, scorer_path: str = "") -> deepspeech.Model: 

30 """Initialises the DeepSpeech model. 

31 

32 Args: 

33 model_path (str): The path to the DeepSpech model. 

34 scorer_path (str, optional): The path to an external scorer. Defaults to "". 

35 

36 Returns: 

37 deepspeech.Model: The DeepSpeech model. 

38 """ 

39 

40 logging.info("Initialising DeepSpeech model: %s", scorer_path) 

41 

42 model = deepspeech.Model(model_path) 

43 if scorer_path: 

44 logging.info("Enabling the external scorer: %s", scorer_path) 

45 model.enableExternalScorer(scorer_path) 

46 

47 return model 

48 

49 def transcribe(self): 

50 """Transcribes spoken words. 

51 

52 Yields: 

53 TranscriptionEvent: Whether the utterance has started or ended. 

54 str: The transcribed phrase spoken by the user. 

55 """ 

56 

57 if not self.us: 

58 return 

59 

60 if self.save_path: 

61 os.makedirs(self.save_path, exist_ok=True) 

62 

63 stream_context = self.model.createStream() 

64 wav_data = bytearray() 

65 last_event = None 

66 try: 

67 for utterance in self.us.utterances(): 

68 if utterance is not None: 

69 if last_event != TranscriptionEvent.START_UTTERANCE: 

70 logging.debug("Utterance started.") 

71 last_event = TranscriptionEvent.START_UTTERANCE 

72 yield last_event, None 

73 

74 stream_context.feedAudioContent( 

75 np.frombuffer(utterance, np.int16)) 

76 

77 if self.save_path: 

78 wav_data.extend(utterance) 

79 else: 

80 logging.debug("Utterence ended.") 

81 

82 text = stream_context.finishStream() 

83 if text and self.save_path: 

84 self.us.write_wav(os.path.join(self.save_path, datetime.datetime.now().strftime( 

85 "%Y-%m-%d_%H-%M-%S - " + text + ".wav")), wav_data) 

86 

87 last_event = TranscriptionEvent.END_UTTERANCE 

88 yield last_event, text 

89 

90 if self.save_path: 

91 wav_data = bytearray() 

92 

93 stream_context = self.model.createStream() 

94 except KeyboardInterrupt: 

95 return