Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import datetime
2from typing import Optional
3import deepspeech
4import enum
5import logging
6import numpy as np
7import os
9from askbob.speech.listener.listener import UtteranceService
12class TranscriptionEvent(enum.Enum):
13 """An enum representing utterance-related events."""
14 START_UTTERANCE = 1
15 END_UTTERANCE = 2
18class Transcriber:
19 """The transcriber performs speech-to-text on captured utterances spoken by the user."""
21 def __init__(self, model: str, scorer: str, us: UtteranceService, save_path: Optional[str] = None):
22 # Load the DeepSpeech model
23 self.model = self.init_deepspeech(model, scorer)
25 # Utterance service
26 self.us = us
27 self.save_path = save_path
29 def init_deepspeech(self, model_path: str, scorer_path: str = "") -> deepspeech.Model:
30 """Initialises the DeepSpeech model.
32 Args:
33 model_path (str): The path to the DeepSpech model.
34 scorer_path (str, optional): The path to an external scorer. Defaults to "".
36 Returns:
37 deepspeech.Model: The DeepSpeech model.
38 """
40 logging.info("Initialising DeepSpeech model: %s", scorer_path)
42 model = deepspeech.Model(model_path)
43 if scorer_path:
44 logging.info("Enabling the external scorer: %s", scorer_path)
45 model.enableExternalScorer(scorer_path)
47 return model
49 def transcribe(self):
50 """Transcribes spoken words.
52 Yields:
53 TranscriptionEvent: Whether the utterance has started or ended.
54 str: The transcribed phrase spoken by the user.
55 """
57 if not self.us:
58 return
60 if self.save_path:
61 os.makedirs(self.save_path, exist_ok=True)
63 stream_context = self.model.createStream()
64 wav_data = bytearray()
65 last_event = None
66 try:
67 for utterance in self.us.utterances():
68 if utterance is not None:
69 if last_event != TranscriptionEvent.START_UTTERANCE:
70 logging.debug("Utterance started.")
71 last_event = TranscriptionEvent.START_UTTERANCE
72 yield last_event, None
74 stream_context.feedAudioContent(
75 np.frombuffer(utterance, np.int16))
77 if self.save_path:
78 wav_data.extend(utterance)
79 else:
80 logging.debug("Utterence ended.")
82 text = stream_context.finishStream()
83 if text and self.save_path:
84 self.us.write_wav(os.path.join(self.save_path, datetime.datetime.now().strftime(
85 "%Y-%m-%d_%H-%M-%S - " + text + ".wav")), wav_data)
87 last_event = TranscriptionEvent.END_UTTERANCE
88 yield last_event, text
90 if self.save_path:
91 wav_data = bytearray()
93 stream_context = self.model.createStream()
94 except KeyboardInterrupt:
95 return