import os import contextlib import asyncio import keyboard import datetime from pvrecorder import PvRecorder import wave import struct import whisper import yaml import torch import random from playsound import playsound import requests from TTS.api import TTS class DankAssistant(): def start(self): self.load_config() self.init_audio() self.init_speech_recognition() self.init_tts() self.bot_name = "dank-bot" self.conversation_history = [] def init_tts(self): device = "cuda" if torch.cuda.is_available() else "cpu" self.tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to(device) def load_config(self): config_filename="settings.yaml" prompt_file="prompt.txt" print("Loading YAML config...") with open(config_filename, 'r') as conf_file: yaml_config = yaml.safe_load(conf_file) self.config = yaml_config.copy() with open(prompt_file, 'r') as prompt_file: self.prompt_template = prompt_file.read() self.prompt_template = self.prompt_template.rstrip('\n') def process_input(self): print("Hold X to speak...") try: keyboard.wait("x") self.recorder.start() wavfile = wave.open("output.wav", "w") wavfile.setparams((1, 2, self.recorder.sample_rate, self.recorder.frame_length, "NONE", "NONE")) playsound(f"sounds/beep-on.wav") while True: frame = self.recorder.read() if wavfile is not None: wavfile.writeframes(struct.pack("h" * len(frame), *frame)) if not keyboard.is_pressed('x'): break self.recorder.stop() playsound(f"sounds/beep-off.wav") result = self.get_speech_transcription("output.wav") self.query_bot(result) except Exception as e: print(f"Error! {e}") def main_loop(self): wavfile = None while True: self.process_input() def init_audio(self): devices = PvRecorder.get_available_devices() # For some reason the last device is the default, at least on Windows chosen_device_index=len(devices)-1 chosen_device=devices[chosen_device_index] print(f"Using audio device {chosen_device}") self.recorder = PvRecorder(frame_length=512, device_index=chosen_device_index) def init_speech_recognition(self): print("Loading speech recognition model...") self.whisper_model = whisper.load_model("small") def get_speech_transcription(self, speech_file): result = self.whisper_model.transcribe(speech_file) return result["text"]; # Puts extra info in the prompt, like date, time, conversation history, etc def process_prompt(self): history_text = "" for entry in self.conversation_history: history_text += f"{entry['user']}: {entry['message']}\n" full_prompt = self.prompt_template.replace("", history_text) full_prompt = full_prompt.replace("", self.bot_name) full_prompt = full_prompt.replace("", str(datetime.date.today())) full_prompt = full_prompt.replace("