commit 0940e84ffc0f00fd2df7863f5b41f12b45ebe39a Author: Cam Spry Date: Mon May 13 01:28:54 2024 -0400 Initial commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/assistant.py b/assistant.py new file mode 100644 index 0000000..f8a1490 --- /dev/null +++ b/assistant.py @@ -0,0 +1,149 @@ +import os +import contextlib +import asyncio +import keyboard +import datetime +from pvrecorder import PvRecorder +import wave +import struct +import whisper +import yaml +import torch +import random +from playsound import playsound +import requests +from TTS.api import TTS + +class DankAssistant(): + def start(self): + self.load_config() + self.init_audio() + self.init_speech_recognition() + self.init_tts() + self.bot_name = "dank-bot" + self.conversation_history = [] + + def init_tts(self): + device = "cuda" if torch.cuda.is_available() else "cpu" + self.tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to(device) + + def load_config(self): + config_filename="settings.yaml" + prompt_file="prompt.txt" + print("Loading YAML config...") + with open(config_filename, 'r') as conf_file: + yaml_config = yaml.safe_load(conf_file) + self.config = yaml_config.copy() + with open(prompt_file, 'r') as prompt_file: + self.prompt_template = prompt_file.read() + self.prompt_template = self.prompt_template.rstrip('\n') + + def process_input(self): + print("Hold X to speak...") + try: + keyboard.wait("x") + self.recorder.start() + wavfile = wave.open("output.wav", "w") + wavfile.setparams((1, 2, self.recorder.sample_rate, self.recorder.frame_length, "NONE", "NONE")) + playsound(f"sounds/beep-on.wav") + while True: + frame = self.recorder.read() + if wavfile is not None: + wavfile.writeframes(struct.pack("h" * len(frame), *frame)) + if not keyboard.is_pressed('x'): + break + self.recorder.stop() + playsound(f"sounds/beep-off.wav") + result = self.get_speech_transcription("output.wav") + self.query_bot(result) + except Exception as e: + print(f"Error! {e}") + + def main_loop(self): + wavfile = None + while True: + self.process_input() + + def init_audio(self): + devices = PvRecorder.get_available_devices() + # For some reason the last device is the default, at least on Windows + chosen_device_index=len(devices)-1 + chosen_device=devices[chosen_device_index] + print(f"Using audio device {chosen_device}") + self.recorder = PvRecorder(frame_length=512, device_index=chosen_device_index) + + def init_speech_recognition(self): + print("Loading speech recognition model...") + self.whisper_model = whisper.load_model("small") + + def get_speech_transcription(self, speech_file): + result = self.whisper_model.transcribe(speech_file) + return result["text"]; + + # Puts extra info in the prompt, like date, time, conversation history, etc + def process_prompt(self): + history_text = "" + for entry in self.conversation_history: + history_text += f"{entry['user']}: {entry['message']}\n" + full_prompt = self.prompt_template.replace("", history_text) + full_prompt = full_prompt.replace("", self.bot_name) + full_prompt = full_prompt.replace("", str(datetime.date.today())) + full_prompt = full_prompt.replace("