From fb51972ec9df732c14af6e502278e34058d875b1 Mon Sep 17 00:00:00 2001 From: rnentjes Date: Mon, 16 Feb 2026 14:04:29 +0100 Subject: [PATCH] Add hotkey support and `xdotool` integration for enhanced speech-to-text functionality --- requirements.txt | 4 +- speech2text-keyed.py | 236 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 239 insertions(+), 1 deletion(-) create mode 100755 speech2text-keyed.py diff --git a/requirements.txt b/requirements.txt index 1a26026..8aed224 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,6 @@ pip install git+https://github.com/openai/whisper.git sudo apt install python3-cairo libcairo2-dev libgirepository2-dev portaudio19-dev - pip install sounddevice pycairo pygobject \ No newline at end of file + pip install sounddevice pycairo pygobject + pip install pynput + sudo apt install xdotool diff --git a/speech2text-keyed.py b/speech2text-keyed.py new file mode 100755 index 0000000..e1adbad --- /dev/null +++ b/speech2text-keyed.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 + +import gi +gi.require_version("Gtk", "3.0") +from gi.repository import Gtk, Gdk, GLib + +import sounddevice as sd +import numpy as np +import whisper +import tempfile +import threading +import time +import wave +import subprocess + +from pynput import keyboard + +# Load Whisper model once +model = whisper.load_model("medium") + +# Recording parameters +SAMPLE_RATE = 22000 +CHANNELS = 1 +DURATION_LIMIT = 300 # max seconds + +class SpeechApp(Gtk.Window): + def __init__(self): + Gtk.Window.__init__(self, title="Speech Typing") + self.set_border_width(20) + self.set_default_size(400, 300) + + # Create a vertical box to hold our widgets + vbox = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=10) + self.add(vbox) + + # Create a status bar + self.status_bar = Gtk.Label(label="Status: Ready") + # Modern alternatives to set_alignment(0, 0.5) + self.status_bar.set_halign(Gtk.Align.START) # Horizontal alignment (left) + self.status_bar.set_valign(Gtk.Align.CENTER) # Vertical alignment (center) + # Style the status bar + context = self.status_bar.get_style_context() + context.add_class("status-bar") + # Create a frame for the status bar + status_frame = Gtk.Frame() + status_frame.add(self.status_bar) + vbox.pack_start(status_frame, False, False, 0) + + # Add the button to the box + self.button = Gtk.Button(label="🎙️ Hold to Speak") + self.button.connect("pressed", self.on_button_pressed) + self.button.connect("released", self.on_button_released) + vbox.pack_start(self.button, False, False, 0) + + # Create a scrolled window to contain the text view + scrolled_window = Gtk.ScrolledWindow() + scrolled_window.set_hexpand(True) + scrolled_window.set_vexpand(True) + vbox.pack_start(scrolled_window, True, True, 0) + + # Create the text view + self.text_view = Gtk.TextView() + self.text_view.set_editable(False) + self.text_view.set_wrap_mode(Gtk.WrapMode.WORD) + scrolled_window.add(self.text_view) + + # Get the buffer associated with the text view + self.text_buffer = self.text_view.get_buffer() + + # Add a copy button + self.copy_button = Gtk.Button(label="📋 Copy Text") + self.copy_button.connect("clicked", self.on_copy_button_clicked) + vbox.pack_start(self.copy_button, False, False, 0) + + self.recording = False + self.audio = [] + self.hotkey_pressed = set() + self.hotkey_active = False + + # Global hotkey listener for Right-Shift + Right-Ctrl + self.listener = keyboard.Listener( + on_press=self.on_key_press, + on_release=self.on_key_release, + ) + self.listener.daemon = True + self.listener.start() + + def on_button_pressed(self, widget): + self.start_recording() + + def on_button_released(self, widget): + self.stop_recording() + + def start_recording(self): + if self.recording: + return + self.recording = True + self.audio = [] + self.start_time = time.time() + + # Clear the text buffer + start_iter = self.text_buffer.get_start_iter() + end_iter = self.text_buffer.get_end_iter() + self.text_buffer.delete(start_iter, end_iter) + + # Update status bar with initial countdown + self.update_status(f"Recording... {DURATION_LIMIT:.1f}s remaining") + self.record_thread = threading.Thread(target=self.record_audio) + self.record_thread.start() + + def stop_recording(self): + if not self.recording: + return + self.recording = False + self.record_thread.join() + # Update status bar + self.update_status("Transcribing...") + threading.Thread(target=self.transcribe_and_type).start() + + def record_audio(self): + def callback(indata, frames, time_info, status): + if self.recording and (time.time() - self.start_time < DURATION_LIMIT): + self.audio.append(indata.copy()) + else: + raise sd.CallbackStop() + + with sd.InputStream(samplerate=SAMPLE_RATE, channels=CHANNELS, callback=callback): + # Check recording status every 100ms instead of sleeping for the full duration + check_interval = 100 # milliseconds + elapsed_time = 0 + while self.recording and elapsed_time < DURATION_LIMIT * 1000: + # Calculate remaining time in seconds + remaining_time = (DURATION_LIMIT * 1000 - elapsed_time) / 1000 + # Update status bar with countdown + GLib.idle_add(self.update_status, f"Recording... {remaining_time:.1f}s remaining") + sd.sleep(check_interval) + elapsed_time += check_interval + + def update_status(self, status): + # Helper method to update status + self.status_bar.set_text(f"Status: {status}") + return False # Return False to prevent being called again + + def transcribe_and_type(self): + if not self.audio: + # Update status back to Ready if no audio was recorded + GLib.idle_add(self.update_status, "Ready") + return + + audio_np = np.concatenate(self.audio, axis=0) + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile: + with wave.open(tmpfile.name, 'wb') as wf: + wf.setnchannels(CHANNELS) + wf.setsampwidth(2) + wf.setframerate(SAMPLE_RATE) + wf.writeframes((audio_np * 32767).astype(np.int16).tobytes()) + + # In the transcribe_and_type method, modify the transcribe call: + # Replace "en" with your desired language code + result = model.transcribe(tmpfile.name, language="en") + text = result["text"].strip() + + # Display the text in the text area + end_iter = self.text_buffer.get_end_iter() + if self.text_buffer.get_char_count() > 0: + self.text_buffer.insert(end_iter, "\n\n") + end_iter = self.text_buffer.get_end_iter() + self.text_buffer.insert(end_iter, text) + + # Scroll to the end of the text view + mark = self.text_buffer.create_mark(None, end_iter, False) + self.text_view.scroll_to_mark(mark, 0.0, True, 0.0, 1.0) + + # Type the text into the current active application + self.send_text_to_active_app(text) + + # Update status back to Ready + GLib.idle_add(self.update_status, "Ready") + + def on_copy_button_clicked(self, widget): + # Get start and end iterators for the entire buffer + start_iter = self.text_buffer.get_start_iter() + end_iter = self.text_buffer.get_end_iter() + + # Get all text from the buffer + text = self.text_buffer.get_text(start_iter, end_iter, False) + + # Get the clipboard + clipboard = Gtk.Clipboard.get(Gdk.SELECTION_CLIPBOARD) + + # Set the text to the clipboard + clipboard.set_text(text, -1) + + # Make the text available to other applications + clipboard.store() + + def send_text_to_active_app(self, text): + if not text: + return + try: + subprocess.run(["xdotool", "type", "--delay", "50", text], check=False) + except FileNotFoundError: + # xdotool not installed; fail silently to avoid crashing the app + pass + + def on_key_press(self, key): + if key == keyboard.Key.shift_r: + self.hotkey_pressed.add("shift_r") + elif key == keyboard.Key.ctrl_r: + self.hotkey_pressed.add("ctrl_r") + + if not self.hotkey_active and self.hotkey_pressed == {"shift_r", "ctrl_r"}: + self.hotkey_active = True + GLib.idle_add(self.start_recording) + + def on_key_release(self, key): + if key == keyboard.Key.shift_r: + self.hotkey_pressed.discard("shift_r") + elif key == keyboard.Key.ctrl_r: + self.hotkey_pressed.discard("ctrl_r") + + if self.hotkey_active and self.hotkey_pressed != {"shift_r", "ctrl_r"}: + self.hotkey_active = False + GLib.idle_add(self.stop_recording) + + def on_destroy(self, widget): + try: + self.listener.stop() + except Exception: + pass + Gtk.main_quit() + +win = SpeechApp() +win.connect("destroy", win.on_destroy) +win.show_all() +Gtk.main()