Add hotkey support and xdotool integration for enhanced speech-to-text functionality

2026-02-16 14:04:29 +01:00
parent 50dc55a6a6
commit fb51972ec9
2 changed files with 239 additions and 1 deletions
--- a/speech2text-keyed.py
+++ b/speech2text-keyed.py
@@ -0,0 +1,236 @@
+#!/usr/bin/env python3
+
+import gi
+gi.require_version("Gtk", "3.0")
+from gi.repository import Gtk, Gdk, GLib
+
+import sounddevice as sd
+import numpy as np
+import whisper
+import tempfile
+import threading
+import time
+import wave
+import subprocess
+
+from pynput import keyboard
+
+# Load Whisper model once
+model = whisper.load_model("medium")
+
+# Recording parameters
+SAMPLE_RATE = 22000
+CHANNELS = 1
+DURATION_LIMIT = 300  # max seconds
+
+class SpeechApp(Gtk.Window):
+    def __init__(self):
+        Gtk.Window.__init__(self, title="Speech Typing")
+        self.set_border_width(20)
+        self.set_default_size(400, 300)
+
+        # Create a vertical box to hold our widgets
+        vbox = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=10)
+        self.add(vbox)
+
+        # Create a status bar
+        self.status_bar = Gtk.Label(label="Status: Ready")
+        # Modern alternatives to set_alignment(0, 0.5)
+        self.status_bar.set_halign(Gtk.Align.START)  # Horizontal alignment (left)
+        self.status_bar.set_valign(Gtk.Align.CENTER)  # Vertical alignment (center)
+        # Style the status bar
+        context = self.status_bar.get_style_context()
+        context.add_class("status-bar")
+        # Create a frame for the status bar
+        status_frame = Gtk.Frame()
+        status_frame.add(self.status_bar)
+        vbox.pack_start(status_frame, False, False, 0)
+
+        # Add the button to the box
+        self.button = Gtk.Button(label="🎙️ Hold to Speak")
+        self.button.connect("pressed", self.on_button_pressed)
+        self.button.connect("released", self.on_button_released)
+        vbox.pack_start(self.button, False, False, 0)
+
+        # Create a scrolled window to contain the text view
+        scrolled_window = Gtk.ScrolledWindow()
+        scrolled_window.set_hexpand(True)
+        scrolled_window.set_vexpand(True)
+        vbox.pack_start(scrolled_window, True, True, 0)
+
+        # Create the text view
+        self.text_view = Gtk.TextView()
+        self.text_view.set_editable(False)
+        self.text_view.set_wrap_mode(Gtk.WrapMode.WORD)
+        scrolled_window.add(self.text_view)
+
+        # Get the buffer associated with the text view
+        self.text_buffer = self.text_view.get_buffer()
+
+        # Add a copy button
+        self.copy_button = Gtk.Button(label="📋 Copy Text")
+        self.copy_button.connect("clicked", self.on_copy_button_clicked)
+        vbox.pack_start(self.copy_button, False, False, 0)
+
+        self.recording = False
+        self.audio = []
+        self.hotkey_pressed = set()
+        self.hotkey_active = False
+
+        # Global hotkey listener for Right-Shift + Right-Ctrl
+        self.listener = keyboard.Listener(
+            on_press=self.on_key_press,
+            on_release=self.on_key_release,
+        )
+        self.listener.daemon = True
+        self.listener.start()
+
+    def on_button_pressed(self, widget):
+        self.start_recording()
+
+    def on_button_released(self, widget):
+        self.stop_recording()
+
+    def start_recording(self):
+        if self.recording:
+            return
+        self.recording = True
+        self.audio = []
+        self.start_time = time.time()
+
+        # Clear the text buffer
+        start_iter = self.text_buffer.get_start_iter()
+        end_iter = self.text_buffer.get_end_iter()
+        self.text_buffer.delete(start_iter, end_iter)
+
+        # Update status bar with initial countdown
+        self.update_status(f"Recording... {DURATION_LIMIT:.1f}s remaining")
+        self.record_thread = threading.Thread(target=self.record_audio)
+        self.record_thread.start()
+
+    def stop_recording(self):
+        if not self.recording:
+            return
+        self.recording = False
+        self.record_thread.join()
+        # Update status bar
+        self.update_status("Transcribing...")
+        threading.Thread(target=self.transcribe_and_type).start()
+
+    def record_audio(self):
+        def callback(indata, frames, time_info, status):
+            if self.recording and (time.time() - self.start_time < DURATION_LIMIT):
+                self.audio.append(indata.copy())
+            else:
+                raise sd.CallbackStop()
+
+        with sd.InputStream(samplerate=SAMPLE_RATE, channels=CHANNELS, callback=callback):
+            # Check recording status every 100ms instead of sleeping for the full duration
+            check_interval = 100  # milliseconds
+            elapsed_time = 0
+            while self.recording and elapsed_time < DURATION_LIMIT * 1000:
+                # Calculate remaining time in seconds
+                remaining_time = (DURATION_LIMIT * 1000 - elapsed_time) / 1000
+                # Update status bar with countdown
+                GLib.idle_add(self.update_status, f"Recording... {remaining_time:.1f}s remaining")
+                sd.sleep(check_interval)
+                elapsed_time += check_interval
+
+    def update_status(self, status):
+        # Helper method to update status
+        self.status_bar.set_text(f"Status: {status}")
+        return False  # Return False to prevent being called again
+
+    def transcribe_and_type(self):
+        if not self.audio:
+            # Update status back to Ready if no audio was recorded
+            GLib.idle_add(self.update_status, "Ready")
+            return
+
+        audio_np = np.concatenate(self.audio, axis=0)
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
+            with wave.open(tmpfile.name, 'wb') as wf:
+                wf.setnchannels(CHANNELS)
+                wf.setsampwidth(2)
+                wf.setframerate(SAMPLE_RATE)
+                wf.writeframes((audio_np * 32767).astype(np.int16).tobytes())
+
+            # In the transcribe_and_type method, modify the transcribe call:
+            # Replace "en" with your desired language code
+            result = model.transcribe(tmpfile.name, language="en")
+            text = result["text"].strip()
+
+            # Display the text in the text area
+            end_iter = self.text_buffer.get_end_iter()
+            if self.text_buffer.get_char_count() > 0:
+                self.text_buffer.insert(end_iter, "\n\n")
+                end_iter = self.text_buffer.get_end_iter()
+            self.text_buffer.insert(end_iter, text)
+
+            # Scroll to the end of the text view
+            mark = self.text_buffer.create_mark(None, end_iter, False)
+            self.text_view.scroll_to_mark(mark, 0.0, True, 0.0, 1.0)
+
+            # Type the text into the current active application
+            self.send_text_to_active_app(text)
+
+            # Update status back to Ready
+            GLib.idle_add(self.update_status, "Ready")
+
+    def on_copy_button_clicked(self, widget):
+        # Get start and end iterators for the entire buffer
+        start_iter = self.text_buffer.get_start_iter()
+        end_iter = self.text_buffer.get_end_iter()
+
+        # Get all text from the buffer
+        text = self.text_buffer.get_text(start_iter, end_iter, False)
+
+        # Get the clipboard
+        clipboard = Gtk.Clipboard.get(Gdk.SELECTION_CLIPBOARD)
+
+        # Set the text to the clipboard
+        clipboard.set_text(text, -1)
+
+        # Make the text available to other applications
+        clipboard.store()
+
+    def send_text_to_active_app(self, text):
+        if not text:
+            return
+        try:
+            subprocess.run(["xdotool", "type", "--delay", "50", text], check=False)
+        except FileNotFoundError:
+            # xdotool not installed; fail silently to avoid crashing the app
+            pass
+
+    def on_key_press(self, key):
+        if key == keyboard.Key.shift_r:
+            self.hotkey_pressed.add("shift_r")
+        elif key == keyboard.Key.ctrl_r:
+            self.hotkey_pressed.add("ctrl_r")
+
+        if not self.hotkey_active and self.hotkey_pressed == {"shift_r", "ctrl_r"}:
+            self.hotkey_active = True
+            GLib.idle_add(self.start_recording)
+
+    def on_key_release(self, key):
+        if key == keyboard.Key.shift_r:
+            self.hotkey_pressed.discard("shift_r")
+        elif key == keyboard.Key.ctrl_r:
+            self.hotkey_pressed.discard("ctrl_r")
+
+        if self.hotkey_active and self.hotkey_pressed != {"shift_r", "ctrl_r"}:
+            self.hotkey_active = False
+            GLib.idle_add(self.stop_recording)
+
+    def on_destroy(self, widget):
+        try:
+            self.listener.stop()
+        except Exception:
+            pass
+        Gtk.main_quit()
+
+win = SpeechApp()
+win.connect("destroy", win.on_destroy)
+win.show_all()
+Gtk.main()