speech2text/speech2text.py

import gi
gi.require_version("Gtk", "3.0")
from gi.repository import Gtk, Gdk, GLib

import sounddevice as sd
import numpy as np
import whisper
import tempfile
import threading
import time
import wave

# Load Whisper model once
model = whisper.load_model("medium")

# Recording parameters
SAMPLE_RATE = 22000
CHANNELS = 1
DURATION_LIMIT = 300  # max seconds

class SpeechApp(Gtk.Window):
    def __init__(self):
        Gtk.Window.__init__(self, title="Speech Typing")
        self.set_border_width(20)
        self.set_default_size(400, 300)

        # Create a vertical box to hold our widgets
        vbox = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=10)
        self.add(vbox)

        # Create a status bar
        self.status_bar = Gtk.Label(label="Status: Ready")
        # Modern alternatives to set_alignment(0, 0.5)
        self.status_bar.set_halign(Gtk.Align.START)  # Horizontal alignment (left)
        self.status_bar.set_valign(Gtk.Align.CENTER)  # Vertical alignment (center)
        # Style the status bar
        context = self.status_bar.get_style_context()
        context.add_class("status-bar")
        # Create a frame for the status bar
        status_frame = Gtk.Frame()
        status_frame.add(self.status_bar)
        vbox.pack_start(status_frame, False, False, 0)

        # Add the button to the box
        self.button = Gtk.Button(label="🎙️ Hold to Speak")
        self.button.connect("pressed", self.on_button_pressed)
        self.button.connect("released", self.on_button_released)
        vbox.pack_start(self.button, False, False, 0)

        # Create a scrolled window to contain the text view
        scrolled_window = Gtk.ScrolledWindow()
        scrolled_window.set_hexpand(True)
        scrolled_window.set_vexpand(True)
        vbox.pack_start(scrolled_window, True, True, 0)

        # Create the text view
        self.text_view = Gtk.TextView()
        self.text_view.set_editable(False)
        self.text_view.set_wrap_mode(Gtk.WrapMode.WORD)
        scrolled_window.add(self.text_view)

        # Get the buffer associated with the text view
        self.text_buffer = self.text_view.get_buffer()

        # Add a copy button
        self.copy_button = Gtk.Button(label="📋 Copy Text")
        self.copy_button.connect("clicked", self.on_copy_button_clicked)
        vbox.pack_start(self.copy_button, False, False, 0)

        self.recording = False
        self.audio = []

    def on_button_pressed(self, widget):
        self.recording = True
        self.audio = []
        self.start_time = time.time()

        # Clear the text buffer
        start_iter = self.text_buffer.get_start_iter()
        end_iter = self.text_buffer.get_end_iter()
        self.text_buffer.delete(start_iter, end_iter)

        # Update status bar with initial countdown
        self.update_status(f"Recording... {DURATION_LIMIT:.1f}s remaining")
        self.record_thread = threading.Thread(target=self.record_audio)
        self.record_thread.start()

    def on_button_released(self, widget):
        self.recording = False
        self.record_thread.join()
        # Update status bar
        self.update_status("Transcribing...")
        threading.Thread(target=self.transcribe_and_type).start()

    def record_audio(self):
        def callback(indata, frames, time_info, status):
            if self.recording and (time.time() - self.start_time < DURATION_LIMIT):
                self.audio.append(indata.copy())
            else:
                raise sd.CallbackStop()

        with sd.InputStream(samplerate=SAMPLE_RATE, channels=CHANNELS, callback=callback):
            # Check recording status every 100ms instead of sleeping for the full duration
            check_interval = 100  # milliseconds
            elapsed_time = 0
            while self.recording and elapsed_time < DURATION_LIMIT * 1000:
                # Calculate remaining time in seconds
                remaining_time = (DURATION_LIMIT * 1000 - elapsed_time) / 1000
                # Update status bar with countdown
                GLib.idle_add(self.update_status, f"Recording... {remaining_time:.1f}s remaining")
                sd.sleep(check_interval)
                elapsed_time += check_interval

    def update_status(self, status):
        # Helper method to update status
        self.status_bar.set_text(f"Status: {status}")
        return False  # Return False to prevent being called again

    def transcribe_and_type(self):
        if not self.audio:
            # Update status back to Ready if no audio was recorded
            GLib.idle_add(self.update_status, "Ready")
            return

        audio_np = np.concatenate(self.audio, axis=0)
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
            with wave.open(tmpfile.name, 'wb') as wf:
                wf.setnchannels(CHANNELS)
                wf.setsampwidth(2)
                wf.setframerate(SAMPLE_RATE)
                wf.writeframes((audio_np * 32767).astype(np.int16).tobytes())

            # In the transcribe_and_type method, modify the transcribe call:
            # Replace "en" with your desired language code
            result = model.transcribe(tmpfile.name, language="en")
            text = result["text"].strip()

            # Display the text in the text area
            end_iter = self.text_buffer.get_end_iter()
            if self.text_buffer.get_char_count() > 0:
                self.text_buffer.insert(end_iter, "\n\n")
                end_iter = self.text_buffer.get_end_iter()
            self.text_buffer.insert(end_iter, text)

            # Scroll to the end of the text view
            mark = self.text_buffer.create_mark(None, end_iter, False)
            self.text_view.scroll_to_mark(mark, 0.0, True, 0.0, 1.0)

            # Type the text using xdotool
            # subprocess.run(["xdotool", "type", "--delay", "50", text])

            # Update status back to Ready
            GLib.idle_add(self.update_status, "Ready")

    def on_copy_button_clicked(self, widget):
        # Get start and end iterators for the entire buffer
        start_iter = self.text_buffer.get_start_iter()
        end_iter = self.text_buffer.get_end_iter()

        # Get all text from the buffer
        text = self.text_buffer.get_text(start_iter, end_iter, False)

        # Get the clipboard
        clipboard = Gtk.Clipboard.get(Gdk.SELECTION_CLIPBOARD)

        # Set the text to the clipboard
        clipboard.set_text(text, -1)

        # Make the text available to other applications
        clipboard.store()

win = SpeechApp()
win.connect("destroy", Gtk.main_quit)
win.show_all()
Gtk.main()