From 5579b4dc6479ec5d7299cd2b6e7ca6bbeb3915b7 Mon Sep 17 00:00:00 2001 From: Victoria Fischer Date: Tue, 2 Jun 2026 11:25:22 +0200 Subject: [PATCH] code: implement audio transcription from mic audio --- Cargo.lock | 336 ++++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 5 + src/main.rs | 155 +++++++++++++++++++++++- 3 files changed, 492 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8b93d51..a3f8c08 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -158,6 +158,43 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +[[package]] +name = "audio-core" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f93ebbf82d06013f4c41fe71303feb980cddd78496d904d06be627972de51a24" + +[[package]] +name = "audioadapter" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91f87b70b051c5866680ad79f6743a42ccab264c009d1a71f4d33a3872ae60c8" +dependencies = [ + "audio-core", + "num-traits", +] + +[[package]] +name = "audioadapter-buffers" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9097d67933fb083d382ce980430afdb758aada60846010aee6be068c06cef0ca" +dependencies = [ + "audioadapter", + "audioadapter-sample", + "num-traits", +] + +[[package]] +name = "audioadapter-sample" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34ab94f2bc04a14e1f49ee5f222f66460e8a1b51627bdfedf34eed394d747938" +dependencies = [ + "audio-core", + "num-traits", +] + [[package]] name = "autocfg" version = "1.5.1" @@ -233,6 +270,9 @@ name = "bitflags" version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" +dependencies = [ + "serde_core", +] [[package]] name = "bitmaps" @@ -487,6 +527,31 @@ dependencies = [ "libc", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "crossterm" version = "0.29.0" @@ -878,17 +943,22 @@ dependencies = [ "crossterm", "futures", "futures-timer", + "hound", "iref 4.0.0", + "jack", "json-ld", + "oximedia-metering", "ratatui", "rdf-types", "reqwest 0.13.4", + "resampler", "schemars", "scraper", "serde", "serde_json", "sqlite", "static-iref", + "tempfile", "throbber-widgets-tui", "tokio", "tui-input", @@ -1240,6 +1310,11 @@ name = "hashbrown" version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] [[package]] name = "heck" @@ -1268,6 +1343,12 @@ dependencies = [ "digest", ] +[[package]] +name = "hound" +version = "3.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62adaabb884c94955b19907d60019f4e145d091c75345379e70d1ee696f7854f" + [[package]] name = "html5ever" version = "0.39.0" @@ -1655,6 +1736,33 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" +[[package]] +name = "jack" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7811b07bcac5dafabf814ab52c4b0ca9b7948aa1e279f572f03aa6544d47d27" +dependencies = [ + "bitflags 2.11.1", + "jack-sys", + "lazy_static", + "libc", + "log", +] + +[[package]] +name = "jack-sys" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6013b7619b95a22b576dfb43296faa4ecbe40abbdb97dfd22ead520775fc86ab" +dependencies = [ + "bitflags 1.3.2", + "lazy_static", + "libc", + "libloading", + "log", + "pkg-config", +] + [[package]] name = "jni" version = "0.22.4" @@ -2019,6 +2127,22 @@ version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + +[[package]] +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + [[package]] name = "line-clipping" version = "0.3.7" @@ -2277,6 +2401,15 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + [[package]] name = "num-conv" version = "0.2.2" @@ -2321,6 +2454,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -2414,6 +2548,86 @@ version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d211803b9b6b570f68772237e415a029d5a50c65d382910b879fb19d3271f94d" +[[package]] +name = "oxifft" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc0fe3b5f76266f8b860d18c307cacd467213b257dc3098641e19bc3ac17350b" +dependencies = [ + "hashbrown 0.17.1", + "libm", + "num-complex", + "num-traits", + "oxifft-codegen", + "rayon", + "seahash", + "serde", + "serde_json", + "spin", +] + +[[package]] +name = "oxifft-codegen" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b6627e29854ccb428fbf56e95c1970882096a53dae8954759ab09eb5d62f8d5" +dependencies = [ + "oxifft-codegen-impl", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "oxifft-codegen-impl" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "660807d8bbd7453e595aca8523d6f6991dcba0ebd3509f2e5e0b98c1c348e920" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "oximedia-audio" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0408efebc08a1df406d8e56e4b47c19bbb2b69109f1298d943a8d6a435f57b9" +dependencies = [ + "audioadapter", + "audioadapter-buffers", + "bytes", + "oxifft", + "oximedia-core", + "rubato", + "thiserror 2.0.18", +] + +[[package]] +name = "oximedia-core" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb84c059565d2e515e8f7bf655dbda9719237cb4ec915de968bb599f9208efc1" +dependencies = [ + "bitflags 2.11.1", + "serde", + "thiserror 2.0.18", +] + +[[package]] +name = "oximedia-metering" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e8832a54065b5023cd7d1881ef572d485a20d8a691c1528cac4cbfc5d054ea4" +dependencies = [ + "oxifft", + "oximedia-audio", + "oximedia-core", + "rayon", + "thiserror 2.0.18", +] + [[package]] name = "parking_lot" version = "0.12.5" @@ -2683,6 +2897,15 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "primal-check" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc0d895b311e3af9902528fbb8f928688abbd95872819320517cc24ca6b2bd08" +dependencies = [ + "num-integer", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -2943,6 +3166,26 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6bd1f6fba6db8161b6818f9061152e751b4d6030b39b561bbbb0153b36a6cfc5" +[[package]] +name = "rayon" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "rdf-types" version = "0.22.5" @@ -2961,6 +3204,15 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "realfft" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f821338fddb99d089116342c46e9f1fbf3828dba077674613e734e01d6ea8677" +dependencies = [ + "rustfft", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -3125,6 +3377,12 @@ dependencies = [ "tower-service", ] +[[package]] +name = "resampler" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28fdbea87ff02ebbfd904079d1e52138c0c7fbaa3aaddca4a1b9c7d3f85749f2" + [[package]] name = "ring" version = "0.17.14" @@ -3139,6 +3397,22 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rubato" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce96ead1a91f7895704a9f08ea5947dfc8bd7c1f2936a22295b655ec67e5c6ef" +dependencies = [ + "audioadapter", + "audioadapter-buffers", + "num-complex", + "num-integer", + "num-traits", + "realfft", + "visibility", + "windowfunctions", +] + [[package]] name = "rustc-demangle" version = "0.1.27" @@ -3160,6 +3434,20 @@ dependencies = [ "semver", ] +[[package]] +name = "rustfft" +version = "6.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21db5f9893e91f41798c88680037dba611ca6674703c1a18601b01a72c8adb89" +dependencies = [ + "num-complex", + "num-integer", + "num-traits", + "primal-check", + "strength_reduce", + "transpose", +] + [[package]] name = "rustix" version = "1.1.4" @@ -3336,6 +3624,12 @@ dependencies = [ "tendril", ] +[[package]] +name = "seahash" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" + [[package]] name = "secrecy" version = "0.10.3" @@ -3590,6 +3884,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "spin" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1527984ca054dfca79333baec451042863f485fbee01b7bf6d911de915cac865" + [[package]] name = "sqlite" version = "0.37.0" @@ -3705,6 +4005,12 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "strength_reduce" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" + [[package]] name = "string_cache" version = "0.9.0" @@ -4199,6 +4505,16 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "transpose" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad61aed86bc3faea4300c7aee358b4c6d0c8d6ccc36524c96e4c92ccf26e77e" +dependencies = [ + "num-integer", + "strength_reduce", +] + [[package]] name = "try-lock" version = "0.2.5" @@ -4360,6 +4676,17 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "visibility" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d674d135b4a8c1d7e813e2f8d1c9a58308aee4a680323066025e53132218bd91" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "vtparse" version = "0.6.2" @@ -4658,6 +4985,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windowfunctions" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90628d739333b7c5d2ee0b70210b97b8cddc38440c682c96fd9e2c24c2db5f3a" +dependencies = [ + "num-traits", +] + [[package]] name = "windows-core" version = "0.62.2" diff --git a/Cargo.toml b/Cargo.toml index 15cf0e0..741f9d5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,17 +10,22 @@ color-eyre = "0.6.5" crossterm = { version = "0.29.0", features = ["event-stream"] } futures = "0.3.32" futures-timer = "3.0.4" +hound = "3.5.1" iref = { version = "4.0.0", features = ["url", "serde"] } +jack = "0.13.5" json-ld = { version = "0.21.4", features = ["reqwest", "serde"] } +oximedia-metering = "0.1.7" ratatui = "0.30.0" rdf-types = "0.22.5" reqwest = "0.13.4" +resampler = "0.5.1" schemars = "1.2.1" scraper = "0.27.0" serde = { version = "1.0.228", features = ["derive"] } serde_json = "1.0.150" sqlite = "0.37.0" static-iref = "3.0.0" +tempfile = "3.27.0" throbber-widgets-tui = "0.11.0" tokio = { version = "1.52.3", features = ["full"] } tui-input = "0.15.3" diff --git a/src/main.rs b/src/main.rs index 2f29d86..3c12c15 100644 --- a/src/main.rs +++ b/src/main.rs @@ -77,11 +77,14 @@ struct App { end_time: DateTime, throbber_state: ThrobberState, prediction_request_sink: watch::Sender, - is_requesting: bool + is_requesting: bool, + audio_level: f64, + recording_audio: bool, + audio_control_sink: tokio::sync::mpsc::Sender, } impl App { - fn new(prediction_request_sink: watch::Sender) -> Self { + fn new(prediction_request_sink: watch::Sender, audio_control_sink: tokio::sync::mpsc::Sender) -> Self { Self { scene: Scene::default(), next_reply_options: Vec::new(), @@ -90,7 +93,10 @@ impl App { end_time: Utc::now() + Duration::hours(2), throbber_state: ThrobberState::default(), prediction_request_sink, - is_requesting: false + is_requesting: false, + audio_level: -60., + recording_audio: false, + audio_control_sink, } } @@ -209,12 +215,37 @@ impl App { let status_layout = Layout::default() .direction(Direction::Horizontal) - .constraints([Constraint::Max(3), Constraint::Fill(1)]) + .constraints([Constraint::Max(3), Constraint::Fill(2), Constraint::Fill(1)]) .split(layout[3]); self.draw_user_input(frame, layout[2]); self.draw_io_throbber(frame, status_layout[0]); self.draw_status(frame, status_layout[1]); + self.draw_volume(frame, status_layout[2]); + } + + fn draw_volume(&self, frame: &mut Frame, area: Rect) { + const NOISE_FLOOR: f64 = 50.; + let vu_pct = 1.0 - (self.audio_level.abs().min(NOISE_FLOOR) / NOISE_FLOOR); + + let volume_color = if self.recording_audio { + if vu_pct >= 0.85 { + style::Color::Red + } else if vu_pct >= 0.60 { + style::Color::Yellow + } else { + style::Color::LightGreen + } + } else { + style::Color::Gray + }; + + let gauge = Gauge::default() + .ratio(vu_pct) + .use_unicode(true) + .gauge_style(volume_color) + .label(format!("{:.01}dB", self.audio_level)); + frame.render_widget(gauge, area); } fn insert_selected_prompt(&mut self) { @@ -232,6 +263,16 @@ impl App { if let Some(key) = evt.as_key_press_event() { match key.code { KeyCode::Char('r') if key.modifiers.contains(KeyModifiers::CONTROL) => self.regenerate_responses(), + KeyCode::Char('x') if key.modifiers.contains(KeyModifiers::CONTROL) => { + if self.recording_audio { + self.recording_audio = false; + self.audio_control_sink.send(AudioRecordRequest::Finish).await.unwrap(); + self.is_requesting = true; + } else { + self.recording_audio = true; + self.audio_control_sink.send(AudioRecordRequest::Start).await.unwrap(); + } + }, KeyCode::Down => self.reply_state.select_next(), KeyCode::Up => self.reply_state.select_previous(), KeyCode::Tab => { @@ -373,6 +414,11 @@ impl App { } } +enum AudioRecordRequest { + Start, + Finish +} + #[tokio::main] async fn main() { color_eyre::install().unwrap(); @@ -381,6 +427,100 @@ async fn main() { let (prediction_in, mut prediction_out) = tokio::sync::watch::channel(None); let (prediction_request_in, mut prediction_request_out) = tokio::sync::watch::channel(Scene::default()); + let (audio_in, mut audio_out) = tokio::sync::mpsc::channel(32); + let (audio_level_in, mut audio_level_out) = tokio::sync::watch::channel(0.); + + let (audio_control_in, mut audio_control_out) = tokio::sync::mpsc::channel(1); + let (transcrption_in, mut transcription_out) = tokio::sync::mpsc::channel(1); + + let mut app = App::new(prediction_request_in, audio_control_in); + app.load(); + + let (client, _status) = jack::Client::new("Eva-Cohost", ClientOptions::default() | ClientOptions::SESSION_ID).unwrap(); + let port = client.register_port("microphone-in", AudioIn::default()).unwrap(); + let rate = client.sample_rate(); + + + if let Ok(_) = client.connect_ports_by_name("mixxx-mic-1:capture_MONO", port.name().unwrap().as_str()) { + app.scene.insert_conversation(ConversationEntry::SystemMessage("Connected to audio.".into())); + } else { + app.scene.insert_conversation(ConversationEntry::SystemMessage("Failed to reconnect to audio.".into())); + } + + let handler = jack::contrib::ClosureProcessHandler::new(move |client, scope| { + if port.connected_count().unwrap() > 0 { + let buf: Vec<_> = port.as_slice(scope).iter().copied().collect(); + audio_in.blocking_send(buf).unwrap(); + } + jack::Control::Continue + }); + + std::mem::forget(client.activate_async((), handler).unwrap()); + + tokio::spawn(async move { + let spec = hound::WavSpec { + channels: 1, + sample_rate: rate, + bits_per_sample: 16, + sample_format: hound::SampleFormat::Int + }; + + let mut meter = VuMeter::new(rate.into(), 1, None); + + let mut writer = None; + + let client: Client = Client::default(); + + loop { + tokio::select! { + maybe_audio_event = audio_control_out.recv() => { + let audio_event = maybe_audio_event.unwrap(); + match audio_event { + AudioRecordRequest::Start => { + // FIXME: We should switch this over to using spooled tempfiles instead of a named file in the current directory that can easily get clobbered + //const SPOOL_SIZE: usize = 16 * (rate as usize) * 10;// 10 seconds of audio + //let mut outfile = tempfile::spooled_tempfile(SPOOL_SIZE); + writer = Some(hound::WavWriter::create("mic.wav", spec).unwrap()); + }, + AudioRecordRequest::Finish => { + writer = None; + + let response = client.audio().transcription().create(CreateTranscriptionRequest { + file: AudioInput { source: InputSource::Path { path: "mic.wav".into() } }, + model: "gpt-4o-mini-transcribe".into(), + ..Default::default() + }).await.unwrap(); + transcrption_in.send(response.text).await.unwrap(); + } + } + }, + maybe_audio_packet = audio_out.recv() => { + let buf = maybe_audio_packet.unwrap(); + + meter.process_interleaved(buf.as_slice()); + if let Some(w) = writer.as_mut() { + for sample in buf.iter().copied() { + let sample_i16 = (sample * 32768.0) + .clamp(i16::MIN as f32, i16::MAX as f32) + as i16; + w.write_sample(sample_i16).unwrap(); + } + w.flush().unwrap(); + } + audio_level_in.send_if_modified(|v| { + let next_vu = meter.channel_vu(0).unwrap(); + if *v != next_vu { + *v = next_vu; + true + } else { + false + } + }); + } + }; + } + }); + tokio::spawn(async move { let client: Client = Client::default(); loop { @@ -427,6 +567,13 @@ async fn main() { _ = prediction_out.changed() => { app.on_response(prediction_out.borrow().clone().unwrap()); }, + _ = audio_level_out.changed() => { + app.audio_level = audio_level_out.borrow().clone(); + }, + maybe_transcription = transcription_out.recv() => { + app.scene.insert_conversation(ConversationEntry::User(maybe_transcription.unwrap())); + app.regenerate_responses(); + } maybe_event = event => { match maybe_event { Some(Ok(event)) => {