From 16c6cc70011840b3b8ed58c2d982f87113012995 Mon Sep 17 00:00:00 2001 From: Victoria Fischer Date: Mon, 8 Jun 2026 15:59:25 +0200 Subject: [PATCH] tts: add an audio output port, and feed TTS utterances into it --- src/audio.rs | 36 ++++++++++++++++++++++++++++++++++-- src/main.rs | 7 +++---- src/tts.rs | 29 +++++++++++++++++++++++++---- 3 files changed, 62 insertions(+), 10 deletions(-) diff --git a/src/audio.rs b/src/audio.rs index 6a941bf..8021d42 100644 --- a/src/audio.rs +++ b/src/audio.rs @@ -1,4 +1,4 @@ -use jack::{AudioIn, ClientOptions}; +use jack::{AudioIn, AudioOut, ClientOptions}; use oximedia_metering::vu_meter::VuMeter; use tokio::sync::*; @@ -32,15 +32,23 @@ pub struct MicStream { pub sample_rate: u32 } -pub async fn start_audio_input(messages: &mpsc::Sender) -> (AudioInputControl, MicStream) { +#[derive(Debug)] +pub struct TtsOutStream { + pub sink: mpsc::Sender>, + pub sample_rate: u32 +} + +pub async fn start_audio_input(messages: &mpsc::Sender) -> (AudioInputControl, MicStream, TtsOutStream) { let (exit_tx, exit_rx) = oneshot::channel(); let (mic_audio_sink, mic_audio_src) = mpsc::channel(32); + let (tts_audio_sink, mut tts_audio_src) = mpsc::channel(32); let (volume_sink, volume_src) = watch::channel(0.); let (client, _status) = jack::Client::new("Eva-Cohost", ClientOptions::default() | ClientOptions::SESSION_ID).unwrap(); let mic_port = client.register_port("microphone-in", AudioIn::default()).unwrap(); + let mut tts_port = client.register_port("tts-out", AudioOut::default()).unwrap(); let rate = client.sample_rate(); if let Ok(_) = client.connect_ports_by_name("mixxx-mic-1:capture_MONO", mic_port.name().unwrap().as_str()) { @@ -50,6 +58,8 @@ pub async fn start_audio_input(messages: &mpsc::Sender) -> (AudioInputCo } let mut meter = VuMeter::new(rate.into(), 1, None); + let mut tts_output_buf = vec![]; + tts_output_buf.reserve(1024); let handler = jack::contrib::ClosureProcessHandler::new(move |_client, scope| { if mic_port.connected_count().unwrap() > 0 { @@ -67,6 +77,25 @@ pub async fn start_audio_input(messages: &mpsc::Sender) -> (AudioInputCo } }); } + + + if let Ok(mut next_outbuf) = tts_audio_src.try_recv() { + tts_output_buf.append(&mut next_outbuf); + } + + if tts_port.connected_count().unwrap() > 0 && !tts_output_buf.is_empty() { + let outbuf = tts_port.as_mut_slice(scope); + let mut next_segment: Vec = tts_output_buf.drain(0..(outbuf.len()).min(tts_output_buf.len())).collect(); + let underrun = outbuf.len() - next_segment.len(); + if underrun > 0 { + for _ in 0..underrun { + next_segment.push(0.); + } + } + + outbuf.copy_from_slice(&next_segment); + } + jack::Control::Continue }); @@ -84,5 +113,8 @@ pub async fn start_audio_input(messages: &mpsc::Sender) -> (AudioInputCo }, MicStream { sample_rate: rate, src: mic_audio_src + }, TtsOutStream { + sample_rate: rate, + sink: tts_audio_sink }) } \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index b818336..280411d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -13,7 +13,7 @@ use futures::{StreamExt, future::FutureExt}; use ratatui::prelude::*; use tui_skeleton::{AnimationMode, SkeletonText}; -use crate::{audio::{AudioInputControl, start_audio_input}, prediction::{BandcampResult, PossibleResponse}, scene::{ConversationEntry, Scene, StageActions, StageDirection}, transcription::TranscriptionControl, tts::{TtsControl, start_tts}}; +use crate::{audio::{AudioInputControl, TtsOutStream, start_audio_input}, prediction::{BandcampResult, PossibleResponse}, scene::{ConversationEntry, Scene, StageActions, StageDirection}, transcription::TranscriptionControl, tts::{TtsControl, start_tts}}; mod scene; mod events; @@ -609,9 +609,8 @@ async fn main() { SaveData::default() }; - let (audio_ctrl, mic_stream) = start_audio_input(&sys_message_sink).await; - - let tts_ctrl = start_tts().await; + let (audio_ctrl, mic_stream, tts_output) = start_audio_input(&sys_message_sink).await; + let tts_ctrl = start_tts(tts_output).await; let (prediction_request_in, mut prediction_out) = prediction::start_prediction(sys_message_src, saved_session.messages).await; let transcription_ctrl = transcription::start_transcription(mic_stream).await; diff --git a/src/tts.rs b/src/tts.rs index d5578b2..744b1b3 100644 --- a/src/tts.rs +++ b/src/tts.rs @@ -1,4 +1,6 @@ -use std::process::Command; +use std::process::{Command, Stdio}; + +use crate::audio::TtsOutStream; #[derive(Debug)] pub struct TtsControl { @@ -11,15 +13,34 @@ impl TtsControl { } } -pub async fn start_tts() -> TtsControl { +pub async fn start_tts(audio_sink: TtsOutStream) -> TtsControl { let (tts_request_sender, mut tts_request_receiver) = tokio::sync::mpsc::channel(3); // Set up the TTS task tokio::spawn(async move { while let Some(text) = tts_request_receiver.recv().await { - // TODO: We should also have espeak pipe out to stdout, then we can apply some audio effects and write to our own jack port. - Command::new("espeak-ng").arg("-v").arg("en-us+f3").arg(text).spawn().unwrap().wait().unwrap(); + let tts_output = Command::new("espeak-ng").args(["-v", "en-us+f3", "--stdout"]).arg(text).stdout(Stdio::piped()).spawn().unwrap().wait_with_output().unwrap().stdout; + let tts_fd = std::io::Cursor::new(tts_output); + let mut wav_reader = hound::WavReader::new(tts_fd).unwrap(); + + let mut bitrate_resample = resampler::ResamplerFir::new_from_hz(1, wav_reader.spec().sample_rate, audio_sink.sample_rate, Default::default(), Default::default()); + + let mut audio_out_buf = vec![]; + + for sample in wav_reader.samples() { + if let Ok(raw_sample) = sample { + let sample16: i16 = raw_sample; + let sample32: f32 = (sample16 as f32) / (i16::MAX as f32); + let mut audio_slice = [0.; 32]; + let (_, write_count) = bitrate_resample.resample(&[sample32], &mut audio_slice).unwrap(); + audio_out_buf.extend_from_slice(&audio_slice[0..write_count]); + } else { + break; + } + } + + audio_sink.sink.send(audio_out_buf).await.unwrap(); } });