tts: add an audio output port, and feed TTS utterances into it
This commit is contained in:
+34
-2
@@ -1,4 +1,4 @@
|
|||||||
use jack::{AudioIn, ClientOptions};
|
use jack::{AudioIn, AudioOut, ClientOptions};
|
||||||
use oximedia_metering::vu_meter::VuMeter;
|
use oximedia_metering::vu_meter::VuMeter;
|
||||||
use tokio::sync::*;
|
use tokio::sync::*;
|
||||||
|
|
||||||
@@ -32,15 +32,23 @@ pub struct MicStream {
|
|||||||
pub sample_rate: u32
|
pub sample_rate: u32
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn start_audio_input(messages: &mpsc::Sender<String>) -> (AudioInputControl, MicStream) {
|
#[derive(Debug)]
|
||||||
|
pub struct TtsOutStream {
|
||||||
|
pub sink: mpsc::Sender<Vec<f32>>,
|
||||||
|
pub sample_rate: u32
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn start_audio_input(messages: &mpsc::Sender<String>) -> (AudioInputControl, MicStream, TtsOutStream) {
|
||||||
|
|
||||||
let (exit_tx, exit_rx) = oneshot::channel();
|
let (exit_tx, exit_rx) = oneshot::channel();
|
||||||
|
|
||||||
let (mic_audio_sink, mic_audio_src) = mpsc::channel(32);
|
let (mic_audio_sink, mic_audio_src) = mpsc::channel(32);
|
||||||
|
let (tts_audio_sink, mut tts_audio_src) = mpsc::channel(32);
|
||||||
let (volume_sink, volume_src) = watch::channel(0.);
|
let (volume_sink, volume_src) = watch::channel(0.);
|
||||||
|
|
||||||
let (client, _status) = jack::Client::new("Eva-Cohost", ClientOptions::default() | ClientOptions::SESSION_ID).unwrap();
|
let (client, _status) = jack::Client::new("Eva-Cohost", ClientOptions::default() | ClientOptions::SESSION_ID).unwrap();
|
||||||
let mic_port = client.register_port("microphone-in", AudioIn::default()).unwrap();
|
let mic_port = client.register_port("microphone-in", AudioIn::default()).unwrap();
|
||||||
|
let mut tts_port = client.register_port("tts-out", AudioOut::default()).unwrap();
|
||||||
let rate = client.sample_rate();
|
let rate = client.sample_rate();
|
||||||
|
|
||||||
if let Ok(_) = client.connect_ports_by_name("mixxx-mic-1:capture_MONO", mic_port.name().unwrap().as_str()) {
|
if let Ok(_) = client.connect_ports_by_name("mixxx-mic-1:capture_MONO", mic_port.name().unwrap().as_str()) {
|
||||||
@@ -50,6 +58,8 @@ pub async fn start_audio_input(messages: &mpsc::Sender<String>) -> (AudioInputCo
|
|||||||
}
|
}
|
||||||
|
|
||||||
let mut meter = VuMeter::new(rate.into(), 1, None);
|
let mut meter = VuMeter::new(rate.into(), 1, None);
|
||||||
|
let mut tts_output_buf = vec![];
|
||||||
|
tts_output_buf.reserve(1024);
|
||||||
|
|
||||||
let handler = jack::contrib::ClosureProcessHandler::new(move |_client, scope| {
|
let handler = jack::contrib::ClosureProcessHandler::new(move |_client, scope| {
|
||||||
if mic_port.connected_count().unwrap() > 0 {
|
if mic_port.connected_count().unwrap() > 0 {
|
||||||
@@ -67,6 +77,25 @@ pub async fn start_audio_input(messages: &mpsc::Sender<String>) -> (AudioInputCo
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if let Ok(mut next_outbuf) = tts_audio_src.try_recv() {
|
||||||
|
tts_output_buf.append(&mut next_outbuf);
|
||||||
|
}
|
||||||
|
|
||||||
|
if tts_port.connected_count().unwrap() > 0 && !tts_output_buf.is_empty() {
|
||||||
|
let outbuf = tts_port.as_mut_slice(scope);
|
||||||
|
let mut next_segment: Vec<f32> = tts_output_buf.drain(0..(outbuf.len()).min(tts_output_buf.len())).collect();
|
||||||
|
let underrun = outbuf.len() - next_segment.len();
|
||||||
|
if underrun > 0 {
|
||||||
|
for _ in 0..underrun {
|
||||||
|
next_segment.push(0.);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
outbuf.copy_from_slice(&next_segment);
|
||||||
|
}
|
||||||
|
|
||||||
jack::Control::Continue
|
jack::Control::Continue
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -84,5 +113,8 @@ pub async fn start_audio_input(messages: &mpsc::Sender<String>) -> (AudioInputCo
|
|||||||
}, MicStream {
|
}, MicStream {
|
||||||
sample_rate: rate,
|
sample_rate: rate,
|
||||||
src: mic_audio_src
|
src: mic_audio_src
|
||||||
|
}, TtsOutStream {
|
||||||
|
sample_rate: rate,
|
||||||
|
sink: tts_audio_sink
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
+3
-4
@@ -13,7 +13,7 @@ use futures::{StreamExt, future::FutureExt};
|
|||||||
use ratatui::prelude::*;
|
use ratatui::prelude::*;
|
||||||
use tui_skeleton::{AnimationMode, SkeletonText};
|
use tui_skeleton::{AnimationMode, SkeletonText};
|
||||||
|
|
||||||
use crate::{audio::{AudioInputControl, start_audio_input}, prediction::{BandcampResult, PossibleResponse}, scene::{ConversationEntry, Scene, StageActions, StageDirection}, transcription::TranscriptionControl, tts::{TtsControl, start_tts}};
|
use crate::{audio::{AudioInputControl, TtsOutStream, start_audio_input}, prediction::{BandcampResult, PossibleResponse}, scene::{ConversationEntry, Scene, StageActions, StageDirection}, transcription::TranscriptionControl, tts::{TtsControl, start_tts}};
|
||||||
|
|
||||||
mod scene;
|
mod scene;
|
||||||
mod events;
|
mod events;
|
||||||
@@ -609,9 +609,8 @@ async fn main() {
|
|||||||
SaveData::default()
|
SaveData::default()
|
||||||
};
|
};
|
||||||
|
|
||||||
let (audio_ctrl, mic_stream) = start_audio_input(&sys_message_sink).await;
|
let (audio_ctrl, mic_stream, tts_output) = start_audio_input(&sys_message_sink).await;
|
||||||
|
let tts_ctrl = start_tts(tts_output).await;
|
||||||
let tts_ctrl = start_tts().await;
|
|
||||||
let (prediction_request_in, mut prediction_out) = prediction::start_prediction(sys_message_src, saved_session.messages).await;
|
let (prediction_request_in, mut prediction_out) = prediction::start_prediction(sys_message_src, saved_session.messages).await;
|
||||||
let transcription_ctrl = transcription::start_transcription(mic_stream).await;
|
let transcription_ctrl = transcription::start_transcription(mic_stream).await;
|
||||||
|
|
||||||
|
|||||||
+25
-4
@@ -1,4 +1,6 @@
|
|||||||
use std::process::Command;
|
use std::process::{Command, Stdio};
|
||||||
|
|
||||||
|
use crate::audio::TtsOutStream;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct TtsControl {
|
pub struct TtsControl {
|
||||||
@@ -11,15 +13,34 @@ impl TtsControl {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn start_tts() -> TtsControl {
|
pub async fn start_tts(audio_sink: TtsOutStream) -> TtsControl {
|
||||||
|
|
||||||
let (tts_request_sender, mut tts_request_receiver) = tokio::sync::mpsc::channel(3);
|
let (tts_request_sender, mut tts_request_receiver) = tokio::sync::mpsc::channel(3);
|
||||||
|
|
||||||
// Set up the TTS task
|
// Set up the TTS task
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
while let Some(text) = tts_request_receiver.recv().await {
|
while let Some(text) = tts_request_receiver.recv().await {
|
||||||
// TODO: We should also have espeak pipe out to stdout, then we can apply some audio effects and write to our own jack port.
|
let tts_output = Command::new("espeak-ng").args(["-v", "en-us+f3", "--stdout"]).arg(text).stdout(Stdio::piped()).spawn().unwrap().wait_with_output().unwrap().stdout;
|
||||||
Command::new("espeak-ng").arg("-v").arg("en-us+f3").arg(text).spawn().unwrap().wait().unwrap();
|
let tts_fd = std::io::Cursor::new(tts_output);
|
||||||
|
let mut wav_reader = hound::WavReader::new(tts_fd).unwrap();
|
||||||
|
|
||||||
|
let mut bitrate_resample = resampler::ResamplerFir::new_from_hz(1, wav_reader.spec().sample_rate, audio_sink.sample_rate, Default::default(), Default::default());
|
||||||
|
|
||||||
|
let mut audio_out_buf = vec![];
|
||||||
|
|
||||||
|
for sample in wav_reader.samples() {
|
||||||
|
if let Ok(raw_sample) = sample {
|
||||||
|
let sample16: i16 = raw_sample;
|
||||||
|
let sample32: f32 = (sample16 as f32) / (i16::MAX as f32);
|
||||||
|
let mut audio_slice = [0.; 32];
|
||||||
|
let (_, write_count) = bitrate_resample.resample(&[sample32], &mut audio_slice).unwrap();
|
||||||
|
audio_out_buf.extend_from_slice(&audio_slice[0..write_count]);
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
audio_sink.sink.send(audio_out_buf).await.unwrap();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user