code: implement audio transcription from mic audio

This commit is contained in:
2026-06-02 11:25:22 +02:00
parent 7c15eec10d
commit 5579b4dc64
3 changed files with 492 additions and 4 deletions
+151 -4
View File
@@ -77,11 +77,14 @@ struct App {
end_time: DateTime<Utc>,
throbber_state: ThrobberState,
prediction_request_sink: watch::Sender<Scene>,
is_requesting: bool
is_requesting: bool,
audio_level: f64,
recording_audio: bool,
audio_control_sink: tokio::sync::mpsc::Sender<AudioRecordRequest>,
}
impl App {
fn new(prediction_request_sink: watch::Sender<Scene>) -> Self {
fn new(prediction_request_sink: watch::Sender<Scene>, audio_control_sink: tokio::sync::mpsc::Sender<AudioRecordRequest>) -> Self {
Self {
scene: Scene::default(),
next_reply_options: Vec::new(),
@@ -90,7 +93,10 @@ impl App {
end_time: Utc::now() + Duration::hours(2),
throbber_state: ThrobberState::default(),
prediction_request_sink,
is_requesting: false
is_requesting: false,
audio_level: -60.,
recording_audio: false,
audio_control_sink,
}
}
@@ -209,12 +215,37 @@ impl App {
let status_layout = Layout::default()
.direction(Direction::Horizontal)
.constraints([Constraint::Max(3), Constraint::Fill(1)])
.constraints([Constraint::Max(3), Constraint::Fill(2), Constraint::Fill(1)])
.split(layout[3]);
self.draw_user_input(frame, layout[2]);
self.draw_io_throbber(frame, status_layout[0]);
self.draw_status(frame, status_layout[1]);
self.draw_volume(frame, status_layout[2]);
}
fn draw_volume(&self, frame: &mut Frame, area: Rect) {
const NOISE_FLOOR: f64 = 50.;
let vu_pct = 1.0 - (self.audio_level.abs().min(NOISE_FLOOR) / NOISE_FLOOR);
let volume_color = if self.recording_audio {
if vu_pct >= 0.85 {
style::Color::Red
} else if vu_pct >= 0.60 {
style::Color::Yellow
} else {
style::Color::LightGreen
}
} else {
style::Color::Gray
};
let gauge = Gauge::default()
.ratio(vu_pct)
.use_unicode(true)
.gauge_style(volume_color)
.label(format!("{:.01}dB", self.audio_level));
frame.render_widget(gauge, area);
}
fn insert_selected_prompt(&mut self) {
@@ -232,6 +263,16 @@ impl App {
if let Some(key) = evt.as_key_press_event() {
match key.code {
KeyCode::Char('r') if key.modifiers.contains(KeyModifiers::CONTROL) => self.regenerate_responses(),
KeyCode::Char('x') if key.modifiers.contains(KeyModifiers::CONTROL) => {
if self.recording_audio {
self.recording_audio = false;
self.audio_control_sink.send(AudioRecordRequest::Finish).await.unwrap();
self.is_requesting = true;
} else {
self.recording_audio = true;
self.audio_control_sink.send(AudioRecordRequest::Start).await.unwrap();
}
},
KeyCode::Down => self.reply_state.select_next(),
KeyCode::Up => self.reply_state.select_previous(),
KeyCode::Tab => {
@@ -373,6 +414,11 @@ impl App {
}
}
enum AudioRecordRequest {
Start,
Finish
}
#[tokio::main]
async fn main() {
color_eyre::install().unwrap();
@@ -381,6 +427,100 @@ async fn main() {
let (prediction_in, mut prediction_out) = tokio::sync::watch::channel(None);
let (prediction_request_in, mut prediction_request_out) = tokio::sync::watch::channel(Scene::default());
let (audio_in, mut audio_out) = tokio::sync::mpsc::channel(32);
let (audio_level_in, mut audio_level_out) = tokio::sync::watch::channel(0.);
let (audio_control_in, mut audio_control_out) = tokio::sync::mpsc::channel(1);
let (transcrption_in, mut transcription_out) = tokio::sync::mpsc::channel(1);
let mut app = App::new(prediction_request_in, audio_control_in);
app.load();
let (client, _status) = jack::Client::new("Eva-Cohost", ClientOptions::default() | ClientOptions::SESSION_ID).unwrap();
let port = client.register_port("microphone-in", AudioIn::default()).unwrap();
let rate = client.sample_rate();
if let Ok(_) = client.connect_ports_by_name("mixxx-mic-1:capture_MONO", port.name().unwrap().as_str()) {
app.scene.insert_conversation(ConversationEntry::SystemMessage("Connected to audio.".into()));
} else {
app.scene.insert_conversation(ConversationEntry::SystemMessage("Failed to reconnect to audio.".into()));
}
let handler = jack::contrib::ClosureProcessHandler::new(move |client, scope| {
if port.connected_count().unwrap() > 0 {
let buf: Vec<_> = port.as_slice(scope).iter().copied().collect();
audio_in.blocking_send(buf).unwrap();
}
jack::Control::Continue
});
std::mem::forget(client.activate_async((), handler).unwrap());
tokio::spawn(async move {
let spec = hound::WavSpec {
channels: 1,
sample_rate: rate,
bits_per_sample: 16,
sample_format: hound::SampleFormat::Int
};
let mut meter = VuMeter::new(rate.into(), 1, None);
let mut writer = None;
let client: Client<OpenAIConfig> = Client::default();
loop {
tokio::select! {
maybe_audio_event = audio_control_out.recv() => {
let audio_event = maybe_audio_event.unwrap();
match audio_event {
AudioRecordRequest::Start => {
// FIXME: We should switch this over to using spooled tempfiles instead of a named file in the current directory that can easily get clobbered
//const SPOOL_SIZE: usize = 16 * (rate as usize) * 10;// 10 seconds of audio
//let mut outfile = tempfile::spooled_tempfile(SPOOL_SIZE);
writer = Some(hound::WavWriter::create("mic.wav", spec).unwrap());
},
AudioRecordRequest::Finish => {
writer = None;
let response = client.audio().transcription().create(CreateTranscriptionRequest {
file: AudioInput { source: InputSource::Path { path: "mic.wav".into() } },
model: "gpt-4o-mini-transcribe".into(),
..Default::default()
}).await.unwrap();
transcrption_in.send(response.text).await.unwrap();
}
}
},
maybe_audio_packet = audio_out.recv() => {
let buf = maybe_audio_packet.unwrap();
meter.process_interleaved(buf.as_slice());
if let Some(w) = writer.as_mut() {
for sample in buf.iter().copied() {
let sample_i16 = (sample * 32768.0)
.clamp(i16::MIN as f32, i16::MAX as f32)
as i16;
w.write_sample(sample_i16).unwrap();
}
w.flush().unwrap();
}
audio_level_in.send_if_modified(|v| {
let next_vu = meter.channel_vu(0).unwrap();
if *v != next_vu {
*v = next_vu;
true
} else {
false
}
});
}
};
}
});
tokio::spawn(async move {
let client: Client<OpenAIConfig> = Client::default();
loop {
@@ -427,6 +567,13 @@ async fn main() {
_ = prediction_out.changed() => {
app.on_response(prediction_out.borrow().clone().unwrap());
},
_ = audio_level_out.changed() => {
app.audio_level = audio_level_out.borrow().clone();
},
maybe_transcription = transcription_out.recv() => {
app.scene.insert_conversation(ConversationEntry::User(maybe_transcription.unwrap()));
app.regenerate_responses();
}
maybe_event = event => {
match maybe_event {
Some(Ok(event)) => {