code: implement audio transcription from mic audio

This commit is contained in:
2026-06-02 11:25:22 +02:00
parent 7c15eec10d
commit 5579b4dc64
3 changed files with 492 additions and 4 deletions
Generated
+336
View File
@@ -158,6 +158,43 @@ version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
[[package]]
name = "audio-core"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f93ebbf82d06013f4c41fe71303feb980cddd78496d904d06be627972de51a24"
[[package]]
name = "audioadapter"
version = "3.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91f87b70b051c5866680ad79f6743a42ccab264c009d1a71f4d33a3872ae60c8"
dependencies = [
"audio-core",
"num-traits",
]
[[package]]
name = "audioadapter-buffers"
version = "3.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9097d67933fb083d382ce980430afdb758aada60846010aee6be068c06cef0ca"
dependencies = [
"audioadapter",
"audioadapter-sample",
"num-traits",
]
[[package]]
name = "audioadapter-sample"
version = "3.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34ab94f2bc04a14e1f49ee5f222f66460e8a1b51627bdfedf34eed394d747938"
dependencies = [
"audio-core",
"num-traits",
]
[[package]]
name = "autocfg"
version = "1.5.1"
@@ -233,6 +270,9 @@ name = "bitflags"
version = "2.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
dependencies = [
"serde_core",
]
[[package]]
name = "bitmaps"
@@ -487,6 +527,31 @@ dependencies = [
"libc",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "crossterm"
version = "0.29.0"
@@ -878,17 +943,22 @@ dependencies = [
"crossterm",
"futures",
"futures-timer",
"hound",
"iref 4.0.0",
"jack",
"json-ld",
"oximedia-metering",
"ratatui",
"rdf-types",
"reqwest 0.13.4",
"resampler",
"schemars",
"scraper",
"serde",
"serde_json",
"sqlite",
"static-iref",
"tempfile",
"throbber-widgets-tui",
"tokio",
"tui-input",
@@ -1240,6 +1310,11 @@ name = "hashbrown"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash 0.2.0",
]
[[package]]
name = "heck"
@@ -1268,6 +1343,12 @@ dependencies = [
"digest",
]
[[package]]
name = "hound"
version = "3.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62adaabb884c94955b19907d60019f4e145d091c75345379e70d1ee696f7854f"
[[package]]
name = "html5ever"
version = "0.39.0"
@@ -1655,6 +1736,33 @@ version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
[[package]]
name = "jack"
version = "0.13.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7811b07bcac5dafabf814ab52c4b0ca9b7948aa1e279f572f03aa6544d47d27"
dependencies = [
"bitflags 2.11.1",
"jack-sys",
"lazy_static",
"libc",
"log",
]
[[package]]
name = "jack-sys"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6013b7619b95a22b576dfb43296faa4ecbe40abbdb97dfd22ead520775fc86ab"
dependencies = [
"bitflags 1.3.2",
"lazy_static",
"libc",
"libloading",
"log",
"pkg-config",
]
[[package]]
name = "jni"
version = "0.22.4"
@@ -2019,6 +2127,22 @@ version = "0.2.186"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
[[package]]
name = "libloading"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f"
dependencies = [
"cfg-if",
"winapi",
]
[[package]]
name = "libm"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
[[package]]
name = "line-clipping"
version = "0.3.7"
@@ -2277,6 +2401,15 @@ dependencies = [
"num-traits",
]
[[package]]
name = "num-complex"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
dependencies = [
"num-traits",
]
[[package]]
name = "num-conv"
version = "0.2.2"
@@ -2321,6 +2454,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
"libm",
]
[[package]]
@@ -2414,6 +2548,86 @@ version = "4.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d211803b9b6b570f68772237e415a029d5a50c65d382910b879fb19d3271f94d"
[[package]]
name = "oxifft"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc0fe3b5f76266f8b860d18c307cacd467213b257dc3098641e19bc3ac17350b"
dependencies = [
"hashbrown 0.17.1",
"libm",
"num-complex",
"num-traits",
"oxifft-codegen",
"rayon",
"seahash",
"serde",
"serde_json",
"spin",
]
[[package]]
name = "oxifft-codegen"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b6627e29854ccb428fbf56e95c1970882096a53dae8954759ab09eb5d62f8d5"
dependencies = [
"oxifft-codegen-impl",
"proc-macro2",
"quote",
"syn 2.0.117",
]
[[package]]
name = "oxifft-codegen-impl"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "660807d8bbd7453e595aca8523d6f6991dcba0ebd3509f2e5e0b98c1c348e920"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.117",
]
[[package]]
name = "oximedia-audio"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0408efebc08a1df406d8e56e4b47c19bbb2b69109f1298d943a8d6a435f57b9"
dependencies = [
"audioadapter",
"audioadapter-buffers",
"bytes",
"oxifft",
"oximedia-core",
"rubato",
"thiserror 2.0.18",
]
[[package]]
name = "oximedia-core"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb84c059565d2e515e8f7bf655dbda9719237cb4ec915de968bb599f9208efc1"
dependencies = [
"bitflags 2.11.1",
"serde",
"thiserror 2.0.18",
]
[[package]]
name = "oximedia-metering"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e8832a54065b5023cd7d1881ef572d485a20d8a691c1528cac4cbfc5d054ea4"
dependencies = [
"oxifft",
"oximedia-audio",
"oximedia-core",
"rayon",
"thiserror 2.0.18",
]
[[package]]
name = "parking_lot"
version = "0.12.5"
@@ -2683,6 +2897,15 @@ dependencies = [
"syn 2.0.117",
]
[[package]]
name = "primal-check"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc0d895b311e3af9902528fbb8f928688abbd95872819320517cc24ca6b2bd08"
dependencies = [
"num-integer",
]
[[package]]
name = "proc-macro-error"
version = "1.0.4"
@@ -2943,6 +3166,26 @@ version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bd1f6fba6db8161b6818f9061152e751b4d6030b39b561bbbb0153b36a6cfc5"
[[package]]
name = "rayon"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
dependencies = [
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
dependencies = [
"crossbeam-deque",
"crossbeam-utils",
]
[[package]]
name = "rdf-types"
version = "0.22.5"
@@ -2961,6 +3204,15 @@ dependencies = [
"thiserror 1.0.69",
]
[[package]]
name = "realfft"
version = "3.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f821338fddb99d089116342c46e9f1fbf3828dba077674613e734e01d6ea8677"
dependencies = [
"rustfft",
]
[[package]]
name = "redox_syscall"
version = "0.5.18"
@@ -3125,6 +3377,12 @@ dependencies = [
"tower-service",
]
[[package]]
name = "resampler"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28fdbea87ff02ebbfd904079d1e52138c0c7fbaa3aaddca4a1b9c7d3f85749f2"
[[package]]
name = "ring"
version = "0.17.14"
@@ -3139,6 +3397,22 @@ dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "rubato"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce96ead1a91f7895704a9f08ea5947dfc8bd7c1f2936a22295b655ec67e5c6ef"
dependencies = [
"audioadapter",
"audioadapter-buffers",
"num-complex",
"num-integer",
"num-traits",
"realfft",
"visibility",
"windowfunctions",
]
[[package]]
name = "rustc-demangle"
version = "0.1.27"
@@ -3160,6 +3434,20 @@ dependencies = [
"semver",
]
[[package]]
name = "rustfft"
version = "6.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21db5f9893e91f41798c88680037dba611ca6674703c1a18601b01a72c8adb89"
dependencies = [
"num-complex",
"num-integer",
"num-traits",
"primal-check",
"strength_reduce",
"transpose",
]
[[package]]
name = "rustix"
version = "1.1.4"
@@ -3336,6 +3624,12 @@ dependencies = [
"tendril",
]
[[package]]
name = "seahash"
version = "4.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
[[package]]
name = "secrecy"
version = "0.10.3"
@@ -3590,6 +3884,12 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "spin"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1527984ca054dfca79333baec451042863f485fbee01b7bf6d911de915cac865"
[[package]]
name = "sqlite"
version = "0.37.0"
@@ -3705,6 +4005,12 @@ dependencies = [
"thiserror 1.0.69",
]
[[package]]
name = "strength_reduce"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82"
[[package]]
name = "string_cache"
version = "0.9.0"
@@ -4199,6 +4505,16 @@ dependencies = [
"tracing-core",
]
[[package]]
name = "transpose"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ad61aed86bc3faea4300c7aee358b4c6d0c8d6ccc36524c96e4c92ccf26e77e"
dependencies = [
"num-integer",
"strength_reduce",
]
[[package]]
name = "try-lock"
version = "0.2.5"
@@ -4360,6 +4676,17 @@ version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
name = "visibility"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d674d135b4a8c1d7e813e2f8d1c9a58308aee4a680323066025e53132218bd91"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.117",
]
[[package]]
name = "vtparse"
version = "0.6.2"
@@ -4658,6 +4985,15 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windowfunctions"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90628d739333b7c5d2ee0b70210b97b8cddc38440c682c96fd9e2c24c2db5f3a"
dependencies = [
"num-traits",
]
[[package]]
name = "windows-core"
version = "0.62.2"
+5
View File
@@ -10,17 +10,22 @@ color-eyre = "0.6.5"
crossterm = { version = "0.29.0", features = ["event-stream"] }
futures = "0.3.32"
futures-timer = "3.0.4"
hound = "3.5.1"
iref = { version = "4.0.0", features = ["url", "serde"] }
jack = "0.13.5"
json-ld = { version = "0.21.4", features = ["reqwest", "serde"] }
oximedia-metering = "0.1.7"
ratatui = "0.30.0"
rdf-types = "0.22.5"
reqwest = "0.13.4"
resampler = "0.5.1"
schemars = "1.2.1"
scraper = "0.27.0"
serde = { version = "1.0.228", features = ["derive"] }
serde_json = "1.0.150"
sqlite = "0.37.0"
static-iref = "3.0.0"
tempfile = "3.27.0"
throbber-widgets-tui = "0.11.0"
tokio = { version = "1.52.3", features = ["full"] }
tui-input = "0.15.3"
+151 -4
View File
@@ -77,11 +77,14 @@ struct App {
end_time: DateTime<Utc>,
throbber_state: ThrobberState,
prediction_request_sink: watch::Sender<Scene>,
is_requesting: bool
is_requesting: bool,
audio_level: f64,
recording_audio: bool,
audio_control_sink: tokio::sync::mpsc::Sender<AudioRecordRequest>,
}
impl App {
fn new(prediction_request_sink: watch::Sender<Scene>) -> Self {
fn new(prediction_request_sink: watch::Sender<Scene>, audio_control_sink: tokio::sync::mpsc::Sender<AudioRecordRequest>) -> Self {
Self {
scene: Scene::default(),
next_reply_options: Vec::new(),
@@ -90,7 +93,10 @@ impl App {
end_time: Utc::now() + Duration::hours(2),
throbber_state: ThrobberState::default(),
prediction_request_sink,
is_requesting: false
is_requesting: false,
audio_level: -60.,
recording_audio: false,
audio_control_sink,
}
}
@@ -209,12 +215,37 @@ impl App {
let status_layout = Layout::default()
.direction(Direction::Horizontal)
.constraints([Constraint::Max(3), Constraint::Fill(1)])
.constraints([Constraint::Max(3), Constraint::Fill(2), Constraint::Fill(1)])
.split(layout[3]);
self.draw_user_input(frame, layout[2]);
self.draw_io_throbber(frame, status_layout[0]);
self.draw_status(frame, status_layout[1]);
self.draw_volume(frame, status_layout[2]);
}
fn draw_volume(&self, frame: &mut Frame, area: Rect) {
const NOISE_FLOOR: f64 = 50.;
let vu_pct = 1.0 - (self.audio_level.abs().min(NOISE_FLOOR) / NOISE_FLOOR);
let volume_color = if self.recording_audio {
if vu_pct >= 0.85 {
style::Color::Red
} else if vu_pct >= 0.60 {
style::Color::Yellow
} else {
style::Color::LightGreen
}
} else {
style::Color::Gray
};
let gauge = Gauge::default()
.ratio(vu_pct)
.use_unicode(true)
.gauge_style(volume_color)
.label(format!("{:.01}dB", self.audio_level));
frame.render_widget(gauge, area);
}
fn insert_selected_prompt(&mut self) {
@@ -232,6 +263,16 @@ impl App {
if let Some(key) = evt.as_key_press_event() {
match key.code {
KeyCode::Char('r') if key.modifiers.contains(KeyModifiers::CONTROL) => self.regenerate_responses(),
KeyCode::Char('x') if key.modifiers.contains(KeyModifiers::CONTROL) => {
if self.recording_audio {
self.recording_audio = false;
self.audio_control_sink.send(AudioRecordRequest::Finish).await.unwrap();
self.is_requesting = true;
} else {
self.recording_audio = true;
self.audio_control_sink.send(AudioRecordRequest::Start).await.unwrap();
}
},
KeyCode::Down => self.reply_state.select_next(),
KeyCode::Up => self.reply_state.select_previous(),
KeyCode::Tab => {
@@ -373,6 +414,11 @@ impl App {
}
}
enum AudioRecordRequest {
Start,
Finish
}
#[tokio::main]
async fn main() {
color_eyre::install().unwrap();
@@ -381,6 +427,100 @@ async fn main() {
let (prediction_in, mut prediction_out) = tokio::sync::watch::channel(None);
let (prediction_request_in, mut prediction_request_out) = tokio::sync::watch::channel(Scene::default());
let (audio_in, mut audio_out) = tokio::sync::mpsc::channel(32);
let (audio_level_in, mut audio_level_out) = tokio::sync::watch::channel(0.);
let (audio_control_in, mut audio_control_out) = tokio::sync::mpsc::channel(1);
let (transcrption_in, mut transcription_out) = tokio::sync::mpsc::channel(1);
let mut app = App::new(prediction_request_in, audio_control_in);
app.load();
let (client, _status) = jack::Client::new("Eva-Cohost", ClientOptions::default() | ClientOptions::SESSION_ID).unwrap();
let port = client.register_port("microphone-in", AudioIn::default()).unwrap();
let rate = client.sample_rate();
if let Ok(_) = client.connect_ports_by_name("mixxx-mic-1:capture_MONO", port.name().unwrap().as_str()) {
app.scene.insert_conversation(ConversationEntry::SystemMessage("Connected to audio.".into()));
} else {
app.scene.insert_conversation(ConversationEntry::SystemMessage("Failed to reconnect to audio.".into()));
}
let handler = jack::contrib::ClosureProcessHandler::new(move |client, scope| {
if port.connected_count().unwrap() > 0 {
let buf: Vec<_> = port.as_slice(scope).iter().copied().collect();
audio_in.blocking_send(buf).unwrap();
}
jack::Control::Continue
});
std::mem::forget(client.activate_async((), handler).unwrap());
tokio::spawn(async move {
let spec = hound::WavSpec {
channels: 1,
sample_rate: rate,
bits_per_sample: 16,
sample_format: hound::SampleFormat::Int
};
let mut meter = VuMeter::new(rate.into(), 1, None);
let mut writer = None;
let client: Client<OpenAIConfig> = Client::default();
loop {
tokio::select! {
maybe_audio_event = audio_control_out.recv() => {
let audio_event = maybe_audio_event.unwrap();
match audio_event {
AudioRecordRequest::Start => {
// FIXME: We should switch this over to using spooled tempfiles instead of a named file in the current directory that can easily get clobbered
//const SPOOL_SIZE: usize = 16 * (rate as usize) * 10;// 10 seconds of audio
//let mut outfile = tempfile::spooled_tempfile(SPOOL_SIZE);
writer = Some(hound::WavWriter::create("mic.wav", spec).unwrap());
},
AudioRecordRequest::Finish => {
writer = None;
let response = client.audio().transcription().create(CreateTranscriptionRequest {
file: AudioInput { source: InputSource::Path { path: "mic.wav".into() } },
model: "gpt-4o-mini-transcribe".into(),
..Default::default()
}).await.unwrap();
transcrption_in.send(response.text).await.unwrap();
}
}
},
maybe_audio_packet = audio_out.recv() => {
let buf = maybe_audio_packet.unwrap();
meter.process_interleaved(buf.as_slice());
if let Some(w) = writer.as_mut() {
for sample in buf.iter().copied() {
let sample_i16 = (sample * 32768.0)
.clamp(i16::MIN as f32, i16::MAX as f32)
as i16;
w.write_sample(sample_i16).unwrap();
}
w.flush().unwrap();
}
audio_level_in.send_if_modified(|v| {
let next_vu = meter.channel_vu(0).unwrap();
if *v != next_vu {
*v = next_vu;
true
} else {
false
}
});
}
};
}
});
tokio::spawn(async move {
let client: Client<OpenAIConfig> = Client::default();
loop {
@@ -427,6 +567,13 @@ async fn main() {
_ = prediction_out.changed() => {
app.on_response(prediction_out.borrow().clone().unwrap());
},
_ = audio_level_out.changed() => {
app.audio_level = audio_level_out.borrow().clone();
},
maybe_transcription = transcription_out.recv() => {
app.scene.insert_conversation(ConversationEntry::User(maybe_transcription.unwrap()));
app.regenerate_responses();
}
maybe_event = event => {
match maybe_event {
Some(Ok(event)) => {