code: implement audio transcription from mic audio
This commit is contained in:
+151
-4
@@ -77,11 +77,14 @@ struct App {
|
||||
end_time: DateTime<Utc>,
|
||||
throbber_state: ThrobberState,
|
||||
prediction_request_sink: watch::Sender<Scene>,
|
||||
is_requesting: bool
|
||||
is_requesting: bool,
|
||||
audio_level: f64,
|
||||
recording_audio: bool,
|
||||
audio_control_sink: tokio::sync::mpsc::Sender<AudioRecordRequest>,
|
||||
}
|
||||
|
||||
impl App {
|
||||
fn new(prediction_request_sink: watch::Sender<Scene>) -> Self {
|
||||
fn new(prediction_request_sink: watch::Sender<Scene>, audio_control_sink: tokio::sync::mpsc::Sender<AudioRecordRequest>) -> Self {
|
||||
Self {
|
||||
scene: Scene::default(),
|
||||
next_reply_options: Vec::new(),
|
||||
@@ -90,7 +93,10 @@ impl App {
|
||||
end_time: Utc::now() + Duration::hours(2),
|
||||
throbber_state: ThrobberState::default(),
|
||||
prediction_request_sink,
|
||||
is_requesting: false
|
||||
is_requesting: false,
|
||||
audio_level: -60.,
|
||||
recording_audio: false,
|
||||
audio_control_sink,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -209,12 +215,37 @@ impl App {
|
||||
|
||||
let status_layout = Layout::default()
|
||||
.direction(Direction::Horizontal)
|
||||
.constraints([Constraint::Max(3), Constraint::Fill(1)])
|
||||
.constraints([Constraint::Max(3), Constraint::Fill(2), Constraint::Fill(1)])
|
||||
.split(layout[3]);
|
||||
|
||||
self.draw_user_input(frame, layout[2]);
|
||||
self.draw_io_throbber(frame, status_layout[0]);
|
||||
self.draw_status(frame, status_layout[1]);
|
||||
self.draw_volume(frame, status_layout[2]);
|
||||
}
|
||||
|
||||
fn draw_volume(&self, frame: &mut Frame, area: Rect) {
|
||||
const NOISE_FLOOR: f64 = 50.;
|
||||
let vu_pct = 1.0 - (self.audio_level.abs().min(NOISE_FLOOR) / NOISE_FLOOR);
|
||||
|
||||
let volume_color = if self.recording_audio {
|
||||
if vu_pct >= 0.85 {
|
||||
style::Color::Red
|
||||
} else if vu_pct >= 0.60 {
|
||||
style::Color::Yellow
|
||||
} else {
|
||||
style::Color::LightGreen
|
||||
}
|
||||
} else {
|
||||
style::Color::Gray
|
||||
};
|
||||
|
||||
let gauge = Gauge::default()
|
||||
.ratio(vu_pct)
|
||||
.use_unicode(true)
|
||||
.gauge_style(volume_color)
|
||||
.label(format!("{:.01}dB", self.audio_level));
|
||||
frame.render_widget(gauge, area);
|
||||
}
|
||||
|
||||
fn insert_selected_prompt(&mut self) {
|
||||
@@ -232,6 +263,16 @@ impl App {
|
||||
if let Some(key) = evt.as_key_press_event() {
|
||||
match key.code {
|
||||
KeyCode::Char('r') if key.modifiers.contains(KeyModifiers::CONTROL) => self.regenerate_responses(),
|
||||
KeyCode::Char('x') if key.modifiers.contains(KeyModifiers::CONTROL) => {
|
||||
if self.recording_audio {
|
||||
self.recording_audio = false;
|
||||
self.audio_control_sink.send(AudioRecordRequest::Finish).await.unwrap();
|
||||
self.is_requesting = true;
|
||||
} else {
|
||||
self.recording_audio = true;
|
||||
self.audio_control_sink.send(AudioRecordRequest::Start).await.unwrap();
|
||||
}
|
||||
},
|
||||
KeyCode::Down => self.reply_state.select_next(),
|
||||
KeyCode::Up => self.reply_state.select_previous(),
|
||||
KeyCode::Tab => {
|
||||
@@ -373,6 +414,11 @@ impl App {
|
||||
}
|
||||
}
|
||||
|
||||
enum AudioRecordRequest {
|
||||
Start,
|
||||
Finish
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
color_eyre::install().unwrap();
|
||||
@@ -381,6 +427,100 @@ async fn main() {
|
||||
let (prediction_in, mut prediction_out) = tokio::sync::watch::channel(None);
|
||||
let (prediction_request_in, mut prediction_request_out) = tokio::sync::watch::channel(Scene::default());
|
||||
|
||||
let (audio_in, mut audio_out) = tokio::sync::mpsc::channel(32);
|
||||
let (audio_level_in, mut audio_level_out) = tokio::sync::watch::channel(0.);
|
||||
|
||||
let (audio_control_in, mut audio_control_out) = tokio::sync::mpsc::channel(1);
|
||||
let (transcrption_in, mut transcription_out) = tokio::sync::mpsc::channel(1);
|
||||
|
||||
let mut app = App::new(prediction_request_in, audio_control_in);
|
||||
app.load();
|
||||
|
||||
let (client, _status) = jack::Client::new("Eva-Cohost", ClientOptions::default() | ClientOptions::SESSION_ID).unwrap();
|
||||
let port = client.register_port("microphone-in", AudioIn::default()).unwrap();
|
||||
let rate = client.sample_rate();
|
||||
|
||||
|
||||
if let Ok(_) = client.connect_ports_by_name("mixxx-mic-1:capture_MONO", port.name().unwrap().as_str()) {
|
||||
app.scene.insert_conversation(ConversationEntry::SystemMessage("Connected to audio.".into()));
|
||||
} else {
|
||||
app.scene.insert_conversation(ConversationEntry::SystemMessage("Failed to reconnect to audio.".into()));
|
||||
}
|
||||
|
||||
let handler = jack::contrib::ClosureProcessHandler::new(move |client, scope| {
|
||||
if port.connected_count().unwrap() > 0 {
|
||||
let buf: Vec<_> = port.as_slice(scope).iter().copied().collect();
|
||||
audio_in.blocking_send(buf).unwrap();
|
||||
}
|
||||
jack::Control::Continue
|
||||
});
|
||||
|
||||
std::mem::forget(client.activate_async((), handler).unwrap());
|
||||
|
||||
tokio::spawn(async move {
|
||||
let spec = hound::WavSpec {
|
||||
channels: 1,
|
||||
sample_rate: rate,
|
||||
bits_per_sample: 16,
|
||||
sample_format: hound::SampleFormat::Int
|
||||
};
|
||||
|
||||
let mut meter = VuMeter::new(rate.into(), 1, None);
|
||||
|
||||
let mut writer = None;
|
||||
|
||||
let client: Client<OpenAIConfig> = Client::default();
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
maybe_audio_event = audio_control_out.recv() => {
|
||||
let audio_event = maybe_audio_event.unwrap();
|
||||
match audio_event {
|
||||
AudioRecordRequest::Start => {
|
||||
// FIXME: We should switch this over to using spooled tempfiles instead of a named file in the current directory that can easily get clobbered
|
||||
//const SPOOL_SIZE: usize = 16 * (rate as usize) * 10;// 10 seconds of audio
|
||||
//let mut outfile = tempfile::spooled_tempfile(SPOOL_SIZE);
|
||||
writer = Some(hound::WavWriter::create("mic.wav", spec).unwrap());
|
||||
},
|
||||
AudioRecordRequest::Finish => {
|
||||
writer = None;
|
||||
|
||||
let response = client.audio().transcription().create(CreateTranscriptionRequest {
|
||||
file: AudioInput { source: InputSource::Path { path: "mic.wav".into() } },
|
||||
model: "gpt-4o-mini-transcribe".into(),
|
||||
..Default::default()
|
||||
}).await.unwrap();
|
||||
transcrption_in.send(response.text).await.unwrap();
|
||||
}
|
||||
}
|
||||
},
|
||||
maybe_audio_packet = audio_out.recv() => {
|
||||
let buf = maybe_audio_packet.unwrap();
|
||||
|
||||
meter.process_interleaved(buf.as_slice());
|
||||
if let Some(w) = writer.as_mut() {
|
||||
for sample in buf.iter().copied() {
|
||||
let sample_i16 = (sample * 32768.0)
|
||||
.clamp(i16::MIN as f32, i16::MAX as f32)
|
||||
as i16;
|
||||
w.write_sample(sample_i16).unwrap();
|
||||
}
|
||||
w.flush().unwrap();
|
||||
}
|
||||
audio_level_in.send_if_modified(|v| {
|
||||
let next_vu = meter.channel_vu(0).unwrap();
|
||||
if *v != next_vu {
|
||||
*v = next_vu;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
tokio::spawn(async move {
|
||||
let client: Client<OpenAIConfig> = Client::default();
|
||||
loop {
|
||||
@@ -427,6 +567,13 @@ async fn main() {
|
||||
_ = prediction_out.changed() => {
|
||||
app.on_response(prediction_out.borrow().clone().unwrap());
|
||||
},
|
||||
_ = audio_level_out.changed() => {
|
||||
app.audio_level = audio_level_out.borrow().clone();
|
||||
},
|
||||
maybe_transcription = transcription_out.recv() => {
|
||||
app.scene.insert_conversation(ConversationEntry::User(maybe_transcription.unwrap()));
|
||||
app.regenerate_responses();
|
||||
}
|
||||
maybe_event = event => {
|
||||
match maybe_event {
|
||||
Some(Ok(event)) => {
|
||||
|
||||
Reference in New Issue
Block a user