diff --git a/camera_hub/Cargo.lock b/camera_hub/Cargo.lock index 7663255..dd8c92f 100644 --- a/camera_hub/Cargo.lock +++ b/camera_hub/Cargo.lock @@ -2801,6 +2801,24 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" +[[package]] +name = "opusic-c" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89f8e9c909466f15e60277212cc4fec082c68a5e1c9f6e373eee716fec2fed47" +dependencies = [ + "opusic-sys", +] + +[[package]] +name = "opusic-sys" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2804e694ef0de3b4cbb254de565053b7cb48d3398df7fd60c6c62bed40c5372a" +dependencies = [ + "cmake", +] + [[package]] name = "order-stat" version = "0.1.3" @@ -3714,6 +3732,7 @@ dependencies = [ "log", "ndarray 0.17.2", "openmls", + "opusic-c", "rand 0.9.4", "reqwest", "retina", diff --git a/camera_hub/Cargo.toml b/camera_hub/Cargo.toml index fe625e2..4c8f5b3 100644 --- a/camera_hub/Cargo.toml +++ b/camera_hub/Cargo.toml @@ -8,7 +8,7 @@ authors = ["Ardalan Amiri Sani "] default = ["logging"] logging = ["log"] ip = ["dep:rpassword", "dep:reqwest", "dep:http-auth", "dep:linfa", "dep:linfa-clustering", "dep:retina", "dep:serde_yaml2", "dep:ndarray", "dep:futures", "secluso-client-lib/camera_secret_qrcode"] -raspberry = ["dep:secluso-motion-ai"] +raspberry = ["dep:secluso-motion-ai", "dep:opusic-c"] manual = [] telemetry = [] # todo: dep on the motion_ai crate test = [] @@ -46,3 +46,4 @@ linfa-clustering = { version = "0.8.1", optional = true } # Raspberry Specific Dependencies secluso-motion-ai = { path = "../motion_ai/pipeline", optional = true, default-features = false } +opusic-c = { version = "1.6.1", optional = true } diff --git a/camera_hub/src/mp4.rs b/camera_hub/src/mp4.rs index 70c8335..2ebd401 100644 --- a/camera_hub/src/mp4.rs +++ b/camera_hub/src/mp4.rs @@ -70,13 +70,14 @@ pub struct TrakTrackerCore { /// is calculated using the PTS of the following sample. pub durations: Vec<(u32, u32)>, pub last_pts: Option, + pub last_duration: Option, pub tot_duration: u64, } impl TrakTrackerCore { pub fn finish(&mut self) { if self.last_pts.is_some() { - self.durations.push((1, 0)); + self.durations.push((1, self.last_duration.unwrap_or(0))); } } @@ -172,6 +173,7 @@ impl TrakTracker { }; self.core.tot_duration += duration; let duration = u32::try_from(duration)?; + self.core.last_duration = Some(duration); match self.core.durations.last_mut() { Some((s, d)) if *d == duration => *s += 1, _ => self.core.durations.push((1, duration)), @@ -350,26 +352,24 @@ impl Mp4WriterCor self.audio_params.write_codec_box(buf)?; }); audio_trak_core.write_common_stbl_parts(buf)?; - - // AAC requires two samples (really, each is a set of 960 or 1024 samples) - // to decode accurately. See - // https://developer.apple.com/library/archive/documentation/QuickTime/QTFF/QTFFAppenG/QTFFAppenG.html . - write_box!(buf, b"sgpd", { - // BMFF section 8.9.3: SampleGroupDescriptionBox - buf.put_u32(0); // version - buf.extend_from_slice(b"roll"); // grouping type - buf.put_u32(1); // entry_count - // BMFF section 10.1: AudioRollRecoveryEntry - buf.put_i16(-1); // roll_distance - }); - write_box!(buf, b"sbgp", { - // BMFF section 8.9.2: SampleToGroupBox - buf.put_u32(0); // version - buf.extend_from_slice(b"roll"); // grouping type - buf.put_u32(1); // entry_count - buf.put_u32(audio_trak_core.samples); - buf.put_u32(1); // group_description_index - }); + if let Some(roll_distance) = self.audio_params.audio_roll_distance() { + write_box!(buf, b"sgpd", { + // BMFF section 8.9.3: SampleGroupDescriptionBox + buf.put_u32(0); // version + buf.extend_from_slice(b"roll"); // grouping type + buf.put_u32(1); // entry_count + // BMFF section 10.1: AudioRollRecoveryEntry + buf.put_i16(roll_distance); + }); + write_box!(buf, b"sbgp", { + // BMFF section 8.9.2: SampleToGroupBox + buf.put_u32(0); // version + buf.extend_from_slice(b"roll"); // grouping type + buf.put_u32(1); // entry_count + buf.put_u32(audio_trak_core.samples); + buf.put_u32(1); // group_description_index + }); + } }); }); }); @@ -400,6 +400,7 @@ impl Result<(), Error> { self.video_trak.core.finish(); self.audio_trak.core.finish(); + let movie_timescale = 90_000u64; + let movie_duration = self.video_trak.core.tot_duration.max( + self.audio_trak + .core + .tot_duration + .saturating_mul(movie_timescale) + / u64::from(self.core.audio_params.get_clock_rate().max(1)), + ); let mut buf = BytesMut::with_capacity( 1024 + self.video_trak.core.size_estimate() + self.audio_trak.core.size_estimate() @@ -426,8 +435,8 @@ impl 0 { self.core diff --git a/camera_hub/src/raspberry_pi/rpi_camera.rs b/camera_hub/src/raspberry_pi/rpi_camera.rs index 0b8d28b..e148521 100644 --- a/camera_hub/src/raspberry_pi/rpi_camera.rs +++ b/camera_hub/src/raspberry_pi/rpi_camera.rs @@ -36,6 +36,16 @@ use tokio::runtime::Runtime; const TOTAL_FRAME_RATE: usize = 10; const I_FRAME_INTERVAL: usize = TOTAL_FRAME_RATE; // 1-second fragments +// We currently standardize Raspberry Pi audio on mono 48 kHz Opus. +// 20 ms packets are the common low-latency Opus framing and map to 960 PCM samples at 48 kHz. +pub const AUDIO_SAMPLE_RATE: u32 = 48_000; +pub const AUDIO_CHANNELS: u16 = 1; +pub const OPUS_FRAME_SAMPLES: u32 = 960; // 20 ms at 48 kHz +pub const OPUS_BITRATE_BPS: u32 = 32_000; +pub const OPUS_COMPLEXITY: u8 = 3; +// 80 ms is the recommended preroll for random access and is = to 3840 samples at 48 kHz. +pub const OPUS_PREROLL_SAMPLES: u16 = 3_840; // 80 ms at 48 kHz + //These are for our local SPS/PPS channel #[derive(PartialEq, Debug, Clone)] pub enum FrameKind { @@ -141,7 +151,8 @@ impl RaspberryPiCamera { debug!("Exited controller tick loop"); }); - let resolution: CameraResolution = Self::fetch_resolution().expect("A supported camera module was not found"); + let resolution: CameraResolution = + Self::fetch_resolution().expect("A supported camera module was not found"); // Start the new shared stream. rpi_dual_stream::start( @@ -154,7 +165,7 @@ impl RaspberryPiCamera { ps_tx, motion_fps as u8, ) - .expect("Failed to start shared stream"); + .expect("Failed to start shared stream"); rpi_dual_stream::start_audio(Arc::clone(&frame_queue)) .expect("Failed to start audio stream"); @@ -199,6 +210,7 @@ impl RaspberryPiCamera { mp4: &mut M, duration: Option, frame_queue: Arc>>, + include_audio: bool, ) -> Result<(), Error> { let recording_window = duration.map(|secs| Duration::new(secs, 0)); let recording_start_time = Instant::now(); @@ -236,9 +248,13 @@ impl RaspberryPiCamera { if started { if frame.kind == FrameKind::Audio { + // TODO: Livestream fMP4 audio is disabled for now. + if !include_audio { + continue; + } let ts_audio = audio_sample_count; // 48k timescale mp4.audio(&frame.data, ts_audio).await?; - audio_sample_count += 1024; // AAC-LC fixed + audio_sample_count += u64::from(OPUS_FRAME_SAMPLES); continue; } else { let ts = video_frame_count * ticks_per_video_frame; @@ -315,10 +331,10 @@ impl RaspberryPiCamera { RpiCameraAudioParameters::new(), file, ) - .await?; + .await?; // Process the rest of the frames, writing both to the MP4 writer and to the raw file. - Self::copy(&mut mp4, Some(duration), frame_queue).await?; + Self::copy(&mut mp4, Some(duration), frame_queue, true).await?; mp4.finish().await?; Ok(()) @@ -435,10 +451,10 @@ impl RaspberryPiCamera { RpiCameraAudioParameters::new(), livestream_writer, ) - .await?; + .await?; fmp4.finish_header(None).await?; - Self::copy(&mut fmp4, None, frame_queue).await?; + Self::copy(&mut fmp4, None, frame_queue, false).await?; Ok(()) } @@ -556,7 +572,7 @@ impl RaspberryPiCamera { }) } else { None - } + }; } } @@ -608,7 +624,7 @@ impl Camera for RaspberryPiCamera { Arc::clone(&self.frame_queue), self.sps_frame.clone(), self.pps_frame.clone(), - self.resolution.clone() + self.resolution.clone(), ); rt.block_on(future).unwrap(); @@ -634,7 +650,7 @@ impl Camera for RaspberryPiCamera { frame_queue_clone, sps_frame_clone, pps_frame_clone, - resolution_clone + resolution_clone, ); if let Err(e) = rt.block_on(future) { eprintln!("[Livestream] write_fmp4 error: {e:?}"); @@ -669,7 +685,11 @@ struct RpiCameraVideoParameters { impl RpiCameraVideoParameters { pub fn new(sps: Vec, pps: Vec, dimensions: CameraResolution) -> Self { - Self { sps, pps, dimensions } + Self { + sps, + pps, + dimensions, + } } } @@ -757,95 +777,50 @@ impl CodecParameters for RpiCameraVideoParameters { } fn get_dimensions(&self) -> (u32, u32) { - ((self.dimensions.width as u32) << 16, (self.dimensions.height as u32) << 16) + ( + (self.dimensions.width as u32) << 16, + (self.dimensions.height as u32) << 16, + ) } } struct RpiCameraAudioParameters { - sample_rate: u32, // 48000 - channels: u16, // 1 - asc: [u8; 2], // AudioSpecificConfig + sample_rate: u32, + channels: u16, } impl RpiCameraAudioParameters { fn new() -> Self { Self { - sample_rate: 48_000, - channels: 1, - asc: [0x11, 0x88], // AAC-LC, 48kHz (idx=3), mono (1) + sample_rate: AUDIO_SAMPLE_RATE, + channels: AUDIO_CHANNELS, } } } impl CodecParameters for RpiCameraAudioParameters { fn write_codec_box(&self, buf: &mut BytesMut) -> Result<(), Error> { - write_box!(buf, b"mp4a", { - // 6 reserved bytes - buf.put_u8(0); - buf.put_u8(0); - buf.put_u8(0); - buf.put_u8(0); - buf.put_u8(0); - buf.put_u8(0); - - // data_reference_index + write_box!(buf, b"Opus", { + // AudioSampleEntry layout mirrors mp4a + // however, Opus-specific decoder metadata goes in the dOps child box defined by the Opus-in-ISOBMFF mapping. + buf.extend_from_slice(&[0; 6]); buf.put_u16(1); - - // AudioSampleEntry - buf.put_u16(0); // version - buf.put_u16(0); // revision - buf.put_u32(0); // vendor - - buf.put_u16(self.channels); // channelcount - buf.put_u16(16); // samplesize - buf.put_u16(0); // compressionid - buf.put_u16(0); // packetsize - buf.put_u32(self.sample_rate << 16); // samplerate 16.16 - - // ES Descriptor box - write_box!(buf, b"esds", { - // FullBox: version(1) + flags(3) - buf.put_u32(0); - - let asc = &self.asc; - - // Build the descriptor payload in a temp vec, then write with size. - let mut d = Vec::new(); - - // ES_Descriptor - d.push(0x03); - let mut es_payload = Vec::new(); - es_payload.extend_from_slice(&0x0002u16.to_be_bytes()); // ES_ID - es_payload.push(0x00); // flags - - // DecoderConfigDescriptor (tag 0x04) --- - let mut dec_payload = Vec::new(); - dec_payload.push(0x40); // objectTypeIndication = MPEG-4 Audio - dec_payload.push(0x15); // streamType=5 (AudioStream) <<2 | 1 reserved - dec_payload.extend_from_slice(&[0x00, 0x00, 0x00]); // bufferSizeDB (unknown) - dec_payload.extend_from_slice(&0u32.to_be_bytes()); // maxBitrate - dec_payload.extend_from_slice(&0u32.to_be_bytes()); // avgBitrate - - // DecoderSpecificInfo (tag 0x05) inside DecoderConfigDescriptor - dec_payload.push(0x05); - dec_payload.push(asc.len() as u8); - dec_payload.extend_from_slice(asc); - - // Write DecoderConfigDescriptor: tag + length + payload - es_payload.push(0x04); - es_payload.extend_from_slice(&write_desc_len(dec_payload.len())); - es_payload.extend_from_slice(&dec_payload); - - // SLConfigDescriptor (tag 0x06) is a sibling of 0x04 inside ES_Descriptor - es_payload.push(0x06); - es_payload.push(0x01); - es_payload.push(0x02); - - // Write length for ES_Descriptor - d.extend_from_slice(&write_desc_len(es_payload.len())); - d.extend_from_slice(&es_payload); - - buf.extend_from_slice(&d); + buf.put_u16(0); + buf.put_u16(0); + buf.put_u32(0); + buf.put_u16(self.channels); + buf.put_u16(16); + buf.put_u16(0); + buf.put_u16(0); + buf.put_u32(self.sample_rate << 16); + + write_box!(buf, b"dOps", { + buf.put_u8(0); // version + buf.put_u8(self.channels as u8); + buf.put_u16(OPUS_PREROLL_SAMPLES); // pre_skip / decoder preroll in PCM samples + buf.put_u32(self.sample_rate); // original input sample rate + buf.put_i16(0); // output gain + buf.put_u8(0); // channel mapping family }); }); @@ -859,28 +834,8 @@ impl CodecParameters for RpiCameraAudioParameters { fn get_dimensions(&self) -> (u32, u32) { (0, 0) } -} -// Size encoding: 7-bit continuation -fn write_desc_len(mut len: usize) -> Vec { - // Up to 4 bytes - let mut out = vec![0u8; 4]; - for i in (0..4).rev() { - out[i] = (len & 0x7F) as u8; - len >>= 7; - } - // Set continuation bit on first 3 - out[0] |= 0x80; - out[1] |= 0x80; - out[2] |= 0x80; - - // Trim leading 0x80 chunks if possible - while out.len() > 1 && out[0] == 0x80 && (out[1] & 0x80) != 0 { - out.remove(0); - } - // If needed, also trim if the very first byte is 0 - while out.len() > 1 && out[0] == 0 { - out.remove(0); + fn audio_roll_distance(&self) -> Option { + Some(-(OPUS_PREROLL_SAMPLES as i16)) } - out } diff --git a/camera_hub/src/raspberry_pi/rpi_dual_stream.rs b/camera_hub/src/raspberry_pi/rpi_dual_stream.rs index 7b3b50b..fbfb062 100644 --- a/camera_hub/src/raspberry_pi/rpi_dual_stream.rs +++ b/camera_hub/src/raspberry_pi/rpi_dual_stream.rs @@ -7,22 +7,55 @@ use std::collections::VecDeque; use std::os::unix::net::UnixStream; use std::sync::{Arc, Mutex}; use std::thread::sleep; -use std::time::{SystemTime}; +use std::time::SystemTime; use std::{ io::{BufReader, Read, Write}, - process::{Command, Stdio}, + process::{Child, ChildStdout, Command, Stdio}, thread, time::Duration, }; -use bytes::Buf; -use crate::raspberry_pi::rpi_camera::{Frame, FrameKind}; +use crate::raspberry_pi::rpi_camera::{ + Frame, FrameKind, OPUS_BITRATE_BPS, OPUS_COMPLEXITY, OPUS_FRAME_SAMPLES, +}; use anyhow::anyhow; use bytes::BytesMut; use crossbeam_channel::Sender; +use opusic_c::{Application, Bitrate, Channels, Encoder, Signal}; use secluso_motion_ai::frame::RawFrame; use secluso_motion_ai::logic::pipeline::PipelineController; +const AUDIO_PROBE_PACKETS: usize = 50; +const AUDIO_SIGNAL_RMS_THRESHOLD: f64 = 8.0; +const AUDIO_SIGNAL_PEAK_THRESHOLD: i16 = 32; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum AudioCaptureMode { + I2s32StereoLeft, + Mono16, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +struct AudioCaptureCandidate { + device: String, + mode: AudioCaptureMode, + forced: bool, +} + +#[derive(Clone, Copy, Debug)] +enum I2sDecodeMode { + LeftShift16, + LeftShift8, + RightShift16, + RightShift8, +} + +#[derive(Clone, Copy, Debug)] +struct AudioProbeStats { + rms: f64, + peak: i16, +} + /// Provides two channels: one for raw YUV420 frames from rpicam‑vid (for motion detection), one for H.264 frames converted by rpicam-vid. #[allow(clippy::too_many_arguments)] pub fn start( @@ -274,45 +307,209 @@ fn extract_h264_frame(buffer: &mut BytesMut) -> anyhow::Result> { Ok(Some(Frame::new(nal_unit.to_vec(), kind))) } -fn adts_frame_len(header: &[u8]) -> Option { - if header.len() < 7 { return None; } - // syncword 0xFFF - if header[0] != 0xFF || (header[1] & 0xF0) != 0xF0 { return None; } - let protection_absent = header[1] & 0x01; - let hdr_len = if protection_absent == 1 { 7 } else { 9 }; +pub fn start_audio( + frame_queue: Arc>>, +) -> Result<(), Box> { + // Spawn a thread to continuously + // (1) look for a usable ALSA capture device & + // (2) encode one 20 ms Opus packet at a time from whichever source is available. + thread::spawn(move || loop { + let candidates = audio_device_candidates(); + let mut used_device = false; + + for candidate in candidates { + match spawn_arecord(&candidate) { + Ok((mut child, stdout)) => { + eprintln!( + "Using audio input device: {} ({:?}, forced={})", + candidate.device, candidate.mode, candidate.forced + ); + used_device = encode_audio_stream(stdout, &candidate, Arc::clone(&frame_queue)); + let _ = child.wait(); + + if used_device { + eprintln!( + "Audio input device ended: {}. Retrying discovery.", + candidate.device + ); + break; + } + + eprintln!( + "Audio input device produced no audio: {}. Trying next candidate.", + candidate.device + ); + } + Err(err) => { + eprintln!( + "Failed to start audio input device {} ({:?}): {:?}", + candidate.device, candidate.mode, err + ); + } + } + } - let frame_length = (((header[3] & 0x03) as usize) << 11) - | ((header[4] as usize) << 3) - | (((header[5] & 0xE0) as usize) >> 5); + if !used_device { + eprintln!("No usable audio input device found. Retrying in 5 seconds."); + thread::sleep(Duration::from_secs(5)); + } else { + thread::sleep(Duration::from_secs(1)); + } + }); - if frame_length < hdr_len { return None; } - Some(frame_length) + Ok(()) } -fn strip_adts(frame: &[u8]) -> Option<&[u8]> { - if frame.len() < 7 { return None; } - if frame[0] != 0xFF || (frame[1] & 0xF0) != 0xF0 { return None; } - let protection_absent = frame[1] & 0x01; - let hdr_len = if protection_absent == 1 { 7 } else { 9 }; - if frame.len() < hdr_len { return None; } - Some(&frame[hdr_len..]) +fn audio_device_candidates() -> Vec { + let mut devices = Vec::new(); + + if let Ok(device) = std::env::var("SECLUSO_AUDIO_DEVICE") { + let device = device.trim(); + if !device.is_empty() { + eprintln!( + "SECLUSO_AUDIO_DEVICE is set. Forcing audio capture attempts on {}.", + device + ); + devices.push(AudioCaptureCandidate { + device: device.to_string(), + mode: AudioCaptureMode::I2s32StereoLeft, + forced: true, + }); + devices.push(AudioCaptureCandidate { + device: device.to_string(), + mode: AudioCaptureMode::Mono16, + forced: true, + }); + eprintln!( + "Audio candidate list: {} ({:?}, forced), {} ({:?}, forced).", + device, + AudioCaptureMode::I2s32StereoLeft, + device, + AudioCaptureMode::Mono16 + ); + return devices; + } + } + + devices.push(AudioCaptureCandidate { + device: "plughw:ICS43432Mic,0".to_string(), + mode: AudioCaptureMode::I2s32StereoLeft, + forced: false, + }); + // We prioritize the HAT capture device first because that is our default config + eprintln!( + "Audio candidate list starts with HAT device {} ({:?}).", + devices[0].device, devices[0].mode + ); + + for device in detect_usb_capture_devices() { + let candidate = AudioCaptureCandidate { + device, + mode: AudioCaptureMode::Mono16, + forced: false, + }; + if !devices.iter().any(|existing| existing == &candidate) { + eprintln!( + "Discovered USB audio candidate {} ({:?}).", + candidate.device, candidate.mode + ); + devices.push(candidate); + } + } + + eprintln!("Final audio candidate count: {}.", devices.len()); + devices } -pub fn start_audio( - frame_queue: Arc>>, -) -> Result<(), Box> { +fn detect_usb_capture_devices() -> Vec { + let output = match Command::new("arecord") + .arg("-l") + .stdout(Stdio::piped()) + .stderr(Stdio::null()) + .output() + { + Ok(output) => output, + Err(err) => { + eprintln!("Failed to enumerate ALSA capture devices: {:?}", err); + return Vec::new(); + } + }; - let cmd = "\ - arecord -D plughw:0,0 -f S16_LE -r 48000 -c 1 -t raw | \ - sox -t raw -b 16 -e signed-integer -r 48000 -c 1 - \ - -t raw -b 16 -e signed-integer -r 48000 -c 1 - \ - highpass 100 lowpass 7000 gain 20 | \ - fdkaac --raw --raw-channels 1 --raw-rate 48000 \ - --bitrate 96k --transport-format 2 -o - -"; + let listing = String::from_utf8_lossy(&output.stdout); + let mut devices = Vec::new(); - let mut child = Command::new("sh") - .arg("-c") - .arg(cmd) + for line in listing.lines() { + let lower = line.to_ascii_lowercase(); + // Catch the class-compliant USB microphone devices that have been tested (and not unrelated ALSA hardware) + if !lower.contains("usb") && !lower.contains("c-media") { + continue; + } + + if let Some((card, device)) = parse_arecord_card_and_device(line) { + eprintln!("Matched USB ALSA capture line: {}", line.trim()); + devices.push(format!("plughw:{},{}", card, device)); + } + } + + if devices.is_empty() { + eprintln!("No USB ALSA capture devices matched arecord -l output."); + } + + devices +} + +fn parse_arecord_card_and_device(line: &str) -> Option<(u32, u32)> { + let card_start = line.find("card ")? + "card ".len(); + let card_end = line[card_start..].find(':')? + card_start; + let device_marker = ", device "; + let device_start = line.find(device_marker)? + device_marker.len(); + let device_end = line[device_start..].find(':')? + device_start; + + let card = line[card_start..card_end].trim().parse().ok()?; + let device = line[device_start..device_end].trim().parse().ok()?; + + Some((card, device)) +} + +fn spawn_arecord( + candidate: &AudioCaptureCandidate, +) -> Result<(Child, ChildStdout), Box> { + let args: &[&str] = match candidate.mode { + AudioCaptureMode::I2s32StereoLeft => &[ + "-D", + &candidate.device, + "-f", + "S32_LE", + "-r", + "48000", + "-c", + "2", + "-t", + "raw", + ], + AudioCaptureMode::Mono16 => &[ + "-D", + &candidate.device, + "-f", + "S16_LE", + "-r", + "48000", + "-c", + "1", + "-t", + "raw", + ], + }; + + eprintln!( + "Launching arecord for {} with mode {:?}: arecord {}", + candidate.device, + candidate.mode, + args.join(" ") + ); + + let mut child = Command::new("arecord") + .args(args) .stdout(Stdio::piped()) .stderr(Stdio::null()) .spawn()?; @@ -321,47 +518,300 @@ pub fn start_audio( .take() .ok_or_else(|| anyhow!("Failed to capture stdout"))?; - // Spawn a thread to read arecord|sox's stdout and extract audio frames. - { - thread::spawn(move || { - let mut r = BufReader::new(stdout); - let mut buf = BytesMut::with_capacity(64 * 1024); - let mut tmp = [0u8; 4096]; + Ok((child, stdout)) +} - loop { - match r.read(&mut tmp) { - Ok(0) => break, - Ok(n) => { - buf.extend_from_slice(&tmp[..n]); - - loop { - if buf.len() < 7 { break; } - let len = match adts_frame_len(&buf[..7]) { - Some(l) => l, - None => { - // resync: drop 1 byte - buf.advance(1); - continue; - } - }; - if buf.len() < len { break; } - - let adts = buf.split_to(len).to_vec(); - if let Some(aac_au) = strip_adts(&adts) { - let frame = Frame { - data: aac_au.to_vec(), - kind: FrameKind::Audio, - timestamp: SystemTime::now(), - }; - add_frame_and_drop_old(Arc::clone(&frame_queue), frame); - } - } - } - Err(_) => break, +fn encode_audio_stream( + stdout: ChildStdout, + candidate: &AudioCaptureCandidate, + frame_queue: Arc>>, +) -> bool { + let mut r = BufReader::new(stdout); + let mut encoder = match Encoder::new( + Channels::Mono, + opusic_c::SampleRate::Hz48000, + Application::Audio, + ) { + Ok(encoder) => encoder, + Err(err) => { + eprintln!("Failed to initialize Opus encoder: {:?}", err); + return false; + } + }; + if let Err(err) = encoder.set_bitrate(Bitrate::Value(OPUS_BITRATE_BPS)) { + eprintln!("Failed to set Opus bitrate: {:?}", err); + return false; + } + if let Err(err) = encoder.set_vbr(true) { + eprintln!("Failed to enable Opus VBR: {:?}", err); + return false; + } + if let Err(err) = encoder.set_complexity(OPUS_COMPLEXITY) { + eprintln!("Failed to set Opus complexity: {:?}", err); + return false; + } + if let Err(err) = encoder.set_signal(Signal::Voice) { + eprintln!("Failed to set Opus signal mode: {:?}", err); + return false; + } + + let bytes_per_frame = match candidate.mode { + AudioCaptureMode::I2s32StereoLeft => (OPUS_FRAME_SAMPLES as usize) * 8, + AudioCaptureMode::Mono16 => (OPUS_FRAME_SAMPLES as usize) * 2, + }; + eprintln!( + "Audio encoder configured for {}: opus_frame_samples={} bytes_per_capture_frame={} bitrate_bps={} complexity={}.", + candidate.device, + OPUS_FRAME_SAMPLES, + bytes_per_frame, + OPUS_BITRATE_BPS, + OPUS_COMPLEXITY + ); + let mut pcm_bytes = vec![0u8; bytes_per_frame]; + let mut pcm_samples = vec![0u16; OPUS_FRAME_SAMPLES as usize]; + let mut opus_packet = vec![0u8; 1_500]; + let mut encoded_any = false; + let mut probe_raw = Vec::with_capacity(bytes_per_frame * AUDIO_PROBE_PACKETS); + + // We collect an initial probe window before committing to this device (basically a sanity check) + // That gives us enough PCM to both detect obvious silence and, for I2S microphones, infer which slot/shift interpretation contains the live samples. + for _ in 0..AUDIO_PROBE_PACKETS { + if r.read_exact(&mut pcm_bytes).is_err() { + break; + } + probe_raw.extend_from_slice(&pcm_bytes); + } + + if probe_raw.is_empty() { + eprintln!( + "Audio probe failed: device {} opened but no PCM bytes arrived.", + candidate.device + ); + return false; + } + + let i2s_decode_mode = match candidate.mode { + AudioCaptureMode::I2s32StereoLeft => { + let mut best_mode = I2sDecodeMode::LeftShift16; + let mut best_stats = AudioProbeStats { rms: 0.0, peak: 0 }; + + // The I2S microphone has not always presented its usable 16-bit audio samples in the same byte position / slot arrangement across our boards and overlays. + // Thus, we score a few plausible interpretations and keep whichever one produces the strongest signal. + for mode in [ + I2sDecodeMode::LeftShift16, + I2sDecodeMode::LeftShift8, + I2sDecodeMode::RightShift16, + I2sDecodeMode::RightShift8, + ] { + let stats = probe_i2s_mode(&probe_raw, mode); + if stats.rms > best_stats.rms + || (stats.rms == best_stats.rms && stats.peak > best_stats.peak) + { + best_mode = mode; + best_stats = stats; } } - }); + + eprintln!( + "Audio probe for {} picked {:?} with rms={:.2} peak={}.", + candidate.device, best_mode, best_stats.rms, best_stats.peak + ); + Some(best_mode) + } + AudioCaptureMode::Mono16 => { + let stats = probe_mono16(&probe_raw); + eprintln!( + "Audio probe for {} mono16 rms={:.2} peak={}.", + candidate.device, stats.rms, stats.peak + ); + None + } + }; + + let probe_stats = match candidate.mode { + AudioCaptureMode::I2s32StereoLeft => probe_i2s_mode( + &probe_raw, + i2s_decode_mode.expect("i2s decode mode must be selected"), + ), + AudioCaptureMode::Mono16 => probe_mono16(&probe_raw), + }; + let signal_present = probe_stats.rms >= AUDIO_SIGNAL_RMS_THRESHOLD + || probe_stats.peak >= AUDIO_SIGNAL_PEAK_THRESHOLD; + eprintln!( + "Audio signal status for {}: pcm_bytes={} signal_present={} rms={:.2} peak={} thresholds(rms>={:.2}, peak>={}).", + candidate.device, + probe_raw.len(), + signal_present, + probe_stats.rms, + probe_stats.peak, + AUDIO_SIGNAL_RMS_THRESHOLD, + AUDIO_SIGNAL_PEAK_THRESHOLD + ); + + if !signal_present && !candidate.forced { + // For auto-discovered devices, a silent probe means we should probably go to the next candidate on the list + // Allows us to skip a dead/default ALSA source and keep searching for a live microphone. + eprintln!( + "Audio probe saw no meaningful signal on {}. Trying another candidate.", + candidate.device + ); + return false; } - Ok(()) -} \ No newline at end of file + eprintln!( + "Beginning steady-state audio encode for {} using mode {:?} and decode {:?}.", + candidate.device, candidate.mode, i2s_decode_mode + ); + let start_time = SystemTime::now(); + let mut packets_encoded = 0usize; + let mut audio_bytes_encoded = 0usize; + + for frame in probe_raw.chunks_exact(bytes_per_frame) { + fill_pcm_samples(frame, candidate.mode, i2s_decode_mode, &mut pcm_samples); + if let Some(encoded_len) = + encode_packet(&mut encoder, &pcm_samples, &mut opus_packet, &frame_queue) + { + encoded_any = true; + packets_encoded += 1; + audio_bytes_encoded += encoded_len; + } + } + + loop { + if r.read_exact(&mut pcm_bytes).is_err() { + break; + } + + fill_pcm_samples( + &pcm_bytes, + candidate.mode, + i2s_decode_mode, + &mut pcm_samples, + ); + + if let Some(encoded_len) = + encode_packet(&mut encoder, &pcm_samples, &mut opus_packet, &frame_queue) + { + encoded_any = true; + packets_encoded += 1; + audio_bytes_encoded += encoded_len; + if packets_encoded % 250 == 0 { + let elapsed = start_time.elapsed().unwrap_or_default(); + eprintln!( + "Audio encode progress for {}: packets={} opus_bytes={} elapsed_ms={}.", + candidate.device, + packets_encoded, + audio_bytes_encoded, + elapsed.as_millis() + ); + } + } + } + + let elapsed = start_time.elapsed().unwrap_or_default(); + eprintln!( + "Audio encode loop finished for {}: encoded_any={} packets={} opus_bytes={} elapsed_ms={}.", + candidate.device, + encoded_any, + packets_encoded, + audio_bytes_encoded, + elapsed.as_millis() + ); + + encoded_any +} + +fn encode_packet( + encoder: &mut Encoder, + pcm_samples: &[u16], + opus_packet: &mut [u8], + frame_queue: &Arc>>, +) -> Option { + match encoder.encode_to_slice(pcm_samples, opus_packet) { + Ok(encoded_len) if encoded_len > 0 => { + let frame = Frame { + data: opus_packet[..encoded_len].to_vec(), + kind: FrameKind::Audio, + timestamp: SystemTime::now(), + }; + add_frame_and_drop_old(Arc::clone(frame_queue), frame); + Some(encoded_len) + } + Ok(_) => None, + Err(err) => { + eprintln!("Failed to encode Opus audio: {:?}", err); + None + } + } +} + +fn fill_pcm_samples( + bytes: &[u8], + mode: AudioCaptureMode, + i2s_decode_mode: Option, + pcm_samples: &mut [u16], +) { + match mode { + AudioCaptureMode::I2s32StereoLeft => { + let decode_mode = i2s_decode_mode.expect("i2s decode mode must be selected"); + for (dst, frame) in pcm_samples.iter_mut().zip(bytes.chunks_exact(8)) { + *dst = decode_i2s_sample(frame, decode_mode); + } + } + AudioCaptureMode::Mono16 => { + for (dst, chunk) in pcm_samples.iter_mut().zip(bytes.chunks_exact(2)) { + *dst = u16::from_le_bytes([chunk[0], chunk[1]]); + } + } + } +} + +fn probe_mono16(bytes: &[u8]) -> AudioProbeStats { + let samples = bytes + .chunks_exact(2) + .map(|chunk| i16::from_le_bytes([chunk[0], chunk[1]])); + compute_probe_stats(samples) +} + +fn probe_i2s_mode(bytes: &[u8], mode: I2sDecodeMode) -> AudioProbeStats { + let samples = bytes + .chunks_exact(8) + .map(|frame| i16::from_ne_bytes(decode_i2s_sample(frame, mode).to_ne_bytes())); + compute_probe_stats(samples) +} + +fn compute_probe_stats(samples: impl Iterator) -> AudioProbeStats { + let mut sum_sq = 0.0f64; + let mut count = 0usize; + let mut peak = 0i16; + + for sample in samples { + let abs = sample.saturating_abs(); + if abs > peak { + peak = abs; + } + let sample_f = f64::from(sample); + sum_sq += sample_f * sample_f; + count += 1; + } + + let rms = if count == 0 { + 0.0 + } else { + (sum_sq / count as f64).sqrt() + }; + + AudioProbeStats { rms, peak } +} + +fn decode_i2s_sample(frame: &[u8], mode: I2sDecodeMode) -> u16 { + let left = i32::from_le_bytes([frame[0], frame[1], frame[2], frame[3]]); + let right = i32::from_le_bytes([frame[4], frame[5], frame[6], frame[7]]); + let sample_i16 = match mode { + I2sDecodeMode::LeftShift16 => (left >> 16) as i16, + I2sDecodeMode::LeftShift8 => (left >> 8) as i16, + I2sDecodeMode::RightShift16 => (right >> 16) as i16, + I2sDecodeMode::RightShift8 => (right >> 8) as i16, + }; + sample_i16 as u16 +} diff --git a/camera_hub/src/traits.rs b/camera_hub/src/traits.rs index 189bfd3..78a910d 100644 --- a/camera_hub/src/traits.rs +++ b/camera_hub/src/traits.rs @@ -16,6 +16,9 @@ pub trait CodecParameters { fn write_codec_box(&self, buf: &mut BytesMut) -> Result<(), Error>; fn get_clock_rate(&self) -> u32; fn get_dimensions(&self) -> (u32, u32); + fn audio_roll_distance(&self) -> Option { + None + } } #[cfg(any(feature = "raspberry", feature = "ip"))]