Audio Sync with Cross-Correlation in Dart
Audio Sync with Cross-Correlation in Dart
Finding the time offset between two audio recordings using signal processing
The Problem
When recording guitar with video on your phone while simultaneously recording high-quality audio on a dedicated device, you end up with two files:
- Video file with low-quality audio (phone mic)
- WAV file with pristine audio (Hoopi device)
To combine them, we need to find the time offset between the two audio tracks.
Cross-Correlation Explained
Cross-correlation measures similarity between two signals at different time offsets:
Mathematical Definition
$$R_{xy}[\tau] = \sum_{n} x[n] \cdot y[n + \tau]$$
Where:
- $x$ = reference signal (phone audio)
- $y$ = target signal (device audio)
- $\tau$ = lag (offset being tested)
- $R_{xy}[\tau]$ = correlation at lag $\tau$
Naive Implementation
The simplest approach: slide one signal across the other:
int findOffsetNaive(Float64List reference, Float64List target, int maxLag) {
double maxCorrelation = double.negativeInfinity;
int bestOffset = 0;
// Test offsets from -maxLag to +maxLag
for (int lag = -maxLag; lag <= maxLag; lag++) {
double correlation = 0;
for (int i = 0; i < reference.length; i++) {
final j = i + lag;
if (j >= 0 && j < target.length) {
correlation += reference[i] * target[j];
}
}
if (correlation > maxCorrelation) {
maxCorrelation = correlation;
bestOffset = lag;
}
}
return bestOffset; // Positive = target is ahead
}
Problem: O(n ร maxLag) complexity. For 30 seconds at 48kHz with ยฑ5 second search:
- 1.44M samples ร 480K lags = 691 billion operations
FFT-Based Approach
Cross-correlation can be computed efficiently via FFT:
$$R_{xy} = \mathcal{F}^{-1}(\mathcal{F}(x) \cdot \mathcal{F}^*(y))$$
Complexity: O(n log n) - much better!
Practical Implementation
Downsampling First
We don't need full 48kHz resolution to find sync. Downsample to ~4kHz:
Float64List downsample(Float64List samples, int factor) {
final length = samples.length ~/ factor;
final result = Float64List(length);
for (int i = 0; i < length; i++) {
// Average over the decimation window
double sum = 0;
for (int j = 0; j < factor; j++) {
sum += samples[i * factor + j];
}
result[i] = sum / factor;
}
return result;
}
Zero-Padding for Linear Correlation
int nextPowerOf2(int n) {
int power = 1;
while (power < n) {
power *= 2;
}
return power;
}
Float64List zeroPad(Float64List signal, int targetLength) {
if (signal.length >= targetLength) return signal;
final result = Float64List(targetLength);
result.setRange(0, signal.length, signal);
return result;
}
The Core Algorithm
class AudioSyncResult {
final int offsetSamples;
final double offsetSeconds;
final double confidence;
AudioSyncResult({
required this.offsetSamples,
required this.offsetSeconds,
required this.confidence,
});
}
Future<AudioSyncResult> findAudioOffset({
required Float64List referenceAudio,
required Float64List targetAudio,
required int sampleRate,
double maxOffsetSeconds = 5.0,
}) async {
// 1. Downsample for efficiency (12x = 48kHz โ 4kHz)
const downsampleFactor = 12;
final ref = downsample(referenceAudio, downsampleFactor);
final tar = downsample(targetAudio, downsampleFactor);
final dsRate = sampleRate ~/ downsampleFactor;
// 2. Use only a portion for faster computation
final analysisLength = min(dsRate * 10, min(ref.length, tar.length)); // 10 seconds
final refSegment = Float64List.sublistView(ref, 0, analysisLength);
final tarSegment = Float64List.sublistView(tar, 0, analysisLength);
// 3. Zero-pad to next power of 2 (for FFT)
final fftLength = nextPowerOf2(refSegment.length + tarSegment.length - 1);
final refPadded = zeroPad(refSegment, fftLength);
final tarPadded = zeroPad(tarSegment, fftLength);
// 4. FFT both signals
final refFFT = fft(refPadded);
final tarFFT = fft(tarPadded);
// 5. Multiply reference by conjugate of target
final product = complexMultiplyConjugate(refFFT, tarFFT);
// 6. Inverse FFT to get correlation
final correlation = ifft(product);
// 7. Find peak within allowed range
final maxLagSamples = (maxOffsetSeconds * dsRate).round();
final peakResult = findPeakInRange(
correlation,
-maxLagSamples,
maxLagSamples,
);
// 8. Convert back to original sample rate
final offsetOriginal = peakResult.index * downsampleFactor;
final offsetSeconds = offsetOriginal / sampleRate;
return AudioSyncResult(
offsetSamples: offsetOriginal,
offsetSeconds: offsetSeconds,
confidence: peakResult.normalizedValue,
);
}
FFT Implementation
Dart doesn't have a built-in FFT. Here's a Cooley-Tukey implementation:
class Complex {
final double real;
final double imag;
const Complex(this.real, [this.imag = 0]);
Complex operator +(Complex other) =>
Complex(real + other.real, imag + other.imag);
Complex operator -(Complex other) =>
Complex(real - other.real, imag - other.imag);
Complex operator *(Complex other) => Complex(
real * other.real - imag * other.imag,
real * other.imag + imag * other.real,
);
Complex get conjugate => Complex(real, -imag);
double get magnitude => sqrt(real * real + imag * imag);
}
List<Complex> fft(Float64List input) {
final n = input.length;
if (n == 1) return [Complex(input[0])];
// Split into even and odd
final even = Float64List(n ~/ 2);
final odd = Float64List(n ~/ 2);
for (int i = 0; i < n ~/ 2; i++) {
even[i] = input[2 * i];
odd[i] = input[2 * i + 1];
}
// Recursive FFT
final evenFFT = fft(even);
final oddFFT = fft(odd);
// Combine
final result = List<Complex>.filled(n, Complex(0));
for (int k = 0; k < n ~/ 2; k++) {
final angle = -2 * pi * k / n;
final twiddle = Complex(cos(angle), sin(angle));
final t = twiddle * oddFFT[k];
result[k] = evenFFT[k] + t;
result[k + n ~/ 2] = evenFFT[k] - t;
}
return result;
}
Float64List ifft(List<Complex> input) {
final n = input.length;
// Conjugate input
final conjugated = input.map((c) => c.conjugate).toList();
// FFT of conjugated
final fftResult = fftComplex(conjugated);
// Conjugate and scale
final result = Float64List(n);
for (int i = 0; i < n; i++) {
result[i] = fftResult[i].conjugate.real / n;
}
return result;
}
Finding the Peak
class PeakResult {
final int index;
final double value;
final double normalizedValue; // 0-1 confidence
PeakResult(this.index, this.value, this.normalizedValue);
}
PeakResult findPeakInRange(Float64List correlation, int minLag, int maxLag) {
final n = correlation.length;
double maxValue = double.negativeInfinity;
int maxIndex = 0;
double sumSquares = 0;
for (int lag = minLag; lag <= maxLag; lag++) {
// Handle wrap-around for negative lags
final index = lag < 0 ? n + lag : lag;
if (index >= 0 && index < n) {
final value = correlation[index];
sumSquares += value * value;
if (value > maxValue) {
maxValue = value;
maxIndex = lag;
}
}
}
// Normalize: peak relative to RMS
final rms = sqrt(sumSquares / (maxLag - minLag + 1));
final normalized = rms > 0 ? maxValue / (rms * 10) : 0;
return PeakResult(maxIndex, maxValue, normalized.clamp(0.0, 1.0));
}
Visualizing the Process
Confidence Scoring
Not all correlations are reliable. Measure confidence:
double calculateConfidence(Float64List correlation, int peakIndex) {
final peakValue = correlation[peakIndex].abs();
// Method 1: Peak-to-mean ratio
double sum = 0;
for (final v in correlation) {
sum += v.abs();
}
final mean = sum / correlation.length;
final peakToMean = peakValue / mean;
// Method 2: Peak-to-second-peak ratio
double secondPeak = 0;
for (int i = 0; i < correlation.length; i++) {
if ((i - peakIndex).abs() > 100) { // Not near main peak
if (correlation[i].abs() > secondPeak) {
secondPeak = correlation[i].abs();
}
}
}
final peakRatio = secondPeak > 0 ? peakValue / secondPeak : 10;
// Combine metrics
return ((peakToMean / 5) + (peakRatio / 3)) / 2;
}
Confidence Thresholds
| Confidence | Interpretation |
|---|---|
| > 0.8 | Excellent match |
| 0.5 - 0.8 | Good match |
| 0.3 - 0.5 | Possible match, verify manually |
| < 0.3 | Unreliable, audio may not match |
Handling Edge Cases
1. Very Different Lengths
Float64List normalizeLength(Float64List a, Float64List b) {
// Use the shorter length for correlation
final minLen = min(a.length, b.length);
return Float64List.sublistView(a, 0, minLen);
}
2. Silent Audio
bool isSilent(Float64List samples, {double threshold = 0.001}) {
double maxAbs = 0;
for (final s in samples) {
if (s.abs() > maxAbs) maxAbs = s.abs();
}
return maxAbs < threshold;
}
Future<AudioSyncResult?> findAudioOffset(...) async {
if (isSilent(referenceAudio) || isSilent(targetAudio)) {
return null; // Can't sync silent audio
}
// ...
}
3. DC Offset Removal
Float64List removeDCOffset(Float64List samples) {
// Calculate mean
double sum = 0;
for (final s in samples) {
sum += s;
}
final mean = sum / samples.length;
// Subtract mean
final result = Float64List(samples.length);
for (int i = 0; i < samples.length; i++) {
result[i] = samples[i] - mean;
}
return result;
}
Performance Optimization
Use Isolates for Heavy Work
Future<AudioSyncResult> findAudioOffsetAsync({
required String referencePath,
required String targetPath,
}) async {
return compute(_findOffsetInIsolate, {
'referencePath': referencePath,
'targetPath': targetPath,
});
}
AudioSyncResult _findOffsetInIsolate(Map<String, String> args) {
final refBytes = File(args['referencePath']!).readAsBytesSync();
final tarBytes = File(args['targetPath']!).readAsBytesSync();
final refSamples = extractAudio(refBytes);
final tarSamples = extractAudio(tarBytes);
return findAudioOffsetSync(refSamples, tarSamples);
}
Progress Reporting
Stream<double> findAudioOffsetWithProgress(...) async* {
yield 0.1; // Loading files
final ref = await loadAudio(referencePath);
yield 0.2;
final tar = await loadAudio(targetPath);
yield 0.4;
// Downsample
final refDs = downsample(ref, 12);
yield 0.5;
final tarDs = downsample(tar, 12);
yield 0.6;
// FFT
final refFFT = await compute(fft, refDs);
yield 0.75;
final tarFFT = await compute(fft, tarDs);
yield 0.9;
// Correlation
final result = computeCorrelation(refFFT, tarFFT);
yield 1.0;
}
Integration with Video Sync
Future<void> syncVideoWithAudio({
required String videoPath,
required String audioPath,
required String outputPath,
}) async {
// 1. Extract audio from video
final videoAudioPath = await extractAudioFromVideo(videoPath);
// 2. Load both audio files
final videoAudio = await loadWavFile(videoAudioPath);
final deviceAudio = await loadWavFile(audioPath);
// 3. Find offset
final result = await findAudioOffset(
referenceAudio: videoAudio,
targetAudio: deviceAudio,
sampleRate: 48000,
);
log('Found offset: ${result.offsetSeconds}s (confidence: ${result.confidence})');
if (result.confidence < 0.3) {
throw Exception('Low confidence sync - audio may not match');
}
// 4. Combine with offset
await VideoAudioSyncService.combineVideoAudio(
videoPath: videoPath,
audioPath: audioPath,
outputPath: outputPath,
offsetSeconds: result.offsetSeconds,
);
}
Key Takeaways
- Downsample first - Full resolution isn't needed for sync
- FFT is essential - O(n log n) vs O(nยฒ) matters
- Zero-pad for linear correlation - Avoid circular correlation
- Measure confidence - Not all matches are reliable
- Handle edge cases - Silence, DC offset, length mismatch
- Use isolates - Keep UI responsive during heavy computation
Common Pitfalls
Cross-correlation is a powerful technique that transforms a difficult sync problem into a mathematical search for a peak - elegant and reliable when implemented correctly.