@inproceedings{3c1a6b009c9545aab7f1a05b7aa2fbf8,
title = "Noisy audio feature enhancement using audio-visual speech data",
abstract = "We investigate improving automatic speech recognition (ASR) in noisy conditions by enhancing noisy audio features using visual speech captured from the speaker's face. The enhancement is achieved by applying a linear filter to the concatenated vector of noisy audio and visual features, obtained by mean square error estimation of the clean audio features in a training stage. The performance of the enhanced audio features is evaluated on two ASR tasks: A connected digits task and speaker-independent, large-vocabulary, continuous speech recognition. In both cases and at sufficiently low signal-to-noise ratios (SNRs), ASR trained on the enhanced audio features significantly outperforms ASR trained on the noisy audio, achieving for example a 46% relative reduction in word error rate on the digits task at -3.5 dB SNR. However, the method fails to capture the full visual modality benefit to ASR, as demonstrated by its comparison to discriminant audio-visual feature fusion introduced in previous work.",
keywords = "audio-visual speech, noisy audio, feature enhancement",
author = "Roland Goecke and Gerasimos Potamianos and Chalapathy Neti",
year = "2002",
doi = "10.1109/ICASSP.2002.5745030",
language = "English",
isbn = "0-7803-7402-9",
volume = "2",
series = "ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings",
publisher = "IEEE, Institute of Electrical and Electronics Engineers",
pages = "2025--2028",
booktitle = "ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings",
address = "United States",
note = "2002 IEEE International Conference on Acoustics, Speech, and Signal Processing ICASSP 2002, ICASSP 2012 ; Conference date: 12-05-2002 Through 17-05-2002",
}