Theses, Chapters

@book{mandel19,
  editor = {Michael Mandel and Justin Salamon and Daniel P.W. Ellis},
  title = {Proceedings of the Detection and Classification of Acoustic Scenes and Events 2019 Workshop (DCASE2019)},
  address = {New York University, NY, USA},
  year = 2019,
  month = {October},
  isbn = {978-0-578-59596-2},
  doi = {10.33682/1syg-dy60}
}
@incollection{mandel16e,
  author = {Michael I Mandel and Shoko Araki and Tomohiro Nakatani},
  title = {Multichannel clustering and classification approaches},
  chapter = {12},
  booktitle = {Audio Source Separation and Speech Enhancement},
  editor = {Emmanuel Vincent and Tuomas Virtanen and Sharon Gannot},
  year = {2018},
  publisher = {Wiley}
}
@incollection{MandelAndBarker2017,
  author = {Michael I Mandel and Jon P Barker},
  title = {Multichannel spatial clustering using model-based source separation},
  chapter = {3},
  booktitle = {New Era for Robust Speech Recognition: Exploiting, Deep Learning},
  editor = {Shinji Watanabe and Marc Delcroix and Florian Metze and John R. Hershey},
  year = {2017},
  publisher = {Springer},
  doi = {10.1007/978-3-319-64680-0},
  isbn = {978-3-319-64679-4}
}
@incollection{XiaoEtAl2017,
  author = {Xiong Xiao and Shinji Watanabe and Hakan Erdogan and Michael Mandel and Liang Lu and John R. Hershey and Michael L. Seltzer and Guoguo Chen and Yu Zhang and Dong Yu},
  title = {Discriminative beamforming with phase-aware neural networks for speech enhancement and recognition},
  chapter = {4},
  booktitle = {New Era for Robust Speech Recognition: Exploiting, Deep Learning},
  editor = {Shinji Watanabe and Marc Delcroix and Florian Metze and John R. Hershey},
  year = {2017},
  publisher = {Springer},
  doi = {10.1007/978-3-319-64680-0},
  isbn = {978-3-319-64679-4}
}
@book{devaney16b,
  title = {Proceedings of the 17th International Society for Music Information Retrieval Conference (ISMIR)},
  year = {2016},
  editor = {Johanna Devaney and Michael I Mandel and Douglas Turnbull and George Tzanetakis},
  isbn = {978-0-692-75506-8},
  address = {New York},
  url = {https://drive.google.com/file/d/0B2SQvWn0_78BaWxUNEdyakROLWM/view?usp=sharing}
}
@incollection{bertin-mahieux09,
  title = {Automatic Tagging of Audio: The State-of-the-Art},
  author = {Thierry Bertin-Mahieux and Douglas Eck and Michael I. Mandel},
  booktitle = {Machine Audition: Principles, Algorithms and Systems},
  editor = {Wenwu Wang},
  publisher = {IGI Publishing},
  year = {2010},
  chapter = {14},
  pages = {334--352}
}
@phdthesis{mandel09c,
  author = {Michael I. Mandel},
  title = {Binaural Model-Based Source Separation and Localization},
  year = {2010},
  month = feb,
  school = {Columbia University},
  department = {Electrical Engineering},
  url = {http://m.mr-pc.org/work/dissertation.pdf},
  abstract = {When listening in noisy and reverberant environments, human listeners are able to focus on a particular sound of interest while ignoring interfering sounds. Computer listeners, however, can only perform highly constrained versions of this task. While automatic speech recognition systems and hearing aids work well in quiet conditions, source separation is necessary for them to be able to function in these challenging situations.

This dissertation introduces a system that separates more than two sound sources from reverberant, binaural mixtures based on the sources' locations. Each source is modelled probabilistically using information about its interaural time and level differences at every frequency, with parameters learned using an expectation maximization (EM) algorithm. The system is therefore called Model-based EM Source Separation and Localization (MESSL). This EM algorithm alternates between refining its estimates of the model parameters (location) for each source and refining its estimates of the regions of the spectrogram dominated by each source. In addition to successfully separating sources, the algorithm estimates model parameters from a mixture that have direct psychoacoustic relevance and can usually only be measured for isolated sources. One of the key features enabling this separation is a novel probabilistic localization model that can be evaluated at individual time-frequency points and over arbitrarily-shaped regions of the spectrogram.

The localization performance of the systems introduced here is comparable to that of humans in both anechoic and reverberant conditions, with a 40\% lower mean absolute error than four comparable algorithms. When target and masker sources are mixed at similar levels, MESSL's separations have signal-to-distortion ratios 2.0 dB higher than four comparable separation algorithms and estimated speech quality 0.19 mean opinion score units higher. When target and masker sources are mixed anechoically at very different levels, MESSL's performance is comparable to humans', but in similar reverberant mixtures it only achieves 20–-25\% of human performance. While MESSL successfully rejects enough of the direct-path portion of the masking source in reverberant mixtures to improve energy-based signal-to-noise ratio results, it has difficulty rejecting enough reverberation to improve automatic speech recognition results significantly. This problem is shared by other comparable separation systems.}
}

Journal

@article{TrinhAndMandel2020b,
  title = {Directly comparing the listening strategies of humans and machines},
  author = {Vieh Anh Trinh and Michael I Mandel},
  journal = { {IEEE} Transactions on Audio, Speech, and Language Processing},
  year = {2021},
  doi = {10.1109/TASLP.2020.3040545},
  volume = {29},
  pages = {312-323},
  abstract = {Automatic speech recognition (ASR) has reached hu-man performance on many clean speech corpora, but it remains worse  than  human  listeners  in  noisy  environments.  This  paper investigates whether this difference in performance might be due to  a  difference  in  the  time-frequency  regions  that  each  listener utilizes  in  making  their  decisions  and  how  these  "important" regions  change  for  ASRs  using  different  acoustic  models  (AMs) and  language  models  (LMs).  We  define  important  regions  as time-frequency  points  in  a  spectrogram  that  tend  to  be  audible when  the  listener  correctly  recognizes  that  utterance  in  noise. The  evidence  from  this  study  indicates  that  a  neural  network AM attends to regions that are more similar to those of humans (capturing certain high-energy regions) than those of a traditional Gaussian  mixture  model  (GMM)  AM.  Our  analysis  also  shows that the neural network AM has not yet captured all the cues that human listeners utilize, such as certain transitions between silence and high speech energy. We also find that differences in important time-frequency  regions  tend  to  track  differences  in  accuracy  on specific words in a test sentence, suggesting a connection. Because of this connection, adapting an ASR to attend to the same regions humans  use  might  improve  its  generalization  in  noise.}
}
@article{mandelEtAl2019,
  title = {The Bubble-Noise Technique for Speech Perception Research},
  author = {Michael I Mandel and Vikas Grover and Mengxuan Zhao and Jiyoung Choi and Valerie Shafer},
  journal = {Perspectives of the ASHA Special Interest Groups},
  year = {2019},
  doi = {10.1044/2019_PERS-19-00058},
  volume = {4},
  number = {6},
  pages = {1653--1666},
  abstract = {Purpose: The ``bubble noise'' technique has recently been introduced as a method to identify
the regions in time-frequency maps (that is, spectrograms) of speech that are especially
important for listeners in speech recognition. This technique identifies regions of ``importance''
that are specific to the speech stimulus and the listener, thus permitting these regions to be
compared across different listener groups. For example, in cross-linguistic and second language
(L2) speech perception, this method identifies differences in regions of importance in
accomplishing decisions of phoneme category membership. This paper describes the
application of bubble noise to the study of language learning for three different language pairs:
Hindi-English bilinguals' perception of the /v/-/w/ contrast in American English, native English
speakers' perception of the tense/lax contrast for Korean fricatives and affricates, and native
English speakers' perception of Mandarin lexical tone.
Conclusion: We demonstrate that this technique provides insight on what information in the
speech signal is important for native/first-language listeners compared to non-native/L2
listeners. Furthermore, the method can be used to examine whether L2 speech perception
training is effective in bringing the listener's attention to the important cues.}
}
@article{mandel16c,
  author = {Michael I Mandel and Sarah E Yoho and Eric W Healy},
  title = {Measuring time-frequency importance functions of speech with bubble noise},
  journal = {Journal of the Acoustical Society of America},
  year = {2016},
  volume = {140},
  issue = {4},
  pages = {2542--2553},
  doi = {10.1121/1.4964102},
  url = {http://m.mr-pc.org/work/jasa16.pdf},
  abstract = {Listeners can reliably perceive speech in noisy conditions, but it is not well understood what specific features of speech they use to do this. This paper introduces a data-driven framework to identify the time-frequency locations of these features. Using the same speech utterance mixed with many different noise instances, the framework is able to compute the importance of each time-frequency point in the utterance to its intelligibility. The mixtures have approximately the same global signal-to-noise ratio at each frequency, but very different recognition rates. The difference between these intelligible vs unintelligible mixtures is the alignment between the speech and spectro-temporally modulated noise, providing different combinations of “glimpses” of speech in each mixture. The current results reveal the locations of these important noise-robust phonetic features in a restricted set of syllables. Classification models trained to predict whether individual mixtures are intelligible based on the location of these glimpses can generalize to new conditions, successfully predicting the intelligibility of novel mixtures. They are able to generalize to novel noise instances, novel productions of the same word by the same talker, novel utterances of the same word spoken by different talkers, and, to some extent, novel consonants.},
  code = {http://github.com/mim/auditoryBubbles}
}
@article{larochelle12,
  title = {Learning Algorithms for the Classification Restricted Boltzmann Machine},
  author = {Hugo Larochelle and Michael I Mandel and Razvan Pascanu and Yoshua Bengio},
  volume = {13},
  pages = {643--669},
  year = {2012},
  journal = {Journal of Machine Learning Research},
  month = mar,
  url = {http://www.jmlr.org/papers/volume13/larochelle12a/larochelle12a.pdf},
  abstract = {Recent developments have demonstrated the capacity of restricted
Boltzmann machines (RBM) to be powerful generative models, able to
extract useful features from input data or construct deep artificial
neural networks. In such settings, the RBM only yields a preprocessing
or an initialization for some other model, instead of acting as a
complete supervised model in its own right. In this paper, we argue
that RBMs can provide a self-contained framework for developing
competitive classifiers. We study the Classification RBM (ClassRBM), a
variant on the RBM adapted to the classification setting. We study
different strategies for training the ClassRBM and show that
competitive classification performances can be reached when
appropriately combining discriminative and generative training
objectives. Since training according to the generative objective
requires the computation of a generally intractable gradient, we also
compare different approaches to estimating this gradient and address
the issue of obtaining such a gradient for problems with very high
dimensional inputs. Finally, we describe how to adapt the ClassRBM to
two special cases of classification problems, namely semi-supervised
and multitask learning.}
}
@article{weiss11,
  title = {Combining localization cues and source model constraints for binaural source separation},
  author = {Ron Weiss and Michael I. Mandel and Daniel P. W. Ellis},
  journal = {Speech Communication},
  year = {2011},
  month = may,
  volume = {53},
  number = {5},
  pages = {606--621},
  doi = {10.1016/j.specom.2011.01.003},
  url = {http://www.ee.columbia.edu/~dpwe/pubs/WeissME11-messlev.pdf},
  abstract = {We describe a system for separating multiple sources from a two-channel recording based on interaural cues and prior knowledge of the statistics of the underlying source signals. The proposed algorithm effectively combines information derived from low level perceptual cues, similar to those used by the human auditory system, with higher level information related to speaker identity. We combine a probabilistic model of the observed interaural level and phase differences with a prior model of the source statistics and derive an EM algorithm for finding the maximum likelihood parameters of the joint model. The system is able to separate more sound sources than there are observed channels in the presence of reverberation. In simulated mixtures of speech from two and three speakers the proposed algorithm gives a signal-to-noise ratio improvement of 1.7 dB over a baseline algorithm which uses only interaural cues. Further improvement is obtained by incorporating eigenvoice speaker adaptation to enable the source model to better match the sources present in the signal. This improves performance over the baseline by 2.7 dB when the speakers used for training and testing are matched. However, the improvement is minimal when the test data is very different from that used in training.}
}
@article{mandel11b,
  title = {Contextual tag inference},
  author = {Michael I.~Mandel And Razvan Pascanu And Douglas Eck And Yoshua Bengio And Luca M.~Aiello And Rossano Schifanella And Filippo Menczer},
  journal = { {ACM} Transactions on Multimedia Computing, Communications and Applications},
  year = {2011},
  month = oct,
  doi = {10.1145/2037676.2037689},
  volume = {7S},
  number = {1},
  articleno = {32},
  pages = {32:1--32:18},
  acmid = {2037689},
  publisher = {ACM},
  address = {New York, NY, USA},
  url = {http://m.mr-pc.org/work/tomccap11.pdf},
  abstract = {This paper examines the use of two kinds of context to improve the results of content-based music taggers: the relationships between tags and between the clips of songs that are tagged.  We show that users agree more on tags applied to clips temporally "closer" to one another; that conditional restricted Boltzmann machine models of tags can more accurately predict related tags when they take context into account; and that when training data is "smoothed" using context, support vector machines can better rank these clips according to the original, unsmoothed tags and do this more accurately than three standard multi-label classifiers.}
}
@article{devaney12a,
  author = {Johanna Devaney and Michael I. Mandel and Daniel P. W. Ellis and Ichiro Fujinaga},
  year = {2012},
  title = {Automatically extracting performance data from recordings of trained singers},
  journal = {Psychomusicology: Music, Mind \& Brain},
  volume = {21},
  number = {1–-2},
  pages = {108--136},
  url = {http://music.mcgill.ca/~devaney/files/devaney11automatically.pdf},
  abstract = {Recorded music offers a wealth of information for studying performance practice. This paper examines the challenges of automatically extracting performance information from audio recordings of the singing voice and discusses our technique for automatically extracting information such as note timings, intonation, vibrato rates, and dynamics. An experiment is also presented that focuses on the tuning of semitones in solo soprano performances of Schubert's ``Ave Maria'' by non-professional and professional singers. We found a small decrease in size of intervals with a leading tone function only in the non-professional group.}
}
@article{mandel10a,
  title = {Evaluating source separation algorithms with reverberant speech},
  author = {Michael I. Mandel and Scott Bressler and Barbara Shinn-Cunningham and Daniel P. W. Ellis},
  journal = { {IEEE} Transactions on Audio, Speech, and Language Processing},
  year = {2010},
  url = {http://m.mr-pc.org/work/taslp10b.pdf},
  volume = {18},
  number = {7},
  pages = {1872--1883},
  doi = {10.1109/TASL.2010.2052252},
  abstract = {This paper examines the performance of several source separation
systems on a speech separation task for which human intelligibility
has previously been measured.  For anechoic mixtures, automatic speech
recognition (ASR) performance on the separated signals is quite
similar to human performance.  In reverberation, however, while signal
separation has some benefit for ASR, the results are still far below
those of human listeners facing the same task.  Performing this same
experiment with a number of oracle masks created with \emph{a priori}
knowledge of the separated sources motivates a new objective measure
of separation performance, the DERTM (Direct-path, Early echo, and
Reverberation, of the Target and Masker), which is closely related to
the ASR results.  This measure indicates that while the non-oracle
algorithms successfully reject the direct-path signal from the masking
source, they reject less of its reverberation, explaining the
disappointing ASR performance.}
}
@article{mandel09a,
  title = {Model-based expectation maximization source separation and localization},
  author = {Michael I. Mandel and Ron J. Weiss and Daniel P. W. Ellis},
  journal = { {IEEE} Transactions on Audio, Speech, and Language Processing},
  year = {2010},
  month = feb,
  volume = {18},
  number = {2},
  pages = {382--394},
  url = {http://m.mr-pc.org/work/taslp10.pdf},
  doi = {10.1109/TASL.2009.2029711},
  abstract = {This paper describes a system, referred to as model-based expectation-maximization source separation and localization (MESSL), for separating and localizing multiple sound sources from an underdetermined reverberant two-channel recording. By clustering individual spectrogram points based on their interaural phase and level differences, MESSL generates masks that can be used to isolate individual sound sources. We first describe a probabilistic model of interaural parameters that can be evaluated at individual spectrogram points. By creating a mixture of these models over sources and delays, the multi-source localization problem is reduced to a collection of single source problems. We derive an expectation-maximization algorithm for computing the maximum-likelihood parameters of this mixture model, and show that these parameters correspond well with interaural parameters measured in isolation. As a byproduct of fitting this mixture model, the algorithm creates probabilistic spectrogram masks that can be used for source separation. In simulated anechoic and reverberant environments, separations using MESSL produced on average a signal-to-distortion ratio 1.6 dB greater and perceptual evaluation of speech quality (PESQ) results 0.27 mean opinion score units greater than four comparable algorithms.}
}
@article{mandel08b,
  title = {A Web-Based Game for Collecting Music Metadata},
  author = {Michael I. Mandel and Daniel P. W. Ellis},
  journal = {Journal of New Music Research},
  year = {2008},
  volume = {37},
  number = {2},
  pages = {151--165},
  url = {http://m.mr-pc.org/work/jnmr08.pdf},
  doi = {10.1080/09298210802479300},
  abstract = {We have designed a web-based game, MajorMiner, that makes collecting descriptions of musical excerpts fun, easy, useful, and objective. Participants describe 10 second clips of songs and score points when their descriptions match those of other participants. The rules were designed to encourage players to be thorough and the clip length was chosen to make judgments objective and specific. To analyse the data, we measured the degree to which binary classifiers could be trained to spot popular tags. We also compared the performance of clip classifiers trained with MajorMiner's tag data to those trained with social tag data from a popular website. On the top 25 tags from each source, MajorMiner's tags were classified correctly 67.2\% of the time, while the social tags were classified correctly 62.6\% of the time.}
}
@article{huang08,
  title = {Active Learning for Interactive Multimedia Retrieval},
  author = {Thomas S. Huang and Charlie K. Dagli and 
             Shyamsundar Rajaram and Edward Y. Chang and 
             Michael I. Mandel and Graham E. Poliner and Daniel P. W. Ellis},
  journal = {Proceedings of the {IEEE}},
  pages = {648--667},
  volume = {96},
  number = {4},
  year = {2008},
  doi = {10.1109/JPROC.2008.916364},
  abstract = {As the first decade of the 21st century comes to a close, growth in multimedia delivery infrastructure and public demand for applications built on this backbone are converging like never before. The push towards reaching truly interactive multimedia technologies becomes stronger as our media consumption paradigms continue to change. In this paper, we profile a technology leading the way in this revolution: active learning. Active learning is a strategy that helps alleviate challenges inherent in multimedia information retrieval through user interaction. We show how active learning is ideally suited for the multimedia information retrieval problem by giving an overview of the paradigm and component technologies used with special attention given to the application scenarios in which these technologies are useful. Finally, we give insight into the future of this growing field and how it fits into the larger context of multimedia information retrieval.}
}
@article{mandel06b,
  author = {Michael I. Mandel and Graham E. Poliner and Daniel P. W. Ellis},
  title = {Support vector machine active learning for music retrieval},
  journal = {Multimedia systems},
  pages = {1--11},
  year = {2006},
  month = aug,
  volume = {12},
  number = {1},
  url = {http://m.mr-pc.org/work/mmsj05.pdf},
  doi = {10.1007/s00530-006-0032-2},
  abstract = {Searching and organizing growing digital music collections requires a computational model of music similarity. This paper describes a system for performing flexible music similarity queries using SVM active learning. We evaluated the success of our system by classifying 1210 pop songs according to mood and style (from an online music guide) and by the performing artist. In comparing a number of representations for songs, we found the statistics of mel-frequency cepstral coefficients to perform best in precision-at-20 comparisons. We also show that by choosing training examples intelligently, active learning requires half as many labeled examples to achieve the same accuracy as a standard scheme.}
}

Conference

@inproceedings{CobanEtAl2023,
  author = {Enis Berk \c{C}oban and Megan Perra and Michael I Mandel},
  title = {Towards high resolution weather monitoring with sound data},
  year = {2024},
  booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing},
  note = {To appear},
  abstract = {}
}
@inproceedings{SyedAndMandel2023,
  author = {Ali Raza Syed and Michael I Mandel},
  title = {Estimating shapley values of training utterances for automatic speech recognition models},
  booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing},
  year = {2023}
}
@inproceedings{TrinhEtAl2022,
  author = {Viet Ahn Trinh and Hassan Salami Kavaki and Michael I Mandel},
  title = {ImportantAug: a data augmentation agent for speech},
  booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing},
  year = 2022
}
@inproceedings{CobanEtAl2022,
  author = {Enis Berk \c{C}oban and Megan Perra and Dara Pir and Michael I Mandel},
  title = {EDANSA-2019: The ecoacoustic dataset from arctic north slope Alaska},
  booktitle = {Workshop on the Detection and Classification of Audio Scenes and Environments},
  year = {2022}
}
@inproceedings{CobanEtAl2021,
  title = {Towards Large Scale Ecoacoustic Monitoring With Small Amounts of Labeled Data},
  author = {Enis Berk \c{C}oban and Ali R Syed and Dara Pir and Michael I Mandel},
  booktitle = { {IEEE} Workshop on Applications of Signal Processing to Audio and Acoustics},
  year = {2021}
}
@inproceedings{NiEtAl2020,
  title = { {WPD}++: an improved neural beamformer for simultaneous speech separation and dereverberation},
  author = {Zhaoheng Ni and Yong Xu and Meng Yu and Bo Wu and Shixiong Zhang and Dong Yu and Michael I Mandel},
  booktitle = { {IEEE} Workshop on Spoken Language Technologies},
  year = {2020}
}
@inproceedings{KavakiAndMandel2020,
  title = {Identifying Important Time-frequency Locations in Continuous Speech Utterances},
  author = {Hassan Salami Kavaki and Michael I Mandel},
  booktitle = {Proceedings of Interspeech},
  year = {2020},
  pages = {1639-1643},
  url = {https://isca-speech.org/archive/Interspeech_2020/pdfs/2637.pdf},
  doi = {10.21437/Interspeech.2020-2637},
  abstract = {Human listeners use specific cues to recognize speech and recent experiments have shown that certain time-frequency regions of individual utterances are more important to their correct identification than others. A model that could identify such cues or regions from clean speech would facilitate speech recognition and speech enhancement by focusing on those important regions. Thus, in this paper we present a model that can predict the regions of individual utterances that are important to an automatic speech recognition (ASR) "listener" by learning to add as much noise as possible to these utterances while still permitting the ASR to correctly identify them. This work utilizes a continuous speech recognizer to recognize multi-word utterances and builds upon our previous work that performed the same process for an isolated word recognizer. Our experimental results indicate that our model can apply noise to obscure 90.5\% of the spectrogram while leaving recognition performance nearly unchanged.
}
}
@inproceedings{TrinhAndMandel2020,
  author = {Viet Anh Trinh and Michael I. Mandel},
  title = {Large Scale Evaluation of Importance Maps in Automatic Speech Recognition},
  year = 2020,
  booktitle = {Proceedings of Interspeech},
  pages = {1166--1170},
  doi = {10.21437/Interspeech.2020-2883},
  url = {https://www.isca-speech.org/archive/Interspeech_2020/pdfs/2883.pdf},
  abstract = {This paper proposes a metric that we call the structured saliency benchmark (SSBM) to evaluate importance maps computed for automatic speech recognizers on individual utterances. These maps indicate time-frequency points of the utterance that are most important for correct recognition of a target word. Our evaluation technique is not only suitable for standard classification tasks, but is also appropriate for structured prediction tasks like sequence-to-sequence models. Additionally, we use this approach to perform a comparison of the importance maps created by our previously introduced technique using “bubble noise” to identify important points through correlation with a baseline approach based on smoothed speech energy and forced alignment. Our results show that the bubble analysis approach is better at identifying important speech regions than this baseline on 100 sentences from the AMI corpus.}
}
@inproceedings{GhalyAndMandel2020,
  title = {Using Prosody to Improve Dependency Parsing},
  author = {Hussein Ghaly and Michael I Mandel},
  booktitle = {Speech prosody},
  year = {2020}
}
@inproceedings{CobanEtAl2020,
  author = {Enis Berk \c{C}oban and Dara Pir and Richard So and Michael I Mandel},
  title = {Transfer learning from YouTube soundtracks to tag arctic ecoacoustic recordings},
  booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing},
  year = {2020},
  url = {http://m.mr-pc.org/work/icassp20coban.pdf},
  abstract = {Sound provides a valuable tool for long-term monitoring of sensitive animal habitats at a spatial scale larger than camera traps or field observations, while also providing more details than satellite imagery. Currently, the ability to collect such recordings outstrips the ability to analyze them manually, necessitating the development of automatic analysis methods.  While several datasets and models of large corpora of video soundtracks have recently been released, it is not clear to what extent these models will generalize to environmental recordings and the scientific questions of interest in analyzing them. This paper investigates this generalization in several ways and finds that models themselves display limited performance, however, their intermediate representations can be used to train successful models on small sets of labeled data. 
},
  pages = {726--730},
  doi = {10.1109/ICASSP40776.2020.9053338}
}
@inproceedings{MaitiAndMandel2019c,
  author = {Soumi Maiti and Michael I Mandel},
  title = {Speaker independence of neural vocoders and their effect on parametric resynthesis speech enhancement},
  year = {2020},
  eprint = {1911.06266},
  eprinttype = {arxiv},
  booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing},
  http = {http://mr-pc.org/work/icassp20/},
  url = {http://m.mr-pc.org/work/icassp20maiti.pdf},
  slides = {http://m.mr-pc.org/work/icassp20maitiSlides.pdf},
  abstract = {Traditional speech enhancement systems produce speech with compromised quality.  Here we propose to use the high quality speech generation capability of neural vocoders for better quality speech enhancement. We term this \emph{parametric resynthesis} (PR). In previous work, we showed that PR systems generate high quality speech for a single speaker using two neural vocoders, WaveNet and WaveGlow. Both these vocoders are traditionally speaker dependent. Here we first show that when trained on data from enough speakers, these vocoders can generate speech from unseen speakers, both male and female, with similar quality as seen speakers in training. 
Next using these two vocoders and a new vocoder LPCNet, we evaluate the noise reduction quality of PR on unseen speakers and show that objective signal and overall quality is higher than the state-of-the-art speech enhancement systems Wave-U-Net, Wavenet-denoise, and SEGAN.  Moreover, in subjective quality, multiple-speaker PR out-performs the oracle Wiener mask.},
  pages = {206--210},
  doi = {10.1109/ICASSP40776.2020.9053296}
}
@inproceedings{NiAndMandel2019,
  author = {Zhaoheng Ni and Michael I Mandel},
  title = {Mask-dependent Phase Estimation for Monaural Speaker Separation},
  year = {2020},
  eprint = {1911.02746},
  eprinttype = {arxiv},
  booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing},
  abstract = {Speaker separation refers to isolating speech of interest in a multi-talker environment. Most methods apply real-valued Time-Frequency (T-F) masks to the mixture Short-Time Fourier Transform (STFT) to reconstruct the clean speech. Hence there is an unavoidable mismatch between the phase of the reconstruction and the original phase of the clean speech. In this paper, we propose a simple yet effective phase estimation network that predicts the phase of the clean speech based on a T-F mask predicted by a chimera++ network. To overcome the label-permutation problem for both the T-F mask and the phase, we propose a mask-dependent permutation invariant training (PIT) criterion to select the phase signal based on the loss from the T-F mask prediction. We also propose an Inverse Mask Weighted Loss Function for phase prediction to focus the model on the T-F regions in which the phase is more difficult to predict. Results on the WSJ0-2mix dataset show that the phase estimation network achieves comparable performance to models that use iterative phase reconstruction or end-to-end time-domain loss functions, but in a more straightforward manner.},
  url = {http://m.mr-pc.org/work/icassp20ni.pdf}
}
@inproceedings{MaitiAndMandel2019b,
  title = {Parametric Resynthesis with Neural Vocoders},
  author = {Soumi Maiti and Michael I Mandel},
  year = 2019,
  booktitle = { {IEEE} Workshop on Applications of Signal Processing to Audio and Acoustics},
  doi = {10.1109/WASPAA.2019.8937165},
  pages = {303-307},
  issn = {1931-1168},
  eprint = {1906.06762},
  eprinttype = {arxiv},
  arxiv = {1906.06762},
  http = {http://mr-pc.org/work/waspaa19/},
  url = {http://m.mr-pc.org/work/waspaa19.pdf},
  abtract = {Noise suppression systems generally produce output speech with compromised quality. We propose to utilize the high quality speech generation capability of neural vocoders for noise suppression. We use a neural network to predict clean mel-spectrogram features from noisy speech and then compare two neural vocoders, WaveNet and WaveGlow, for synthesizing clean speech from the predicted mel spectrogram. Both WaveNet and WaveGlow achieve better subjective and objective quality scores than the source separation model Chimera++. Further, WaveNet and WaveGlow also achieve significantly better subjective quality ratings than the oracle Wiener mask. Moreover, we observe that between WaveNet and WaveGlow, WaveNet achieves the best subjective quality scores, although at the cost of much slower waveform generation.}
}
@inproceedings{MaitiAndMandel2019,
  title = {Speech denoising by parametric resynthesis},
  author = {Soumi Maiti and Michael I Mandel},
  year = 2019,
  booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing},
  doi = {10.1109/ICASSP.2019.8683130},
  pages = {6995--6999},
  url = {http://m.mr-pc.org/work/icassp19.pdf},
  poster = {http://m.mr-pc.org/work/icassp19poster.pdf},
  http = {http://mr-pc.org/work/icassp19/},
  abstract = {This work proposes the use of clean speech vocoder parameters as the target for a neural network performing speech enhancement. These parameters have been designed for text-to-speech synthesis so that they both produce high-quality resyntheses and also are straightforward to model with neural networks, but have not been utilized in speech enhancement until now.  In comparison to a matched text-to-speech system that is given the ground truth transcripts of the noisy speech, our model is able to produce more natural speech because it has access to the true prosody in the noisy speech. In comparison to two denoising systems, the oracle Wiener mask and a DNN-based mask predictor, our model equals the oracle Wiener mask in subjective quality and intelligibility and surpasses the realistic system.  A vocoder-based upper bound shows that there is still room for improvement with this approach beyond the oracle Wiener mask. We test speaker-dependence with two speakers and show that a single model can be used for multiple speakers.}
}
@inproceedings{TrinhEtAl2018,
  title = {Bubble cooperative networks for identifying important speech cues},
  author = {Viet~Anh Trinh and Brian McFee and Michael I Mandel},
  year = 2018,
  booktitle = {Proceedings of Interspeech},
  url = {http://m.mr-pc.org/work/interspeech18trinh.pdf},
  poster = {http://m.mr-pc.org/work/interspeech18trinhPoster.pdf},
  pages = {1616--1620},
  doi = {10.21437/Interspeech.2018-2377},
  abstract = {Predicting the intelligibility of noisy recordings is difficult and most current algorithms treat all speech energy as equally important to intelligibility. Our previous work on human perception used a listening test paradigm and correlational analysis to show that some energy is more important to intelligibility than other energy. In this paper, we propose a system called the Bubble Cooperative Network (BCN), which aims to predict important areas of individual utterances directly from clean speech. Given such a prediction, noise is added to the utterance in unimportant regions and then presented to a recognizer.  The  BCN is trained with a loss that encourages it to add as much noise as possible while preserving recognition performance, encouraging it to identify important regions precisely and place the noise everywhere else.  Empirical evaluation shows that the BCN can obscure 97.7\% of the spectrogram with noise while maintaining recognition accuracy for a simple speech recognizer that compares a noisy test utterance with a clean reference utterance.  The masks predicted by a single BCN on several utterances show patterns that are similar to analyses derived from human listening tests that analyze each utterance separately, while exhibiting better generalization and less context-dependence than previous approaches.}
}
@inproceedings{SyedEtAl2018,
  author = {Ali Raza Syed and Viet~Anh Trinh and Michael I.~Mandel},
  title = {Concatenative resynthesis with improved training signals for speech enhancement},
  booktitle = {Proceedings of Interspeech},
  year = 2018,
  url = {http://m.mr-pc.org/work/interspeech18syed.pdf},
  poster = {http://m.mr-pc.org/work/interspeech18syedPoster.pdf},
  pages = {1195--1199},
  doi = {10.21437/Interspeech.2018-2439},
  abstract = {Noise reduction in speech signals remains an important area of research with potential for high impact in speech processing domains such as voice communication and hearing prostheses.  We extend and demonstrate significant improvements to our previous work in synthesis-based speech enhancement, which performs concatenative resynthesis of speech signals for the production of noiseless, high quality speech.  Concatenative resynthesis methods perform unit selection through learned non-linear similarity functions between short chunks of clean and noisy signals.  These mappings are learned using deep neural networks (DNN) trained to predict high similarity for the exact chunk of speech that is contained within a chunk of noisy speech, and low similarity for all other pairings.  We find here that more robust mappings can be learned with a more efficient use of the available data by selecting pairings that are not exact matches, but contain similar clean speech that matches the original in terms of acoustic, phonetic, and prosodic content.  The resulting output is evaluated on the small vocabulary CHiME2-GRID corpus and outperforms our original baseline system in terms of intelligibility by combining phonetic similarity with similarity of acoustic intensity, fundamental frequency, and periodicity.}
}
@inproceedings{MaitiEtAl2018,
  author = {Soumi Maiti and Joey Ching and Michael I.~Mandel},
  title = {Large vocabulary concatenative resynthesis},
  booktitle = {Proceedings of Interspeech},
  year = 2018,
  url = {http://m.mr-pc.org/work/interspeech18maiti.pdf},
  poster = {http://m.mr-pc.org/work/interspeech18maitiPoster.pdf},
  pages = {1190--1194},
  doi = {10.21437/Interspeech.2018-2383},
  abstract = {Traditional speech enhancement systems reduce noise by modifying the noisy signal, which suffer from two problems: under-suppression of noise and over-suppression of speech. As an alternative, in this paper, we use the recently introduced concatenative resynthesis approach where we replace the noisy speech with its clean resynthesis. The output of such a system can produce speech that is both noise-free and high quality. This paper generalizes our previous small-vocabulary system to large vocabulary.  To do so, we employ efficient decoding techniques using fast approximate nearest neighbor (ANN) algorithms. Firstly, we  apply ANN techniques on the original small vocabulary task and get $5\times$ speedup.  We then apply the techniques to the construction of a large vocabulary concatenative resynthesis system and scale the system up to $12 \times$ larger dictionary. We perform listening tests with five participants to measure subjective quality and intelligibility of the output speech. }
}
@inproceedings{MaitiAndMandel2017,
  author = {Soumi Maiti and Michael I Mandel},
  title = {Concatenative resynthesis using twin networks},
  booktitle = {Proceedings of Interspeech},
  url = {http://m.mr-pc.org/work/interspeech17.pdf},
  year = 2017,
  pages = {3647--3651},
  doi = {10.21437/Interspeech.2017-1653},
  abstract = {Traditional noise reduction systems modify a noisy signal to make it more like the original clean signal. For speech, these methods suffer from two main problems: under-suppression of noise and over-suppression of target speech. Instead, synthesizing clean speech based on the noisy signal could produce outputs that are both noise-free and high quality.  Our previous work introduced such a system using concatenative synthesis, but it required processing the clean speech at run time, which was slow and not scalable. In order to make such a system scalable, we propose here learning a similarity metric using two separate networks, one network processing the clean segments offline and another processing the noisy segments at run time. This system incorporates a ranking loss to optimize for the retrieval of appropriate clean speech segments. This model is compared against our original on the CHiME2-GRID corpus, measuring ranking performance and subjective listening tests of resyntheses.}
}
@inproceedings{syed17,
  author = {Ali Syed and Andrew Rosenberg and Michael I Mandel},
  title = {Active Learning for Low-Resource Speech Recognition: Impact of Selection Size and Language Modeling Data},
  booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing},
  year = {2017},
  url = {http://m.mr-pc.org/work/syed17.pdf},
  abstract = {Active learning aims to reduce the time and cost of developing speech recognition systems by selecting for transcription highly informative subsets from large pools of audio data.  Previous evaluations at OpenKWS and IARPA BABEL have investigated data selection for low-resource languages in very constrained scenarios with 2-hour data selections given a 1-hour seed set.  We expand on this to investigate what happens with larger selections and fewer constraints on language modeling data.  Our results, on four languages from the final BABEL OP3 period, show that active learning is helpful at larger selections with consistent gains up to 14 hours. We also find that the impact of additional language model data is orthogonal to the impact of the active learning selection criteria.}
}
@inproceedings{devaney17,
  author = {Johanna Devaney and Michael I Mandel},
  title = {An evaluation of score-informed methods for estimating fundamental frequency and power from polyphonic audio},
  year = {2017},
  booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing},
  url = {http://m.mr-pc.org/work/devaney17.pdf},
  abstract = {Robust extraction of performance data from polyphonic musical performances requires precise frame-level estimation of fundamental frequency (f0) and power. This paper evaluates a new score-guided approach to f0 and power estimation in polyphonic audio and compares the use of four different input features: the central bin frequencies of the spectrogram, the instantaneous frequency, and two variants of a high resolution spectral analysis. These four features were evaluated on four-part multi-track ensemble recordings, consisting of either four vocalists or bassoon, clarinet, saxophone, and violin (the Bach10 data set) created from polyphonic mixes of the monophonic tracks both with and without artificial reverberation. Score information was used to identify time-frequency regions of interest in the polyphonic mixes for each note in a corresponding aligned score, from which f0 and power estimates were made. The approach was able to recover ground truth f0 within 20 cents on average in reverberation and power within 5\,dB for anechoic mixtures, but only within 10\,dB for reverberant.
}
}
@inproceedings{mandel16b,
  author = {Michael I Mandel and Jon P Barker},
  title = {Multichannel spatial clustering for robust far-field automatic
  speech recognition in mismatched conditions},
  year = {2016},
  booktitle = {Proceedings of Interspeech},
  doi = {http://dx.doi.org/10.21437/Interspeech.2016-1275},
  pages = {1991--1995},
  url = {http://m.mr-pc.org/work/interspeech16b.pdf},
  slides = {http://m.mr-pc.org/work/interspeech16bslides.pdf},
  abstract = {Recent automatic speech recognition (ASR) results are quite good when the training data is matched to the test data, but much worse when they differ in some important regard, like the number and arrangement of microphones or differences in reverberation and noise conditions.  This paper proposes an unsupervised spatial clustering approach to microphone array processing that can overcome such train-test mismatches.  This approach, known as Model-based EM Source Separation and Localization (MESSL), clusters spectrogram points based on the relative differences in phase and level between pairs of microphones.  Here it is used for the first time to drive minimum variance distortionless response (MVDR) beamforming in several ways.  We compare it to a standard delay-and-sum beamformer on the CHiME-3 noisy test set (real recordings), using each system as a pre-processor for the same recognizer trained on the AMI meeting corpus. We find that the spatial clustering front end reduces word error rates by between 9.9 and 17.1\% relative to the baseline.}
}
@inproceedings{mandel16,
  author = {Michael I Mandel},
  title = {Directly comparing the listening strategies of humans and machines},
  year = {2016},
  booktitle = {Proceedings of Interspeech},
  doi = {http://dx.doi.org/10.21437/Interspeech.2016-932},
  pages = {660--664},
  url = {http://m.mr-pc.org/work/interspeech16.pdf},
  poster = {http://m.mr-pc.org/work/interspeech16poster.pdf},
  abstract = {In a given noisy environment, human listeners can more accurately identify spoken words than automatic speech recognizers.  It is not clear, however, what information the humans are able to utilize in doing so that the machines are not.  This paper uses a recently introduced technique to directly characterize the information used by humans and machines on the same task.  The task was a forced choice between eight sentences spoken by a single talker from the small-vocabulary GRID corpus that were selected to be maximally confusable with one another.  These sentences were mixed with ``bubble'' noise, which is designed to reveal randomly selected time-frequency glimpses of the sentence.  Responses to these noisy mixtures allowed the identification of time-frequency regions that were important for each listener to recognize each sentence, i.e., regions that were frequently audible when a sentence was correctly identified and inaudible when it was not.  In comparing these regions across human and machine listeners, we found that dips in noise allowed the humans to recognize words based on informative speech cues.  In contrast, the baseline CHiME-2-GRID recognizer correctly identified sentences only when the time-frequency profile of the noisy mixture matched that of the underlying speech.}
}
@inproceedings{erdogan16,
  author = {Hakan Erdogan and John Hershey and Shinji Watanabe and Michael I Mandel and Jonathan Le Roux},
  title = {Improved {MVDR} beamforming using single-channel mask prediction networks
},
  year = {2016},
  booktitle = {Proceedings of Interspeech},
  doi = {http://dx.doi.org/10.21437/Interspeech.2016-552},
  url = {http://www.isca-speech.org/archive/Interspeech_2016/pdfs/0552.PDF},
  pages = {1981--1985},
  abstract = {Recent studies on multi-microphone speech databases indicate
that it is beneficial to perform beamforming to improve speech
recognition accuracies, especially when there is a high level of
background noise. Minimum variance distortionless response
(MVDR) beamforming is an important beamforming method
that performs quite well for speech recognition purposes especially
if the steering vector is known. However, steering the
beamformer to focus on speech in unknown acoustic conditions
remains a challenging problem. In this study, we use single-channel
speech enhancement deep networks to form masks that
can be used for noise spatial covariance estimation, which steers
the MVDR beamforming toward the speech. We analyze how
mask prediction affects performance and also discuss various
ways to use masks to obtain the speech and noise spatial covariance
estimates in a reliable way. We show that using a
single mask across microphones for covariance prediction with
minima-limited post-masking yields the best result in terms of
signal-level quality measures and speech recognition word error
rates in a mismatched training condition.}
}
@inproceedings{xiao16,
  author = {Xiao, Xiong and Watanabe, Shinji and Erdogan, Hakan and Lu, Liang and Hershey, John and Seltzer, Michael L and Chen, Guoguo and Zhang, Yu and Mandel, Michael and Yu, Dong},
  booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing},
  doi = {10.1109/ICASSP.2016.7472778},
  isbn = {9781479999880},
  issn = {15206149},
  keywords = {AMI meeting corpus,Acoustics,Array signal processing,Feature extraction,Microphones,Neural networks,Speech,Speech recognition,absolute word error rate reduction,acoustic modeling network,array signal application,array signal processing,backpropagation,beamforming,common crossentropy objective function,deep beamforming network,deep neural network,deep neural networks,direction of arrival,entropy,feature extraction,filter coefficient,filter- and-sum beamforming,filtering theory,frequency-domain analysis,frequency-domain beamformer,learning (artificial intelligence),microphone arrays,microphone channels,multichannel far-field speech recognition,network-specific objective function,reverberation,signal classification,signal enhancement,signal representation,single unified computational network,speech recognition},
  month = {mar},
  pages = {5745--5749},
  publisher = {IEEE},
  title = {Deep Beamforming Networks for Multi-Channel Speech Recognition},
  url = {http://www.clsp.jhu.edu/~guoguo/papers/icassp2016_deep_beamforming.pdf},
  year = {2016},
  abstract = {Despite the significant progress in speech recognition enabled by deep neural networks, poor performance persists in some scenarios. In this work, we focus on far-field speech recognition which remains challenging due to high levels of noise and reverberation in the captured speech signals. We propose to represent the stages of acoustic processing including beamforming, feature extraction, and acoustic modeling, as three components of a single unified computational network. The parameters of a frequency-domain beamformer are first estimated by a network based on features derived from the microphone channels. These filter coefficients are then applied to the array signals to form an enhanced signal. Conventional features are then extracted from this signal and passed to a second network that performs acoustic modeling for classification. The parameters of both the beamforming and acoustic modeling networks are trained jointly using back-propagation with a common crossentropy objective function. In experiments on the AMI meeting corpus, we observed improvements by pre-training each sub-network with a network-specific objective function before joint training of both networks. The proposed method obtained a 3.2\% absolute word error rate reduction compared to a conventional pipeline of independent processing stages.}
}
@inproceedings{bagchi15,
  author = {Deblin Bagchi and Michael I Mandel and Zhongqiu Wang and Yanzhang He and Andrew Plummer and Eric Fosler-Lussier},
  title = {Combining spectral feature mapping and multi-channel model-based source separation for noise-robust automatic speech recognition},
  booktitle = {Proceedings of the {IEEE} Workshop on Automatic Speech Recognition and Understanding},
  pages = {496--503},
  doi = {10.1109/ASRU.2015.7404836},
  year = {2015},
  url = {http://m.mr-pc.org/work/asru15.pdf},
  abstract = {Automatic Speech Recognition systems suffer from severe performance degradation in the presence of myriad complicating factors such as noise, reverberation, multiple speech sources, multiple recording devices, etc.  Previous challenges have sparked much innovation when it comes to designing systems capable of handling these complications.  In this spirit, the CHiME-3 challenge presents system builders with the task of recognizing speech in a real-world noisy setting wherein speakers talk to an array of 6 microphones in a tablet.  In order to address these issues, we explore the effectiveness of first applying a model-based source separation mask  to the output of a beamformer that combines the source signals recorded by each microphone, followed by a DNN-based front end spectral mapper that predicts clean filterbank features.  The source separation algorithm MESSL (Model-based EM Source Separation and Localization) has been extended from two channels to multiple channels in order to meet the demands of the challenge.  We report on interactions between the two systems, cross-cut by the use of a robust beamforming algorithm called BeamformIt.   Evaluations of different system settings reveal that combining MESSL and the spectral mapper together on the baseline beamformer algorithm boosts the performance substantially.
}
}
@inproceedings{tirumala15,
  author = {Sreyas Srimath Tirumala and Michael I Mandel},
  title = {Exciting estimated clean spectra for speech resynthesis},
  booktitle = { {IEEE} Workshop on Applications of Signal Processing to Audio and Acoustics},
  year = {2015},
  url = {http://m.mr-pc.org/work/waspaa15b.pdf},
  abstract = {Spectral masking techniques are prevalent for noise suppression but they damage speech in regions of the spectrum where both noise and speech are present. This paper instead utilizes a recently introduced analysis-by-synthesis technique to estimate the spectral envelope of the speech at all frequencies, and adds to it a model of the speech excitation necessary to fully resynthesize a clean speech signal. Such a resynthesis should have little noise and high quality compared to mask-based approaches. We compare several different excitation signals on the Aurora4 corpus, including those derived from the high quefrency components of the noisy mixture and from the combination of a noise robust pitch tracker and a voiced/unvoiced classifier. Preliminary subjective evaluations suggest that the speech synthesized using our approach has higher voice quality and noise suppression than spectral masking.},
  poster = {http://m.mr-pc.org/work/waspaa15bposter.pdf}
}
@inproceedings{mandel15d,
  author = {Michael I Mandel and Young Suk Cho},
  title = {Audio super-resolution using concatenative resynthesis},
  booktitle = { {IEEE} Workshop on Applications of Signal Processing to Audio and Acoustics},
  year = {2015},
  url = {http://m.mr-pc.org/work/waspaa15.pdf},
  abstract = {  This paper utilizes a recently introduced non-linear
  dictionary-based denoising system in another voice mapping task,
  that of transforming low-bandwidth, low-bitrate speech into
  high-bandwidth, high-quality speech.  The system uses a deep neural
  network as a learned non-linear comparison function to drive unit
  selection in a concatenative synthesizer based on clean recordings.
  This neural network is trained to predict
  whether a given clean audio segment from the dictionary could be
  transformed into a given segment of the degraded observation.
  Speaker-dependent
  experiments on the small-vocabulary CHiME2-GRID corpus show that
  this model is able to resynthesize high quality clean speech from
  degraded observations.  Preliminary listening tests show that the
  system is able to improve subjective speech quality evaluations by
  up to 50 percentage points, while a similar system based on
  non-negative matrix factorization and trained on the same data
  produces no significant improvement.},
  slides = {http://m.mr-pc.org/work/waspaa15slides.pdf},
  http = {http://mr-pc.org/work/waspaa15/}
}
@inproceedings{mandel15c,
  author = {Michael I Mandel and Nicoleta Roman},
  title = {Enforcing consistency in spectral masks using Markov random fields},
  year = {2015},
  booktitle = {Proceedings of {EUSIPCO}},
  pages = {2028--2032},
  abstract = {Localization-based multichannel source separation
                  algorithms typically operate by clustering or
                  classifying individual time-frequency points based
                  on their spatial characteristics, treating adjacent
                  points as independent observations.  The Model-based
                  EM Source Separation and Localization (MESSL)
                  algorithm is one such approach for binaural signals
                  that achieves additional robustness by enforcing
                  consistency across frequencies in interaural phase
                  differences.  This paper incorporates MESSL into a
                  Markov Random Field (MRF) framework in order to
                  enforce consistency in the assignment of neighboring
                  time-frequency units to sources.  Approximate
                  inference in the MRF is performed using loopy belief
                  propagation, and the same approach can be used to
                  smooth any probabilistic source separation mask. The
                  proposed MESSL-MRF algorithm is tested on binaural
                  mixtures of three sources in reverberant conditions
                  and shows significant improvements over the original
                  MESSL algorithm as measured by both
                  signal-to-distortion ratios as well as a speech
                  intelligibility predictor.  },
  url = {http://m.mr-pc.org/work/eusipco15.pdf}
}
@inproceedings{mandel14c,
  author = {Michael I Mandel and Young-Suk Cho and Yuxuan Wang},
  title = {Learning a concatenative resynthesis system for noise suppression},
  booktitle = {Proceedings of the {IEEE} {GlobalSIP} conference},
  year = 2014,
  url = {http://m.mr-pc.org/work/globalsip14.pdf},
  poster = {http://m.mr-pc.org/work/globalsip14poster.pdf},
  http = {http://mr-pc.org/work/globalsip14/},
  abstract = {This paper introduces a new approach to dictionary-based source
  separation employing a learned non-linear metric.  In contrast to
  existing parametric source separation systems, this model is able to
  utilize a rich dictionary of speech signals.  In contrast to
  previous dictionary-based source separation systems, the system can
  utilize perceptually relevant non-linear features of the noisy and
  clean audio.  This approach utilizes a deep neural network (DNN) to
  predict whether a noisy chunk of audio contains a given clean chunk.
  Speaker-dependent experiments on the CHiME2-GRID corpus show 
  that this model is able to accurately resynthesize clean speech from
  noisy observations.  Preliminary listening tests show that the
  system's output has much higher audio quality than existing parametric
  systems trained on the same data, achieving noise suppression levels
  close to those of the original clean speech.}
}
@inproceedings{mandel14b,
  author = {Michael I Mandel and Sarah E Yoho and Eric W Healy},
  title = {Generalizing time-frequency importance functions across noises, talkers, and phonemes},
  booktitle = {Proceedings of Interspeech},
  year = 2014,
  url = {http://m.mr-pc.org/work/interspeech14.pdf},
  poster = {http://m.mr-pc.org/work/interspeech14poster.pdf},
  abstract = {Listeners can reliably identify speech in noisy conditions, although it is generally not known what specific features of speech are used to do this.  We utilize a recently introduced data-driven framework to identify these features.  By analyzing listening-test results involving the same speech utterance mixed with many different noise instances, the framework is able to compute the importance of each time-frequency point in the utterance to its intelligibility.  This paper shows that a trained model resulting from this framework can generalize to new conditions, successfully predicting the intelligibility of novel mixtures. First, it can generalize to novel noise instances after being trained on mixtures involving the same speech utterance but different noises. Second, it can generalize to novel talkers after being trained on mixtures involving the same syllables produced by different talkers in different noises. Finally, it can generalize to novel phonemes, after being trained on mixtures involving different consonants produced by the same or different talkers in different noises. Aligning the clean utterances in time and then propagating this alignment to the features used in the intelligibility prediction improves this generalization performance further.}
}
@inproceedings{mandel14a,
  author = {Michael I Mandel and Arun Narayanan},
  title = {Analysis-by-synthesis feature estimation for robust automatic speech recognition using spectral masks},
  booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing},
  year = {2014},
  url = {http://m.mr-pc.org/work/icassp14a.pdf},
  poster = {http://m.mr-pc.org/work/icassp14poster.pdf},
  abstract = {Spectral masking is a promising method for noise suppression in which regions of the spectrogram that are dominated by noise are attenuated while regions dominated by speech are preserved. It is not clear, however, how best to combine spectral masking with the non-linear processing necessary to compute automatic speech recognition features. We propose an analysis-by-synthesis approach to automatic speech recognition, which, given a spectral mask, poses the estimation of mel frequency cepstral coefficients (MFCCs) of the clean speech as an optimization problem. MFCCs are found that minimize a combination of the distance from the resynthesized clean power spectrum to the regions of the noisy spectrum selected by the mask and the negative log likelihood under an unmodified large vocabulary continuous speech recognizer. In evaluations on the Aurora4 noisy speech recognition task with both ideal and estimated masks, analysis-by-synthesis decreases both word error rates and distances to clean speech as compared to traditional approaches.}
}
@inproceedings{nandi14,
  author = {Arnab Nandi and Lilong Jiang and Michael I Mandel},
  title = {Gestural Query Specification},
  booktitle = {Proceedings of the International Conference on Very Large Data Bases},
  year = {2014},
  volume = 7,
  issue = 4,
  url = {http://www.vldb.org/pvldb/vol7/p289-nandi.pdf},
  slides = {https://speakerdeck.com/arnabdotorg/gestural-query-specification-querying-without-keyboards},
  abstract = {Direct, ad-hoc interaction with databases has typically been per-
formed over console-oriented conversational interfaces using query
languages such as SQL. With the rise in popularity of gestural user
interfaces and computing devices that use gestures as their exclusive
modes of interaction, database query interfaces require a fundamen-
tal rethinking to work without keyboards. We present a novel query
specification system that allows the user to query databases using a
series of gestures. We present a novel gesture recognition system
that uses both the interaction and the state of the database to classify
gestural input into relational database queries. We conduct exhaus-
tive systems performance tests and user studies to demonstrate that
our system is not only performant and capable of interactive laten-
cies, but it is also more usable, faster to use and more intuitive than
existing systems.}
}
@inproceedings{mandel13,
  author = {Michael I. Mandel},
  title = {Learning an intelligibility map of individual utterances},
  year = {2013},
  booktitle = { {IEEE} Workshop on Applications of Signal Processing to Audio and Acoustics},
  abstract = {Predicting the intelligibility of noisy recordings is difficult and most current algorithms only aim to be correct on average across many recordings.  This paper describes a listening test paradigm and associated analysis technique that can predict the intelligibility of a specific recording of a word in the presence of a specific noise instance.  The analysis learns a map of the importance of each point in the recording's spectrogram to the overall intelligibility of the word when glimpsed through ``bubbles'' in many noise instances.  By treating this as a classification problem, a linear classifier can be used to predict intelligibility and can be examined to determine the importance of spectral regions.  This approach was tested on recordings of vowels and consonants. The important regions identified by the model in these tests agreed with those identified by a standard, non-predictive statistical test of independence and with the acoustic phonetics literature.},
  url = {http://m.mr-pc.org/work/waspaa13.pdf}
}
@inproceedings{roman13,
  author = {Nicoleta Roman and Micheal Mandel},
  title = {Classification based binaural dereverberation},
  year = 2013,
  booktitle = {Proceedings of Interspeech},
  abstract = {Reverberation has a detrimental effect on speech perception both in terms of quality as well as intelligibility, as late reflections smear temporal and spectral cues. The ideal binary mask, which is an established computational approach to sound separation, was recently extended to remove reverberation. Experiments with both normal hearing and hearing impaired listeners have shown significant intelligibility improvements for reverberant speech processed using such a priori binary masks. The dereverberation problem can thus be formulated as a classification problem, where the desired output is the ideal binary mask. The goal in this approach is to produce a mask that selects the time-frequency regions where the direct energy dominates the energy from the late reflections. In this study, a binaural dereverberation algorithm is proposed which utilizes the binaural cues of interaural time and level differences as features. The algorithm is tested in highly reverberant environments using both simulated and recorded room impulse responses. Evaluations show significant improvements over the unprocessed condition as measured by both a speech quality measure and a speech intelligibility predictor.}
}
@inproceedings{devaney12b,
  author = {Johanna Devaney and Michael I. Mandel and Ichiro Fujinaga},
  year = {2012},
  title = {A Study of Intonation in Three-Part Singing using the Automatic  Music Performance Analysis and Comparison Toolkit ({AMPACT})},
  booktitle = {Proceedings of the International Society for Music Information Retrieval conference},
  location = {Porto},
  url = {http://ismir2012.ismir.net/event/papers/511-ismir-2012.pdf},
  abstract = {This paper introduces the Automatic Music Performance Analysis and Comparison Toolkit ({AMPACT}), is a {MATLAB} toolkit for accurately aligning monophonic audio to {MIDI} scores as well as extracting and analyzing timing-, pitch-, and dynamics-related performance data from the aligned recordings. This paper also presents the results of an analysis performed with {AMPACT} on an experiment studying intonation in three-part singing. The experiment examines the interval size and drift in four ensembles' performances of a short exercise by Benedetti, which was designed to highlight the conflict between Just Intonation tuning and pitch drift.}
}
@inproceedings{devaney11,
  author = {Johanna Devaney and Michael I. Mandel and Ichiro Fujinaga},
  year = {2011},
  title = {Characterizing Singing Voice Fundamental Frequency Trajectories},
  booktitle = { {IEEE} Workshop on Applications of Signal Processing to Audio and Acoustics},
  month = oct,
  pages = {73--76},
  url = {http://www.music.mcgill.ca/~devaney/files/devaney11waspaa.pdf},
  poster = {http://music.mcgill.ca/~devaney/files/devaney11waspaaPoster.pdf},
  abstract = {This paper evaluates the utility of the Discrete Cosine Transform (DCT) for characterizing singing voice fundamental frequency (F0) trajectories. Specifically, it focuses on the use of the 1 st and 2nd DCT coefficients as approximations of slope and curvature. It also considers the impact of vocal vibrato on the DCT calculations, including the influence of segmentation on the consistency of the reported DCT coefficient values. These characterizations are useful for describing similarities in the evolution of the fundamental frequency in different notes. Such descriptors can be applied in the areas of performance analysis and singing synthesis.}
}
@inproceedings{mandel10b,
  title = {Learning tags that vary within a song},
  author = {Michael I. Mandel and Douglas Eck and Yoshua Bengio},
  year = {2010},
  month = aug,
  pages = {399--404},
  booktitle = {Proceedings of the International Society for Music Information Retrieval conference},
  url = {http://m.mr-pc.org/work/ismir10.pdf},
  slides = {http://m.mr-pc.org/work/ismir10slides.pdf},
  abstract = {This paper examines the relationship between human generated tags
  describing different parts of the same song.  These tags were
  collected using Amazon's Mechanical Turk service.  We find that the
  agreement between different people's tags decreases as the distance
  between the parts of a song that they heard increases.  To model
  these tags and these relationships, we describe a conditional
  restricted Boltzmann machine.  Using this model to fill in tags that
  should probably be present given a context of other tags, we train
  automatic tag classifiers (autotaggers) that
  outperform those trained on the original data.}
}
@inproceedings{bergstra10,
  title = {Scalable genre and tag prediction with spectral covariance},
  author = {James Bergstra and Michael I. Mandel and Douglas Eck},
  year = {2010},
  month = aug,
  pages = {507--512},
  booktitle = {Proceedings of the International Society for Music Information Retrieval conference},
  url = {http://m.mr-pc.org/work/ismir10b.pdf},
  abstract = {Cepstral analysis is effective in separating source from filter in vocal and monophonic [pitched] recordings, but is it a good general-purpose framework for working with music audio? We evaluate covariance in spectral features as an alternative to means and variances in cepstral features (particularly MFCCs) as summaries of frame-level features. We find that spectral covariance is more effective than mean, variance, and covariance statistics of MFCCs for genre and social tag prediction. Support for our model comes from strong and state-of-the-art performance on the GTZAN genre dataset, MajorMiner, and MagnaTagatune. Our classification strategy based on linear classifiers is easy to implement, exhibits very little sensitivity to hyper-parameters, trains quickly (even for web-scale datasets), is fast to apply, and offers competitive performance in genre and tag prediction.}
}
@inproceedings{mandel09b,
  title = {The ideal interaural parameter mask: a bound on binaural separation systems},
  author = {Michael I. Mandel and Daniel P. W. Ellis},
  year = {2009},
  month = oct,
  pages = {85--88},
  booktitle = { {IEEE} Workshop on Applications of Signal Processing to Audio and Acoustics},
  url = {http://m.mr-pc.org/work/waspaa09.pdf},
  poster = {http://m.mr-pc.org/work/waspaa09poster.pdf},
  doi = {10.1109/ASPAA.2009.5346506},
  abstract = {We introduce the Ideal Interaural Parameter Mask as an upper bound on the performance of mask-based source separation algorithms that are based on the differences between signals from two microphones or ears. With two additions to our Model-based EM Source Separation and Localization system, its performance approaches that of the IIPM upper bound to within 0.9 dB. These additions battle the effects of reverberation by absorbing reverberant energy and by forcing the ILD estimate to be larger than it might otherwise be. An oracle reliability measure was also added, in the hope that estimating parameters from more reliable regions of the spectrogram would improve separation, but it was not consistently useful.}
}
@inproceedings{devaney09,
  title = {Improving {MIDI}-audio alignment with acoustic features},
  author = {Johanna Devaney and Michael I. Mandel and Daniel P. W. Ellis},
  year = {2009},
  month = oct,
  pages = {45--48},
  booktitle = { {IEEE} Workshop on Applications of Signal Processing to Audio and Acoustics},
  url = {http://m.mr-pc.org/work/devaney_waspaa09.pdf},
  doi = {10.1109/ASPAA.2009.5346500},
  abstract = {This paper describes a technique to improve the accuracy of dynamic time warping-based MIDI-audio alignment. The technique implements a hidden Markov model that uses aperiodicity and power estimates from the signal as observations and the results of a dynamic time warping alignment as a prior. In addition to improving the overall alignment, this technique also identifies the transient and steady state sections of the note. This information is important for describing various aspects of a musical performance, including both pitch and rhythm.}
}
@inproceedings{law09,
  title = {Evaluation of algorithms using games: the case of music annotation},
  author = {Edith Law and Kris West and Michael I Mandel and Mert Bay and J. Stephen Downie},
  year = {2009},
  month = oct,
  booktitle = {Proceedings of the International Society for Music Information Retrieval conference},
  pages = {387--392},
  url = {http://m.mr-pc.org/work/ismir09.pdf},
  abstract = {Search by keyword is an extremely popular method for retrieving music. To support this, novel algorithms that automatically tag music are being developed. The conventional way to evaluate audio tagging algorithms is to compute measures of agreement between the output and the ground truth set. In this work, we introduce a new method for evaluating audio tagging algorithms on a large scale by collecting set-level judgments from players of a human computation game called TagATune. We present the design and preliminary results of an experiment comparing five algorithms using this new evaluation metric, and contrast the results with those obtained by applying several conventional agreement-based evaluation metrics.}
}
@inproceedings{weiss08,
  title = {Source separation based on binaural cues and source
               model constraints},
  author = {Ron J. Weiss and Michael I. Mandel and Daniel P. W. Ellis},
  booktitle = {Proceedings of Interspeech},
  month = sep,
  year = {2008},
  pages = {419--422},
  location = {Brisbane, Australia},
  url = {http://m.mr-pc.org/work/interspeech08.pdf},
  http = {http://www.isca-speech.org/archive/interspeech_2008/i08_0419.html},
  abstract = {We describe a system for separating multiple sources from a two-channel recording based on interaural cues and known characteristics of the source signals. We combine a probabilistic model of the observed interaural level and phase differences with a prior model of the source statistics and derive an EM algorithm for finding the maximum likelihood parameters of the joint model. The system is able to separate more sound sources than there are observed channels. In simulated reverberant mixtures of three speakers the proposed algorithm gives a signal-to-noise ratio improvement of 2.1 dB over a baseline algorithm using only interaural cues.}
}
@inproceedings{mandel08a,
  title = {Multiple-instance learning for music information retrieval},
  author = {Michael I. Mandel and Daniel P. W. Ellis},
  booktitle = {Proceedings of the International Society for Music Information Retrieval conference},
  month = sep,
  year = {2008},
  pages = {577--582},
  url = {http://m.mr-pc.org/work/ismir08.pdf},
  poster = {http://m.mr-pc.org/work/ismir08poster.pdf},
  abstract = {Multiple-instance learning algorithms train classifiers from lightly supervised data, i.e. labeled collections of items, rather than labeled items.  We compare the multiple-instance learners mi-SVM and MILES on the task of classifying 10-second song clips.  These classifiers are trained on tags at the track, album, and artist levels, or granularities, that have been derived from tags at the clip granularity, allowing us to test the effectiveness of the learners at recovering the clip labeling in the training set and predicting the clip labeling for a held-out test set.  We find that mi-SVM is better than a control at the recovery task on training clips, with an average classification accuracy as high as 87\% over 43 tags; on test clips, it is comparable to the control with an average classification accuracy of up to 68\%.  MILES performed adequately on the recovery task, but poorly on the test clips.}
}
@inproceedings{ellis08,
  title = {Cross-correlation of beat-synchronous representations 
               for music similarity},
  author = {Daniel P. W. Ellis and Courtenay V. Cotton 
               and Michael I. Mandel},
  booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing},
  pages = {57--60},
  month = apr,
  year = {2008},
  location = {Las Vegas, NV},
  url = {http://m.mr-pc.org/work/icassp08.pdf},
  doi = {10.1109/ICASSP.2008.4517545},
  abstract = {Systems to predict human judgments of music similarity directly from the audio have generally been based on the global statistics of spectral feature vectors i.e. collapsing any large-scale temporal structure in the data. Based on our work in identifying alternative ("cover") versions of pieces, we investigate using direct correlation of beat-synchronous representations of music audio to find segments that are similar not only in feature statistics, but in the relative positioning of those features in tempo-normalized time. Given a large enough search database, good matches by this metric should have very high perceived similarity to query items. We evaluate our system through a listening test in which subjects rated system-generated matches as similar or not similar, and compared results to a more conventional timbral and rhythmic similarity baseline, and to random selections.}
}
@inproceedings{mandel07c,
  title = { {EM} localization and separation using interaural level and
               phase cues},
  author = {Michael I. Mandel and Daniel P. W. Ellis},
  booktitle = { {IEEE} Workshop on Applications of Signal Processing to Audio and Acoustics},
  pages = {275--278},
  month = oct,
  year = {2007},
  url = {http://m.mr-pc.org/work/waspaa07.pdf},
  poster = {http://m.mr-pc.org/work/waspaa07poster.pdf},
  doi = {10.1109/10.1109/ASPAA.2007.4392987},
  abstract = {We describe a system for localizing and separating multiple sound sources from a reverberant two-channel recording. It consists of a probabilistic model of interaural level and phase differences and an EM algorithm for finding the maximum likelihood parameters of this model. By assigning points in the interaural spectrogram probabilistically to sources with the best-fitting parameters and then estimating the parameters of the sources from the points assigned to them, the system is able to separate and localize more sound sources than there are available channels. It is also able to estimate frequency-dependent level differences of sources in a mixture that correspond well to those measured in isolation. In experiments in simulated anechoic and reverberant environments, the proposed system improved the signal-to-noise ratio of target sources by 2.7 and 3.4dB more than two comparable algorithms on average.}
}
@inproceedings{mandel07b,
  title = {A web-based game for collecting music metadata},
  author = {Michael I. Mandel and Daniel P. W. Ellis},
  pages = {365--366},
  booktitle = {Proceedings of the International Society for Music Information Retrieval conference},
  editor = {Simon Dixon and David Bainbridge and Rainer Typke},
  location = {Vienna, Austria},
  month = sep,
  year = {2007},
  url = {http://m.mr-pc.org/work/ismir07.pdf},
  poster = {http://m.mr-pc.org/work/ismir07poster.pdf},
  abstract = {We have designed a web-based game to make collecting descriptions of musical excerpts fun, easy, useful, and objective.  Participants describe 10 second clips of songs and score points when their descriptions match those of other participants.  The rules were designed to encourage users to be thorough and the clip length was chosen to make judgments more objective and specific.  Analysis of preliminary data shows that we are able to collect objective and specific descriptions of clips and that players tend to agree with one another.}
}
@incollection{mandel07a,
  title = {An {EM} Algorithm for Localizing Multiple Sound Sources in
               Reverberant Environments},
  author = {Michael I. Mandel and Daniel P. W. Ellis and Tony Jebara},
  booktitle = {Advances in Neural Information Processing Systems},
  editor = {B. Sch\"{o}lkopf and J. Platt and T. Hoffman},
  publisher = { {MIT} Press},
  address = {Cambridge, MA},
  pages = {953--960},
  year = {2007},
  url = {http://m.mr-pc.org/work/nips06.pdf},
  poster = {http://m.mr-pc.org/work/nips06poster.pdf},
  abstract = {We present a method for localizing and separating sound sources in stereo recordings that is robust to reverberation and does not make any assumptions about the source statistics.  The method consists of a probabilistic model of binaural multi-source recordings and an expectation maximization algorithm for finding the maximum likelihood parameters of that model.  These parameters include distributions over delays and assignments of time-frequency regions to sources.  We evaluate this method against two comparable algorithms on simulations of simultaneous speech from two or three sources.  Our method outperforms the others in anechoic conditions and performs as well as the better of the two in the presence of reverberation.}
}
@inproceedings{mandel05,
  author = {Michael I. Mandel and Daniel P. W. Ellis},
  title = {Song-Level Features and Support Vector Machines for Music
               Classification},
  pages = {594--599},
  booktitle = {Proceedings of the International Society for Music Information Retrieval conference},
  editor = {Joshua D. Reiss and Geraint A. Wiggins},
  location = {London, UK},
  month = sep,
  year = {2005},
  url = {http://m.mr-pc.org/work/ismir05.pdf},
  poster = {http://m.mr-pc.org/work/ismir05poster.pdf},
  abstract = {Searching and organizing growing digital music collections requires automatic classification of music.  This paper describes a new system, tested on the task of artist identification, that uses support vector machines to classify songs based on features calculated over their entire lengths.  Since support vector machines are exemplar-based classifiers, training on and classifying entire songs instead of short-time features makes intuitive sense.  On a dataset of 1200 pop songs performed by 18 artists, we show that this classifier outperforms similar classifiers that use only SVMs or song-level features.  We also show that the KL divergence between single Gaussians and Mahalanobis distance between MFCC statistics vectors perform comparably when classifiers are trained and tested on separate albums, but KL divergence outperforms Mahalanobis distance when trained and tested on songs from the same albums.}
}
@incollection{sudderth05,
  author = {Erik B. Sudderth and Michael I. Mandel and William
               T. Freeman and Alan S. Willsky},
  title = {Distributed Occlusion Reasoning for Tracking with
               Nonparametric Belief Propagation},
  booktitle = {Advances in Neural Information Processing Systems},
  editor = {Lawrence K. Saul and Yair Weiss and {L\'{e}on} Bottou},
  publisher = {MIT Press},
  address = {Cambridge, MA},
  pages = {1369--1376},
  year = {2005},
  url = {http://m.mr-pc.org/work/nips04.pdf},
  http = {http://ssg.mit.edu/nbp/},
  abstract = {We describe a three­dimensional geometric hand model suitable for visual tracking applications. The kinematic constraints implied by the model's joints have a probabilistic structure which is well described by a graphical model. Inference in this model is complicated by the hand's many degrees of freedom, as well as multimodal likelihoods caused by ambiguous image measurements. We use nonparametric belief propagation (NBP) to develop a tracking algorithm which exploits the graph's structure to control complexity, while avoiding costly discretization. While kinematic constraints naturally have a local structure, self­ occlusions created by the imaging process lead to complex interpendencies in color and edge­based likelihood functions. However, we show that local structure may be recovered by introducing binary hidden variables describing the occlusion state of each pixel. We augment the NBP algorithm to infer these occlusion variables in a distributed fashion, and then analytically marginalize over them to produce hand position estimates which properly account for occlusion events. We provide simulations showing that NBP may be used to refine inaccurate model initializations, as well as track hand motion through extended image sequences. }
}

Other

@inproceedings{DavolEtAl2021,
  title = {Automated Soundscape Analysis Reveals Strong Influence of Time Since Wildfire on Boreal Breeding Birds},
  author = {Davol, Eleanor and Boelman, Natalie and Brinkman, Todd and Brown, Carissa and Liston, Glen and Mandel, Michael and Coban, Enis and Perra, Megan and Reid, Kirsten and Leorna, Scott and others},
  booktitle = {AGU Fall Meeting Abstracts},
  volume = {2021},
  pages = {B23C--03},
  year = {2021}
}
@misc{NiEtAl2020b,
  title = {Improved {MVDR} Beamforming Using {LSTM} Speech Models to Clean Spatial Clustering Masks},
  author = {Ni, Zhaoheng and Grezes, Felix and Trinh, Viet Anh and Mandel, Michael I},
  eprint = {2012.02191},
  eprinttype = {arxiv},
  year = {2020},
  url = {https://arxiv.org/pdf/2012.02191.pdf}
}
@misc{GrezesEtAl2020,
  title = {Enhancement of spatial clustering-based time-frequency masks using LSTM neural networks},
  author = {Grezes, Felix and Ni, Zhaoheng and Trinh, Viet Anh and Mandel, Michael},
  year = {2020},
  eprint = {2012.01576},
  eprinttype = {arxiv},
  url = {https://arxiv.org/pdf/2012.01576.pdf}
}
@misc{GrezesEtAl2020b,
  title = {Combining spatial clustering with LSTM speech models for multichannel speech enhancement},
  author = {Grezes, Felix and Ni, Zhaoheng and Trinh, Viet Anh and Mandel, Michael},
  year = {2020},
  eprint = {2012.03388},
  eprinttype = {arxiv},
  url = {https://arxiv.org/pdf/2012.03388.pdf}
}
@inproceedings{CaiEtAl2020,
  title = {Music Autotagging as Captioning},
  author = {Tian Cai and Michael I Mandel and Di He},
  booktitle = {First Workshop on NLP for Music and Audio},
  year = {2020},
  location = {Montreal, QC},
  url = {https://drive.google.com/file/d/1kHxvQH0zwO9C4EJ0kiIPxfTdQukfr2O7/view?usp=sharing},
  poster = {https://drive.google.com/file/d/1q1KUrYaP-ajGO9_I93oM7qHhz8XeAhOy/view?usp=sharing},
  video = {https://www.youtube.com/watch?v=Y6XiXlZ73ac&list=PL44xXQ2KNZ0Inxy6ZIol47RkmPWWRCTBQ&index=10},
  abstract = {Music  autotagging  has  typically  been  formulated  as  a  multilabel  classification  problem. This  approach  assumes  that  tags  associatedwith a clip of music are an unordered set. With recent  success  of  image  and  video  captioning  as  well  as  environmental  audio  captioning,  we  we  propose  formulating  music  auto-tagging  as  a  captioning  task,  which  automatically associates tags with a clip of music inthe order a human would apply them.  Under the  formulation  of  captioning  as  a  sequence-to-sequence problem, previous music autotagging systems can be used as the encoder,  extracting a representation of the musical audio. An attention-based decoder is added to learn to predict a sequence of tags describing the given clip.  Experiments are conducted on data collected from the MajorMiner game,  which includes the order and timing that tags were applied to clips by individual users, and contains 3.95 captions per clip on average.}
}
@misc{WatanabeEtAl2020,
  author = {Shinji Watanabe and Michael I Mandel and Jon Barker and Emmanuel Vincent},
  title = { {CHiME}-6 Challenge: Tackling Multispeaker Speech Recognition for Unsegmented Recordings},
  year = {2020},
  eprint = {2004.09249},
  eprinttype = {arxiv},
  abstract = {Following the success of the 1st, 2nd, 3rd, 4th and 5th CHiME challenges we
organize the 6th CHiME Speech Separation and Recognition Challenge (CHiME-6).
The new challenge revisits the previous CHiME-5 challenge and further considers
the problem of distant multi-microphone conversational speech diarization and
recognition in everyday home environments. Speech material is the same as the
previous CHiME-5 recordings except for accurate array synchronization. The
material was elicited using a dinner party scenario with efforts taken to
capture data that is representative of natural conversational speech. This
paper provides a baseline description of the CHiME-6 challenge for both
segmented multispeaker speech recognition (Track 1) and unsegmented
multispeaker speech recognition (Track 2). Of note, Track 2 is the first
challenge activity in the community to tackle an unsegmented multispeaker
speech recognition scenario with a complete set of reproducible open source
baselines providing speech enhancement, speaker diarization, and speech
recognition modules.}
}
@inproceedings{mandelEtAl2019b,
  author = {Lauren Mandel and Michael I. Mandel and Chris Streb},
  title = {Soundscape Ecology: How listening to the environment can shape design and planning},
  booktitle = {American Society for Landscape Architects Conference on Landscape Architecture},
  year = {2019},
  address = {San Diego, CA}
}
@inproceedings{NiAndMandel2019b,
  author = {Zhaoheng Ni and Michael I Mandel},
  title = {Onssen: an open-source speech separation and enhancement library},
  year = {2020},
  eprint = {1911.00982},
  eprinttype = {arxiv},
  abstract = {Speech separation is an essential task for multi-talker speech recognition. Recently many deep learning approaches are proposed and have been constantly refreshing the state-of-the-art performances. The lack of algorithm implementations limits researchers to use the same dataset for comparison. Building a generic platform can benefit researchers by easily implementing novel separation algorithms and comparing them with the existing ones on customized datasets. We introduce "$onssen$": an open-source speech separation and enhancement library. $onssen$ is a library mainly for deep learning separation and enhancement algorithms. It uses LibRosa and NumPy libraries for the feature extraction and PyTorch as the back-end for model training. $onssen$ supports most of the Time-Frequency mask-based separation algorithms (e.g. deep clustering, chimera net, chimera++, and so on) and also supports customized datasets. In this paper, we describe the functionality of modules in $onssen$ and show the algorithms implemented by $onssen$ achieve the same performances as reported in the original papers.},
  pages = {7269--7273},
  doi = {10.1109/ICASSP40776.2020.9054265}
}
@inproceedings{GroverEtAl2018,
  title = {Understanding Acoustic Cues Non-Native Speakers Use for Identifying English /v/-/w/ Using Bubble Noise Method},
  author = {Vikas Grover and Michael I Mandel and Valerie Shafer and Yusra Syed and Austin Twine},
  year = 2018,
  booktitle = {ASHA Convention},
  abstract = {Hindi speakers of English perceive the English /v/-/w/ contrast less accurately than English speakers (Grover et al., 2016). The specific acoustic information misperceived in /v/-/w/ contrast remains unclear. This study, using a novel method of “bubble” noise (Mandel et al., 2016), identifies the acoustic cues for perception of /v/-/w/ contrast in English and Hindi speakers of English.

Learner Outcome(s):
Describe the effects of first language phonology on second language phonology
Discuss a novel method (Bubble Noise) to identify specific acoustic cues
Explain the importance of targeted training studies for difficult non-native contrasts


Keywords: Speech perception, Bubble Noise, Non-native speakers, Hindi, Acoustic cues
},
  url = {https://plan.core-apps.com/asha2018/event/b70a14de3fdfa5add67a659fc86a3ef5}
}
@inproceedings{GhalyEtA2017,
  title = {Analyzing Human and Machine Performance In Resolving Ambiguous Spoken Sentences},
  author = {Hussein Ghaly and Michael I Mandel},
  year = {2017},
  booktitle = {1st Workshop on Speech-Centric Natural Language Processing (SCNLP)},
  url = {http://m.mr-pc.org/work/scnlp17.pdf},
  pages = {18--26}
}
@inproceedings{choi17,
  author = {Jiyoung Choi and Michael I Mandel},
  title = {Perception of Korean fricatives and affricates in 'bubble' noise by native and nonnative speakers},
  year = {2017},
  booktitle = {International Circle of Korean Linguistics}
}
@inproceedings{mandel15b,
  author = {Michael I Mandel and Nicoleta Roman},
  title = {Integrating Markov random fields and model-based expectation maximization source separation and localization},
  year = {2015},
  booktitle = {Acoustical Society of America Spring Meeting},
  location = {Pittsburgh, PA},
  slides = {http://m.mr-pc.org/work/asa15slides.pdf}
}
@inproceedings{mandel15a,
  author = {Michael I Mandel and Sarah E Yoho and Eric W Healy},
  title = {Listener consistency in identifying speech mixed with particular “bubble” noise instances},
  year = {2015},
  booktitle = {Acoustical Society of America Spring Meeting},
  location = {Pittsburgh, PA},
  poster = {http://m.mr-pc.org/work/asa15poster.pdf}
}
@inproceedings{mandel14d,
  author = {Michael I Mandel and Song Hui Chon},
  title = {Using auditory bubbles to determine spectro-temporal cues of timbre},
  year = {2014},
  booktitle = {Cognitively Based Music Informatics Research (CogMIR)},
  location = {Toronto, ON},
  slides = {http://m.mr-pc.org/work/cogmir14slides.pdf},
  abstract = {Listeners can reliably identify speech in noisy conditions, but it is not well understood which specific features of speech they use to do this.  This talk presents a data-driven framework for identifying these features.  By analyzing listening-test results involving the same speech utterance mixed with many different "bubble" noise instances, the framework is able to compute the importance of each time-frequency point in the utterance to its intelligibility, which we call the time-frequency importance function.  We show that listeners are self-consistent in their ability to identify the word in individual mixtures and are also fairly consistent with other listeners, and that different listeners' time-frequency importance functions are similar for the same utterance.  In addition, a predictive model trained under this framework is able to generalize to new conditions, successfully predicting the intelligibility of mixtures involving novel noise instances, novel utterances of the same word from the same and different talkers, and even to some extent novel consonants.  If there is time, I will also discuss a preliminary experiment applying this framework to the determination of the time-frequency points in a musical note that are most important to listeners for recognizing its timbre.
}
}
@inproceedings{nandi13,
  author = {Arnab Nandi and Michael I Mandel},
  title = {The Interactive Join: Recognizing Gestures for Database Queries},
  booktitle = {CHI Works-In-Progress},
  year = {2013},
  url = {http://m.mr-pc.org/work/chiwip13.pdf},
  poster = {http://m.mr-pc.org/work/chiwip13poster.pdf},
  abstract = {Direct, ad-hoc interaction with databases has typically
been performed over console-oriented conversational
interfaces using query languages such as SQL. With the
rise in popularity of gestural user interfaces and
computing devices that use gestures as their exclusive
mode of interaction, database query interfaces require a
fundamental rethinking to work without keyboards. Unlike
domain-specific applications, the scope of possible actions
is significantly larger if not infinite. Thus, the recognition
of gestures and their consequent queries is a challenge.
We present a novel gesture recognition system that uses
both the interaction and the state of the database to
classify gestural input into relational database queries.
Preliminary results show that using this approach allows
for fast, efficient and interactive gesture-based querying
over relational databases.}
}
@misc{mandel11a,
  abstract = {This paper describes two applications of conditional restricted Boltzmann
machines ({CRBMs}) to the task of autotagging music. The first consists of
training a {CRBM} to predict tags that a user would apply to a clip of a song
based on tags already applied by other users. By learning the relationships
between tags, this model is able to pre-process training data to significantly
improve the performance of a support vector machine ({SVM}) autotagging. The
second is the use of a discriminative {RBM}, a type of {CRBM}, to autotag music. By
simultaneously exploiting the relationships among tags and between tags and
audio-based features, this model is able to significantly outperform {SVMs},
logistic regression, and multi-layer perceptrons. In order to be applied to
this problem, the discriminative {RBM} was generalized to the multi-label setting
and four different learning algorithms for it were evaluated, the first such
in-depth analysis of which we are aware.},
  archiveprefix = {arXiv},
  author = {Mandel, Michael and Pascanu, Razvan and Larochelle, Hugo and Bengio, Yoshua},
  day = 15,
  eprint = {1103.2832},
  eprinttype = {arxiv},
  month = mar,
  title = {Autotagging music with conditional restricted Boltzmann machines},
  url = {http://arxiv.org/abs/1103.2832},
  year = 2011
}
@inproceedings{mandel06a,
  author = {Michael I. Mandel and Daniel P. W. Ellis},
  title = {A Probability Model for Interaural Phase Difference},
  year = {2006},
  booktitle = { {ISCA} Workshop on Statistical and Perceptual Audio Processing },
  location = {Pittsburgh, PA},
  pages = {1--6},
  url = {http://m.mr-pc.org/work/sapa06.pdf},
  slides = {http://m.mr-pc.org/work/sapa06slides.pdf},
  http = {http://www.isca-speech.org/archive/sapa_2006/sap6_001.html},
  abstract = {In this paper, we derive a probability model for interaural phase differences at individual spectrogram points. Such a model can combine observations across arbitrary time and frequency regions in a structured way and does not make any assumptions about the characteristics of the sound sources. In experiments with speech from twenty speakers in simulated reverberant environments, this probabilistic method predicted the correct interaural delay of a signal more accurately than generalized cross-correlation methods.}
}
@inproceedings{sudderth04,
  title = {Visual Hand Tracking Using Nonparametric Belief Propagation},
  author = {Erik B. Sudderth and Michael I. Mandel and William
               T. Freeman and Alan S. Willsky},
  booktitle = {Proceedings of the {IEEE} Conference on Computer Vision and Pattern Recognition Workshops},
  pages = {189--197},
  year = {2004},
  url = {http://m.mr-pc.org/work/gmbv04.pdf},
  doi = {10.1109/CVPR.2004.200},
  http = {http://ssg.mit.edu/nbp/},
  abstract = {This paper develops probabilistic methods for visual tracking of a three-dimensional geometric hand model from monocular image sequences. We consider a redundant representation in which each model component is described by its position and orientation in the world coordinate frame. A prior model is then defined which enforces the kinematic constraints implied by the model's joints. We show that this prior has a local structure, and is in fact a pairwise Markov random field. Furthermore, our redundant representation allows color and edge-based likelihood measures, such as the Chamfer distance, to be similarly decomposed in cases where there is no self-occlusion. Given this graphical model of hand kinematics, we may track the hand's motion using the recently proposed nonparametric belief propagation (NBP) algorithm. Like particle filters, NBP approximates the posterior distribution over hand configurations as a collection of samples. However, NBP uses the graphical structure to greatly reduce the dimensionality of these distributions, providing improved robustness. Several methods are used to improve NBP's computational efficiency, including a novel KD-tree based method for fast Chamfer distance evaluation. We provide simulations showing that NBP may be used to refine inaccurate model initializations, as well as track hand motion through extended image sequences.}
}