Book chapter

@incollection{bertin-mahieux09,
  title = {Automatic Tagging of Audio: The State-of-the-Art},
  author = {Thierry Bertin-Mahieux and Douglas Eck and Michael I. Mandel},
  booktitle = {Machine Audition: Principles, Algorithms and Systems},
  editor = {Wenwu Wang},
  publisher = {IGI Publishing},
  year = {2010},
  note = {In press}
}

Journal

@article{mandel10a,
  title = {Evaluating source separation algorithms with reverberant speech},
  author = {Michael I. Mandel and Scott Bressler and Barbara Shinn-Cunningham and Daniel P. W. Ellis},
  journal = {{IEEE} Transactions on audio, speech, and language processing},
  year = {2010},
  url = {http://mr-pc.org/work/taslp10b.pdf},
  note = {In press},
  abstract = {This paper examines the performance of several source separation
systems on a speech separation task for which human intelligibility
has previously been measured.  For anechoic mixtures, automatic speech
recognition (ASR) performance on the separated signals is quite
similar to human performance.  In reverberation, however, while signal
separation has some benefit for ASR, the results are still far below
those of human listeners facing the same task.  Performing this same
experiment with a number of oracle masks created with \emph{a priori}
knowledge of the separated sources motivates a new objective measure
of separation performance, the DERTM (Direct-path, Early echo, and
Reverberation, of the Target and Masker), which is closely related to
the ASR results.  This measure indicates that while the non-oracle
algorithms successfully reject the direct-path signal from the masking
source, they reject less of its reverberation, explaining the
disappointing ASR performance.}
}
@article{mandel09a,
  title = {Model-based expectation maximization source separation and localization},
  author = {Michael I. Mandel and Ron J. Weiss and Daniel P. W. Ellis},
  journal = {{IEEE} Transactions on audio, speech, and language processing},
  year = {2010},
  month = feb,
  volume = {18},
  number = {2},
  pages = {382--394},
  url = {http://mr-pc.org/work/taslp10.pdf},
  doi = {10.1109/TASL.2009.2029711},
  abstract = {This paper describes a system, referred to as model-based expectation-maximization source separation and localization (MESSL), for separating and localizing multiple sound sources from an underdetermined reverberant two-channel recording. By clustering individual spectrogram points based on their interaural phase and level differences, MESSL generates masks that can be used to isolate individual sound sources. We first describe a probabilistic model of interaural parameters that can be evaluated at individual spectrogram points. By creating a mixture of these models over sources and delays, the multi-source localization problem is reduced to a collection of single source problems. We derive an expectation-maximization algorithm for computing the maximum-likelihood parameters of this mixture model, and show that these parameters correspond well with interaural parameters measured in isolation. As a byproduct of fitting this mixture model, the algorithm creates probabilistic spectrogram masks that can be used for source separation. In simulated anechoic and reverberant environments, separations using MESSL produced on average a signal-to-distortion ratio 1.6 dB greater and perceptual evaluation of speech quality (PESQ) results 0.27 mean opinion score units greater than four comparable algorithms.}
}
@article{mandel08b,
  title = {A Web-Based Game for Collecting Music Metadata},
  author = {Michael I. Mandel and Daniel P. W. Ellis},
  journal = {Journal of New Music Research},
  year = {2008},
  volume = {37},
  number = {2},
  pages = {151--165},
  url = {http://mr-pc.org/work/jnmr08.pdf},
  doi = {10.1080/09298210802479300},
  abstract = {We have designed a web-based game, MajorMiner, that makes collecting descriptions of musical excerpts fun, easy, useful, and objective. Participants describe 10 second clips of songs and score points when their descriptions match those of other participants. The rules were designed to encourage players to be thorough and the clip length was chosen to make judgments objective and specific. To analyse the data, we measured the degree to which binary classifiers could be trained to spot popular tags. We also compared the performance of clip classifiers trained with MajorMiner's tag data to those trained with social tag data from a popular website. On the top 25 tags from each source, MajorMiner's tags were classified correctly 67.2\% of the time, while the social tags were classified correctly 62.6\% of the time.}
}
@article{huang08,
  title = {Active Learning for Interactive Multimedia Retrieval},
  author = {Thomas S. Huang and Charlie K. Dagli and 
             Shyamsundar Rajaram and Edward Y. Chang and 
             Michael I. Mandel and Graham E. Poliner and Daniel P. W. Ellis},
  journal = {Proceedings of the IEEE},
  pages = {648--667},
  volume = {96},
  number = {4},
  year = {2008},
  doi = {10.1109/JPROC.2008.916364},
  abstract = {As the first decade of the 21st century comes to a close, growth in multimedia delivery infrastructure and public demand for applications built on this backbone are converging like never before. The push towards reaching truly interactive multimedia technologies becomes stronger as our media consumption paradigms continue to change. In this paper, we profile a technology leading the way in this revolution: active learning. Active learning is a strategy that helps alleviate challenges inherent in multimedia information retrieval through user interaction. We show how active learning is ideally suited for the multimedia information retrieval problem by giving an overview of the paradigm and component technologies used with special attention given to the application scenarios in which these technologies are useful. Finally, we give insight into the future of this growing field and how it fits into the larger context of multimedia information retrieval.}
}
@article{mandel06b,
  author = {Michael I. Mandel and Graham E. Poliner and Daniel P. W. Ellis},
  title = {Support vector machine active learning for music retrieval},
  journal = {Multimedia systems},
  pages = {1--11},
  year = {2006},
  month = {August},
  volume = {12},
  number = {1},
  url = {http://mr-pc.org/work/mmsj05.pdf},
  doi = {10.1007/s00530-006-0032-2},
  abstract = {Searching and organizing growing digital music collections requires a computational model of music similarity. This paper describes a system for performing flexible music similarity queries using SVM active learning. We evaluated the success of our system by classifying 1210 pop songs according to mood and style (from an online music guide) and by the performing artist. In comparing a number of representations for songs, we found the statistics of mel-frequency cepstral coefficients to perform best in precision-at-20 comparisons. We also show that by choosing training examples intelligently, active learning requires half as many labeled examples to achieve the same accuracy as a standard scheme.}
}

Conference

@inproceedings{mandel10b,
  title = {Learning tags that vary within a song},
  author = {Michael Mandel and Douglas Eck and Yoshua Bengio},
  year = {2010},
  month = {August},
  booktitle = {Proceedings of the 11th International Conference on
               Music Information Retrieval ({ISMIR})},
  note = {To appear},
  url = {http://mr-pc.org/work/ismir10.pdf},
  abstract = {This paper examines the relationship between human generated tags
  describing different parts of the same song.  These tags were
  collected using Amazon's Mechanical Turk service.  We find that the
  agreement between different people's tags decreases as the distance
  between the parts of a song that they heard increases.  To model
  these tags and these relationships, we describe a conditional
  restricted Boltzmann machine.  Using this model to fill in tags that
  should probably be present given a context of other tags, we train
  automatic tag classifiers (autotaggers) that
  outperform those trained on the original data.}
}
@inproceedings{bergstra10,
  title = {Scalable genre and tag prediction with spectral covariance},
  author = {James Bergstra and Michael Mandel and Douglas Eck},
  year = {2010},
  month = {August},
  booktitle = {Proceedings of the 11th International Conference on
               Music Information Retrieval ({ISMIR})},
  note = {To appear}
}
@inproceedings{mandel09b,
  title = {The ideal interaural parameter mask: a bound on binaural separation systems},
  author = {Michael I. Mandel and Daniel P. W. Ellis},
  year = {2009},
  month = {October},
  pages = {85--88},
  booktitle = {{IEEE} Workshop on Applications of Signal Processing to
               Audio and Acoustics ({WASPAA})},
  url = {http://mr-pc.org/work/waspaa09.pdf},
  poster = {http://mr-pc.org/work/waspaa09poster.pdf},
  doi = {10.1109/ASPAA.2009.5346506},
  abstract = {We introduce the Ideal Interaural Parameter Mask as an upper bound on the performance of mask-based source separation algorithms that are based on the differences between signals from two microphones or ears. With two additions to our Model-based EM Source Separation and Localization system, its performance approaches that of the IIPM upper bound to within 0.9 dB. These additions battle the effects of reverberation by absorbing reverberant energy and by forcing the ILD estimate to be larger than it might otherwise be. An oracle reliability measure was also added, in the hope that estimating parameters from more reliable regions of the spectrogram would improve separation, but it was not consistently useful.}
}
@inproceedings{devaney09,
  title = {Improving {MIDI}-audio alignment with acoustic features},
  author = {Johanna Devaney and Michael I. Mandel and Daniel P. W. Ellis},
  year = {2009},
  month = {October},
  pages = {45--48},
  booktitle = {{IEEE} Workshop on Applications of Signal Processing to
               Audio and Acoustics ({WASPAA})},
  url = {http://mr-pc.org/work/devaney_waspaa09.pdf},
  doi = {10.1109/ASPAA.2009.5346500},
  abstract = {This paper describes a technique to improve the accuracy of dynamic time warping-based MIDI-audio alignment. The technique implements a hidden Markov model that uses aperiodicity and power estimates from the signal as observations and the results of a dynamic time warping alignment as a prior. In addition to improving the overall alignment, this technique also identifies the transient and steady state sections of the note. This information is important for describing various aspects of a musical performance, including both pitch and rhythm.}
}
@inproceedings{law09,
  title = {Evaluation of algorithms using games: the case of music annotation},
  author = {Edith Law and Kris West and Michael Mandel and Mert Bay and J. Stephen Downie},
  year = {2009},
  month = {October},
  booktitle = {Proceedings of the 10th International Conference on
               Music Information Retrieval ({ISMIR})},
  pages = {387--392},
  url = {http://mr-pc.org/work/ismir09.pdf},
  abstract = {Search by keyword is an extremely popular method for retrieving music. To support this, novel algorithms that automatically tag music are being developed. The conventional way to evaluate audio tagging algorithms is to compute measures of agreement between the output and the ground truth set. In this work, we introduce a new method for evaluating audio tagging algorithms on a large scale by collecting set-level judgments from players of a human computation game called TagATune. We present the design and preliminary results of an experiment comparing five algorithms using this new evaluation metric, and contrast the results with those obtained by applying several conventional agreement-based evaluation metrics.}
}
@inproceedings{weiss08,
  title = {Source separation based on binaural cues and source
               model constraints},
  author = {Ron J. Weiss and Michael I. Mandel and Daniel P. W. Ellis},
  booktitle = {Proc. Interspeech},
  month = {September},
  year = {2008},
  pages = {419--422},
  location = {Brisbane, Australia},
  url = {http://mr-pc.org/work/interspeech08.pdf},
  http = {http://www.isca-speech.org/archive/interspeech_2008/i08_0419.html},
  abstract = {We describe a system for separating multiple sources from a two-channel recording based on interaural cues and known characteristics of the source signals. We combine a probabilistic model of the observed interaural level and phase differences with a prior model of the source statistics and derive an EM algorithm for finding the maximum likelihood parameters of the joint model. The system is able to separate more sound sources than there are observed channels. In simulated reverberant mixtures of three speakers the proposed algorithm gives a signal-to-noise ratio improvement of 2.1 dB over a baseline algorithm using only interaural cues.}
}
@inproceedings{mandel08a,
  title = {Multiple-instance learning for music information retrieval},
  author = {Michael I. Mandel and Daniel P. W. Ellis},
  booktitle = {Proceedings of the 9th International Conference on
               Music Information Retrieval ({ISMIR})},
  month = {September},
  year = {2008},
  pages = {577--582},
  url = {http://mr-pc.org/work/ismir08.pdf},
  poster = {http://mr-pc.org/work/ismir08poster.pdf},
  abstract = {Multiple-instance learning algorithms train classifiers from lightly supervised data, i.e. labeled collections of items, rather than labeled items.  We compare the multiple-instance learners mi-SVM and MILES on the task of classifying 10-second song clips.  These classifiers are trained on tags at the track, album, and artist levels, or granularities, that have been derived from tags at the clip granularity, allowing us to test the effectiveness of the learners at recovering the clip labeling in the training set and predicting the clip labeling for a held-out test set.  We find that mi-SVM is better than a control at the recovery task on training clips, with an average classification accuracy as high as 87\% over 43 tags; on test clips, it is comparable to the control with an average classification accuracy of up to 68\%.  MILES performed adequately on the recovery task, but poorly on the test clips.}
}
@inproceedings{ellis08,
  title = {Cross-correlation of beat-synchronous representations 
               for music similarity},
  author = {Daniel P. W. Ellis and Courtenay V. Cotton 
               and Michael I. Mandel},
  booktitle = {Proceedings of the {IEEE} International Conference 
               on Acoustics, Speech, and Signal Processing ({ICASSP})},
  pages = {57--60},
  month = {April},
  year = {2008},
  location = {Las Vegas, NV},
  url = {http://mr-pc.org/work/icassp08.pdf},
  doi = {10.1109/ICASSP.2008.4517545},
  abstract = {Systems to predict human judgments of music similarity directly from the audio have generally been based on the global statistics of spectral feature vectors i.e. collapsing any large-scale temporal structure in the data. Based on our work in identifying alternative ("cover") versions of pieces, we investigate using direct correlation of beat-synchronous representations of music audio to find segments that are similar not only in feature statistics, but in the relative positioning of those features in tempo-normalized time. Given a large enough search database, good matches by this metric should have very high perceived similarity to query items. We evaluate our system through a listening test in which subjects rated system-generated matches as similar or not similar, and compared results to a more conventional timbral and rhythmic similarity baseline, and to random selections.}
}
@inproceedings{mandel07c,
  title = {{EM} localization and separation using interaural level and
               phase cues},
  author = {Michael I. Mandel and Daniel P. W. Ellis},
  booktitle = {{IEEE} Workshop on Applications of Signal Processing to
               Audio and Acoustics ({WASPAA})},
  pages = {275--278},
  month = {October},
  year = {2007},
  url = {http://mr-pc.org/work/waspaa07.pdf},
  poster = {http://mr-pc.org/work/waspaa07poster.pdf},
  doi = {10.1109/10.1109/ASPAA.2007.4392987},
  abstract = {We describe a system for localizing and separating multiple sound sources from a reverberant two-channel recording. It consists of a probabilistic model of interaural level and phase differences and an EM algorithm for finding the maximum likelihood parameters of this model. By assigning points in the interaural spectrogram probabilistically to sources with the best-fitting parameters and then estimating the parameters of the sources from the points assigned to them, the system is able to separate and localize more sound sources than there are available channels. It is also able to estimate frequency-dependent level differences of sources in a mixture that correspond well to those measured in isolation. In experiments in simulated anechoic and reverberant environments, the proposed system improved the signal-to-noise ratio of target sources by 2.7 and 3.4dB more than two comparable algorithms on average.}
}
@inproceedings{mandel07b,
  title = {A web-based game for collecting music metadata},
  author = {Michael I. Mandel and Daniel P. W. Ellis},
  pages = {365--366},
  booktitle = {Proceedings of the 8th International Conference on
               Music Information Retrieval ({ISMIR})},
  editor = {Simon Dixon and David Bainbridge and Rainer Typke},
  location = {Vienna, Austria},
  month = {September},
  year = {2007},
  url = {http://mr-pc.org/work/ismir07.pdf},
  poster = {http://mr-pc.org/work/ismir07poster.pdf},
  abstract = {We have designed a web-based game to make collecting descriptions of musical excerpts fun, easy, useful, and objective.  Participants describe 10 second clips of songs and score points when their descriptions match those of other participants.  The rules were designed to encourage users to be thorough and the clip length was chosen to make judgments more objective and specific.  Analysis of preliminary data shows that we are able to collect objective and specific descriptions of clips and that players tend to agree with one another.}
}
@incollection{mandel07a,
  title = {An {EM} Algorithm for Localizing Multiple Sound Sources in
               Reverberant Environments},
  author = {Michael I. Mandel and Daniel P. W. Ellis and Tony Jebara},
  booktitle = {Advances in Neural Information Processing Systems 19},
  editor = {B. Sch\"{o}lkopf and J. Platt and T. Hoffman},
  publisher = {{MIT} Press},
  address = {Cambridge, MA},
  pages = {953--960},
  year = {2007},
  url = {http://mr-pc.org/work/nips06.pdf},
  poster = {http://mr-pc.org/work/nips06poster.pdf},
  abstract = {We present a method for localizing and separating sound sources in stereo recordings that is robust to reverberation and does not make any assumptions about the source statistics.  The method consists of a probabilistic model of binaural multi-source recordings and an expectation maximization algorithm for finding the maximum likelihood parameters of that model.  These parameters include distributions over delays and assignments of time-frequency regions to sources.  We evaluate this method against two comparable algorithms on simulations of simultaneous speech from two or three sources.  Our method outperforms the others in anechoic conditions and performs as well as the better of the two in the presence of reverberation.}
}
@inproceedings{mandel05,
  author = {Michael I. Mandel and Daniel P. W. Ellis},
  title = {Song-Level Features and Support Vector Machines for Music
               Classification},
  pages = {594--599},
  booktitle = {Proceedings of the 6th International Conference on
               Music Information Retrieval ({ISMIR})},
  editor = {Joshua D. Reiss and Geraint A. Wiggins},
  location = {London, UK},
  month = {September},
  year = {2005},
  url = {http://mr-pc.org/work/ismir05.pdf},
  poster = {http://mr-pc.org/work/ismir05poster.pdf},
  abstract = {Searching and organizing growing digital music collections requires automatic classification of music.  This paper describes a new system, tested on the task of artist identification, that uses support vector machines to classify songs based on features calculated over their entire lengths.  Since support vector machines are exemplar-based classifiers, training on and classifying entire songs instead of short-time features makes intuitive sense.  On a dataset of 1200 pop songs performed by 18 artists, we show that this classifier outperforms similar classifiers that use only SVMs or song-level features.  We also show that the KL divergence between single Gaussians and Mahalanobis distance between MFCC statistics vectors perform comparably when classifiers are trained and tested on separate albums, but KL divergence outperforms Mahalanobis distance when trained and tested on songs from the same albums.}
}
@incollection{sudderth05,
  author = {Erik B. Sudderth and Michael I. Mandel and William
               T. Freeman and Alan S. Willsky},
  title = {Distributed Occlusion Reasoning for Tracking with
               Nonparametric Belief Propagation},
  booktitle = {Advances in Neural Information Processing Systems 17},
  editor = {Lawrence K. Saul and Yair Weiss and {L\'{e}on} Bottou},
  publisher = {MIT Press},
  address = {Cambridge, MA},
  pages = {1369--1376},
  year = {2005},
  url = {http://mr-pc.org/work/nips04.pdf},
  http = {http://ssg.mit.edu/nbp/},
  abstract = {We describe a three­dimensional geometric hand model suitable for visual tracking applications. The kinematic constraints implied by the model's joints have a probabilistic structure which is well described by a graphical model. Inference in this model is complicated by the hand's many degrees of freedom, as well as multimodal likelihoods caused by ambiguous image measurements. We use nonparametric belief propagation (NBP) to develop a tracking algorithm which exploits the graph's structure to control complexity, while avoiding costly discretization. While kinematic constraints naturally have a local structure, self­ occlusions created by the imaging process lead to complex interpendencies in color and edge­based likelihood functions. However, we show that local structure may be recovered by introducing binary hidden variables describing the occlusion state of each pixel. We augment the NBP algorithm to infer these occlusion variables in a distributed fashion, and then analytically marginalize over them to produce hand position estimates which properly account for occlusion events. We provide simulations showing that NBP may be used to refine inaccurate model initializations, as well as track hand motion through extended image sequences. }
}

Workshop

@inproceedings{mandel06a,
  author = {Michael I. Mandel and Daniel P. W. Ellis},
  title = {A Probability Model for Interaural Phase Difference},
  year = {2006},
  booktitle = {{ISCA} Workshop on Statistical and Perceptual Audio
               Processing ({SAPA})},
  location = {Pittsburgh, PA},
  pages = {1--6},
  url = {http://mr-pc.org/work/sapa06.pdf},
  slides = {http://mr-pc.org/work/sapa06slides.pdf},
  http = {http://www.isca-speech.org/archive/sapa_2006/sap6_001.html},
  abstract = {In this paper, we derive a probability model for interaural phase differences at individual spectrogram points. Such a model can combine observations across arbitrary time and frequency regions in a structured way and does not make any assumptions about the characteristics of the sound sources. In experiments with speech from twenty speakers in simulated reverberant environments, this probabilistic method predicted the correct interaural delay of a signal more accurately than generalized cross-correlation methods.}
}
@inproceedings{sudderth04,
  title = {Visual Hand Tracking Using Nonparametric Belief Propagation},
  author = {Erik B. Sudderth and Michael I. Mandel and William
               T. Freeman and Alan S. Willsky},
  booktitle = {Proceedings of the {IEEE} Conference on Computer Vision and Pattern Recognition Workshops},
  pages = {189--197},
  year = {2004},
  url = {http://mr-pc.org/work/gmbv04.pdf},
  doi = {10.1109/CVPR.2004.200},
  http = {http://ssg.mit.edu/nbp/},
  abstract = {This paper develops probabilistic methods for visual tracking of a three-dimensional geometric hand model from monocular image sequences. We consider a redundant representation in which each model component is described by its position and orientation in the world coordinate frame. A prior model is then defined which enforces the kinematic constraints implied by the model's joints. We show that this prior has a local structure, and is in fact a pairwise Markov random field. Furthermore, our redundant representation allows color and edge-based likelihood measures, such as the Chamfer distance, to be similarly decomposed in cases where there is no self-occlusion. Given this graphical model of hand kinematics, we may track the hand's motion using the recently proposed nonparametric belief propagation (NBP) algorithm. Like particle filters, NBP approximates the posterior distribution over hand configurations as a collection of samples. However, NBP uses the graphical structure to greatly reduce the dimensionality of these distributions, providing improved robustness. Several methods are used to improve NBP's computational efficiency, including a novel KD-tree based method for fast Chamfer distance evaluation. We provide simulations showing that NBP may be used to refine inaccurate model initializations, as well as track hand motion through extended image sequences.}
}
mr-pc.org updated
Copyright © 2004-9 Michael I Mandel