library.bib

Automatically generated by Mendeley Desktop 1.16.3
Any changes to this file will be lost if it is regenerated by Mendeley.

BibTeX export options can be customized via Preferences -> BibTeX in Mendeley Desktop

@article{Donoho2009,
author = {Donoho, David L. and Maleki, Arian and Rahman, Inam Ur and Shahram, Morteza and Stodden, Victoria},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Donoho et al. - 2009 - Reproducible Research in Computational Harmonic Analysis.pdf:pdf},
issn = {0036-8075, 1095-9203},
journal = {Computing in Science {\&} Engineering},
number = {1},
pages = {8--18},
title = {{Reproducible Research in Computational Harmonic Analysis}},
volume = {11},
year = {2009}
}
@article{Markatou2005,
author = {Markatou, Marianthi and Tian, H},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Markatou, Tian - 2005 - Analysis of variance of cross-validation estimators of the generalization error.pdf:pdf},
isbn = {1532-4435},
issn = {1532-4435},
journal = {Journal of Machine {\ldots}},
keywords = {cross-validation,generalization error,moment approximation,prediction,variance},
pages = {1127--1168},
title = {{Analysis of variance of cross-validation estimators of the generalization error}},
url = {http://machinelearning.wustl.edu/mlpapers/paper{\_}files/MarkatouTBH05.pdf},
volume = {6},
year = {2005}
}
@inproceedings{Dasgupta2002,
author = {Dasgupta, Sanjoy and Littman, Michael L. and McAlles},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Dasgupta, Littman, McAlles - 2002 - PAC generalization bounds for co-training.pdf:pdf},
pages = {375--382},
title = {{PAC generalization bounds for co-training}},
url = {http://books.google.com/books?hl=en{\&}lr={\&}id=PGrlRWV5-v0C{\&}oi=fnd{\&}pg=PA375{\&}dq=PAC+Generalization+Bounds+for+Co-training{\&}ots=auaN1CGPip{\&}sig=0dID1oXJYgeENxwSzfsntvwz{\_}oU},
year = {2002}
}
@article{Ireland1968a,
author = {Ireland, C.T. and Kullback, S.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ireland, Kullback - 1968 - Contingence tables with given marginals.pdf:pdf},
journal = {Biometrika},
number = {1},
pages = {179--188},
title = {{Contingence tables with given marginals}},
volume = {55},
year = {1968}
}
@article{Bartlett2007,
abstract = {One of the nice properties of kernel classifiers such as SVMs is that they often produce sparse solutions. However, the decision functions of these classifiers cannot always be used to estimate the conditional probability of the class label. We investigate the relationship between these two properties and show that these are intimately related: sparseness does not occur when the conditional probabilities can be unambiguously estimated. We consider a family of convex loss functions and derive sharp asymptotic results for the fraction of data that becomes support vectors. This enables us to characterize the exact trade-off between sparseness and the ability to estimate conditional probabilities for these loss functions.},
author = {Bartlett, Peter L and Bartlett, Peter L and Tewari, Ambuj and Tewari, Ambuj},
doi = {10.1007/978-3-540-27819-1_39},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bartlett et al. - 2007 - Sparseness vs Estimating Conditional Probabilities Some Asymptotic Results.pdf:pdf},
isbn = {1532-4435},
issn = {15324435},
journal = {Journal of Machine Learning Research},
keywords = {calibration,estimating conditional proba-,kernel methods,sparseness,support vector machines},
pages = {775--790},
title = {{Sparseness vs Estimating Conditional Probabilities: Some Asymptotic Results}},
volume = {8},
year = {2007}
}
@article{Smola2005,
abstract = {We present methods for dealing with missing variables in the context
  of Gaussian Processes and Support Vector Machines. This solves an
  important problem which has largely been ignored by kernel methods:
  How to systematically deal with incomplete data? Our method can also
  be applied to problems with partially observed labels as well as to
  the transductive setting where we view the labels as missing data.
  
  Our approach relies on casting kernel methods as an estimation
  problem in exponential families. Hence, estimation with missing
  variables becomes a problem of computing marginal distributions, and
  finding efficient optimization methods. To that extent we propose an
  optimization scheme which extends the Concave Convex Procedure (CCP)
  of Yuille and Rangarajan, and present a simplified and intuitive
  proof of its convergence. We show how our algorithm can be
  specialized to various cases in order to efficiently solve the
  optimization problems that arise. Encouraging preliminary
  experimental results on the USPS dataset are also presented.},
author = {Smola, Alex and Vishwanathan, S V N and Hoffman, Thomas},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Smola, Vishwanathan, Hoffman - 2005 - Kernel Methods for Missing Variables.pdf:pdf},
isbn = {097273581X},
keywords = {Learning/Statistics {\&} Optimisation,Theory {\&} Algorithms},
title = {{Kernel Methods for Missing Variables}},
url = {http://eprints.pascal-network.org/archive/00002053/},
year = {2005}
}
@article{Michie1994,
author = {Michie, D and Spiegelhalter, D J and Taylor, C C},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Michie, Spiegelhalter, Taylor - 1994 - Statlog.pdf:pdf},
title = {{Statlog}},
year = {1994}
}
@article{Ho2002,
author = {Ho, Tin Kam and Basu, Mitra},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ho, Basu - 2002 - Complexity Measures of Supervised Classification Problems.pdf:pdf},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
number = {3},
pages = {289--300},
title = {{Complexity Measures of Supervised Classification Problems}},
volume = {24},
year = {2002}
}
@article{Rothschild2009,
abstract = {Using the 2008 elections, I explore the accuracy and infor-mational content of forecasts derived from two different types of data: polls and prediction markets. Both types of data suffer from inherent biases, and this is the first analysis to compare the accuracy of these forecasts adjusting for these biases. Moreover, the analysis expands on previous research by evaluating state-level forecasts in Presidential and Senatorial races, rather than just the national popular vote. Utilizing sev-eral different estimation strategies, I demonstrate that early in the cycle and in not-certain races debiased prediction market-based forecasts pro-vide more accurate probabilities of victory and more information than debiased poll-based forecasts. These results are significant because accu-rately documenting the underlying probabilities, at any given day before the election, is critical for enabling academics to determine the impact of shocks to the campaign, for the public to invest wisely and for practi-tioners to spend efficiently. Starting in the 2008 Presidential campaign, Nate Silver's FiveThirtyEight.com revolutionized election forecasting for the general public. Until his website was launched in March of 2008, those interested in predicting election out-comes typically reviewed national polling results that asked a representative cross-section of voters who they would vote for if the election were held that day. Yet, these raw poll numbers are volatile, subject to random sampling error on either side of the true underlying value.},
author = {Rothschild, David},
doi = {10.1093/poq/nfp082},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Rothschild - 2009 - Forecasting Elections Comparing prediction markets, polls, and their biases.pdf:pdf},
isbn = {0033-362X},
issn = {0033362X},
journal = {Public Opinion Quarterly},
number = {5},
pages = {895--916},
title = {{Forecasting Elections: Comparing prediction markets, polls, and their biases}},
volume = {73},
year = {2009}
}
@article{Welinder2013,
author = {Welinder, Peter and Welling, Max and Perona, Pietro},
doi = {10.1109/CVPR.2013.419},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Welinder, Welling, Perona - 2013 - A Lazy Man's Approach to Benchmarking Semisupervised Classifier Evaluation and Recalibration.pdf:pdf},
isbn = {978-0-7695-4989-7},
journal = {2013 IEEE Conference on Computer Vision and Pattern Recognition},
month = {jun},
pages = {3262--3269},
publisher = {Ieee},
title = {{A Lazy Man's Approach to Benchmarking: Semisupervised Classifier Evaluation and Recalibration}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=6619263},
year = {2013}
}
@article{Heitjan1994,
author = {Heitjan, Daniel F and Landis, J Richard},
doi = {10.2307/2290900},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Heitjan, Landis - 1994 - Assessing Secular Trends in Blood Pressure {\{}A{\}} Multiple-imputation Approach.pdf:pdf},
issn = {01621459},
journal = {Journal of the American Statistical Association},
keywords = {bayesian bootstrap,hot deck,incomplete data,missing data,observational study,predictive-mean matching},
number = {August 2015},
pages = {750--759},
title = {{Assessing Secular Trends in Blood Pressure: {\{}A{\}} Multiple-imputation Approach}},
volume = {89},
year = {1994}
}
@article{Castelli1996,
author = {Castelli, Vittorio and Cover, Thomas M.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Castelli, Cover - 1996 - The Relative Value of Labeled and Unlabeled Samples in Pattern Recognition with an Unknown Mixing Parameter.pdf:pdf},
journal = {IEEE Transactions on Information Theory},
number = {6},
pages = {2102--2117},
title = {{The Relative Value of Labeled and Unlabeled Samples in Pattern Recognition with an Unknown Mixing Parameter}},
volume = {42},
year = {1996}
}
@article{Wickenberg-Bolin2006,
abstract = {Supervised learning for classification of cancer employs a set of design examples to learn how to discriminate between tumors. In practice it is crucial to confirm that the classifier is robust with good generalization performance to new examples, or at least that it performs better than random guessing. A suggested alternative is to obtain a confidence interval of the error rate using repeated design and test sets selected from available examples. However, it is known that even in the ideal situation of repeated designs and tests with completely novel samples in each cycle, a small test set size leads to a large bias in the estimate of the true variance between design sets. Therefore different methods for small sample performance estimation such as a recently proposed procedure called Repeated Random Sampling (RSS) is also expected to result in heavily biased estimates, which in turn translates into biased confidence intervals. Here we explore such biases and develop a refined algorithm called Repeated Independent Design and Test (RIDT).},
author = {Wickenberg-Bolin, Ulrika and G{\"{o}}ransson, Hanna and Frykn{\"{a}}s, M{\aa}rten and Gustafsson, Mats G and Isaksson, Anders},
doi = {10.1186/1471-2105-7-127},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wickenberg-Bolin et al. - 2006 - Improved variance estimation of classification performance via reduction of bias caused by small sample.pdf:pdf},
issn = {1471-2105},
journal = {BMC bioinformatics},
keywords = {Analysis of Variance,Artificial Intelligence,Bias (Epidemiology),Diagnosis, Computer-Assisted,Diagnosis, Computer-Assisted: methods,Gene Expression Profiling,Gene Expression Profiling: methods,Humans,Models, Biological,Models, Statistical,Neoplasm Proteins,Neoplasm Proteins: analysis,Neoplasms,Neoplasms: diagnosis,Neoplasms: metabolism,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Pattern Recognition, Automated,Pattern Recognition, Automated: methods,Reproducibility of Results,Sample Size,Sensitivity and Specificity,Tumor Markers, Biological,Tumor Markers, Biological: analysis},
month = {jan},
pages = {127},
pmid = {16533392},
title = {{Improved variance estimation of classification performance via reduction of bias caused by small sample size.}},
url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=1435937{\&}tool=pmcentrez{\&}rendertype=abstract},
volume = {7},
year = {2006}
}
@article{Grunwald2007a,
abstract = {We show that forms of Bayesian and MDL inference that are often applied to classification problems can be {\{}$\backslash$em inconsistent{\}}.  This means that there exists a learning problem such that for all amounts of data the generalization errors of the MDL classifier and the Bayes classifier relative to the Bayesian posterior both remain bounded away from the smallest achievable generalization error. We extensively discuss the result from both a Bayesian and an MDL perspective.},
archivePrefix = {arXiv},
arxivId = {math/0406221},
author = {Gr{\"{u}}nwald, Peter and Langford, John},
doi = {10.1007/s10994-007-0716-7},
eprint = {0406221},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gr{\"{u}}nwald, Langford - 2007 - Suboptimal behavior of Bayes and MDL in classification under misspecification.pdf:pdf},
issn = {08856125},
journal = {Machine Learning},
keywords = {Bayesian statistics,Classification,Consistency,Inconsistency,Minimum description length,Misspecification},
number = {2-3},
pages = {119--149},
primaryClass = {math},
title = {{Suboptimal behavior of Bayes and MDL in classification under misspecification}},
volume = {66},
year = {2007}
}
@article{Fourure,
author = {Fourure, Damien and Fromont, Elisa and Muselet, Damien and Tr, Alain and Wolf, Christian},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Fourure et al. - Unknown - Semantic Segmentation via Multi-task , Multi-domain Learning.pdf:pdf},
keywords = {convolutional neural networks,deep learning,domain adaptation,multi-task learning,segmentation,semantic},
title = {{Semantic Segmentation via Multi-task , Multi-domain Learning}}
}
@unpublished{Bresson2012,
archivePrefix = {arXiv},
arxivId = {arXiv:1210.0699v1},
author = {Bresson, Xavier and Zhang, Ruiliang},
booktitle = {arXiv preprint},
eprint = {arXiv:1210.0699v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bresson, Zhang - 2012 - TV-SVM Total Variation Support Vector Machine for Semi-Supervised Data Classification.pdf:pdf},
title = {{TV-SVM: Total Variation Support Vector Machine for Semi-Supervised Data Classification}},
url = {http://arxiv.org/abs/1210.0699},
year = {2012}
}
@article{Wang2009b,
author = {Wang, Junhui and Shen, Xiaotong and Pan, Wei},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wang, Shen, Pan - 2009 - On efficient large margin semisupervised learning Method and theory.pdf:pdf},
journal = {The Journal of Machine Learning Research},
keywords = {classification,difference convex programming,nonconvex minimization,regulariza-,support vectors,tion},
pages = {719--742},
title = {{On efficient large margin semisupervised learning: Method and theory}},
url = {http://dl.acm.org/citation.cfm?id=1577094},
volume = {10},
year = {2009}
}
@article{Gelman2013b,
abstract = {The missionary zeal of many Bayesians of old has been matched, in the other direction, by an attitude among some theoreticians that Bayesian methods were absurd—notmerely misguided but obviously wrong in prin- ciple. We consider several examples, beginning with Feller's classic text on probability theory and continuing with more recent cases such as the perceived Bayesian nature of the so-called doomsday argument. We an- alyze in this note the intellectual background behind various misconcep- tions about Bayesian statistics, without aiming at a complete historical coverage of the reasons for this dismissal.},
archivePrefix = {arXiv},
arxivId = {arXiv:1006.5366v5},
author = {Gelman, Andrew and Robert, Christian P.},
doi = {10.1080/00031305.2013.760987},
eprint = {arXiv:1006.5366v5},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gelman, Robert - 2013 - “Not Only Defended But Also Applied” The Perceived Absurdity of Bayesian Inference.pdf:pdf},
isbn = {0003-1305},
issn = {0003-1305},
journal = {The American Statistician},
keywords = {bayesian,bogosity,doomsdsay argument,foundations,frequentist,laplace law of succession},
number = {1},
pages = {1--5},
title = {{“Not Only Defended But Also Applied”: The Perceived Absurdity of Bayesian Inference}},
url = {http://basepub.dauphine.fr/handle/123456789/11069$\backslash$nhttp://www.tandfonline.com/doi/abs/10.1080/00031305.2013.760987},
volume = {67},
year = {2013}
}
@article{Caticha2011,
author = {Caticha, Ariel and Mohammad-Djafari, Ali and Bercher, Jean-François and Bessiére, Pierre},
doi = {10.1063/1.3573619},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Caticha et al. - 2011 - Entropic Inference.pdf:pdf},
isbn = {9780735408609},
keywords = {bayes rule,entropy,information,maximum entropy},
number = {1},
pages = {20--29},
title = {{Entropic Inference}},
url = {http://link.aip.org/link/APCPCS/v1305/i1/p20/s1{\&}Agg=doi},
volume = {20},
year = {2011}
}
@article{Dwork2015,
author = {Dwork, Cynthia and Feldman, Vitaly and Hardt, Moritz and Pitassi, Toniann and Reingold, Omer and Roth, Aaron},
doi = {10.1126/science.aaa9375},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Dwork et al. - 2015 - The reusable holdout Preserving validity in adaptive data analysis(2).pdf:pdf;:Users/jkrijthe/Documents/Mendeley Desktop/Dwork et al. - 2015 - The reusable holdout Preserving validity in adaptive data analysis.pdf:pdf},
issn = {0036-8075},
journal = {Science},
number = {6248},
pages = {636--638},
title = {{The reusable holdout: Preserving validity in adaptive data analysis}},
url = {http://www.sciencemag.org/cgi/doi/10.1126/science.aaa9375},
volume = {349},
year = {2015}
}
@article{Gelman2013e,
abstract = {Researcher degrees of freedom can lead to a multiple comparisons problem, even in settings where researchers perform only a single analysis on their data. The problem is there can be a large number of potential comparisons when the details of data analysis are highly contingent on data, without the researcher having to perform any conscious procedure of fishing or examining multiple p-values. We discuss in the context of several examples of published papers where data-analysis decisions were theoretically-motivated based on previous literature, but where the details of data selection and analysis were not pre-specified and, as a result, were contingent on data. 1.},
author = {Gelman, Andrew and Loken, Eric},
doi = {10.1037/a0037714},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gelman, Loken - 2013 - The garden of forking paths Why multiple comparisons can be a problem, even when there is no “fishing exp.pdf:pdf},
issn = {1939-1455},
pages = {1--17},
pmid = {25180805},
title = {{The garden of forking paths: Why multiple comparisons can be a problem, even when there is no “fishing expedition” or “p-hacking” and the research hypothesis}},
url = {http://www.stat.columbia.edu/{~}gelman/research/unpublished/p{\_}hacking.pdf},
year = {2013}
}
@misc{Shalev-Shwartz2014,
author = {Shalev-Shwartz, Shai and Ben-David, Shai},
publisher = {Cambridge University Press},
title = {{Understanding Machine Learning}},
year = {2014}
}
@article{Minka2005,
abstract = {This paper presents a unifying view of message-passing algorithms, as methods to approximate a complex Bayesian network by a simpler network with minimum information divergence. In this view, the difference between mean-field methods and belief propagation is not the amount of structure they model, but only the measure of loss they minimize (`exclusive' versus `inclusive' Kullback-Leibler divergence). In each case, message-passing arises by minimizing a localized version of the divergence, local to each factor. By examining these divergence measures, we can intuit the types of solution they prefer (symmetry-breaking, for example) and their suitability for different tasks. Furthermore, by considering a wider variety of divergence measures (such as alpha-divergences), we can achieve different complexity and performance goals.},
author = {Minka, Thomas},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Minka - 2005 - Divergence measures and message passing.pdf:pdf},
pages = {MSR--TR--2005--173},
title = {{Divergence measures and message passing}},
year = {2005}
}
@inproceedings{Krijthe2016rssl,
author = {Krijthe, Jesse Hendrik},
booktitle = {Workshop on Reproducible Research in Pattern Recognition (Lecture Notes in Computer Science) (To Appear)},
title = {{RSSL: R package for Semi-supervised Learning}},
year = {2016}
}
@article{Lattimore2011,
archivePrefix = {arXiv},
arxivId = {1111.3846},
author = {Lattimore, Tor and Hutter, Marcus},
eprint = {1111.3846},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lattimore, Hutter - 2011 - No Free Lunch versus Occam's Razor in Supervised Learning.pdf:pdf},
journal = {arXiv preprint},
keywords = {kolmogorov complexity,no free lunch,occam,s razor,supervised learning},
title = {{No Free Lunch versus Occam's Razor in Supervised Learning}},
url = {http://arxiv.org/abs/1111.3846},
year = {2011}
}
@article{Reitmaier2015,
archivePrefix = {arXiv},
arxivId = {arXiv:1502.04033v2},
author = {Reitmaier, Tobias and Sick, Bernhard},
eprint = {arXiv:1502.04033v2},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Reitmaier, Sick - 2015 - The Responsibility Weighted Mahalanobis Kernel for Semi-Supervised Training of Support Vector Machines for Clas.pdf:pdf},
journal = {Information Sciences},
keywords = {kernel function,pattern classification,responsibility weighted mahalanobis kernel,semi-supervised learning,support vector machine},
pages = {179--198},
title = {{The Responsibility Weighted Mahalanobis Kernel for Semi-Supervised Training of Support Vector Machines for Classification}},
volume = {323},
year = {2015}
}
@inproceedings{Foulds2011,
author = {Foulds, James and Smyth, Padhraic},
booktitle = {SIAM International Conference on Data Mining},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Foulds, Smyth - 2011 - Multi-instance mixture models and semi-supervised learning.pdf:pdf},
number = {Mi},
title = {{Multi-instance mixture models and semi-supervised learning}},
url = {http://siam.omnibooksonline.com/2011datamining/data/papers/256.pdf},
year = {2011}
}
@inproceedings{Moutafis2014,
author = {Moutafis, Panagiotis and Kakadiaris, Ioannis A},
booktitle = {Pacific-Asia Conference on Knowledge Discovery and Data Mining},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Moutafis, Kakadiaris - 2014 - GS4 Generating synthetic samples for semi-supervised nearest neighbor classification.pdf:pdf},
isbn = {9783319131856},
issn = {16113349},
keywords = {Classification,K-nearest neighbor,Semi-supervised learning,Synthetic samples},
number = {13},
pages = {393--403},
title = {{GS4: Generating synthetic samples for semi-supervised nearest neighbor classification}},
volume = {8643},
year = {2014}
}
@article{Adams,
archivePrefix = {arXiv},
arxivId = {arXiv:1504.01344v1},
author = {Adams, Ryan P},
eprint = {arXiv:1504.01344v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Adams - Unknown - Early Stopping is Nonparametric Variational Inference.pdf:pdf},
title = {{Early Stopping is Nonparametric Variational Inference}}
}
@inproceedings{VanOmmen2014,
archivePrefix = {arXiv},
arxivId = {arXiv:1406.6200v1},
author = {van Ommen, Thijs},
booktitle = {Proceedings of the 30th Conference Annual Conference on Uncertainty in Artificial Intelligence},
eprint = {arXiv:1406.6200v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/van Ommen - 2014 - Combining predictions from linear models when training and test inputs differ.pdf:pdf},
pages = {653--662},
title = {{Combining predictions from linear models when training and test inputs differ}},
year = {2014}
}
@book{RamonyCajal1897a,
abstract = {Santiago Ramon y Cajal was a mythic figure in science. Hailed as the father of modernanatomy and neurobiology, he was largely responsible for the modern conception of the brain. Hisgroundbreaking works were New Ideas on the Structure of the Nervous System and Histology of the Nervous System in Man and Vertebrates. In addition to leaving alegacy of unparalleled scientific research, Cajal sought to educate the novice scientist about howscience was done and how he thought it should be done. This recently rediscovered classic, firstpublished in 1897, is an anecdotal guide for the perplexed new investigator as well as a refreshingresource for the old pro.Cajal was a pragmatist, aware of the pitfalls of beingtoo idealistic -- and he had a sense of humor, particularly evident in his diagnoses of variousstereotypes of eccentric scientists. The book covers everything from valuable personality traits foran investigator to social factors conducive to scientific work.},
author = {{Ram{\'{o}}n y Cajal}, Santiago},
booktitle = {Advice for a Young Investigator},
doi = {10.1016/S0166-2236(00)01546-0},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ram{\'{o}}n y Cajal - 1897 - Advice for a Young Investigator (translated by Neely Swanson and Larry W. Swanson).pdf:pdf},
isbn = {0262181916},
issn = {01662236},
number = {7},
pages = {1--150},
title = {{Advice for a Young Investigator (translated by Neely Swanson and Larry W. Swanson)}},
volume = {23},
year = {1897}
}
@article{Gelman2013,
abstract = {A substantial school in the philosophy of science identifies Bayesian inference with inductive inference and even rationality as such, and seems to be strengthened by the rise and practical success of Bayesian statistics. We argue that the most successful forms of Bayesian statistics do not actually support that particular philosophy but rather accord much better with sophisticated forms of hypothetico-deductivism. We examine the actual role played by prior distributions in Bayesian models, and the crucial aspects of model checking and model revision, which fall outside the scope of Bayesian confirmation theory. We draw on the literature on the consistency of Bayesian updating and also on our experience of applied work in social science. Clarity about these matters should benefit not just philosophy of science, but also statistical practice. At best, the inductivist view has encouraged researchers to fit and compare models without checking them; at worst, theorists have actively discouraged practitioners from performing model checking because it does not fit into their framework.},
author = {Gelman, Andrew and Shalizi, Cosma Rohilla},
doi = {10.1111/j.2044-8317.2011.02037.x},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gelman, Shalizi - 2013 - Philosophy and the practice of Bayesian statistics.pdf:pdf},
issn = {2044-8317},
journal = {The British journal of mathematical and statistical psychology},
month = {feb},
number = {1},
pages = {8--38},
pmid = {22364575},
title = {{Philosophy and the practice of Bayesian statistics.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/22364575},
volume = {66},
year = {2013}
}
@article{Krahenbuhl2011,
abstract = {Most state-of-the-art techniques for multi-class image segmentation and labeling use conditional random fields defined over pixels or image regions. While regionlevel models often feature dense pairwise connectivity, pixel-level models are considerably larger and have only permitted sparse graph structures. In this paper, we consider fully connected CRF models defined on the complete set of pixels in an image. The resulting graphs have billions of edges, making traditional inference algorithms impractical. Our main contribution is a highly efficient approximate inference algorithm for fully connected CRF models in which the pairwise edge potentials are defined by a linear combination of Gaussian kernels. Our experiments demonstrate that dense connectivity at the pixel level substantially improves segmentation and labeling accuracy.},
archivePrefix = {arXiv},
arxivId = {1210.5644},
author = {Krahenbuhl, Philipp and Koltun, Vladlen and Kr¨ahenb¨uhl, Philipp and Koltun, Vladlen and Krahenbuhl, Philipp},
eprint = {1210.5644},
file = {:Users/jkrijthe/Documents/Mendeley Desktop//Krahenbuhl et al. - 2011 - Efficient Inference in Fully Connected CRFs with Gaussian Edge Potentials.pdf:pdf;:Users/jkrijthe/Documents/Mendeley Desktop//Krahenbuhl et al. - 2011 - Efficient Inference in Fully Connected CRFs with Gaussian Edge Potentials.pdf:pdf},
isbn = {9781618395993},
journal = {Advances in Neural Information Processing Systems},
keywords = {conditional random field,filtering,message passing,sampling,segmentation},
number = {4},
pages = {1--9},
title = {{Efficient Inference in Fully Connected CRFs with Gaussian Edge Potentials}},
year = {2011}
}
@book{Pearl2014,
author = {Pearl, Judea and Glymour, Madelyn and Jewell, Nicholas P.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Pearl, Glymour, Jewell - 2016 - Causal Inference in Statistics A Primer.pdf:pdf},
publisher = {Wiley},
title = {{Causal Inference in Statistics: A Primer}},
year = {2016}
}
@inproceedings{Ji2012,
author = {Ji, Ming and Yang, Tianbao and Lin, Binbin and Jin, Rong and Han, Jiawei},
booktitle = {Proceedings of the 29th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ji et al. - 2012 - A simple algorithm for semi-supervised learning with improved generalization error bound.pdf:pdf},
number = {2},
title = {{A simple algorithm for semi-supervised learning with improved generalization error bound}},
url = {http://arxiv.org/abs/1206.6412},
year = {2012}
}
@inproceedings{Fujino2005,
author = {Fujino, Akinori and Ueda, Naonori and Saito, Kazumi},
booktitle = {Proceedings of the National Conference on Artificial Intelligence},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Fujino, Ueda, Saito - 2005 - A hybrid generativediscriminative approach to semi-supervised classifier design.pdf:pdf},
number = {2},
pages = {764--769},
title = {{A hybrid generative/discriminative approach to semi-supervised classifier design}},
url = {http://www.aaai.org/Papers/AAAI/2005/AAAI05-120.pdf},
volume = {20},
year = {2005}
}
@inproceedings{Carroll2007,
author = {Carroll, James L. and Seppi, Kevin D.},
booktitle = {IJCNN Workshop on Meta-Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Carroll, Seppi - 2007 - No-free-lunch and Bayesian optimality.pdf:pdf},
title = {{No-free-lunch and Bayesian optimality}},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.142.7564{\&}rep=rep1{\&}type=pdf},
year = {2007}
}
@article{Chan1997,
author = {Chan, Philip K. and Stolfo, Salvatore J.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Chan, Stolfo - 1997 - On the accuracy of meta-learning for scalable data mining.pdf:pdf},
journal = {Journal of Intelligent Information Systems},
title = {{On the accuracy of meta-learning for scalable data mining}},
url = {http://www.springerlink.com/index/M27133K052552242.pdf},
year = {1997}
}
@inproceedings{Sa1994,
author = {Sa, Virginia R De},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Sa - 1994 - Learning Classification with Unlabeled Data.pdf:pdf},
pages = {112--112},
title = {{Learning Classification with Unlabeled Data}},
year = {1994}
}
@inproceedings{DeBie2003,
author = {de Bie, Tijl and Cristianini, Nello},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/de Bie, Cristianini - 2003 - Convex Methods for Transduction.pdf:pdf},
title = {{Convex Methods for Transduction}},
year = {2003}
}
@article{Dhillon2013,
author = {Dhillon, Paramveer S. and Foster, Dean P. and Kakade, Sham M. and Ungar, Lyle H.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Dhillon et al. - 2013 - A Risk Comparison of Ordinary Least Squares vs Ridge Regression.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {pca,ridge regression,risk inflation},
pages = {1505--1511},
title = {{A Risk Comparison of Ordinary Least Squares vs Ridge Regression}},
url = {http://adsabs.harvard.edu/abs/2011arXiv1105.0875D},
volume = {14},
year = {2013}
}
@article{Kulesza2012,
abstract = {Determinantal point processes (DPPs) are elegant probabilistic models of repulsion that arise in quantum physics and random matrix theory. In contrast to traditional structured models like Markov random fields, which become intractable and hard to approximate in the presence of negative correlations, DPPs offer efficient and exact algorithms for sampling, marginalization, conditioning, and other inference tasks. We provide a gentle introduction to DPPs, focusing on the intuitions, algorithms, and extensions that are most relevant to the machine learning community, and show how DPPs can be applied to real-world applications like finding diverse sets of high-quality search results, building informative summaries by selecting diverse sentences from documents, modeling non-overlapping human poses in images or video, and automatically building timelines of important news stories.},
archivePrefix = {arXiv},
arxivId = {1207.6083},
author = {Kulesza, Alex and Taskar, Ben},
doi = {10.1561/2200000044},
eprint = {1207.6083},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kulesza, Taskar - 2012 - Determinantal Point Processes for Machine Learning.pdf:pdf},
isbn = {9781601986283},
issn = {1935-8237},
journal = {Foundations and Trends{\textregistered} in Machine Learning},
number = {2-3},
pages = {123--286},
title = {{Determinantal Point Processes for Machine Learning}},
url = {http://arxiv.org/abs/1207.6083$\backslash$nhttp://www.nowpublishers.com/product.aspx?product=MAL{\&}doi=2200000044},
volume = {5},
year = {2012}
}
@article{King1995,
author = {King, R.D. and Feng, C and Sutherland, A},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/King, Feng, Sutherland - 1995 - Statlog comparison of classification algorithms on large real-world problems.pdf:pdf},
journal = {Applied Artificial Intelligence an International Journal},
number = {3},
pages = {289--333},
title = {{Statlog: comparison of classification algorithms on large real-world problems}},
url = {http://www.tandfonline.com/doi/abs/10.1080/08839519508945477},
volume = {9},
year = {1995}
}
@article{Ye2007a,
address = {New York, New York, USA},
author = {Ye, Jieping},
doi = {10.1145/1273496.1273633},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ye - 2007 - Least squares linear discriminant analysis.pdf:pdf},
isbn = {9781595937933},
journal = {Proceedings of the 24th International Conference on Machine Learning},
keywords = {18,3,8,are linear combinations of,class separability,derived features in lda,dimension reduction,least squares,linear discriminant anal-,linear regression,the,the data achieves maximum,the orig-,ysis},
pages = {1087--1093},
publisher = {ACM Press},
title = {{Least squares linear discriminant analysis}},
url = {http://portal.acm.org/citation.cfm?doid=1273496.1273633},
year = {2007}
}
@inproceedings{Duin2002,
author = {Pekalska, Ella and Duin, Robert P.W. and Skurichina, Marina},
booktitle = {Multiple Classifier Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Pekalska, Duin, Skurichina - 2002 - A discussion on the classifier projection space for classifier combining.pdf:pdf},
pages = {137--148},
title = {{A discussion on the classifier projection space for classifier combining}},
url = {http://www.springerlink.com/index/A98FBKT93AK0YNNE.pdf},
year = {2002}
}
@unpublished{Krijthe2016limits,
author = {Krijthe, Jesse Hendrik and Loog, Marco},
title = {{The Pessimistic Limits of Margin-based Losses in Semi-supervised Learning}},
year = {2016}
}
@article{Krijthe2016,
archivePrefix = {arXiv},
arxivId = {1602.07865},
author = {Krijthe, Jesse Hendrik and Loog, Marco},
eprint = {1602.07865},
title = {{Projected Estimators for Robust Semi-supervised Classification}},
url = {http://arxiv.org/abs/1602.07865},
year = {2016}
}
@article{Chapelle2006a,
address = {New York, New York, USA},
author = {Chapelle, Olivier and Chi, Mingmin and Zien, Alexander},
doi = {10.1145/1143844.1143868},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Chapelle, Chi, Zien - 2006 - A continuation method for semi-supervised SVMs.pdf:pdf},
isbn = {1595933832},
journal = {Proceedings of the 23rd international conference on Machine learning - ICML '06},
pages = {185--192},
publisher = {ACM Press},
title = {{A continuation method for semi-supervised SVMs}},
url = {http://portal.acm.org/citation.cfm?doid=1143844.1143868},
year = {2006}
}
@inproceedings{Ravi2016,
abstract = {Traditional graph-based semi-supervised learning (SSL) approaches, even though widely applied, are not suited for massive data and large label scenarios since they scale linearly with the number of edges {\$}|E|{\$} and distinct labels {\$}m{\$}. To deal with the large label size problem, recent works propose sketch-based methods to approximate the distribution on labels per node thereby achieving a space reduction from {\$}O(m){\$} to {\$}O(\backslashlog m){\$}, under certain conditions. In this paper, we present a novel streaming graph-based SSL approximation that captures the sparsity of the label distribution and ensures the algorithm propagates labels accurately, and further reduces the space complexity per node to {\$}O(1){\$}. We also provide a distributed version of the algorithm that scales well to large data sizes. Experiments on real-world datasets demonstrate that the new method achieves better performance than existing state-of-the-art algorithms with significant reduction in memory footprint. We also study different graph construction mechanisms for natural language applications and propose a robust graph augmentation strategy trained using state-of-the-art unsupervised deep learning architectures that yields further significant quality gains.},
archivePrefix = {arXiv},
arxivId = {1512.01752},
author = {Ravi, Sujith and Diao, Qiming},
booktitle = {Proceedings of the 19th International Conference on Artificial Intelligence and Statistics (AISTATS)},
eprint = {1512.01752},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ravi, Diao - 2016 - Large Scale Distributed Semi-Supervised Learning Using Streaming Approximation.pdf:pdf},
title = {{Large Scale Distributed Semi-Supervised Learning Using Streaming Approximation}},
volume = {51},
year = {2016}
}
@article{Fakeri-Tabrizi2015,
abstract = {In many applications, observations are available with different views. This is, for example, the case with image-text classification, multilingual document classification or document classification on the web. In addition, unlabeled multiview examples can be easily acquired, but assigning labels to these examples is usually a time consuming task. We describe a multiview self-learning strategy which trains different voting classifiers on different views. The margin distributions over the unlabeled training data, obtained with each view-specific classifier are then used to estimate an upper-bound on their transductive Bayes error. Minimizing this upper-bound provides an automatic margin-threshold which is used to assign pseudo-labels to unlabeled examples. Final class labels are then assigned to these examples, by taking a vote on the pool of the previous pseudo-labels. New view-specific classifiers are then trained using the labeled and pseudo-labeled training data. We consider applications to image-text classification and to multilingual document classification. We present experimental results on the NUS-WIDE collection and on Reuters RCV1-RCV2 which show that despite its simplicity, our approach is competitive with other state-of-the-art techniques.},
author = {Fakeri-Tabrizi, Ali and Amini, Massih Reza and Goutte, Cyril and Usunier, Nicolas},
doi = {10.1016/j.neucom.2014.12.041},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Fakeri-Tabrizi et al. - 2015 - Multiview self-learning.pdf:pdf},
issn = {18728286},
journal = {Neurocomputing},
keywords = {Image annotation,Multilingual document categorization,Multiview learning,Self-learning},
pages = {117--127},
publisher = {Elsevier},
title = {{Multiview self-learning}},
url = {http://dx.doi.org/10.1016/j.neucom.2014.12.041},
volume = {155},
year = {2015}
}
@article{Bengio2007,
author = {Bengio, Yoshua and LeCun, Yann},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bengio, LeCun - 2007 - Scaling Learning Algorithms towards AI.pdf:pdf},
journal = {Large-Scale Kernel Machines},
number = {1},
pages = {1--41},
title = {{Scaling Learning Algorithms towards AI}},
url = {http://www.iro.umontreal.ca/{~}lisa/bib/pub{\_}subject/language/pointeurs/bengio+lecun-chapter2007.pdf},
year = {2007}
}
@inproceedings{Ho2000,
author = {Ho, Tin Kam},
booktitle = {Multiple Classifier Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ho - 2000 - Complexity of Classification Problems and Comparative Advantages of Combined Classifiers.pdf:pdf},
pages = {97--106},
title = {{Complexity of Classification Problems and Comparative Advantages of Combined Classifiers}},
year = {2000}
}
@inproceedings{Hoekstra1996,
author = {Hoekstra, Aarnoud and Duin, Robert P.W.},
booktitle = {Proceedings of the 13th International Conference on Pattern Recognition},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hoekstra, Duin - 1996 - On the nonlinearity of pattern classifiers.pdf:pdf},
pages = {271--275},
title = {{On the nonlinearity of pattern classifiers}},
url = {http://ieeexplore.ieee.org/xpls/abs{\_}all.jsp?arnumber=547429},
year = {1996}
}
@inproceedings{Giraud-Carrier2005,
author = {Giraud-carrier, Christophe and Provost, Foster},
booktitle = {In Proceedings of the ICML-2005 Workshop on Meta-learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Giraud-carrier, Provost - 2005 - Toward a justification of meta-learning Is the no free lunch theorem a show-stopper.pdf:pdf},
pages = {12--19},
title = {{Toward a justification of meta-learning: Is the no free lunch theorem a show-stopper}},
url = {http://dml.cs.byu.edu/{~}cgc/pubs/ICML2005WS.pdf},
year = {2005}
}
@inproceedings{Jaakkola2002,
author = {Jaakkola, MST and Szummer, Martin},
booktitle = {Advances in Neural Information Processing Systems 14},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Jaakkola, Szummer - 2002 - Partially labeled classification with Markov random walks.pdf:pdf},
pages = {945--952},
title = {{Partially labeled classification with Markov random walks}},
url = {http://books.google.com/books?hl=en{\&}lr={\&}id=GbC8cqxGR7YC{\&}oi=fnd{\&}pg=PA945{\&}dq=Partially+labeled+classification+with+Markov+random+walks{\&}ots=ZvP5J{\_}YBx6{\&}sig=dk27TWzUdp9G-e9OyvfYcGR14ro},
year = {2002}
}
@article{Wilkinson1958,
author = {Wilkinson, G. N.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wilkinson - 1958 - Estimation of Missing Values for the Analysis of Incomplete Data.pdf:pdf},
issn = {0006-341X},
journal = {Biometrics},
number = {2},
pages = {257--286},
title = {{Estimation of Missing Values for the Analysis of Incomplete Data}},
volume = {14},
year = {1958}
}
@article{Robert2016,
abstract = {This note is made of comments on Watson and Holmes (2016) and about their proposals towards more robust decisions. While we acknowledge and commend the authors for setting new and all-encompassing principles of Bayesian robustness, we remain uncertain as to which extent such principles can be applied outside binary decision. We also wonder at the ultimate relevance of Kullback-Leibler neighbourhoods to characterise robustness.},
archivePrefix = {arXiv},
arxivId = {1603.09088},
author = {Robert, Christian P. and Rousseau, Judith},
eprint = {1603.09088},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Robert, Rousseau - 2016 - Some comments about James Watson's and Chris Holmes' Approximate Models and Robust Decisions.pdf:pdf},
keywords = {1,and phrases,decision-theory,decision-theory, prior selection, robust methodolo,first-hand,introduction,misspecification,ology,prior selection,robust method-,there is nothing like},
pages = {1--7},
title = {{Some comments about James Watson's and Chris Holmes' "Approximate Models and Robust Decisions"}},
url = {http://arxiv.org/abs/1603.09088},
year = {2016}
}
@phdthesis{Lu2009,
author = {Lu, Tyler},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lu - 2009 - Fundamental Limitations of Semi-Supervised Learning.pdf:pdf},
title = {{Fundamental Limitations of Semi-Supervised Learning}},
year = {2009}
}
@book{Little2002,
address = {New York},
author = {Little, Roderick J. A. and Rubin, Donald B.},
booktitle = {Wiley, New York.},
edition = {Second},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Little, Rubin - 2002 - Statistical Analysis with Missing Data.pdf:pdf},
isbn = {3175723993},
publisher = {Wiley},
title = {{Statistical Analysis with Missing Data}},
year = {2002}
}
@inproceedings{Widrow1960,
author = {Widrow, Bernard and Hoff, Marcian E.},
booktitle = {IRE WESCON Convention Record 4},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Widrow, Hoff - 1960 - Adaptive switching circuits.pdf:pdf},
pages = {96--104},
title = {{Adaptive switching circuits.}},
year = {1960}
}
@article{Efron,
abstract = {In the absence of relevant prior experience, popular Bayesian estimation techniques usually begin with some form of 'uninformative' prior distribution intended to have minimal infer-ential influence. The Bayes rule will still produce nice looking estimates and credible intervals, but these lack the logical force that is attached to experience-based priors and require further justification. The paper concerns the frequentist assessment of Bayes estimates. A simple for-mula is shown to give the frequentist standard deviation of a Bayesian point estimate. The same simulations as required for the point estimate also produce the standard deviation. Exponen-tial family models make the calculations particularly simple and bring in a connection to the parametric bootstrap.},
author = {Efron, Bradley},
file = {:Users/jkrijthe/Documents/Mendeley Desktop//Efron - 2015 - Frequentist Accuracy of Bayesian Estimates.pdf:pdf;:Users/jkrijthe/Documents/Mendeley Desktop/Efron - 2015 - Frequentist Accuracy of Bayesian Estimates.pdf:pdf},
journal = {Journal of the Royal Statistical Society. Series B},
keywords = {Approximate bootstrap confidence intervals,General accuracy formula,Hierarchical and empirical Bayes,Markov chain Monte Carlo methods,Parametric bootstrap},
number = {3},
pages = {617--646},
title = {{Frequentist Accuracy of Bayesian Estimates}},
volume = {77},
year = {2015}
}
@article{Roweis1999,
abstract = {Factor analysis, principal component analysis, mixtures of gaussian clusters, vector quantization, Kalman filter models, and hidden Markov models can all be unified as variations of unsupervised learning under a single basic generative model. This is achieved by collecting together disparate observations and derivations made by many previous authors and introducing a new way of linking discrete and continuous state models using a simple nonlinearity. Through the use of other nonlinearities, we show how independent component analysis is also a variation of the same basic generative model. We show that factor analysis and mixtures of gaussians can be implemented in autoencoder neural networks and learned using squared error plus the same regularization term. We introduce a new model for static data, known as sensible principal component analysis, as well as a novel concept of spatially adaptive observation noise. We also review some of the literature involving global and local mixtures of the basic models and provide pseudocode for inference and learning for all the basic models.},
author = {Roweis, Sam and Ghahramani, Zoubin},
doi = {10.1162/089976699300016674},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Roweis, Ghahramani - 1999 - A unifying review of linear gaussian models.pdf:pdf},
isbn = {0899766993000},
issn = {0899-7667},
journal = {Neural computation},
number = {1995},
pages = {305--345},
pmid = {9950734},
title = {{A unifying review of linear gaussian models.}},
volume = {11},
year = {1999}
}
@article{Gomez-Chova2008,
abstract = {This letter presents a semisupervised method based on kernel machines and graph theory for remote sensing image classification. The support vector machine (SVM) is regularized with the unnormalized graph Laplacian, thus leading to the Laplacian SVM (LapSVM). The method is tested in the challenging problems of urban monitoring and cloud screening, in which an adequate exploitation of the wealth of unlabeled samples is critical. Results obtained using different sensors, and with low number of training samples, demonstrate the potential of the proposed LapSVM for remote sensing image classification.},
author = {G{\'{o}}mez-Chova, Luis and Camps-Valls, Gustavo and Mu{\~{n}}oz-Mari, Jordi and Calpe, Javier},
doi = {10.1109/LGRS.2008.916070},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/G{\'{o}}mez-Chova et al. - 2008 - Semisupervised image classification with Laplacian support vector machines.pdf:pdf},
journal = {IEEE Geoscience and Remote Sensing Letters},
keywords = {Kernel methods,Manifold learning,Regularization,Semisupervised learning (SSL),Support vector machines (SVMs)},
number = {3},
pages = {336--340},
title = {{Semisupervised image classification with Laplacian support vector machines}},
volume = {5},
year = {2008}
}
@inproceedings{Kim2014,
author = {Kim, Do-kyum and Der, Matthew and Saul, Lawrence K.},
booktitle = {AISTATS},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kim, Der, Saul - 2014 - A Gaussian Latent Variable Model for Large Margin Classification of Labeled and Unlabeled Data.pdf:pdf},
title = {{A Gaussian Latent Variable Model for Large Margin Classification of Labeled and Unlabeled Data}},
url = {http://jmlr.org/proceedings/papers/v33/kim14a.pdf},
volume = {33},
year = {2014}
}
@inproceedings{Grandvalet2005,
address = {Cambridge, MA},
author = {Grandvalet, Yves and Bengio, Yoshua},
booktitle = {Advances in Neural Information Processing Systems 17},
editor = {Saul, L. K. and Weiss, Y. and Bottou, L.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Grandvalet, Bengio - 2005 - Semi-supervised learning by entropy minimization.pdf:pdf},
pages = {529--536},
publisher = {MIT Press},
title = {{Semi-supervised learning by entropy minimization}},
year = {2005}
}
@article{Zhou2003,
abstract = {We consider the general problem of hlearning from labelled and unlabelled data, which is often called semi-supervised learning or transductive inference. A principled approach to semi-supervised learning is to design a classifying function which is sufficiently smooth with respect to the intrinsic structure collectively revealed by known labelled and unlabelled points. We present a simple algorithm to obtain such a smooth solution. Our method yields encouraging experimental results on a number of classification problems and demonstrated effective use of unlabelled data.},
author = {Zhou, Dengyong and Bousquet, Olivier and Lal, Thomas Navin and Weston, Jason and Sch, Bernhard},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Zhou et al. - 2003 - Learning with Local and Global Consistency.pdf:pdf},
journal = {Advances in Neural Information Processing Systems},
pages = {595--602},
title = {{Learning with Local and Global Consistency}},
url = {http://machinelearning.wustl.edu/mlpapers/paper{\_}files/NIPS2003{\_}AA41.pdf},
volume = {1},
year = {2003}
}
@inproceedings{Bottou2011,
author = {Bottou, Leon and Bousquet, Olivier},
booktitle = {Advances in Neural Information Processing Systems 24},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bottou, Bousquet - 2011 - The Tradeoffs of Large-Scale Learning.pdf:pdf},
pages = {In Advances in Neural Information Processing Syste},
title = {{The Tradeoffs of Large-Scale Learning}},
url = {http://books.google.com/books?hl=en{\&}lr={\&}id=JPQx7s2L1A8C{\&}oi=fnd{\&}pg=PA351{\&}dq=The+Tradeoffs+of+Large+Scale+Learning{\&}ots=vbhayjhcGc{\&}sig=kWCMo7N51TgoLQSVSv2f{\_}ILArjo http://books.google.com/books?hl=en{\&}lr={\&}id=JPQx7s2L1A8C{\&}oi=fnd{\&}pg=PA351{\&}dq=The+Tradeoffs+of+Large-Scale+Learning{\&}ots=vbjaAkg8Fe{\&}sig=chdz7lCKXTFdUaLPYAgH{\_}FfgLmA},
year = {2011}
}
@inproceedings{Cervone2014,
abstract = {Basketball is a game of decisions; at any moment, a player can change the character of a possession by choosing to pass, dribble, or shoot. The current state of basketball analytics, however, provides no way to quantitatively evaluate the vast majority of decisions that players make, as most metrics are driven by events that occur at or near the end of a possession, such as points, turnovers, and assists. We propose a framework for using plater-tracking data to assign a point value to each moment of a possession by computing how many points the offense is expected to score by the end of the possession, a quantity we call expected possession value (EPV). EPV allows analysts to evaluate every decision made during a basketball game - whether it is to pass, dribble, or shoot - opening the door for a multitude of new metrics and analyses of basketball that quantify value in terms of points. In this paper, we propose a modeling framework for estimating EPV, present results of EPV computations performed using player-tracking data from the 2012-2013 season, adn provide several examples of EPV-derived metrics that answer real basketball questions.},
author = {Cervone, Dan and D'Amour, Alexander and Bornn, Luke and Goldsberry, Kirk},
booktitle = {SLOAN Sports Analytics Conference},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Cervone et al. - 2014 - POINTWISE Predicting Points and Valuing Decisions in Real Time with NBA Optical Tracking Data.pdf:pdf},
pages = {1--9},
title = {{POINTWISE: Predicting Points and Valuing Decisions in Real Time with NBA Optical Tracking Data}},
url = {http://www.sloansportsconference.com/wp-content/uploads/2014/02/2014{\_}SSAC{\_}Pointwise-Predicting-Points-and-Valuing-Decisions-in-Real-Time.pdf http://dl.frz.ir/FREE/papers-we-love/sports{\_}analytics/2014-ssac-pointwise-predicting-points-and-valuing-decisions-},
year = {2014}
}
@article{Balsubramania,
author = {Balsubramani, Akshay},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Balsubramani - Unknown - Scalable Semi-Supervised Aggregation of Classifiers.pdf:pdf},
pages = {1--9},
title = {{Scalable Semi-Supervised Aggregation of Classifiers}}
}
@inproceedings{Ben-David2012,
author = {Ben-David, Shai and Loker, David and Srebro, Nathan and Sridharan, Karthik},
booktitle = {Proceedings of the 29th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ben-David et al. - 2012 - Minimizing the misclassification error rate using a surrogate convex loss.pdf:pdf},
pages = {1863--1870},
title = {{Minimizing the misclassification error rate using a surrogate convex loss}},
year = {2012}
}
@inproceedings{Fan2008,
author = {Fan, Bin and Lei, Zhen and Li, Stan Z.},
booktitle = {The 8th International Conference on Automatic Face {\&} Gesture Recognition},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Fan, Lei, Li - 2008 - Normalized LDA for Semi-supervised Learning.pdf:pdf},
pages = {1--6},
title = {{Normalized LDA for Semi-supervised Learning}},
year = {2008}
}
@article{Varma2006,
abstract = {Cross-validation (CV) is an effective method for estimating the prediction error of a classifier. Some recent articles have proposed methods for optimizing classifiers by choosing classifier parameter values that minimize the CV error estimate. We have evaluated the validity of using the CV error estimate of the optimized classifier as an estimate of the true error expected on independent data.},
author = {Varma, Sudhir and Simon, Richard},
doi = {10.1186/1471-2105-7-91},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Varma, Simon - 2006 - Bias in error estimation when using cross-validation for model selection.pdf:pdf},
issn = {1471-2105},
journal = {BMC bioinformatics},
keywords = {Algorithms,Artificial Intelligence,Bias (Epidemiology),Computer Simulation,Data Interpretation, Statistical,Gene Expression Profiling,Gene Expression Profiling: methods,Models, Genetic,Models, Statistical,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Pattern Recognition, Automated,Pattern Recognition, Automated: methods,Reproducibility of Results,Sensitivity and Specificity},
month = {jan},
pages = {91},
pmid = {16504092},
title = {{Bias in error estimation when using cross-validation for model selection.}},
url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=1397873{\&}tool=pmcentrez{\&}rendertype=abstract},
volume = {7},
year = {2006}
}
@misc{Mitchell1980,
author = {Mitchell, Tom M.},
booktitle = {Psychology},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Mitchell - 1980 - The need for biases in learning generalizations.pdf:pdf},
title = {{The need for biases in learning generalizations}},
url = {http://dml.cs.byu.edu/{~}cgc/docs/mldm{\_}tools/Reading/Need for Bias.pdf},
year = {1980}
}
@article{Raghu2016,
archivePrefix = {arXiv},
arxivId = {1606.05336},
author = {Raghu, Maithra and Poole, Ben and Kleinberg, Jon and Ganguli, Surya and Sohl-Dickstein, Jascha},
eprint = {1606.05336},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Raghu et al. - 2016 - On the expressive power of deep neural networks.pdf:pdf},
title = {{On the expressive power of deep neural networks}},
year = {2016}
}
@unpublished{Balakrishnan,
abstract = {We develop a general framework for proving rigorous guarantees on the performance of the EM algorithm and a variant known as gradient EM. Our analysis is divided into two parts: a treatment of these algorithms at the population level (in the limit of infinite data), followed by results that apply to updates based on a finite set of samples. First, we characterize the domain of attraction of any global maximizer of the population likelihood. This characterization is based on a novel view of the EM updates as a perturbed form of likelihood ascent, or in parallel, of the gradient EM updates as a perturbed form of standard gradient ascent. Leveraging this characterization, we then provide non-asymptotic guarantees on the EM and gradient EM algorithms when applied to a finite set of samples. We develop consequences of our general theory for three canonical examples of incomplete-data problems: mixture of Gaussians, mixture of regressions, and linear regression with covariates missing completely at random. In each case, our theory guarantees that with a suitable initialization, a relatively small number of EM (or gradient EM) steps will yield (with high probability) an estimate that is within statistical error of the MLE. We provide simulations to confirm this theoretically predicted behavior.},
archivePrefix = {arXiv},
arxivId = {1408.2156},
author = {Balakrishnan, Sivaraman and Wainwright, Martin J. and Yu, Bin},
eprint = {1408.2156},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Balakrishnan, Wainwright, Yu - Unknown - Statistical guarantees for the EM algorithm From population to sample-based analysis.pdf:pdf},
title = {{Statistical guarantees for the EM algorithm: From population to sample-based analysis}}
}
@article{Amorim2016,
abstract = {The annotation of large data sets by a classifier is a problem whose challenge increases as the number of labeled samples used to train the classifier reduces in comparison to the number of unlabeled samples. In this context, semi-supervised learning methods aim at discovering and labeling informative samples among the unlabeled ones, such that their addition to the correct class in the training set can improve classification performance. We present a semi-supervised learning approach that connects unlabeled and labeled samples as nodes of a minimum-spanning tree and partitions the tree into an optimum-path forest rooted at the labeled nodes. It is suitable when most samples from a same class are more closely connected through sequences of nearby samples than samples from distinct classes, which is usually the case in data sets with a reasonable relation between number of samples and feature space dimension. The proposed solution is validated by using several data sets and state-of-the-art methods as baselines.},
author = {Amorim, Willian P. and Falc{\~{a}}o, Alexandre X. and Papa, Jo{\~{a}}o P. and Carvalho, Marcelo H.},
doi = {10.1016/j.patcog.2016.04.020},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Amorim et al. - 2016 - Improving semi-supervised learning through optimum connectivity.pdf:pdf},
issn = {00313203},
journal = {Pattern Recognition},
keywords = {Optimum-path forest classifiers,Semi-supervised learning,ers,optimum-path forest classi fi,semi-supervised learning},
pages = {72--85},
publisher = {Elsevier},
title = {{Improving semi-supervised learning through optimum connectivity}},
url = {http://dx.doi.org/10.1016/j.patcog.2016.04.020},
volume = {60},
year = {2016}
}
@inproceedings{Cortes1993,
author = {Cortes, Corinna and Jackel, L.D.},
booktitle = {Advances in Neural Information Processing Systems 6},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Cortes, Jackel - 1993 - Learning Cuves Asymptotic Values and Rate of Convergence.pdf:pdf},
pages = {327--334},
title = {{Learning Cuves: Asymptotic Values and Rate of Convergence}},
url = {http://scholar.google.com/scholar?hl=en{\&}btnG=Search{\&}q=intitle:Learning+Cuves:+Asymptotic+Values+and+Rate+of+Convergence{\#}0},
year = {1993}
}
@inproceedings{Chapelle2005,
abstract = {We believe that the cluster assumption is key to successful semi-supervised learning. Based on this, we propose three semi-supervised algorithms: 1. deriving graph-based distances that emphazise low density regions between clusters, followed by training a standard SVM; 2. optimizing the Transductive SVM objective function, which places the decision boundary in low density regions, by gradient descent; 3. combining the first two to make maximum use of the cluster assumption. We compare with state of the art algorithms and demonstrate superior accuracy for the latter two methods.},
author = {Chapelle, Olivier and Zien, Alexander},
booktitle = {AISTATS},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Chapelle, Zien - 2005 - Semi-Supervised Classification by Low Density Separation.pdf:pdf},
keywords = {learning,statistics {\&} optimisation,theory {\&} algorithms},
pages = {57--64},
title = {{Semi-Supervised Classification by Low Density Separation}},
year = {2005}
}
@book{BDA2013,
author = {Gelman, Andrew and Carlin, John B. and Stern, Hal S. and Dunson, David B. and Vehtari, Aki and Rubin, Donald B.},
edition = {3},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gelman et al. - 2013 - Bayesian Data Analysis.pdf:pdf},
publisher = {CRC Press},
title = {{Bayesian Data Analysis}},
year = {2013}
}
@article{Mhaskar2016,
abstract = {The paper briefy reviews several recent results on hierarchical architectures for learning from examples, that may formally explain the conditions under which Deep Convolutional Neural Networks perform much better in function approximation problems than shallow, one-hidden layer architectures. The paper announces new results for a non-smooth activation function - the ReLU function - used in present-day neural networks, as well as for the Gaussian networks. We propose a new definition of relative dimension to encapsulate different notions of sparsity of a function class that can possibly be exploited by deep networks but not by shallow ones to drastically reduce the complexity required for approximation and learning.},
archivePrefix = {arXiv},
arxivId = {1608.03287},
author = {Mhaskar, Hrushikesh and Poggio, Tomaso},
eprint = {1608.03287},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Mhaskar, Poggio - 2016 - Deep vs. shallow networks An approximation theory perspective.pdf:pdf},
number = {054},
pages = {1--16},
title = {{Deep vs. shallow networks : An approximation theory perspective}},
url = {http://arxiv.org/abs/1608.03287},
year = {2016}
}
@article{Cutler1994,
author = {Cutler, Adele and Breiman, Leo},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Cutler, Breiman - 1994 - Archetypal Analysis.pdf:pdf},
journal = {Technometrics},
keywords = {archetypes,convex hull,graphics,nonlinear optimization,principal},
number = {4},
pages = {338--347},
title = {{Archetypal Analysis}},
volume = {36},
year = {1994}
}
@inproceedings{Matti2006,
author = {Kaariainen, Matti},
booktitle = {International Joint Conference on Neural Networks},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kaariainen - 2006 - Semi-Supervised Model Selection Based on Cross-Validation.pdf:pdf},
number = {510},
title = {{Semi-Supervised Model Selection Based on Cross-Validation}},
year = {2006}
}
@inproceedings{Krijthe2016a,
author = {Krijthe, Jesse Hendrik and Loog, Marco},
booktitle = {Proceedings of the 23rd International Conference on Pattern Recognition (To Appear)},
title = {{Optimistic Semi-supervised Least Squares Classification}},
year = {2016}
}
@techreport{Seeger2001,
author = {Seeger, Matthias},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Seeger - 2001 - Learning with labeled and unlabeled data.pdf:pdf},
pages = {1--62},
title = {{Learning with labeled and unlabeled data}},
year = {2001}
}
@article{Burman1989,
author = {Burman, Prabir},
doi = {10.2307/2336116},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Burman - 1989 - A Comparative Study of Ordinary Cross-Validation, v-Fold Cross-Validation and the Repeated Learning-Testing Methods.pdf:pdf},
issn = {00063444},
journal = {Biometrika},
month = {sep},
number = {3},
pages = {503},
title = {{A Comparative Study of Ordinary Cross-Validation, v-Fold Cross-Validation and the Repeated Learning-Testing Methods}},
url = {http://www.jstor.org/stable/2336116?origin=crossref},
volume = {76},
year = {1989}
}
@article{Hand2014,
author = {Hand, David J.},
doi = {10.1214/13-STS446},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hand - 2014 - Wonderful Examples, but Let's not Close Our Eyes.pdf:pdf},
issn = {0883-4237},
journal = {Statistical Science},
keywords = {Frequentist, likelihood inference, Neyman-Pearson,and phrases,frequentist,hypothesis testing,informative and thought-provoking,likelihood inference,making specific comments,neyman,on each of these,pearson,schools of inference,space prohibits me from},
month = {feb},
number = {1},
pages = {98--100},
title = {{Wonderful Examples, but Let's not Close Our Eyes}},
url = {http://projecteuclid.org/euclid.ss/1399645735},
volume = {29},
year = {2014}
}
@article{Geer2009,
archivePrefix = {arXiv},
arxivId = {arXiv:0910.0722v1},
author = {Geer, Sara Van De and B{\"{u}}hlmann, Peter},
eprint = {arXiv:0910.0722v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Geer, B{\"{u}}hlmann - 2009 - On the conditions used to prove oracle results for the Lasso.pdf:pdf},
journal = {Electronic Journal of Statistics},
keywords = {and phrases,coherence,compatibility,irrepresentable condition,lasso,re-,restricted isometry,sparsity,stricted eigenvalue},
pages = {1--33},
title = {{On the conditions used to prove oracle results for the Lasso}},
url = {http://projecteuclid.org/euclid.ejs/1260801227},
year = {2009}
}
@article{Isaksson2008,
author = {Isaksson, Anders and Wallman, M. and Goransson, H. and Gustafsson, Mats G},
doi = {10.1016/j.patrec.2008.06.018},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Isaksson et al. - 2008 - Cross-validation and bootstrapping are unreliable in small sample classification.pdf:pdf},
issn = {01678655},
journal = {Pattern Recognition Letters},
keywords = {performance estimation,supervised classification},
month = {oct},
number = {14},
pages = {1960--1965},
title = {{Cross-validation and bootstrapping are unreliable in small sample classification}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0167865508002158},
volume = {29},
year = {2008}
}
@article{Jung2008,
abstract = {In recent years, there has been a growing interest among researchers in the use of latent class and growth mixture modeling techniques for applications in the social and psychological sciences, in part due to advances in and availability of computer software designed for this purpose (e.g., Mplus and SAS Proc Traj). Latent growth modeling approaches, such as latent class growth analysis (LCGA) and growth mixture modeling (GMM), have been increasingly recognized for their usefulness for identifying homogeneous subpopulations within the larger heterogeneous population and for the identification of meaningful groups or classes of individuals. The purpose of this paper is to provide an overview of LCGA and GMM, compare the different techniques of latent growth modeling, discuss current debates and issues, and provide readers with a practical guide for conducting LCGA and GMM using the Mplus software.},
author = {Jung, Tony and Wickrama, K. A.},
doi = {10.1111/j.1751-9004.2007.00054.x},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Jung, Wickrama - 2008 - An introduction to latent class growth analysis and growth mixture modeling.pdf:pdf},
isbn = {1751-9004},
issn = {1751-9004},
journal = {Social and Personality Psychology Compass},
number = {1},
pages = {302--317},
title = {{An introduction to latent class growth analysis and growth mixture modeling}},
url = {http://doi.wiley.com/10.1111/j.1751-9004.2007.00054.x$\backslash$nhttp://onlinelibrary.wiley.com/doi/10.1111/j.1751-9004.2007.00054.x/full},
volume = {2},
year = {2008}
}
@inproceedings{McWilliams2013,
abstract = {This paper presents Correlated Nystr¨ om Views (XNV), a fast semi-supervised al- gorithm for regression and classification. The algorithm draws on two main ideas. First, it generates two views consisting of computationally inexpensive random features. Second, multiview regression, using Canonical Correlation Analysis (CCA) on unlabeled data, biases the regression towards useful features. It has been shown that CCA regression can substantially reduce variance with a mini- mal increase in bias if the views contains accurate estimators. Recent theoretical and empirical work shows that regression with random features closely approxi- mates kernel regression, implying that the accuracy requirement holds for random views. We show that XNV consistently outperforms a state-of-the-art algorithm for semi-supervised learning: substantially improving predictive performance and reducing the variability of performance on a wide variety of real-world datasets, whilst also reducing runtime by orders of magnitude. 1},
archivePrefix = {arXiv},
arxivId = {arXiv:1306.5554v1},
author = {McWilliams, Brian and Balduzzi, David and Buhmann, Joachim M.},
booktitle = {Advances in Neural Information Processing Systems},
eprint = {arXiv:1306.5554v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/McWilliams, Balduzzi, Buhmann - 2013 - Correlated random features for fast semi-supervised learning.pdf:pdf},
pages = {440--44},
title = {{Correlated random features for fast semi-supervised learning}},
year = {2013}
}
@book{Glymour2001,
author = {Glymour, Clark},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Glymour - 2001 - The Mind's Arrow.pdf:pdf},
isbn = {0262072203},
title = {{The Mind's Arrow}},
year = {2001}
}
@inproceedings{Walt2007,
author = {Walt, Christiaan Van Der and Barnard, Etienne},
booktitle = {18th Annual Symposium of the Pattern Recognition Association of South Africa},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Walt, Barnard - 2007 - Measures for the characterisation of pattern-recognition data sets.pdf:pdf},
title = {{Measures for the characterisation of pattern-recognition data sets}},
url = {http://researchspace.csir.co.za/dspace/handle/10204/1979},
year = {2007}
}
@article{Shiao2014,
author = {Shiao, Han-Tai and Cherkassky, Vladimir},
doi = {10.1109/IJCNN.2014.6889517},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Shiao, Cherkassky - 2014 - Learning using privileged information (LUPI) for modeling survival data.pdf:pdf},
isbn = {978-1-4799-1484-5},
journal = {International Joint Conference on Neural Networks},
month = {jul},
pages = {1042--1049},
publisher = {Ieee},
title = {{Learning using privileged information (LUPI) for modeling survival data}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=6889517},
year = {2014}
}
@article{Wilson2015a,
abstract = {We introduce scalable deep kernels, which combine the structural properties of deep learning architectures with the non-parametric flexibility of kernel methods. Specifically, we transform the inputs of a spectral mixture base kernel with a deep architecture, using local kernel interpolation, inducing points, and structure exploiting (Kronecker and Toeplitz) algebra for a scalable kernel representation. These closed-form kernels can be used as drop-in replacements for standard kernels, with benefits in expressive power and scalability. We jointly learn the properties of these kernels through the marginal likelihood of a Gaussian process. Inference and learning cost {\$}O(n){\$} for {\$}n{\$} training points, and predictions cost {\$}O(1){\$} per test point. On a large and diverse collection of applications, including a dataset with 2 million examples, we show improved performance over scalable Gaussian processes with flexible kernel learning models, and stand-alone deep architectures.},
archivePrefix = {arXiv},
arxivId = {1511.02222},
author = {Wilson, Andrew Gordon and Hu, Zhiting and Salakhutdinov, Ruslan and Xing, Eric P.},
eprint = {1511.02222},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wilson et al. - 2015 - Deep Kernel Learning.pdf:pdf},
number = {1998},
pages = {1--19},
title = {{Deep Kernel Learning}},
url = {http://arxiv.org/abs/1511.02222},
year = {2015}
}
@article{Raudys1998,
author = {Raudys, Sarunas and Duin, Robert P W},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Raudys, Duin - 1998 - Expected classification error of the Fisher linear classifier with pseudo-inverse covariance matrix.pdf:pdf},
journal = {Pattern Recognition Letters},
keywords = {Dimensionality,fisher linear discriminant,generalization error,pseudo-inversion,sample size,scissors effect,statistical classification},
month = {apr},
number = {5-6},
pages = {385--392},
publisher = {Elsevier},
title = {{Expected classification error of the Fisher linear classifier with pseudo-inverse covariance matrix}},
volume = {19},
year = {1998}
}
@article{Buja2014,
author = {Buja, A and Brown, L},
doi = {10.1214/13-AOS1175},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Buja, Brown - 2014 - Discussion of A significance test for the lasso''.pdf:pdf},
journal = {The Annals of Statistics},
number = {2},
pages = {509--517},
title = {{Discussion of "A significance test for the lasso''}},
volume = {42},
year = {2014}
}
@inproceedings{Cortes2004,
author = {Cortes, Corinna and Mohri, Mehryar},
booktitle = {Advances in Neural Information Processing Systems 16},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Cortes, Mohri - 2004 - AUC optimization vs. error rate minimization.pdf:pdf},
pages = {313--320},
title = {{AUC optimization vs. error rate minimization}},
url = {http://books.google.com/books?hl=en{\&}lr={\&}id=0F-9C7K8fQ8C{\&}oi=fnd{\&}pg=PA313{\&}dq=AUC+Optimization+vs+.+Error+Rate+Minimization{\&}ots=TGKup{\_}Ra93{\&}sig=VTdv-C5TW9itNMlz43YJjmxRKAc},
year = {2004}
}
@inproceedings{Joulin2012,
author = {Joulin, Armand and Bach, Francis},
booktitle = {Proceedings of the 29th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Joulin, Bach - 2012 - A convex relaxation for weakly supervised classifiers.pdf:pdf},
keywords = {MIL,convex relaxation,weak supervision},
pages = {1279--1286},
title = {{A convex relaxation for weakly supervised classifiers}},
url = {http://arxiv.org/abs/1206.6413},
year = {2012}
}
@article{Scott2009,
author = {Scott, Clayton and Blanchard, Gilles},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Scott, Blanchard - 2009 - Novelty detection Unlabeled data definitely help.pdf:pdf},
pages = {464--471},
title = {{Novelty detection: Unlabeled data definitely help}},
url = {http://eprints.pascal-network.org/archive/00004475/},
volume = {5},
year = {2009}
}
@article{Sharma2014,
author = {Sharma, Aastha and Chaturvedi, Setu and Gour, Bhupesh},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Sharma, Chaturvedi, Gour - 2014 - A Semi- Supervised Technique for Weather Condition Prediction using DBSCAN and KNN.pdf:pdf},
journal = {International Journal of Computer Applications},
keywords = {clustering,data mining,dbscan,knn,semi-supervised},
number = {10},
pages = {21--26},
title = {{A Semi- Supervised Technique for Weather Condition Prediction using DBSCAN and KNN}},
volume = {95},
year = {2014}
}
@book{Gelman,
author = {Gelman, Andrew and Hill, Jennifer},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gelman, Hill - 2007 - Data Analysis Using Regression and MiltilevelHierarchical Models.pdf:pdf},
title = {{Data Analysis Using Regression and Miltilevel/Hierarchical Models}},
year = {2007}
}
@techreport{Welling,
author = {Welling, Max},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Welling - Unknown - Kernel ridge Regression.pdf:pdf},
number = {3},
pages = {3--5},
title = {{Kernel ridge Regression}}
}
@inproceedings{Cai2007,
author = {Cai, Deng and He, Xiaofei and Han, Jiawei},
booktitle = {The IEEE 11th International Conference on Computer Vision},
doi = {10.1109/ICCV.2007.4408856},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Cai, He, Han - 2007 - Semi-supervised Discriminant Analysis.pdf:pdf},
pages = {1--7},
title = {{Semi-supervised Discriminant Analysis}},
year = {2007}
}
@inproceedings{Hoi2006,
abstract = {The goal of active learning is to select the most informative examples for manual labeling. Most of the previous studies in active learning have focused on selecting a single unlabeled example in each iteration. This could be ineﬃcient since the classiﬁcation model has to be retrained for every labeled example. In this paper, we present a framework for “batch mode active learning” that applies the Fisher information matrix to select a number of informative examples simultaneously. The key computational challenge is how to eﬃciently identify the subset of unlabeled examples that can result in the largest reduction in the Fisher information. To resolve this challenge, we propose an eﬃcient greedy algorithm that is based on the property of submodular functions. Our empirical studies with ﬁve UCI datasets and one realworld medical image classiﬁcation show that the proposed batch mode active learning algorithm is more eﬀective than the state-ofthe-art algorithms for active learning},
annote = {Badly written},
author = {Hoi, Steven C H and Jin, Rong and Zhu, Jianke and Lyu, Michael R},
booktitle = {Proceedings of the 23rd International Conference on Machine learning},
doi = {10.1145/1143844.1143897},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hoi et al. - 2006 - Batch mode active learning and its application to medical image classification.pdf:pdf},
isbn = {1595933832},
pages = {417--424},
title = {{Batch mode active learning and its application to medical image classification}},
url = {http://portal.acm.org/citation.cfm?doid=1143844.1143897},
year = {2006}
}
@article{Ireland1968,
author = {Ireland, C.T. and Kullback, S.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ireland, Kullback - 1968 - Minimum Discrimination Information Estimation.pdf:pdf},
journal = {Biometrics},
number = {3},
pages = {707--713},
title = {{Minimum Discrimination Information Estimation}},
url = {http://www.jstor.org/stable/10.2307/2528330},
volume = {24},
year = {1968}
}
@article{Hunt1990,
author = {Hunt, D. N. and Bell, M. J.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hunt, Bell - 1990 - Semi-iterative missing value estimation.pdf:pdf},
journal = {Journal of Applied Statistics},
number = {3},
pages = {389--396},
title = {{Semi-iterative missing value estimation}},
volume = {17},
year = {1990}
}
@article{Kelejian1969,
author = {Kelejian, H.H.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kelejian - 1969 - Missing Observations in Multivaraite Regression Efficiency of a First-Order Method.pdf:pdf},
journal = {Journal of the American Statistical Association},
number = {328},
pages = {1609--1616},
title = {{Missing Observations in Multivaraite Regression: Efficiency of a First-Order Method}},
volume = {64},
year = {1969}
}
@inproceedings{Suzuki2008,
author = {Suzuki, Jun and Isozaki, Hideki},
booktitle = {ACL},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Suzuki, Isozaki - 2008 - Semi-Supervised Sequential Labeling and Segmentation Using Giga-Word Scale Unlabeled Data.pdf:pdf},
pages = {665--673},
title = {{Semi-Supervised Sequential Labeling and Segmentation Using Giga-Word Scale Unlabeled Data.}},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.164.5597{\&}rep=rep1{\&}type=pdf},
year = {2008}
}
@article{Kleinberg1996,
author = {Kleinberg, E.M.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kleinberg - 1996 - An Overtraining-Resistant Stochastic Modeling Method for Pattern Recognition.pdf:pdf},
number = {6},
pages = {2319--2349},
title = {{An Overtraining-Resistant Stochastic Modeling Method for Pattern Recognition}},
volume = {24},
year = {1996}
}
@unpublished{Looga,
author = {Loog, Marco and Jensen, Are C},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Loog, Jensen - Unknown - A Constrained Log-Likelihood Formulation for Semi-Supervised Nearest Mean Classification.pdf:pdf},
keywords = {constrained estimation,log-likelihood,nearest mean classifier,semi-supervised learning},
number = {1},
title = {{A Constrained Log-Likelihood Formulation for Semi-Supervised Nearest Mean Classification}},
volume = {1}
}
@inproceedings{Niu2013,
author = {Niu, G and Jitkrittum, W and Dai, Bo and Hachiya, H and Sugiyama, M},
booktitle = {Proceedings of the 30th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Niu et al. - 2013 - Squared-loss Mutual Information Regularization A Novel Information-theoretic Approach to Semi-supervised Learning.pdf:pdf},
pages = {10--18},
title = {{Squared-loss Mutual Information Regularization: A Novel Information-theoretic Approach to Semi-supervised Learning}},
url = {http://sugiyama-www.cs.titech.ac.jp/{~}gang/paper/niu{\_}icml13.pdf},
year = {2013}
}
@article{Rubin2005,
abstract = {Causal effects are defined as comparisons of potential outcomes under different treatments on a common set of units. Observed values of the potential outcomes are revealed by the assignment mechanism—a probabilistic model for the treatment each unit receives as a function of covariates and potential outcomes. Fisher made tremendous contributions to causal inference through his work on the design of randomized experiments, but the potential outcomes perspective applies to other complex experiments and nonrandomized studies as well. As noted by Kempthorne in his 1976 discussion of Savage's Fisher lecture, Fisher never bridged his work on experimental design and his work on parametric modeling, a bridge that appears nearly automatic with an appropriate view of the potential outcomes framework, where the potential outcomes and covariates are given a Bayesian distribution to complete the model specification. Also, this framework crisply separates scientific inference for causal effects and decisions based on s...},
author = {Rubin, Donald B},
doi = {10.1198/016214504000001880},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Rubin - 2005 - Causal Inference Using Potential Outcomes.pdf:pdf},
isbn = {0162-1459$\backslash$r1537-274X},
issn = {0162-1459},
journal = {Journal of the American Statistical Association},
keywords = {analysis of covariance,assignment mechanism,assignment-based causal inference,bayesian inference,creasy,direct causal,effects,fieller,fisher,neyman,observational studies,principal stratification,randomized experiments,rubin},
number = {469},
pages = {322--331},
title = {{Causal Inference Using Potential Outcomes}},
volume = {100},
year = {2005}
}
@article{Li2009a,
abstract = {Semi-Supervised Support Vector Machines (S3VMs) typically directly$\backslash$nestimate the label assignments for the unlabeled instances. This$\backslash$nis often inefficient even with recent advances in the efficient training$\backslash$nof the (supervised) SVM. In this paper, we show that S3VMs, with$\backslash$nknowledge of the means of the class labels of the unlabeled data,$\backslash$nis closely related to the supervised SVM with known labels on all$\backslash$nthe unlabeled data. This motivates us to first estimate the label$\backslash$nmeans of the unlabeled data. Two versions of the meanS3VM, which$\backslash$nwork by maximizing the margin between the label means, are proposed.$\backslash$nThe first one is based on multiple kernel learning, while the second$\backslash$none is based on alternating optimization. Experiments show that both$\backslash$nof the proposed algorithms achieve highly competitive and sometimes$\backslash$neven the best performance as compared to the state-of-the-art semi-supervised$\backslash$nlearners. Moreover, they are more efficient than existing S3VMs.},
author = {Li, Yu-Feng and Kwok, James T. and Zhou, Zhi-hua},
doi = {10.1145/1553374.1553456},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Li, Kwok, Zhou - 2009 - Semi-supervised learning using label mean(2).pdf:pdf},
isbn = {9781605585161},
journal = {Proceedings of the 26th Annual International Conference on Machine Learning},
pages = {633--640},
title = {{Semi-supervised learning using label mean}},
url = {http://dl.acm.org/citation.cfm?id=1553456},
year = {2009}
}
@inproceedings{Drummond2009,
abstract = {At various machine learning conferences, at various times, there have been discussions arising from the inability to replicate the experimental results published in a paper. There seems to be a wide spread view that we need to do something to address this problem, as it is essential to the advancement of our field. The most compelling argument would seem to be that reproducibility of experimental results is the hallmark of science. Therefore, given that most of us regard machine learning as a scientific discipline, being able to replicate experiments is paramount. I want to challenge this view by separating the notion of reproducibility, a generally desirable property, from replicability, its poor cousin. I claim there are important differences between the two. Reproducibility requires changes; replicability avoids them. Although reproducibility is desirable, I contend that the impoverished version, replicability, is one not worth having.},
author = {Drummond, Chris},
booktitle = {Proceedings of the Evaluation Methods for Machine Learning Workshop at the 26th ICML},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Drummond - 2009 - Replicability is not Reproducibility Nor is it Good Science.pdf:pdf},
keywords = {Artificial Intelligence},
title = {{Replicability is not Reproducibility: Nor is it Good Science}},
year = {2009}
}
@article{Bian2014,
author = {Bian, Wei and Tao, Dacheng and Member, Senior},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bian, Tao, Member - 2014 - Asymptotic Generalization Bound of Fisher's Linear Discriminant Analysis.pdf:pdf},
journal = {IEEE transactions on pattern analysis and machine intelligence},
number = {12},
pages = {2325--2337},
title = {{Asymptotic Generalization Bound of Fisher's Linear Discriminant Analysis}},
volume = {36},
year = {2014}
}
@inproceedings{Loog2010,
author = {Loog, Marco},
booktitle = {Machine Learning and Knowledge Discovery in Databases (Lecture Notes in Computer Science Volume 6322)},
doi = {10.1007/978-3-642-15883-4_19},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Loog - 2010 - Constrained Parameter Estimation for Semi-Supervised Learning The Case of the Nearest Mean Classifier.pdf:pdf},
pages = {291--304},
publisher = {Springer},
title = {{Constrained Parameter Estimation for Semi-Supervised Learning: The Case of the Nearest Mean Classifier}},
year = {2010}
}
@book{Imbens2015,
author = {Imbens, Guido W. and Rubin, Donald B.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Imbens, Rubin - 2015 - Causal Inference for Statistics , Social , and Biomedical Sciences.pdf:pdf},
isbn = {9781139025751},
publisher = {Cambridge University Press},
title = {{Causal Inference for Statistics , Social , and Biomedical Sciences}},
year = {2015}
}
@article{Shipp2001,
author = {Shipp, Catherine A. and Kuncheva, Ludmila I},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Shipp, Kuncheva - 2001 - Four Measures of Data Complexity for Bootstrapping, Splitting and Feature Sampling.pdf:pdf},
isbn = {0000000000000},
journal = {Proc. CIMA},
title = {{Four Measures of Data Complexity for Bootstrapping, Splitting and Feature Sampling}},
url = {http://www.bangor.ac.uk/{~}mas00a/papers/cslkAIDA01.pdf},
year = {2001}
}
@inproceedings{Ho2008,
author = {Ho, Tin Kam},
booktitle = {Proceedings of the 2008 Joint IAPR International Workshop on Structural, Syntactic, and Statistical Pattern Recognition},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ho - 2008 - Data complexity analysis Linkage between context and solution in classification.pdf:pdf},
pages = {986--995},
title = {{Data complexity analysis: Linkage between context and solution in classification}},
url = {http://link.springer.com/chapter/10.1007/978-3-540-89689-0{\_}102},
year = {2008}
}
@article{Dean2006,
abstract = {An authentic food is one that is what it purports to be. Food processors and consumers need to be assured that, when they pay for a specific product or ingredient, they are receiving exactly what they pay for. Classification methods are an important tool in food authenticity studies where they are used to assign food samples of unknown type to known types. A classification method is developed where the classification rule is estimated by using both the labelled and the unlabelled data, in contrast with many classical methods which use only the labelled data for estimation. This methodology models the data as arising from a Gaussian mixture model with parsimonious covariance structure, as is done in model-based clustering. A missing data formulation of the mixture model is used and the models are fitted by using the EM and classification EM algorithms. The methods are applied to the analysis of spectra of food-stuffs recorded over the visible and near infra-red wavelength range in food authenticity studies. A comparison of the performance of model-based discriminant analysis and the method of classification proposed is given. The classification method proposed is shown to yield very good misclassification rates. The correct classification rate was observed to be as much as 15{\%} higher than the correct classification rate for model-based discriminant analysis.},
author = {Dean, Nema and Murphy, Thomas Brendan and Downey, Gerard},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Dean, Murphy, Downey - 2006 - Using unlabelled data to update classification rules with applications in food authenticity studies.pdf:pdf},
journal = {Journal of the Royal Statistical Society. Series C (Applied Statistics)},
keywords = {Classification,Discriminant analysis,Food authenticity studies,Model-based clustering,Near infra-red spectroscopy},
number = {1},
pages = {1--14},
title = {{Using unlabelled data to update classification rules with applications in food authenticity studies}},
volume = {55},
year = {2006}
}
@article{Yoder2016,
abstract = {Traditionally, practitioners initialize the {\{}$\backslash$tt k-means{\}} algorithm with centers chosen uniformly at random. Randomized initialization with uneven weights ({\{}$\backslash$tt k-means++{\}}) has recently been used to improve the performance over this strategy in cost and run-time. We consider the k-means problem with semi-supervised information, where some of the data are pre-labeled, and we seek to label the rest according to the minimum cost solution. By extending the {\{}$\backslash$tt k-means++{\}} algorithm and analysis to account for the labels, we derive an improved theoretical bound on expected cost and observe improved performance in simulated and real data examples. This analysis provides theoretical justification for a roughly linear semi-supervised clustering algorithm.},
archivePrefix = {arXiv},
arxivId = {1602.00360},
author = {Yoder, Jordan and Priebe, Carey E.},
eprint = {1602.00360},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Yoder, Priebe - 2016 - Semi-supervised K-means.pdf:pdf},
keywords = {approximation,clustering,kmeans,partially labeled,semi-supervised},
number = {1},
pages = {1--16},
title = {{Semi-supervised K-means++}},
url = {http://arxiv.org/abs/1602.00360},
year = {2016}
}
@book{Taleb2007,
author = {Taleb, Nassim Nicholas},
publisher = {Random House},
title = {{The Black Swan: The Impact of the Highly Improbable}},
year = {2007}
}
@article{Telgarsky2015a,
abstract = {This paper proves, in very general settings, that convex risk minimization is a procedure to select a unique conditional probability model determined by the classification problem. Unlike most previous work, we give results that are general enough to include cases in which no minimum exists, as occurs typically, for instance, with standard boosting algorithms. Concretely, we first show that any sequence of predictors minimizing convex risk over the source distribution will converge to this unique model when the class of predictors is linear (but potentially of infinite dimension). Secondly, we show the same result holds for $\backslash$emph{\{}empirical{\}} risk minimization whenever this class of predictors is finite dimensional, where the essential technical contribution is a norm-free generalization bound.},
archivePrefix = {arXiv},
arxivId = {1506.04513},
author = {Telgarsky, Matus and Dud{\'{i}}k, Miroslav and Schapire, Robert},
eprint = {1506.04513},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Telgarsky, Dud{\'{i}}k, Schapire - 2015 - Convex Risk Minimization and Conditional Probability Estimation.pdf:pdf},
keywords = {classification,conditional probability estimation,consistency,convex duality,maximum entropy,orlicz spaces},
pages = {1--54},
title = {{Convex Risk Minimization and Conditional Probability Estimation}},
url = {http://arxiv.org/abs/1506.04513},
volume = {40},
year = {2015}
}
@article{Gelman2010,
abstract = {We review some approaches and philosophies of causal inference coming from sociology, economics, computer science, cognitive science, and statistics},
archivePrefix = {arXiv},
arxivId = {1003.2619},
author = {Gelman, Andrew},
doi = {10.1086/662659},
eprint = {1003.2619},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gelman - 2010 - Causality and Statistical Learning.pdf:pdf},
issn = {00029602},
journal = {American Journal of Sociology},
number = {3},
pages = {955--966},
title = {{Causality and Statistical Learning}},
url = {http://arxiv.org/abs/1003.2619},
volume = {117},
year = {2010}
}
@article{Macia2013,
author = {Maci{\`{a}}, N{\'{u}}ria and Bernad{\'{o}}-Mansilla, Ester},
doi = {10.1016/j.ins.2013.08.059},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Maci{\`{a}}, Bernad{\'{o}}-Mansilla - 2013 - Towards UCI A mindful repository design.pdf:pdf},
issn = {00200255},
journal = {Information Sciences},
month = {sep},
title = {{Towards UCI+: A mindful repository design}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0020025513006336},
year = {2013}
}
@article{Hayashi2015,
author = {Hayashi, K. and Takai, K.},
doi = {10.1080/03610918.2014.957847},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hayashi, Takai - 2015 - Finite-sample analysis of impacts of unlabelled data and their labelling mechanisms in linear discriminant analy.pdf:pdf},
issn = {0361-0918},
journal = {Communications in Statistics - Simulation and Computation},
number = {June},
pages = {00--00},
title = {{Finite-sample analysis of impacts of unlabelled data and their labelling mechanisms in linear discriminant analysis}},
url = {http://www.tandfonline.com/doi/full/10.1080/03610918.2014.957847},
year = {2015}
}
@article{Rahimi2007,
abstract = {To accelerate the training of kernel machines, we propose to map the input data to a randomized low-dimensional feature space and then apply existing fast linear methods. Our randomized features are designed so that the inner products of the transformed data are approximately equal to those in the feature space of a user specified shift-invariant kernel. We explore two sets of random features, provide convergence bounds on their ability to approximate various radial basis kernels, and show that in large-scale classification and regression tasks linear machine learning algorithms that use these features outperform state-of-the-art large-scale kernel machines.},
author = {Rahimi, Ali and Recht, Ben},
doi = {10.1.1.145.8736},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Rahimi, Recht - 2007 - Random features for large-scale kernel machines.pdf:pdf},
isbn = {160560352X},
journal = {Advances in Neural Information Processing Systems},
number = {1},
pages = {1--8},
title = {{Random features for large-scale kernel machines}},
url = {http://machinelearning.wustl.edu/mlpapers/paper{\_}files/NIPS2007{\_}833.pdf},
year = {2007}
}
@article{Titterington1976,
abstract = {A Bayesian approach is made to the problem of using individuals of$\backslash$nunconfirmed categories to provide information supplementary to a$\backslash$nbasic data bank of categorized observations. The exact analysis is$\backslash$nbriefly presented, followed by suggestions for more practicable approximate$\backslash$nprocedures, which are applied to examples involving medical and simulated$\backslash$ndata. The general conclusion is that the discriminatory performance$\backslash$nof the data bank can be usefully improved by making use of uncategorized$\backslash$nobservations.},
author = {Titterington, D M},
doi = {10.2307/2347231},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Titterington - 1976 - Updating a diagnostic system using unconfirmed cases.pdf:pdf},
issn = {00359254},
journal = {Journal of the Royal Statistical Society. Series C (Applied Statistics)},
keywords = {bayesian updating,discrimination,distributions,medical differential diagnosis,predictive,undiagnosed individuals},
number = {3},
pages = {238--247},
title = {{Updating a diagnostic system using unconfirmed cases}},
volume = {25},
year = {1976}
}
@article{Wainberg2016,
abstract = {The JMLR study Do we need hundreds of classifiers to solve real world classification problems? benchmarks 179 classifiers in 17 families on 121 data sets from the UCI repository and claims that “the random forest is clearly the best family of classifier”. In this response, we show that the study's results are biased by the lack of a held-out test set and the exclusion of trials with errors. Further, the study's own statistical tests indicate that random forests do not have significantly higher percent accuracy than support vector machines and neural networks, calling into question the conclusion that random forests are the best classifiers.},
author = {Wainberg, Michael and Alipanahi, Babak and Frey, Brendan J.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wainberg, Alipanahi, Frey - 2016 - Are Random Forests Truly the Best Classifiers.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {benchmarking,classification,neural,random forests,support vector machines},
number = {110},
pages = {1--5},
title = {{Are Random Forests Truly the Best Classifiers?}},
volume = {17},
year = {2016}
}
@inproceedings{Huang2014a,
author = {Huang, Gao and Song, Shiji and Xu, Zhixiang and Weinberger, Kilian},
booktitle = {ECML PKDD},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Huang et al. - 2014 - Transductive minimax probability machine.pdf:pdf},
isbn = {9783662448472},
issn = {16113349},
keywords = {minimax probability machine,semi-supervised learning,transductive learning},
number = {PART 1},
pages = {579--594},
title = {{Transductive minimax probability machine}},
volume = {8724 LNAI},
year = {2014}
}
@inproceedings{Macia2009,
author = {Macia, Nuria and Orriols-puig, Albert and Bernad{\'{o}}-Mansilla, Ester},
booktitle = {Hybrid Artificial Intelligence Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Macia, Orriols-puig, Bernad{\'{o}}-Mansilla - 2009 - Beyond Homemade Artificial Data Sets.pdf:pdf},
keywords = {artificial data sets,data complexity,machine learning},
pages = {605--612},
title = {{Beyond Homemade Artificial Data Sets}},
url = {http://www.springerlink.com/index/N23720WL67U355MV.pdf http://link.springer.com/chapter/10.1007/978-3-642-02319-4{\_}73},
year = {2009}
}
@book{Grunwald2007,
author = {Grunwald, Peter D.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Grunwald - 2007 - The minimum description length principle.pdf:pdf},
isbn = {9780262072816},
publisher = {MIT Press},
title = {{The minimum description length principle}},
year = {2007}
}
@inproceedings{Huang2014b,
author = {Huang, Sheng-jun and Jin, Rong and Zhou, Zhi-hua},
booktitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Huang, Jin, Zhou - 2014 - Active learning by querying informative and representative examples.pdf:pdf},
number = {10},
pages = {1936--1949},
title = {{Active learning by querying informative and representative examples}},
volume = {36},
year = {2014}
}
@article{Buhlmann2014a,
author = {Buhlmann, Peter and Meier, Lukas and van de Geer, Sara},
doi = {10.1214/13-AOS1175},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Buhlmann, Meier, van de Geer - 2014 - Discussion “a significance test for the lasso”.pdf:pdf},
journal = {The Annals of Statistics},
number = {2},
pages = {469--477},
title = {{Discussion: “a significance test for the lasso”}},
volume = {42},
year = {2014}
}
@inproceedings{Pitelis2014,
author = {Pitelis, Nikolaos and Russell, Chris and Agapito, Lourdes},
booktitle = {ECML PKDD},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Pitelis, Russell, Agapito - 2014 - Semi-supervised Learning Using an Unsupervised Atlas.pdf:pdf},
pages = {565--580},
title = {{Semi-supervised Learning Using an Unsupervised Atlas}},
year = {2014}
}
@inproceedings{Klinkenberg2001,
author = {Klinkenberg, Ralf},
booktitle = {Workshop notes of the IJCAI-01 Workshop on Learning from Temporal and Spatial Data},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Klinkenberg - 2001 - Using labeled and unlabeled data to learn drifting concepts.pdf:pdf},
pages = {16--24},
title = {{Using labeled and unlabeled data to learn drifting concepts}},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.23.1798{\&}rep=rep1{\&}type=pdf},
year = {2001}
}
@article{Yang2015,
author = {Yang, Yun and Liu, Xingchen},
doi = {10.1016/j.patrec.2015.08.009},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Yang, Liu - 2015 - A robust semi-supervised learning approach via mixture of label information.pdf:pdf},
issn = {01678655},
journal = {Pattern Recognition Letters},
keywords = {Clustering,semi-supervised learning},
publisher = {Elsevier Ltd.},
title = {{A robust semi-supervised learning approach via mixture of label information}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0167865515002688},
year = {2015}
}
@article{Shi2011,
author = {Shi, Mingguang and Zhang, Bing},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Shi, Zhang - 2011 - Semi-supervised learning improves gene expression-based prediction of cancer recurrence.pdf:pdf},
journal = {Bioinformatics},
number = {21},
pages = {3017--3023},
title = {{Semi-supervised learning improves gene expression-based prediction of cancer recurrence}},
volume = {27},
year = {2011}
}
@article{Pitt1988,
author = {Pitt, Leonard and Valiant, Leslie G.},
doi = {10.1145/48014.63140},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Pitt, Valiant - 1988 - Computational limitations on learning from examples.pdf:pdf},
issn = {00045411},
journal = {Journal of the ACM},
month = {oct},
number = {4},
pages = {965--984},
title = {{Computational limitations on learning from examples}},
url = {http://portal.acm.org/citation.cfm?doid=48014.63140},
volume = {35},
year = {1988}
}
@inproceedings{Brefeld2006,
author = {Brefeld, Ulf and G{\"{a}}rtner, Thomas and Scheffer, Tobias and Wrobel, Stefan},
booktitle = {Proceedings of the 23rd International Conference on Machine learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Brefeld et al. - 2006 - Efficient co-regularised least squares regression.pdf:pdf},
pages = {137--144},
title = {{Efficient co-regularised least squares regression}},
url = {http://dl.acm.org/citation.cfm?id=1143862},
year = {2006}
}
@book{Zhu2009,
author = {Zhu, Xiaojin and Goldberg, Andrew B.},
editor = {Brachman, Ronald J. and Dietterich, Thomas},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Zhu, Goldberg - 2009 - Introduction to Semi-Supervised Learning.pdf:pdf},
isbn = {9781598295474},
pages = {1--130},
publisher = {Morgan {\&} Claypool},
title = {{Introduction to Semi-Supervised Learning}},
year = {2009}
}
@inproceedings{Macia2010,
author = {Macia, Nuria and Ho, Tin Kam and Orriols-puig, Albert and Bernad{\'{o}}-Mansilla, Ester},
booktitle = {Proceedings of the 20th International Conference on Pattern Recognition},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Macia et al. - 2010 - The Landscape Contest at ICPR 2010.pdf:pdf},
pages = {29--45},
title = {{The Landscape Contest at ICPR 2010}},
year = {2010}
}
@article{Wang2007a,
author = {Wang, Junhui and Shen, Xiaotong and Pan, Wei},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wang, Shen, Pan - 2007 - On Transductive Support Vector Machines.pdf:pdf},
journal = {Contemporary Mathematics},
pages = {7--19},
title = {{On Transductive Support Vector Machines}},
volume = {443},
year = {2007}
}
@article{Preece1971,
author = {Preece, D.A.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Preece - 1971 - Iterative Procedures for Missing Values in Experiments.pdf:pdf},
journal = {Technometrics},
number = {4},
pages = {743--753},
title = {{Iterative Procedures for Missing Values in Experiments}},
volume = {13},
year = {1971}
}
@article{Henderson1981,
author = {Henderson, H.V. and Searle, S.R.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Henderson, Searle - 1981 - On Deriving the Inverse of a Sum of Matrices.pdf:pdf},
journal = {SIAM Review},
number = {1},
pages = {53--60},
title = {{On Deriving the Inverse of a Sum of Matrices}},
volume = {23},
year = {1981}
}
@article{Loog2014b,
author = {Loog, Marco and Jensen, Are C},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Loog, Jensen - 2014 - Semi-Supervised Nearest Mean Classification through a constrained Log-Likelihood.pdf:pdf},
journal = {IEEE Transactions on Neural Networks and Learning Systems},
number = {5},
pages = {995 -- 1006},
title = {{Semi-Supervised Nearest Mean Classification through a constrained Log-Likelihood}},
volume = {26},
year = {2014}
}
@article{Schmidt1996,
author = {Schmidt, Karsten},
doi = {10.1007/BF00046993},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Schmidt - 1996 - A comparison of minimax and least squares estimators in linear regression with polyhedral prior information.pdf:pdf},
issn = {0167-8019},
journal = {Acta Applicandae Mathematicae},
keywords = {1,3,average performance,estimator,inequality restricted least squares,minimax estima-,n vector of,parameter restrictions,polyhedral prior information in,projection estimators,regression model y,the linear regression model,tion,u,we consider the linear,where y is an,x},
month = {apr},
number = {1},
pages = {127--138},
title = {{A comparison of minimax and least squares estimators in linear regression with polyhedral prior information}},
url = {http://link.springer.com/10.1007/BF00046993},
volume = {43},
year = {1996}
}
@inproceedings{Jordan2002,
abstract = {We compare discriminative and generative learning as typified by logistic regression and naive Bayes. We show, contrary to a widely held belief that discriminative classifiers are almost always to be preferred, that there can often be two distinct regimes of performance as the training set size is increased, one in which each algorithm does better. This stems from the observation - which is borne out in repeated experiments - that while discriminative learning has lower asymptotic error, a generative classifier may also approach its (higher) asymptotic error much faster.},
author = {Jordan, Michael I. and Ng, Andrew Y},
booktitle = {Advances in neural information processing systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Jordan, Ng - 2002 - On Discriminative vs. Generative classifiers comparison of logistic regression and naive Bayes.pdf:pdf},
number = {14},
pages = {841--848},
title = {{On Discriminative vs. Generative classifiers: comparison of logistic regression and naive Bayes}},
url = {http://books.google.com/books?hl=en{\&}lr={\&}id=GbC8cqxGR7YC{\&}oi=fnd{\&}pg=PA841{\&}dq=On+Discriminative+vs.+Generative+classifiers:+comparison+of+logistic+regression+and+naive+Bayes{\&}ots=ZvO0F2{\_}vx9{\&}sig=0nMLd-CWMsb8-jyrI6YetIH6ZZU},
volume = {2},
year = {2002}
}
@inproceedings{Bottou2010,
author = {Bottou, Leon},
booktitle = {Proceedings of COMPSTAT'2010},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bottou - 2010 - Large-scale machine learning with stochastic gradient descent.pdf:pdf},
keywords = {efficiency,online learning,stochastic gradient descent},
pages = {177--186},
publisher = {Springer},
title = {{Large-scale machine learning with stochastic gradient descent}},
year = {2010}
}
@article{Denis2015,
abstract = {Confident prediction is highly relevant in machine learning; for example, in applications such as medical diagnoses, wrong prediction can be fatal. For classification, there already exist procedures that allow to not classify data when the confidence in their prediction is weak. This approach is known as classification with reject option. In the present paper, we provide new methodology for this approach. Predicting a new instance via a confidence set, we ensure an exact control of the probability of classification. Moreover, we show that this methodology is easily implementable and entails attractive theoretical and numerical properties.},
archivePrefix = {arXiv},
arxivId = {1507.07235},
author = {Denis, Christophe and Hebiri, Mohamed},
eprint = {1507.07235},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Denis, Hebiri - 2015 - Consistency of plug-in confidence sets for classification in semi-supervised learning.pdf:pdf},
keywords = {classification,classification with reject option,confidence sets,conformal predictors,plug-in confidence sets},
pages = {1--26},
title = {{Consistency of plug-in confidence sets for classification in semi-supervised learning}},
url = {http://arxiv.org/abs/1507.07235},
year = {2015}
}
@article{Todorovski2003,
author = {Todorovski, Ljupco and D{\v{z}}eroski, Saso},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Todorovski, D{\v{z}}eroski - 2003 - Combining classifiers with meta decision trees.pdf:pdf},
journal = {Machine learning},
keywords = {combining classifiers,decision trees,ensembles of classifiers,meta-level learning,stacking},
pages = {223--249},
title = {{Combining classifiers with meta decision trees}},
url = {http://link.springer.com/article/10.1023/A:1021709817809},
year = {2003}
}
@inproceedings{Loog2012b,
author = {Loog, Marco and Jensen, Are C},
booktitle = {Structural, Syntactic, and Statistical Pattern Recognition (Lecture Notes in Computer Science Volume 7626)},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Loog, Jensen - 2012 - Constrained log-likelihood-based semi-supervised linear discriminant analysis.pdf:pdf},
pages = {327--335},
publisher = {Springer},
title = {{Constrained log-likelihood-based semi-supervised linear discriminant analysis}},
url = {http://www.springerlink.com/index/U16X1L3015777162.pdf},
year = {2012}
}
@article{Bottou2012,
author = {Bottou, L{\'{e}}on},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bottou - 2012 - Stochastic Gradient Descent Tricks.pdf:pdf},
journal = {Neural Networks: Tricks of the Trade},
number = {1},
pages = {1--16},
title = {{Stochastic Gradient Descent Tricks}},
url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8{\_}25},
year = {2012}
}
@article{ONeill1978,
author = {O'Neill, Terence J.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/O'Neill - 1978 - Normal discrimination with unclassified observations.pdf:pdf},
journal = {Journal of the American Statistical Association},
keywords = {unclassified},
number = {364},
pages = {821--826},
title = {{Normal discrimination with unclassified observations}},
url = {http://amstat.tandfonline.com/doi/full/10.1080/01621459.1978.10480106},
volume = {73},
year = {1978}
}
@article{Cheplygina2010,
author = {Cheplygina, Veronika},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Cheplygina - 2010 - Random Subspace Method for One-Class Classifiers.pdf:pdf},
title = {{Random Subspace Method for One-Class Classifiers}},
year = {2010}
}
@inproceedings{Huang2006,
author = {Huang, Jiayuan and Smola, Alex and Gretton, Arthur and Borgwardt, Karsten M. and Sch{\"{o}}lkopf, Bernhard},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Huang et al. - 2006 - Correcting sample selection bias by unlabeled data.pdf:pdf},
pages = {601--608},
title = {{Correcting sample selection bias by unlabeled data}},
url = {http://machinelearning.wustl.edu/mlpapers/paper{\_}files/NIPS2006{\_}915.pdf},
year = {2006}
}
@inproceedings{White2012,
author = {White, Martha and Schuurmans, Dale},
booktitle = {International Conference on Artificial Intelligence and Statistics},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/White, Schuurmans - 2012 - Generalized optimal reverse prediction.pdf:pdf},
pages = {1305--1313},
title = {{Generalized optimal reverse prediction}},
url = {http://machinelearning.wustl.edu/mlpapers/paper{\_}files/AISTATS2012{\_}WhiteS12.pdf},
year = {2012}
}
@inproceedings{Zhang2000a,
author = {Zhang, Tong and Oles, Frank J.},
booktitle = {International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Zhang, Oles - 2000 - A Probability Analysis on the Value of Unlabeled Data for Classification Problems.pdf:pdf},
pages = {1191--1198},
title = {{A Probability Analysis on the Value of Unlabeled Data for Classification Problems}},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.20.6025{\&}rep=rep1{\&}type=pdf},
year = {2000}
}
@book{Bertsekas1999,
author = {Bertsekas, Dimitri P},
publisher = {Athena scientific},
title = {{Nonlinear programming}},
year = {1999}
}
@article{Hernandez-Gonzalez2015,
author = {Hern{\'{a}}ndez-Gonz{\'{a}}lez, Jer{\'{o}}nimo and naki Inza and Lozano, Jose A.},
doi = {10.1016/j.patrec.2015.10.008},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hern{\'{a}}ndez-Gonz{\'{a}}lez, Inza, Lozano - 2015 - Weak supervision and other non-standard classification problems a taxonomy.pdf:pdf},
issn = {01678655},
journal = {Pattern Recognition Letters},
keywords = {degrees of supervision,partially supervised classification,weakly supervised classification},
pages = {49--55},
title = {{Weak supervision and other non-standard classification problems: a taxonomy}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0167865515003505},
volume = {69},
year = {2015}
}
@inproceedings{Sokolovska2008,
address = {Helsinki, Finland},
author = {Sokolovska, Nataliya and Capp{\'{e}}, Olivier and Yvon, Francois},
booktitle = {Proceedings of the 25th International Conference on Machine Learning},
editor = {Cohen, William W. and McCallum, Andrew and Roweis, Sam T.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Sokolovska, Capp{\'{e}}, Yvon - 2008 - The asymptotics of semi-supervised learning in discriminative probabilistic models.pdf:pdf},
pages = {984--991},
publisher = {ACM Press},
title = {{The asymptotics of semi-supervised learning in discriminative probabilistic models}},
year = {2008}
}
@inproceedings{Shalev-Shwartz2007,
author = {Shalev-Shwartz, Shai and Singer, Yoram},
booktitle = {Proceedings of the 24th International Conference on Machine learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Shalev-Shwartz, Singer - 2007 - Pegasos Primal estimated sub-gradient solver for svm.pdf:pdf},
pages = {807--814},
title = {{Pegasos: Primal estimated sub-gradient solver for svm}},
url = {http://link.springer.com/article/10.1007/s10107-010-0420-4},
year = {2007}
}
@article{Nguyen2010,
author = {Nguyen, XuanLong and Wainwright, Martin J. and Jordan, Michael I.},
doi = {10.1109/TIT.2010.2068870},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Nguyen, Wainwright, Jordan - 2010 - Estimating Divergence Functionals and the Likelihood Ratio by Convex Risk Minimization(2).pdf:pdf},
issn = {0018-9448},
journal = {IEEE Transactions on Information Theory},
month = {nov},
number = {11},
pages = {5847--5861},
title = {{Estimating Divergence Functionals and the Likelihood Ratio by Convex Risk Minimization}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=5605355},
volume = {56},
year = {2010}
}
@article{Poggio2004,
abstract = {Developing theoretical foundations for learning is a key step towards understanding intelligence. 'Learning from examples' is a paradigm in which systems (natural or artificial) learn a functional relationship from a training set of examples. Within this paradigm, a learning algorithm is a map from the space of training sets to the hypothesis space of possible functional solutions. A central question for the theory is to determine conditions under which a learning algorithm will generalize from its finite training set to novel examples. A milestone in learning theory was a characterization of conditions on the hypothesis space that ensure generalization for the natural class of empirical risk minimization (ERM) learning algorithms that are based on minimizing the error on the training set. Here we provide conditions for generalization in terms of a precise stability property of the learning process: when the training set is perturbed by deleting one example, the learned hypothesis does not change much. This stability property stipulates conditions on the learning map rather than on the hypothesis space, subsumes the classical theory for ERM algorithms, and is applicable to more general algorithms. The surprising connection between stability and predictivity has implications for the foundations of learning theory and for the design of novel algorithms, and provides insights into problems as diverse as language learning and inverse problems in physics and engineering.},
author = {Poggio, Tomaso and Rifkin, Ryan and Mukherjee, Sayan and Niyogi, Partha},
doi = {10.1038/nature02341},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Poggio et al. - 2004 - General conditions for predictivity in learning theory.pdf:pdf},
issn = {1476-4687},
journal = {Nature},
keywords = {Algorithms,Intelligence,Language,Learning,Learning: physiology,Models, Theoretical,Probability,Research Design},
month = {mar},
number = {6981},
pages = {419--22},
pmid = {15042089},
title = {{General conditions for predictivity in learning theory.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/15042089},
volume = {428},
year = {2004}
}
@article{Ho2002a,
author = {Ho, Tin Kam},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ho - 2002 - A Data Complexity Analysis of Comparative Advantages of Decision Forest Constructors.pdf:pdf},
journal = {Pattern Analysis and Applications},
number = {2},
pages = {102--112},
title = {{A Data Complexity Analysis of Comparative Advantages of Decision Forest Constructors}},
volume = {5},
year = {2002}
}
@article{Vanschoren2012,
author = {Vanschoren, Joaquin and Blockeel, Hendrik and Pfahringer, Bernhard and Holmes, Geoffrey},
doi = {10.1007/s10994-011-5277-0},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Vanschoren et al. - 2012 - Experiment databases.pdf:pdf},
issn = {0885-6125},
journal = {Machine Learning},
month = {jan},
number = {2},
pages = {127--158},
title = {{Experiment databases}},
url = {http://www.springerlink.com/index/10.1007/s10994-011-5277-0},
volume = {87},
year = {2012}
}
@article{Bousquet2004,
author = {Bousquet, Olivier and Boucheron, Stephane and Lugosi, Gabor},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bousquet, Boucheron, Lugosi - 2004 - Introduction to statistical learning theory.pdf:pdf},
journal = {Lecture Notes in Computer Science},
pages = {169--207},
title = {{Introduction to statistical learning theory}},
url = {http://www.springerlink.com/index/CGW0K6W5W1W1WR9B.pdf},
volume = {3176},
year = {2004}
}
@article{Alaoui2016,
abstract = {Given a weighted graph with {\$}N{\$} vertices, consider a real-valued regression problem in a semi-supervised setting, where one observes {\$}n{\$} labeled vertices, and the task is to label the remaining ones. We present a theoretical study of {\$}\backslashell{\_}p{\$}-based Laplacian regularization under a {\$}d{\$}-dimensional geometric random graph model. We provide a variational characterization of the performance of this regularized learner as {\$}N{\$} grows to infinity while {\$}n{\$} stays constant, the associated optimality conditions lead to a partial differential equation that must be satisfied by the associated function estimate {\$}\backslashhat{\{}f{\}}{\$}. From this formulation we derive several predictions on the limiting behavior the {\$}d{\$}-dimensional function {\$}\backslashhat{\{}f{\}}{\$}, including (a) a phase transition in its smoothness at the threshold {\$}p = d + 1{\$}, and (b) a tradeoff between smoothness and sensitivity to the underlying unlabeled data distribution {\$}P{\$}. Thus, over the range {\$}p \backslashleq d{\$}, the function estimate {\$}\backslashhat{\{}f{\}}{\$} is degenerate and "spiky," whereas for {\$}p\backslashgeq d+1{\$}, the function estimate {\$}\backslashhat{\{}f{\}}{\$} is smooth. We show that the effect of the underlying density vanishes monotonically with {\$}p{\$}, such that in the limit {\$}p = \backslashinfty{\$}, corresponding to the so-called Absolutely Minimal Lipschitz Extension, the estimate {\$}\backslashhat{\{}f{\}}{\$} is independent of the distribution {\$}P{\$}. Under the assumption of semi-supervised smoothness, ignoring {\$}P{\$} can lead to poor statistical performance, in particular, we construct a specific example for {\$}d=1{\$} to demonstrate that {\$}p=2{\$} has lower risk than {\$}p=\backslashinfty{\$} due to the former penalty adapting to {\$}P{\$} and the latter ignoring it. We also provide simulations that verify the accuracy of our predictions for finite sample sizes. Together, these properties show that {\$}p = d+1{\$} is an optimal choice, yielding a function estimate {\$}\backslashhat{\{}f{\}}{\$} that is both smooth and non-degenerate, while remaining maximally sensitive to {\$}P{\$}.},
archivePrefix = {arXiv},
arxivId = {1603.00564},
author = {Alaoui, Ahmed El and Cheng, Xiang and Ramdas, Aaditya and Wainwright, Martin J. and Jordan, Michael I.},
eprint = {1603.00564},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Alaoui et al. - 2016 - Asymptotic behavior of {\$}ell{\_}p{\$}-based Laplacian regularization in semi-supervised learning.pdf:pdf},
journal = {arXiv preprint},
keywords = {absolutely minimal lipschitz extension,asymptotic behav-,geometric random graph model,ior,phase transition,regularization,semi-supervised learning,ℓ p -based laplacian},
title = {{Asymptotic behavior of {\$}\backslashell{\_}p{\$}-based Laplacian regularization in semi-supervised learning}},
url = {http://arxiv.org/abs/1603.00564},
year = {2016}
}
@article{Brazdil2003a,
author = {Brazdil, Pavel B. and Soares, Carlos and Costa, JP Da},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Brazdil, Soares, Costa - 2003 - Ranking learning algorithms Using IBL and meta-learning on accuracy and time results.pdf:pdf},
journal = {Machine Learning},
keywords = {algorithm recommendation,data characterization,meta-learning,ranking},
pages = {251--277},
title = {{Ranking learning algorithms: Using IBL and meta-learning on accuracy and time results}},
url = {http://link.springer.com/article/10.1023/A:1021713901879},
volume = {50},
year = {2003}
}
@article{Baraniuk2007,
author = {Baraniuk, Richard G.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Baraniuk - 2007 - Compressive sensing.pdf:pdf},
journal = {IEEE Signal Processing Magazine},
number = {July},
pages = {118--121},
title = {{Compressive sensing}},
url = {http://ieeexplore.ieee.org/xpls/abs{\_}all.jsp?arnumber=4286571 http://omni.isr.ist.utl.pt/{~}aguiar/CS{\_}notes.pdf},
year = {2007}
}
@inproceedings{Krijthe2016reproducing,
author = {Krijthe, Jesse Hendrik and Loog, Marco},
booktitle = {Workshop on Reproducible Research in Pattern Recognition (Lecture Notes in Computer Science) (To Appear)},
title = {{Reproducible Pattern Recognition Research: The Case of Optimistic SSL}},
year = {2016}
}
@article{Mann2007,
author = {Mann, Gideon S. and McCallum, Andrew Kachites},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Mann, McCallum - 2007 - Efficient computation of entropy gradient for semi-supervised conditional random fields.pdf:pdf},
journal = {Human Language Technologies 2007: The Conference of the North American Chapter of the Association for Computational Linguistics},
title = {{Efficient computation of entropy gradient for semi-supervised conditional random fields}},
url = {http://dl.acm.org/citation.cfm?id=1614136},
year = {2007}
}
@inproceedings{Wolpert2002,
author = {Wolpert, David H},
booktitle = {Soft Computing and Industry},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wolpert - 2002 - The Supervised Learning No-Free-Lunch Theorems.pdf:pdf},
pages = {25--42},
title = {{The Supervised Learning No-Free-Lunch Theorems}},
url = {http://link.springer.com/chapter/10.1007/978-1-4471-0123-9{\_}3},
year = {2002}
}
@article{Maaten2014,
author = {Maaten, Laurens Van Der},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Maaten - 2014 - Accelerating t-SNE using Tree-Based Algorithms.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {barnes-hut algorithm,dual-tree algorithm,embedding,multidimensional scaling,space-partitioning trees,t-sne},
pages = {3221−3245},
title = {{Accelerating t-SNE using Tree-Based Algorithms}},
volume = {15},
year = {2014}
}
@article{Mooij2014,
abstract = {The discovery of causal relationships from purely observational data is a fundamental problem in science. The most elementary form of such a causal discovery problem is to decide whether X causes Y or, alternatively, Y causes X, given joint observations of two variables X,Y. An example is to decide whether altitude causes temperature, or vice versa, given only joint measurements of both variables. Even under the simplifying assumptions of causal sufficiency, no feedback loops, and no selection bias, such bivariate causal discovery problems are very challenging. Nevertheless, several approaches for addressing those problems have been proposed in recent years. We review two families of such methods: Additive Noise Methods (ANM) and Information Geometric Causal Inference (IGCI). We present the benchmark CauseEffectPairs that consists of data for 96 different cause-effect pairs selected from 34 datasets from various domains (e.g., meteorology, biology, medicine, engineering, economy, etc.). We motivate our decisions regarding the "ground truth" causal directions of all pairs. We evaluate the performance of several bivariate causal discovery methods on these real-world benchmark data and in addition on artificially simulated data. Our empirical results indicate that certain methods are able to distinguish cause from effect using only purely observational data with an accuracy of 63-69{\%}. Because of multiple-testing corrections, however, considerably more benchmark data would be needed to obtain statistically significant conclusions. A theoretical contribution of this paper is a proof of the consistency of the additive-noise method as originally proposed by Hoyer et al. (2009).},
archivePrefix = {arXiv},
arxivId = {1412.3773},
author = {Mooij, Joris M. and Peters, Jonas and Janzing, Dominik and Zscheischler, Jakob and Sch{\"{o}}lkopf, Bernhard},
eprint = {1412.3773},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Mooij et al. - 2016 - Distinguishing cause from effect using observational data methods and benchmarks(2).pdf:pdf},
month = {dec},
pmid = {1000106307},
title = {{Distinguishing cause from effect using observational data: methods and benchmarks}},
url = {http://arxiv.org/abs/1412.3773},
volume = {17},
year = {2016}
}
@book{Wickham2015,
author = {Wickham, Hadley},
publisher = {O'Reilly Media},
title = {{R packages}},
year = {2015}
}
@article{Wang2015,
abstract = {While machine learning has proven to be a powerful data-driven solution to many real-life problems, its use in sensitive domains that involve human subjects has been limited due to privacy concerns. The cryptographic approach known as "differential privacy" offers provable privacy guarantees. In this paper we study the learnability under Vapnik's general learning setting with differential privacy constraint, and reveal some intricate relationships between privacy, stability and learnability. In particular, we show that a problem is privately learnable $\backslash$emph{\{}if an only if{\}} there is a private algorithm that asymptotically minimizes the empirical risk (AERM). This is rather surprising because for non-private learning, AERM alone is not sufficient for learnability. This result suggests that when searching for private learning algorithms, we can restrict the search to algorithms that are AERM. In light of this, we propose a conceptual procedure that always finds a universally consistent algorithm whenever the problem is learnable under privacy constraint. We also propose a generic and practical algorithm and show that under very general conditions it privately learns a wide class of learning problems.},
archivePrefix = {arXiv},
arxivId = {arXiv:1502.06309v1},
author = {Wang, Yu-Xiang and Lei, Jing and Fienberg, Stephen E.},
eprint = {arXiv:1502.06309v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wang, Lei, Fienberg - 2015 - Learning with Differential Privacy Stability, Learnability and the Sufficiency and Necessity of ERM Princip.pdf:pdf},
journal = {arXiv:1502.06309 [cs, stat]},
keywords = {characterization,differential privacy,learnability,privacy-preserving,stability},
number = {August 2008},
pages = {1--35},
title = {{Learning with Differential Privacy: Stability, Learnability and the Sufficiency and Necessity of ERM Principle}},
url = {http://arxiv.org/abs/1502.06309$\backslash$nhttp://www.arxiv.org/pdf/1502.06309.pdf},
volume = {17},
year = {2015}
}
@article{Liang2007,
archivePrefix = {arXiv},
arxivId = {arXiv:0710.4618v1},
author = {Liang, Feng and Mukherjee, Sayan and West, Mike},
doi = {10.1214/088342307000000032},
eprint = {arXiv:0710.4618v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Liang, Mukherjee, West - 2007 - The Use of Unlabeled Data in Predictive Modeling.pdf:pdf},
issn = {0883-4237},
journal = {Statistical Science},
keywords = {and phrases,bayesian analysis,bayesian kernel regression,latent factor models,mixture models,pervised learning,predictive distribution,semisu-,unlabeled data},
month = {may},
number = {2},
pages = {189--205},
title = {{The Use of Unlabeled Data in Predictive Modeling}},
url = {http://projecteuclid.org/euclid.ss/1190905518},
volume = {22},
year = {2007}
}
@inproceedings{Collins1999,
author = {Collins, Michael and Singer, Yoram},
booktitle = {Proceedings of the joint SIGDAT conference on empirical methods in natural language processing and very large corpora},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Collins, Singer - 1999 - Unsupervised models for named entity classification.pdf:pdf},
pages = {189--196},
title = {{Unsupervised models for named entity classification}},
url = {http://acl.ldc.upenn.edu/W/W99/W99-0613.pdf?ref=Sawos.OrgR{\%}7B.{\%}EF{\%}BF{\%}BD{\%}EF{\%}BF{\%}BD{\%}EF{\%}BF{\%}BD{\%}EF{\%}BF{\%}BD{\%}C7{\%}9D{\%}EF{\%}BF{\%}BD{\%}E2{\%}80{\%}A1{\%}5E{\%}EF{\%}BF{\%}BD{\%}EF{\%}BF{\%}BD{\%}C3{\%}A87},
year = {1999}
}
@article{Rooyen,
archivePrefix = {arXiv},
arxivId = {arXiv:1504.00083v1},
author = {Rooyen, Brendan Van and Williamson, Robert C},
eprint = {arXiv:1504.00083v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Rooyen, Williamson - Unknown - A Theory of Feature Learning.pdf:pdf},
title = {{A Theory of Feature Learning}}
}
@article{Wyman1990,
author = {Wyman, Frank J. and Young, Dean M. and Turner, Danny W.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wyman, Young, Turner - 1990 - A Comparison of Asymptotic Error Rate Expansions for the Sample Linear Discriminant Function.pdf:pdf},
journal = {Pattern Recognition},
number = {7},
pages = {775--783},
title = {{A Comparison of Asymptotic Error Rate Expansions for the Sample Linear Discriminant Function}},
volume = {23},
year = {1990}
}
@article{Nock2009,
abstract = {Bartlett et al. (2006) recently proved that a ground condition for surrogates, classification calibration, ties up their consistent minimization to that of the classification risk, and left as an important problem the algorithmic questions about their minimization. In this paper, we address this problem for a wide set which lies at the intersection of classification calibrated surrogates and those of Murata et al. (2004). This set coincides with those satisfying three common assumptions about surrogates. Equivalent expressions for the members-sometimes well known-follow for convex and concave surrogates, frequently used in the induction of linear separators and decision trees. Most notably, they share remarkable algorithmic features: for each of these two types of classifiers, we give a minimization algorithm provably converging to the minimum of any such surrogate. While seemingly different, we show that these algorithms are offshoots of the same "master" algorithm. This provides a new and broad unified account of different popular algorithms, including additive regression with the squared loss, the logistic loss, and the top-down induction performed in CART, C4.5. Moreover, we show that the induction enjoys the most popular boosting features, regardless of the surrogate. Experiments are provided on 40 readily available domains.},
author = {Nock, Richard and Nielsen, Frank},
doi = {10.1109/TPAMI.2008.225},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Nock, Nielsen - 2009 - Bregman divergences and surrogates for learning.pdf:pdf},
issn = {1939-3539},
journal = {IEEE transactions on pattern analysis and machine intelligence},
keywords = {Algorithms,Artificial Intelligence,Computer Simulation,Decision Support Techniques,Models, Theoretical,Pattern Recognition, Automated,Pattern Recognition, Automated: methods},
month = {nov},
number = {11},
pages = {2048--59},
pmid = {19762930},
title = {{Bregman divergences and surrogates for learning.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/19762930},
volume = {31},
year = {2009}
}
@article{Sotoca2006,
author = {Sotoca, J M and Mollineda, R A and Sanchez, J.S.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Sotoca, Mollineda, Sanchez - 2006 - A meta-learning framework for pattern classification by means of data complexity measures.pdf:pdf},
journal = {Inteligencia artificial: Revista Iberoamericana de Inteligencia Artificial},
keywords = {classification,data complexity,feature selection,meta-learning,prototype selection},
number = {29},
pages = {31--38},
title = {{A meta-learning framework for pattern classification by means of data complexity measures}},
volume = {10},
year = {2006}
}
@article{Sohn1999,
author = {Sohn, So Young},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Sohn - 1999 - Meta Analysis of Classification Algorithms for Pattern Recognition.pdf:pdf},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
number = {11},
pages = {1137--1144},
title = {{Meta Analysis of Classification Algorithms for Pattern Recognition}},
volume = {21},
year = {1999}
}
@article{Hullermeier2013,
archivePrefix = {arXiv},
arxivId = {arXiv:1305.0698v1},
author = {H{\"{u}}llermeier, Eyke},
doi = {10.1016/j.ijar.2013.09.003},
eprint = {arXiv:1305.0698v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/H{\"{u}}llermeier - 2013 - Learning from imprecise and fuzzy observations Data disambiguation through generalized loss minimization.pdf:pdf},
issn = {0888613X},
journal = {International Journal of Approximate Reasoning},
keywords = {data disambiguation,extension principle,fuzzy sets,imprecise data,inductive bias,logistic,loss function,machine learning,risk minimization},
pages = {1--16},
title = {{Learning from imprecise and fuzzy observations: Data disambiguation through generalized loss minimization}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0888613X13001722},
volume = {1},
year = {2013}
}
@article{Betancourt2015,
archivePrefix = {arXiv},
arxivId = {arXiv:1502.01510v1},
author = {Betancourt, Michael},
eprint = {arXiv:1502.01510v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Betancourt - 2015 - The Fundamental Incompatibility of Hamiltonian Monte Carlo and Data Subsampling.pdf:pdf},
journal = {Arxiv preprint},
title = {{The Fundamental Incompatibility of Hamiltonian Monte Carlo and Data Subsampling}},
year = {2015}
}
@manual{RCoreTeam2016,
address = {Vienna, Austria},
author = {{R Core Team}},
organization = {R Foundation for Statistical Computing},
title = {{R: A Language and Environment for Statistical Computing}},
url = {https://www.r-project.org/},
year = {2016}
}
@article{Fox-Roberts2014,
abstract = {Reliable semi-supervised learning, where a small amount of labelled data is complemented by a large body of unlabelled data, has been a long-standing goal of the machine learning community. However, while it seems intuitively obvious that unlabelled data can aid the learning process, in practise its performance has often been disappointing. We investigate this by examining generative maximum likelihood semi-supervised learning and derive novel upper and lower bounds on the degree of bias introduced by the unlabelled data. These bounds improve upon those provided in previous work, and are specifically applicable to the challenging case where the model is unable to exactly fit to the underlying distribution a situation which is common in practise, but for which fewer guarantees of semi-supervised performance have been found. Inspired by this new framework for analysing bounds, we propose a new, simple reweighing scheme which provides a provably unbiased estimator for arbitrary model/distribution pairs|an unusual property for a semi-supervised algorithm. This reweighing introduces no additional computational complexity and can be applied to very many models. Additionally, we provide specific conditions demonstrating the circumstance under which the unlabelled data will lower the estimator variance, thereby improving convergence. {\textcopyright} 2014 Patrick Fox-Roberts and Edward Rosten.},
author = {Fox-Roberts, Patrick and Rosten, Edward},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Fox-Roberts, Rosten - 2014 - Unbiased Generative Semi-Supervised Learning.pdf:pdf},
isbn = {1532-4435},
issn = {15337928},
journal = {Journal of Machine Learning Research},
keywords = {asymptotic bounds,bias,generative model,kullback-leibler,semi-supervised},
pages = {367--443},
title = {{Unbiased Generative Semi-Supervised Learning}},
volume = {15},
year = {2014}
}
@inproceedings{Haitao2015,
address = {Hangzhou, China},
author = {Haitao, Gan and Ming, Meng and Yuliang, Ma and Yunyuan, Gao},
booktitle = {Proceedings of the 34th Chinese Control Conference},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Haitao et al. - 2015 - A safe semi-supervised kernel minimum squared error algorithm.pdf:pdf},
keywords = {kernel minimum squared error,laplacian regularized kernel mini-,safe mechanism,semi-supervised learning},
pages = {3723--3726},
title = {{A safe semi-supervised kernel minimum squared error algorithm}},
year = {2015}
}
@inproceedings{Szummer2000,
author = {Szummer, Martin and Jaakkola, Tommi},
booktitle = {Advances in Neural Information Processing Systems 13},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Szummer, Jaakkola - 2000 - Kernel expansions with unlabeled examples.pdf:pdf},
pages = {626--632},
title = {{Kernel expansions with unlabeled examples}},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.87.222{\&}rep=rep1{\&}type=pdf},
year = {2000}
}
@incollection{Opper1995,
address = {New York},
author = {Opper, Manfred and Kinzel, Wolfgang},
booktitle = {Physics of Neural Networks III},
editor = {Domany, Eytan and Hemmen, J. Leo and Schulten, Klaus},
pages = {151--209},
publisher = {Springer-Verlag},
title = {{Statistical Mechanics of Generalization}},
year = {1995}
}
@unpublished{Tan2013,
author = {Tan, Yimin and Zhu, X},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Tan, Zhu - 2013 - Dragging Density-Ratio Bagging.pdf:pdf},
pages = {1--10},
title = {{Dragging: Density-Ratio Bagging}},
url = {https://minds.wisconsin.edu/bitstream/handle/1793/65831/TR1795.pdf?sequence=1},
year = {2013}
}
@book{Basu2006,
author = {Basu, Mitra and Ho, Tin Kam},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Basu, Ho - 2006 - Data complexity in pattern recognition.pdf:pdf},
isbn = {9781846281716},
title = {{Data complexity in pattern recognition}},
url = {http://books.google.com/books?hl=en{\&}lr={\&}id=GflBKbzym9oC{\&}oi=fnd{\&}pg=PR11{\&}dq=Data+Complexity+in+Pattern+Recognition{\&}ots=igbI3IXn6d{\&}sig=-7L3L4iU5lzLaNaCVoEux{\_}GbVn4},
year = {2006}
}
@article{Taskesen2016,
author = {Taskesen, Erdogan and Huisman, Sjoerd M H and Mahfouz, Ahmed and Krijthe, Jesse H and de Ridder, Jeroen and van de Stolpe, Anja and van den Akker, Erik and Verheagh, Wim and Reinders, Marcel J T},
doi = {10.1038/srep24949},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Taskesen et al. - 2016 - Pan-cancer subtyping in a 2D-map shows substructures that are driven by specific combinations of molecular char.pdf:pdf},
journal = {Scientific Reports},
month = {apr},
pages = {24949},
publisher = {The Author(s)},
title = {{Pan-cancer subtyping in a 2D-map shows substructures that are driven by specific combinations of molecular characteristics}},
volume = {6},
year = {2016}
}
@article{Loog2014a,
author = {Loog, Marco},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Loog - 2014 - Semi-supervised linear discriminant analysis through moment-constraint parameter estimation.pdf:pdf},
journal = {Pattern Recognition Letters},
keywords = {linear discriminant analysis,semi-supervised learning},
pages = {24--31},
publisher = {Elsevier},
title = {{Semi-supervised linear discriminant analysis through moment-constraint parameter estimation}},
volume = {37},
year = {2014}
}
@article{Mann2010,
author = {Mann, Gideon S. and McCallum, Andrew Kachites},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Mann, McCallum - 2010 - Generalized expectation criteria for semi-supervised learning with weakly labeled data.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {condi-,generalized expectation criteria,logistic regression,semi-supervised learning,tional random fields},
pages = {955--984},
title = {{Generalized expectation criteria for semi-supervised learning with weakly labeled data}},
volume = {11},
year = {2010}
}
@article{Kleinberg2000,
author = {Kleinberg, E.M.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kleinberg - 2000 - On the algorithmic implementation of stochastic discrimination.pdf:pdf},
journal = {IEEE transactions on pattern analysis and machine intelligence},
number = {5},
pages = {473--490},
publisher = {IEEE},
title = {{On the algorithmic implementation of stochastic discrimination}},
url = {http://ieeexplore.ieee.org/xpls/abs{\_}all.jsp?arnumber=857004},
volume = {22},
year = {2000}
}
@article{Kucukelbir2015,
abstract = {Variational inference is a scalable technique for approximate Bayesian inference. Deriving variational inference algorithms requires tedious model-specific calculations; this makes it difficult to automate. We propose an automatic variational inference algorithm, automatic differentiation variational inference (ADVI). The user only provides a Bayesian model and a dataset; nothing else. We make no conjugacy assumptions and support a broad class of models. The algorithm automatically determines an appropriate variational family and optimizes the variational objective. We implement ADVI in Stan (code available now), a probabilistic programming framework. We compare ADVI to MCMC sampling across hierarchical generalized linear models, nonconjugate matrix factorization, and a mixture model. We train the mixture model on a quarter million images. With ADVI we can use variational inference on any model we write in Stan.},
archivePrefix = {arXiv},
arxivId = {1506.03431},
author = {Kucukelbir, Alp and Ranganath, Rajesh and Gelman, Andrew and Blei, David M.},
eprint = {1506.03431},
file = {:Users/jkrijthe/Documents/Mendeley Desktop//Kucukelbir et al. - 2015 - Automatic Variational Inference in Stan.pdf:pdf},
pages = {1--22},
title = {{Automatic Variational Inference in Stan}},
url = {http://arxiv.org/abs/1506.03431},
year = {2015}
}
@article{Bengio2004,
author = {Bengio, Yoshua and Grandvalet, Yves},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bengio, Grandvalet - 2004 - No unbiased estimator of the variance of k-fold cross-validation.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {cross-validation,k-fold cross-validation,statistical comparisons,variance estimators},
pages = {1089--1105},
title = {{No unbiased estimator of the variance of k-fold cross-validation}},
volume = {5},
year = {2004}
}
@inproceedings{Goldberg2007,
author = {Goldberg, Andrew B. and Zhu, Xiaojin and Wright, Stephen},
booktitle = {International Conference on Artificial Intelligence and Statistics},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Goldberg, Zhu, Wright - 2007 - Dissimilarity in graph-based semi-supervised classification.pdf:pdf},
number = {1},
pages = {55--162},
title = {{Dissimilarity in graph-based semi-supervised classification}},
url = {http://machinelearning.wustl.edu/mlpapers/paper{\_}files/AISTATS07{\_}GoldbergZW.pdf},
year = {2007}
}
@book{Chapelle2006,
address = {Cambridge, MA},
author = {Chapelle, Olivier and Sch{\"{o}}lkopf, Bernhard and Zien, Alexander},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Chapelle, Sch{\"{o}}lkopf, Zien - 2006 - Semi-supervised learning.pdf:pdf},
isbn = {9780262033589},
publisher = {MIT press},
title = {{Semi-supervised learning}},
year = {2006}
}
@article{Yang2015a,
abstract = {Fractional imputation (FI) is a relatively new method of imputation for handling item nonresponse in survey sampling. In FI, several imputed values with their fractional weights are created for each missing item. Each fractional weight represents the conditional probability of the imputed value given the observed data, and the parameters in the conditional probabilities are often computed by an iterative method such as EM algorithm. The underlying model for FI can be fully parametric, semiparametric, or nonparametric, depending on plausibility of assumptions and the data structure. In this paper, we give an overview of FI, introduce key ideas and methods to readers who are new to the FI literature, and highlight some new development. We also provide guidance on practical implementation of FI and valid inferential tools after imputation. We demonstrate the empirical performance of FI with respect to multiple imputation using a pseudo finite population generated from a sample in Monthly Retail Trade Survey in US Census Bureau.},
archivePrefix = {arXiv},
arxivId = {1508.06945},
author = {Yang, Shu and Kim, Jae Kwang},
eprint = {1508.06945},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Yang, Kim - 2015 - Fractional Imputation in Survey Sampling A Comparative Review.pdf:pdf},
keywords = {Item nonresponse,Missing at random,Monte Carlo EM,Multiple imputation,Synthetic imputation.},
title = {{Fractional Imputation in Survey Sampling: A Comparative Review}},
url = {http://arxiv.org/abs/1508.06945},
volume = {02115},
year = {2015}
}
@misc{RSSL,
author = {Krijthe, Jesse Hendrik},
title = {{RSSL: Semi-supervised Learning in R}},
url = {https://github.com/jkrijthe/RSSL},
year = {2016}
}
@article{Hartley1968b,
author = {Hartley, H.O. and Rao, J.N.K.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hartley, Rao - 1968 - Classification and Estimation in Analysis of Variance Problems.pdf:pdf},
journal = {Review of the International Statistical Institute},
number = {2},
pages = {141--147},
title = {{Classification and Estimation in Analysis of Variance Problems}},
volume = {36},
year = {1968}
}
@inproceedings{Zhou2005a,
author = {Zhou, Zhi-hua and Li, Ming},
booktitle = {International Joint Conferences on Artificial Intelligence},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Zhou, Li - 2005 - Semi-Supervised Regression with Co-Training.pdf:pdf},
title = {{Semi-Supervised Regression with Co-Training.}},
url = {http://ijcai.org/Past Proceedings/IJCAI-05/PDF/0689.pdf},
year = {2005}
}
@article{Kullback1968,
author = {Kullback, S.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kullback - 1968 - Probability Densities with Given Marginals.pdf:pdf},
journal = {The Annals of Mathematical Statistics},
number = {4},
pages = {1236--1243},
title = {{Probability Densities with Given Marginals}},
url = {http://www.jstor.org/stable/10.2307/2239692},
volume = {39},
year = {1968}
}
@article{Vapnik2009,
author = {Vapnik, Vladimir and Vashist, Akshay},
doi = {10.1016/j.neunet.2009.06.042},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Vapnik, Vashist - 2009 - A new learning paradigm Learning using privileged information.pdf:pdf},
journal = {Neural Networks},
number = {5-6},
pages = {544--557},
publisher = {Elsevier Ltd},
title = {{A new learning paradigm : Learning using privileged information}},
volume = {22},
year = {2009}
}
@inproceedings{Roli2002,
author = {Roli, Fabio and Raudys, {\v{S}}arūnas and Marcialis, Gian Luca},
booktitle = {Multiple Classifier Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Roli, Raudys, Marcialis - 2002 - An experimental comparison of fixed and trained fusion rules for crisp classifier outputs.pdf:pdf},
title = {{An experimental comparison of fixed and trained fusion rules for crisp classifier outputs}},
url = {http://link.springer.com/chapter/10.1007/3-540-45428-4{\_}23},
year = {2002}
}
@article{VanRooden2011,
abstract = {The clinical heterogeneity of Parkinson's disease (PD) may point at the existence of subtypes. Because subtypes likely reflect distinct underlying etiologies, their identification may facilitate future genetic and pharmacotherapeutic studies. Aim of this study was to identify subtypes by a data-driven approach applied to a broad spectrum of motor and nonmotor features of PD. Data of motor and nonmotor PD symptoms were collected in 802 patients in two different European prevalent cohorts. A model-based cluster analysis was conducted on baseline data of 344 patients of a Dutch cohort (PROPARK). Reproducibility of these results was tested in data of the second annual assessment of the same cohort and validated in an independent Spanish cohort (ELEP) of 357 patients. The subtypes were subsequently characterized on clinical and demographic variables. Four similar PD subtypes were identified in two different populations and are largely characterized by differences in the severity of nondopaminergic features and motor complications: Subtype 1 was mildly affected in all domains, Subtype 2 was predominantly characterized by severe motor complications, Subtype 3 was affected mainly on nondopaminergic domains without prominent motor complications, while Subtype 4 was severely affected on all domains. The subtypes had largely similar mean disease durations (nonsignificant differences between three clusters) but showed considerable differences with respect to their association with demographic and clinical variables. In prevalent disease, PD subtypes are largely characterized by the severity of nondopaminergic features and motor complications and likely reflect complex interactions between disease mechanisms, treatment, aging, and gender.},
author = {van Rooden, Stephanie M and Colas, Fabrice P. R. and Mart{\'{i}}nez-Mart{\'{i}}n, Pablo and Visser, Martine and Verbaan, Dagmar and Marinus, Johan and Chaudhuri, Ray K and Kok, Joost N and van Hilten, Jacobus J},
doi = {10.1002/mds.23346},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/van Rooden et al. - 2011 - Clinical subtypes of Parkinson's disease.pdf:pdf},
issn = {1531-8257},
journal = {Movement disorders : official journal of the Movement Disorder Society},
keywords = {Aged,Cluster Analysis,Cohort Studies,Disease Progression,Female,Germany,Humans,Male,Middle Aged,Neurologic Examination,Parkinson Disease,Parkinson Disease: classification,Parkinson Disease: physiopathology,Reproducibility of Results,Spain,Time Factors},
month = {jan},
number = {1},
pages = {51--8},
pmid = {21322019},
title = {{Clinical subtypes of Parkinson's disease.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/21322019},
volume = {26},
year = {2011}
}
@article{Patil2016,
author = {Patil, Prasad and Peng, Roger D. and Leek, Jeffrey T.},
doi = {10.1101/066803},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Patil, Peng, Leek - 2016 - A statistical definition for reproducibility and replicability.pdf:pdf},
journal = {Biorxiv},
title = {{A statistical definition for reproducibility and replicability}},
url = {http://dx.doi.org/10.1101/066803},
year = {2016}
}
@article{Patel2015a,
author = {Patel, Nihir and Wang, Jason T L},
doi = {10.1007/s12038-015-9558-9},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Patel, Wang - 2015 - Semi-supervised prediction of gene regulatory networks using machine learning algorithms.pdf:pdf},
issn = {0250-5991},
journal = {Journal of Biosciences},
number = {4},
pages = {731--740},
title = {{Semi-supervised prediction of gene regulatory networks using machine learning algorithms}},
url = {http://link.springer.com/10.1007/s12038-015-9558-9},
volume = {40},
year = {2015}
}
@book{Berger1985,
author = {Berger, James O},
publisher = {Springer},
title = {{Statistical decision theory and Bayesian analysis}},
year = {1985}
}
@article{Hand2006a,
author = {Hand, David J.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hand - 2006 - Classifier Technology and the Illusion of Progress.pdf:pdf},
journal = {Statistical Science},
number = {1},
pages = {1--14},
title = {{Classifier Technology and the Illusion of Progress}},
volume = {21},
year = {2006}
}
@article{Hong2015a,
author = {Hong, Yi and Zhu, Weiping},
doi = {10.1016/j.patrec.2015.06.017},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hong, Zhu - 2015 - Spatial Co-Training for Semi-Supervised Image Classification.pdf:pdf;:Users/jkrijthe/Documents/Mendeley Desktop/Hong, Zhu - 2015 - Spatial Co-Training for Semi-Supervised Image Classification(2).pdf:pdf},
issn = {01678655},
journal = {Pattern Recognition Letters},
keywords = {Co-training,Image classif,Image classification,Semi-supervised learning},
pages = {59--65},
publisher = {Elsevier Ltd.},
title = {{Spatial Co-Training for Semi-Supervised Image Classification}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0167865515001816},
volume = {63},
year = {2015}
}
@misc{Hanczar2015,
author = {Hanczar, Blaise and Zucker, Jean-Daniel},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hanczar, Zucker - 2015 - A generic approach to optimizing abstaining area for small sample data classification.pdf:pdf},
keywords = {abstract,accuracy relies on the,an approach to improve,classifiers,given a classification task,in the feature space,not,reject option,reliable enough,small-sample setting,supervised learning,these classifiers are trained,these rejected observations belong,to an abstaining area,to reject observations for,two,use of abstaining,which predicted values are},
title = {{A generic approach to optimizing abstaining area for small sample data classification}},
year = {2015}
}
@article{Pearl2009a,
author = {Pearl, Judea},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Pearl - 2009 - The Art and Science of Cause and Effect.pdf:pdf},
journal = {Cambridge University Press},
number = {November 1996},
pages = {401--428},
title = {{The Art and Science of Cause and Effect}},
year = {2009}
}
@phdthesis{Druck2011,
author = {Druck, Gregory},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Druck - 2011 - Generalized Expectation Criteria for Lightly Supervised Learning.pdf:pdf},
number = {September},
title = {{Generalized Expectation Criteria for Lightly Supervised Learning}},
url = {http://scholarworks.umass.edu/open{\_}access{\_}dissertations/440/},
year = {2011}
}
@article{Rosasco2004,
abstract = {In this letter, we investigate the impact of choosing different loss functions from the viewpoint of statistical learning theory. We introduce a convexity assumption, which is met by all loss functions commonly used in the literature, and study how the bound on the estimation error changes with the loss. We also derive a general result on the minimizer of the expected risk for a convex loss function in the case of classification. The main outcome of our analysis is that for classification, the hinge loss appears to be the loss of choice. Other things being equal, the hinge loss leads to a convergence rate practically indistinguishable from the logistic loss rate and much better than the square loss rate. Furthermore, if the hypothesis space is sufficiently rich, the bounds obtained for the hinge loss are not loosened by the thresholding stage.},
author = {Rosasco, Lorenzo and {De Vito}, Ernesto and Caponnetto, Andrea and Piana, Michele and Verri, Alessandro},
doi = {10.1162/089976604773135104},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Rosasco et al. - 2004 - Are loss functions all the same.pdf:pdf},
issn = {0899-7667},
journal = {Neural computation},
keywords = {Learning,Learning: physiology,Linear Models,Models, Neurological,Statistics as Topic},
month = {may},
number = {5},
pages = {1063--76},
pmid = {15070510},
title = {{Are loss functions all the same?}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/15070510},
volume = {16},
year = {2004}
}
@unpublished{DeDeo2014,
abstract = {A recurring problem with statistical prediction for policy-making is that many useful variables are associated with others on which it would be ethically problematic to base decisions. This problem becomes particularly acute in the Big Data era, when predictions are often made in the absence of strong theories for the underlying causal mechanisms. Given this, we show how to use information theory to construct the distribution closest in predictive power to the full distribution, but in which predictions---and thus policy outcomes, provision of services, and so forth---are not correlated with protected variables.},
archivePrefix = {arXiv},
arxivId = {1412.4643},
author = {DeDeo, Simon},
eprint = {1412.4643},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/DeDeo - 2014 - Wrong side of the tracks Big Data and Protected Categories.pdf:pdf},
month = {dec},
pages = {3},
title = {{"Wrong side of the tracks": Big Data and Protected Categories}},
url = {http://arxiv.org/abs/1412.4643},
year = {2014}
}
@inproceedings{Skurichina1996,
abstract = {In this paper the possibilities for constructing linear classifiers are considered for very small sample sizes. We propose a stability measure and present a study on the performance and stability of the following techniques: regularization by the Ridge-estimate of the covariance matrix [12], bootstrapping followed by aggregation (‘bagging', [9]) and editing combined with pseudo- inversion [8]. It is shown that by these techniques a smooth transition can be made between the nearest mean classifier and the Fisher discriminant based on large samples sizes. Especially for highly correlated data very good results are obtained compared with the nearest mean method.},
author = {Skurichina, Marina and Duin, Robert P W},
booktitle = {Proceedings of the 13th International Conference on Pattern Recognition},
doi = {10.1109/ICPR.1996.547204},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Skurichina, Duin - 1996 - Stabilizing classifiers for very small sample sizes.pdf:pdf},
isbn = {081867282X},
issn = {10514651},
pages = {891--896},
title = {{Stabilizing classifiers for very small sample sizes}},
year = {1996}
}
@inproceedings{Brazdil1994,
author = {Brazdil, Pavel B. and Gama, Joao and Henery, Bob},
booktitle = {Proceedings of the European conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Brazdil, Gama, Henery - 1994 - Characterizing the applicability of classification algorithms using meta-level learning.pdf:pdf},
pages = {83--102},
title = {{Characterizing the applicability of classification algorithms using meta-level learning}},
url = {http://link.springer.com/chapter/10.1007/3-540-57868-4{\_}52},
year = {1994}
}
@article{Culp2008b,
abstract = {Graph-based learning provides a useful approach for modeling data in classification problems. In this modeling scenario, the relationship between labeled and unlabeled data impacts the construction and performance of classifiers, and therefore a semi-supervised learning framework is adopted. We propose a graph classifier based on kernel smoothing. A regularization framework is also introduced, and it is shown that the proposed classifier optimizes certain loss functions. Its performance is assessed on several synthetic and real benchmark data sets with good results, especially in settings where only a small fraction of the data are labeled.},
author = {Culp, Mark and Michailidis, George},
doi = {10.1109/TPAMI.2007.70765},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Culp, Michailidis - 2008 - Graph-based semisupervised learning.pdf:pdf},
issn = {0162-8828},
journal = {IEEE transactions on pattern analysis and machine intelligence},
keywords = {Algorithms,Artificial Intelligence,Automated,Automated: methods,Cluster Analysis,Computer Simulation,Data Interpretation,Models,Pattern Recognition,Statistical},
month = {jan},
number = {1},
pages = {174--9},
pmid = {18000333},
title = {{Graph-based semisupervised learning.}},
volume = {30},
year = {2008}
}
@article{Tibshirani1996,
author = {Tibshirani, Robert},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Tibshirani - 1996 - Regression shrinkage and selection via the lasso.pdf:pdf},
journal = {Journal of the Royal Statistical Society. Series B},
keywords = {quadratic programming,regression,shrinkage,subset selection},
number = {1},
pages = {267--288},
title = {{Regression shrinkage and selection via the lasso}},
volume = {58},
year = {1996}
}
@inproceedings{Ben-David2008,
author = {Ben-David, Shai and Lu, Tyler and P{\'{a}}l, David},
booktitle = {Proceedings of the 21st Annual Conference on Learning Theory},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ben-David, Lu, P{\'{a}}l - 2008 - Does Unlabeled Data Provably Help Worst-case Analysis of the Sample Complexity of Semi-Supervised Learning.pdf:pdf},
pages = {33--44},
title = {{Does Unlabeled Data Provably Help? Worst-case Analysis of the Sample Complexity of Semi-Supervised Learning.}},
url = {http://www.cs.toronto.edu/{~}tl/papers/ssl.pdf},
year = {2008}
}
@article{Masnadi-shirazi2015,
author = {Masnadi-shirazi, Hamed},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Masnadi-shirazi - 2015 - A View of Margin Losses as Regularizers of Probability Estimates.pdf:pdf},
keywords = {boosting,classification,margin losses,probability elicitation,regularization},
pages = {2751--2795},
title = {{A View of Margin Losses as Regularizers of Probability Estimates}},
volume = {16},
year = {2015}
}
@book{Rothenberg1973,
author = {Rothenberg, Thomas J.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Rothenberg - 1973 - Efficient Estimation with A Priori Information.pdf:pdf},
publisher = {Yale University Press},
title = {{Efficient Estimation with A Priori Information}},
url = {http://www.getcited.org/pub/101421013},
year = {1973}
}
@article{Schaffer1993,
author = {Schaffer, Cullen},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Schaffer - 1993 - Selecting a classification method by cross-validation.pdf:pdf},
journal = {Machine Learning},
keywords = {classification,cross-validation,decision trees,neural networks},
pages = {135--143},
title = {{Selecting a classification method by cross-validation}},
url = {http://link.springer.com/article/10.1007/BF00993106},
volume = {13},
year = {1993}
}
@article{Reid2011,
abstract = {We unify f-divergences, Bregman divergences, surrogate loss bounds (regret bounds), proper scoring rules, matching losses, cost curves, ROC-curves and information. We do this by systematically studying integral and variational representations of these objects and in so doing identify their primitives which all are related to cost-sensitive binary classification. As well as clarifying relationships between generative and discriminative views of learning, the new machinery leads to tight and more general surrogate loss bounds and generalised Pinsker inequalities relating f-divergences to variational divergence. The new viewpoint illuminates existing algorithms: it provides a new derivation of Support Vector Machines in terms of divergences and relates Maximum Mean Discrepancy to Fisher Linear Discriminants. It also suggests new techniques for estimating f-divergences.},
archivePrefix = {arXiv},
arxivId = {0901.0356},
author = {Reid, Mark D. and Williamson, Robert C.},
eprint = {0901.0356},
file = {:Users/jkrijthe/Documents/Mendeley Desktop//Reid, Williamson - 2011 - Information, divergence and risk for binary experiments.pdf:pdf;:Users/jkrijthe/Documents/Mendeley Desktop/Reid, Williamson - 2011 - Information, divergence and risk for binary experiments.pdf:pdf},
isbn = {1532-4435},
issn = {1532-4435},
journal = {Journal of Machine Learning Research},
keywords = {classification,divergence,loss functions,regret bounds,statistical information},
pages = {731--817},
title = {{Information, divergence and risk for binary experiments}},
url = {http://arxiv.org/abs/0901.0356 http://dl.acm.org/citation.cfm?id=2021029},
volume = {12},
year = {2011}
}
@article{Fan2014,
author = {Fan, Jianqing and Ke, Zheng Tracy},
doi = {10.1214/13-AOS1175},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Fan, Ke - 2014 - Discussion “a significance test for the lasso”.pdf:pdf},
issn = {0090-5364},
journal = {The Annals of Statistics},
month = {apr},
number = {2},
pages = {483--492},
title = {{Discussion: “a significance test for the lasso”}},
volume = {42},
year = {2014}
}
@article{Esposito2015,
author = {Esposito, G. and Martin, M.},
doi = {10.1080/08839514.2015.1035951},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Esposito, Martin - 2015 - A Randomized Algorithm for the Exact Solution of Transductive Support Vector Machines.pdf:pdf},
issn = {0883-9514},
journal = {Applied Artificial Intelligence},
number = {5},
pages = {459--479},
title = {{A Randomized Algorithm for the Exact Solution of Transductive Support Vector Machines}},
url = {http://www.tandfonline.com/doi/full/10.1080/08839514.2015.1035951},
volume = {29},
year = {2015}
}
@article{Xue2015,
author = {Xue, Jing-hao and Hall, Peter},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Xue, Hall - 2015 - Why Does Rebalancing Class-Unbalanced Data Improve AUC for Linear Discriminant Analysis.pdf:pdf},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
number = {5},
pages = {1109--1112},
title = {{Why Does Rebalancing Class-Unbalanced Data Improve AUC for Linear Discriminant Analysis?}},
volume = {37},
year = {2015}
}
@article{Hanczar2010,
author = {Hanczar, Blaise and Dougherty, Edward R.},
doi = {10.2174/157489310790596376},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hanczar, Dougherty - 2010 - On the Comparison of Classifiers for Microarray Data.pdf:pdf},
issn = {15748936},
journal = {Current Bioinformatics},
keywords = {classifier comparison,error estimation,microarray classification,variance study},
month = {mar},
number = {1},
pages = {29--39},
title = {{On the Comparison of Classifiers for Microarray Data}},
url = {http://openurl.ingenta.com/content/xref?genre=article{\&}issn=1574-8936{\&}volume=5{\&}issue=1{\&}spage=29},
volume = {5},
year = {2010}
}
@article{Yang2015b,
abstract = {The Hidden Markov Model (HMM) is one of the mainstays of statistical modeling of discrete time series, with applications including speech recognition, computational biology, computer vision and econometrics. Estimating an HMM from its observation process is often addressed via the Baum-Welch algorithm, which is known to be susceptible to local optima. In this paper, we first give a general characterization of the basin of attraction associated with any global optimum of the population likelihood. By exploiting this characterization, we provide non-asymptotic finite sample guarantees on the Baum-Welch updates, guaranteeing geometric convergence to a small ball of radius on the order of the minimax rate around a global optimum. As a concrete example, we prove a linear rate of convergence for a hidden Markov mixture of two isotropic Gaussians given a suitable mean separation and an initialization within a ball of large radius around (one of) the true parameters. To our knowledge, these are the first rigorous local convergence guarantees to global optima for the Baum-Welch algorithm in a setting where the likelihood function is nonconvex. We complement our theoretical results with thorough numerical simulations studying the convergence of the Baum-Welch algorithm and illustrating the accuracy of our predictions.},
archivePrefix = {arXiv},
arxivId = {1512.08269},
author = {Yang, Fanny and Balakrishnan, Sivaraman and Wainwright, Martin J},
eprint = {1512.08269},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Yang, Balakrishnan, Wainwright - 2015 - Statistical and Computational Guarantees for the Baum-Welch Algorithm.pdf:pdf},
title = {{Statistical and Computational Guarantees for the Baum-Welch Algorithm}},
url = {http://arxiv.org/abs/1512.08269},
year = {2015}
}
@article{Abadi2015,
author = {Abadi, Martin and Agarwal, Ashish and Barham, Paul and Brevdo, Eugene and Chen, Zhifeng and Citro, Craig and Corrado, Greg and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and Ghemawat, Sanjay and Goodfellow, Ian and Harp, Andrew and Irving, Geoffrey and Isard, Michael and Jia, Yangqing and Kaiser, Lukasz and Kudlur, Manjunath and Levenberg, Josh and Man, Dan and Monga, Rajat and Moore, Sherry and Murray, Derek and Shlens, Jon and Steiner, Benoit and Sutskever, Ilya and Tucker, Paul and Vanhoucke, Vincent and Vasudevan, Vijay and Vinyals, Oriol and Warden, Pete and Wicke, Martin and Yu, Yuan and Zheng, Xiaoqiang},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Abadi et al. - 2015 - TensorFlow Large-Scale Machine Learning on Heterogeneous Distributed Systems.pdf:pdf},
title = {{TensorFlow : Large-Scale Machine Learning on Heterogeneous Distributed Systems}},
year = {2015}
}
@article{Ahmed2016,
abstract = {Focal cortical dysplasia (FCD) is the most common cause of pediatric epilepsy and the third most common cause in adults with treatment-resistant epilepsy. Surgical resection of the lesion is the most effective treatment to stop seizures. Technical advances in MRI have revolutionized the diagnosis of FCD, leading to high success rates for resective surgery. However, 45{\%} of histologically confirmed FCD patients have normal MRIs (MRI-negative). Without a visible lesion, the success rate of surgery drops from 66{\%} to 29{\%}. In this work, we cast the problem of detecting potential FCD lesions using MRI scans of MRI-negative patients in an image segmentation framework based on hierarchical conditional random fields (HCRF). We use surface based morphometry to model the cortical surface as a two-dimensional surface which is then segmented at multiple scales to extract superpixels of different sizes. Each superpixel is assigned an outlier score by comparing it to a control c Bilal Ahmed et al..},
author = {Ahmed, Bilal and Thesen, Thomas and Blackmon, Karen E and Kuzniekcy, Ruben and Devinsky, Orrin and Brodley, Carla E},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ahmed et al. - 2016 - Decrypting Cryptogenic Epilepsy Semi-supervised Hierarchical Conditional Random Fields For Detecting Cortical Le.pdf:pdf},
journal = {Journal of Machine Learning Research},
pages = {1--30},
title = {{Decrypting " Cryptogenic " Epilepsy: Semi-supervised Hierarchical Conditional Random Fields For Detecting Cortical Lesions In MRI-Negative Patients}},
volume = {17},
year = {2016}
}
@inproceedings{Rifai2011,
abstract = {We combine three important ideas present in previous work for building classi- fiers: the semi-supervised hypothesis (the input distribution contains information about the classifier), the unsupervised manifold hypothesis (data density concen- trates near low-dimensional manifolds), and the manifold hypothesis for classifi- cation (different classes correspond to disjoint manifolds separated by low den- sity). We exploit a novel algorithm for capturing manifold structure (high-order contractive auto-encoders) and we showhowit builds a topological atlas of charts, each chart being characterized by the principal singular vectors of the Jacobian of a representation mapping. This representation learning algorithm can be stacked to yield a deep architecture, and we combine it with a domain knowledge-free version of the TangentProp algorithm to encourage the classifier to be insensitive to local directions changes along the manifold. Record-breaking classification results are obtained},
author = {Rifai, Salah and Dauphin, Yann N. and Vincent, Pascal and Bengio, Yoshua and {Xavier Muller}},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Rifai et al. - 2011 - The Manifold Tangent Classifier.pdf:pdf},
pages = {2294--2302},
title = {{The Manifold Tangent Classifier}},
year = {2011}
}
@book{Lehmann1986,
author = {Lehmann, E.L.},
edition = {2},
publisher = {Wiley},
title = {{Testing Statistical Hyptoheses}},
year = {1986}
}
@inproceedings{Chapelle2002,
author = {Chapelle, Olivier and Weston, Jason and Sch{\"{o}}lkopf, Bernhard},
booktitle = {Advances in Neural Information Processing Systems 14},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Chapelle, Weston, Sch{\"{o}}lkopf - 2002 - Cluster kernels for semi-supervised learning.pdf:pdf},
pages = {585--592},
title = {{Cluster kernels for semi-supervised learning}},
url = {http://machinelearning.wustl.edu/mlpapers/paper{\_}files/AA13.pdf},
year = {2002}
}
@inproceedings{Zhang2008,
abstract = {Linear discriminant analysis (LDA) is commonly used for dimensionality$\backslash$nreduction. In real-world applications where labeled data are scarce,$\backslash$nLDA does not work very well. However, unlabeled data are often available$\backslash$nin large quantities. We propose a novel semi-supervised discriminant$\backslash$nanalysis algorithm called SSDA . We utilize unlabeled data to maximize$\backslash$nan optimality criterion of LDA and use the constrained concave-convex$\backslash$nprocedure to solve the optimization problem. The optimization procedure$\backslash$nleads to estimation of the class labels for the unlabeled data. We$\backslash$npropose a novel confidence measure for selecting those unlabeled$\backslash$ndata points with high confidence. The selected unlabeled data can$\backslash$nthen be used to augment the original labeled data set for performing$\backslash$nLDA. We also propose a variant of SSDA , called M-SSDA , which adopts$\backslash$nthe manifold assumption to utilize the unlabeled data. Extensive$\backslash$nexperiments on many benchmark data sets demonstrate the effectiveness$\backslash$nof our proposed methods.},
author = {Zhang, Yu and Yeung, Dit Yan},
booktitle = {ECML PKDD},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Zhang, Yeung - 2008 - Semi-supervised discriminant analysis via CCCP.pdf:pdf},
isbn = {3540874801},
issn = {03029743},
number = {PART 2},
pages = {644--659},
title = {{Semi-supervised discriminant analysis via CCCP}},
volume = {5212 LNAI},
year = {2008}
}
@inproceedings{Ho2001a,
author = {Ho, Tin Kam},
booktitle = {Multiple Classifier Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ho - 2001 - Data Complexity Analysis for Classifier Combination.pdf:pdf},
pages = {53--67},
title = {{Data Complexity Analysis for Classifier Combination}},
year = {2001}
}
@article{Bach2010,
abstract = {Sparse methods for supervised learning aim at finding good linear predictors from as few variables as possible, i.e., with small cardinality of their supports. This combinatorial selection problem is often turned into a convex optimization problem by replacing the cardinality function by its convex envelope (tightest convex lower bound), in this case the L1-norm. In this paper, we investigate more general set-functions than the cardinality, that may incorporate prior knowledge or structural constraints which are common in many applications: namely, we show that for nondecreasing submodular set-functions, the corresponding convex envelope can be obtained from its $\backslash$lova extension, a common tool in submodular analysis. This defines a family of polyhedral norms, for which we provide generic algorithmic tools (subgradients and proximal operators) and theoretical results (conditions for support recovery or high-dimensional inference). By selecting specific submodular functions, we can give a new interpretation to known norms, such as those based on rank-statistics or grouped norms with potentially overlapping groups; we also define new norms, in particular ones that can be used as non-factorial priors for supervised learning.},
archivePrefix = {arXiv},
arxivId = {1008.4220},
author = {Bach, Francis},
eprint = {1008.4220},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bach - 2010 - Structured sparsity-inducing norms through submodular functions.pdf:pdf},
isbn = {9781617823800},
journal = {Advances in Neural Information Processing Systems NIPS'2010},
pages = {1--9},
title = {{Structured sparsity-inducing norms through submodular functions}},
url = {http://arxiv.org/abs/1008.4220},
year = {2010}
}
@inproceedings{Grunwald2000,
author = {Gr{\"{u}}nwald, Peter},
booktitle = {Proceedings of the 16th Conference on Uncertainty in Artificial Intelligence},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gr{\"{u}}nwald - 2000 - Maximum entropy and the glasses you are looking through.pdf:pdf},
pages = {238--246},
title = {{Maximum entropy and the glasses you are looking through}},
url = {http://dl.acm.org/citation.cfm?id=2073975},
year = {2000}
}
@book{Webb2002,
author = {Webb, Andrew},
edition = {2},
publisher = {John Wiley {\&} Sons},
title = {{Statistical Pattern Recognition}},
year = {2002}
}
@article{OpenScienceCollaboration2015,
author = {{Open Science Collaboration}},
doi = {10.1126/science.aac4716},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Open Science Collaboration - 2015 - Estimating the reproducibility of psychological science.pdf:pdf},
issn = {0036-8075},
journal = {Science},
number = {6251},
title = {{Estimating the reproducibility of psychological science}},
url = {http://www.sciencemag.org/cgi/doi/10.1126/science.aac4716},
volume = {349},
year = {2015}
}
@misc{Shannon1956,
author = {Shannon, E},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Shannon - 1956 - The Bandwagon.pdf:pdf},
title = {{The Bandwagon}},
year = {1956}
}
@article{Demsar2006,
author = {Demsar, Janez},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Demsar - 2006 - Statistical Comparisons of Classifiers over Multiple Data Sets.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {comparative studies,friedman test,multiple comparisons tests,statistical methods,wilcoxon signed ranks test},
pages = {1--30},
title = {{Statistical Comparisons of Classifiers over Multiple Data Sets}},
volume = {7},
year = {2006}
}
@inproceedings{Krizhevsky2012,
author = {Krizhevsky, Alex and Sutskever, I and Hinton, GE},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Krizhevsky, Sutskever, Hinton - 2012 - ImageNet Classification with Deep Convolutional Neural Networks.pdf:pdf},
pages = {1097--1105},
title = {{ImageNet Classification with Deep Convolutional Neural Networks}},
year = {2012}
}
@inproceedings{Zhou2007a,
author = {Zhou, Zhi-hua and Xu, Jun-Ming},
booktitle = {Proceedings of the 24th International Conference on Machine learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Zhou, Xu - 2007 - On the relation between multi-instance learning and semi-supervised learning.pdf:pdf},
number = {1997},
title = {{On the relation between multi-instance learning and semi-supervised learning}},
url = {http://dl.acm.org/citation.cfm?id=1273643},
year = {2007}
}
@article{Duin2015,
author = {Duin, Robert P.W.},
doi = {10.1016/j.patrec.2015.04.015},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Duin - 2015 - The dissimilarity representation for finding universals from particulars by an anti-essentialist approach.pdf:pdf},
issn = {01678655},
journal = {Pattern Recognition Letters},
keywords = {Anti-essentialism,Generalization,Nearest Neighbor Rule,Representation},
publisher = {Elsevier Ltd.},
title = {{The dissimilarity representation for finding universals from particulars by an anti-essentialist approach}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0167865515001324},
year = {2015}
}
@incollection{Hansen2009,
author = {Hansen, Lars Kai},
booktitle = {Dataset shift in Machine Learning},
chapter = {4},
editor = {Quinonero-Candela, Joaquin and Sugiyama, Masashi and Schwaighofer, Anton and Lawrence, Neil D.},
pages = {65--72},
publisher = {MIT Press},
title = {{On Bayesian Transduction: Implications for the Covariate Shift Problem}},
year = {2009}
}
@inproceedings{Elworthy1994,
archivePrefix = {arXiv},
arxivId = {arXiv:cmp-lg/9410012v2},
author = {Elworthy, David},
booktitle = {Proceedings of the 4th Conference on Applied Natural Language Processing},
eprint = {9410012v2},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Elworthy - 1994 - Does Baum-Welch re-estimation help taggers.pdf:pdf},
pages = {53--58},
primaryClass = {arXiv:cmp-lg},
title = {{Does Baum-Welch re-estimation help taggers?}},
year = {1994}
}
@unpublished{Graves2014,
archivePrefix = {arXiv},
arxivId = {1410.5401},
author = {Graves, Alex and Wayne, Greg and Danihelka, Ivo},
eprint = {1410.5401},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Graves, Wayne, Danihelka - 2014 - Neural Turing Machines.pdf:pdf},
month = {oct},
pages = {1--26},
title = {{Neural Turing Machines}},
url = {http://arxiv.org/abs/1410.5401v1},
year = {2014}
}
@incollection{Rifkin2003,
author = {Rifkin, Ryan and Yeo, Gene and Poggio, Tomaso},
booktitle = {Nato Science Series Sub Series III Computer and Systems Sciences 190},
editor = {Suykens, Johan A. K. and Horvath, Gabor and Basu, Sankar and Micchelli, Charles and Vandewalle, Joos},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Rifkin, Yeo, Poggio - 2003 - Regularized least-squares classification.pdf:pdf},
pages = {131--154},
publisher = {IOS Press},
title = {{Regularized least-squares classification}},
year = {2003}
}
@inproceedings{Lafferty2007,
author = {Lafferty, John D. and Wasserman, Larry},
booktitle = {Advances in Neural Information Processing Systems 20},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lafferty, Wasserman - 2007 - Statistical analysis of semi-supervised regression.pdf:pdf},
pages = {801----808},
title = {{Statistical analysis of semi-supervised regression}},
year = {2007}
}
@book{Bertsekas1982,
author = {Bertsekas, Dimitri P.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bertsekas - 1982 - Constrained optimization and Lagrange multiplier methods.pdf:pdf},
publisher = {Academic Press},
title = {{Constrained optimization and Lagrange multiplier methods}},
url = {http://adsabs.harvard.edu/abs/1982colm.book.....b},
year = {1982}
}
@article{Li2013,
author = {Li, YF and Tsang, IW and Kwok, JT and Zhou, ZH},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Li et al. - 2013 - Convex and Scalable Weakly Labeled SVMs.pdf:pdf},
journal = {Journal of Machine Learning Research},
pages = {2151--2188},
title = {{Convex and Scalable Weakly Labeled SVMs}},
url = {http://arxiv.org/abs/1303.1271},
volume = {14},
year = {2013}
}
@inproceedings{Loog2012a,
author = {Loog, Marco},
booktitle = {Partially Supervised Learning (Lecture Notes in Artificial Intelligence Volume 7081)},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Loog - 2012 - Semi-supervised linear discriminant analysis using moment constraints.pdf:pdf},
pages = {32--41},
publisher = {Springer},
title = {{Semi-supervised linear discriminant analysis using moment constraints}},
year = {2012}
}
@article{Senn2011,
author = {Senn, Stephen},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Senn - 2011 - You may believe you are a Bayesian but you are probably wrong.pdf:pdf},
journal = {Rationality, Markets and Morals},
pages = {48--66},
title = {{You may believe you are a Bayesian but you are probably wrong}},
url = {http://www.rmm-journal.com/downloads/Article{\_}Senn.pdf},
volume = {2},
year = {2011}
}
@article{Krijthe2016d,
author = {Krijthe, Jesse Hendrik and Loog, Marco},
doi = {10.1016/j.patcog.2016.09.009},
journal = {Pattern Recognition},
pages = {115--126},
title = {{Robust Semi-supervised Least Squares Classification by Implicit Constraints}},
volume = {63},
year = {2017}
}
@inproceedings{Herbrich2006a,
author = {Herbrich, Ralf and Minka, Tom and Graepel, Thore},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Herbrich, Minka, Graepel - 2006 - TrueSkill A Bayesian Skill Rating System.pdf:pdf},
title = {{TrueSkill: A Bayesian Skill Rating System}},
year = {2006}
}
@article{Janson2015,
author = {Janson, L. and Fithian, W. and Hastie, T. J.},
doi = {10.1093/biomet/asv019},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Janson, Fithian, Hastie - 2015 - Effective degrees of freedom a flawed metaphor.pdf:pdf},
issn = {0006-3444},
journal = {Biometrika},
number = {May},
pages = {479--485},
title = {{Effective degrees of freedom: a flawed metaphor}},
url = {http://biomet.oxfordjournals.org/cgi/doi/10.1093/biomet/asv019},
year = {2015}
}
@article{Smith-Miles2008,
author = {Smith-Miles, Kate},
doi = {10.1145/1456650.1456656},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Smith-Miles - 2008 - Cross-disciplinary perspectives on meta-learning for algorithm selection.pdf:pdf},
issn = {03600300},
journal = {ACM Computing Surveys},
month = {dec},
number = {1},
pages = {1--25},
title = {{Cross-disciplinary perspectives on meta-learning for algorithm selection}},
url = {http://portal.acm.org/citation.cfm?doid=1456650.1456656},
volume = {41},
year = {2008}
}
@inproceedings{Druck2010,
author = {Druck, Gregory and McCallum, Andrew Kachites},
booktitle = {Proceedings of the 27th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Druck, McCallum - 2010 - High-performance semi-supervised learning using discriminatively constrained generative models.pdf:pdf},
pages = {319--326},
title = {{High-performance semi-supervised learning using discriminatively constrained generative models}},
url = {http://www.cs.umass.edu/{~}gdruck/pubs/druck10high.pdf http://machinelearning.wustl.edu/mlpapers/paper{\_}files/icml2010{\_}DruckM10.pdf},
year = {2010}
}
@article{Kawakita2013,
archivePrefix = {arXiv},
arxivId = {arXiv:1204.3965v1},
author = {Kawakita, Masanori and Kanamori, Takafumi},
eprint = {arXiv:1204.3965v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kawakita, Kanamori - 2013 - Semi-Supervised learning with Density-Ratio Estimation.pdf:pdf},
journal = {Machine Learning},
number = {2},
pages = {189--209},
title = {{Semi-Supervised learning with Density-Ratio Estimation}},
volume = {91},
year = {2013}
}
@article{Blundell2015,
abstract = {We introduce a new, efficient, principled and backpropagation-compatible algorithm for learn-ing a probability distribution on the weights of a neural network, called Bayes by Backprop. It regularises the weights by minimising a com-pression cost, known as the variational free en-ergy or the expected lower bound on the marginal likelihood. We show that this principled kind of regularisation yields comparable performance to dropout on MNIST classification. We then demonstrate how the learnt uncertainty in the weights can be used to improve generalisation in non-linear regression problems, and how this weight uncertainty can be used to drive the exploration-exploitation trade-off in reinforce-ment learning.},
archivePrefix = {arXiv},
arxivId = {arXiv:1505.05424v2},
author = {Blundell, Charles and Cornebise, Julien and Kavukcuoglu, Koray and Wierstra, Daan},
eprint = {arXiv:1505.05424v2},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Blundell et al. - 2015 - Weight Uncertainty in Neural Networks.pdf:pdf},
title = {{Weight Uncertainty in Neural Networks}},
volume = {37},
year = {2015}
}
@article{Ojala2010,
abstract = {We explore the framework of permutation-based p-values for assessing the performance of classi- fiers. In this paperwe study two simple permutation tests. The first test assesswhether the classifier has found a real class structure in the data; the corresponding null distribution is estimated by per- muting the labels in the data. This test has been used extensively in classification problems in computational biology. The second test studies whether the classifier is exploiting the dependency between the features in classification; the corresponding null distribution is estimated by permut- ing the features within classes, inspired by restricted randomization techniques traditionally used in statistics. This new test can serve to identify descriptive features which can be valuable infor- mation in improving the classifier performance. We study the properties of these tests and present an extensive empirical evaluation on real and synthetic data. Our analysis shows that studying the classifier performance via permutation tests is effective. In particular, the restricted permutation test clearly reveals whether the classifier exploits the interdependency between the features in the data.},
author = {Ojala, Markus and Garriga, Gemma C},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ojala, Garriga - 2010 - Permutation Tests for Studying Classi er Performance.pdf:pdf},
isbn = {1532-4435},
issn = {1550-4786},
journal = {Journal of Machine Learning Research},
keywords = {classification,labeled data,permutation tests,restricted randomization,significance testing},
pages = {1833--1863},
title = {{Permutation Tests for Studying Classi er Performance}},
volume = {11},
year = {2010}
}
@unpublished{Loog2013,
author = {Loog, Marco},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Loog - 2013 - Conservative Transductive and Semi-Supervised Empirical Risk Minimization.pdf:pdf},
pages = {1--9},
title = {{Conservative Transductive and Semi-Supervised Empirical Risk Minimization}},
year = {2013}
}
@inproceedings{Kohavi1995,
author = {Kohavi, Ron},
booktitle = {International Joint Conferences on Artificial Intelligence},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kohavi - 1995 - A study of cross-validation and bootstrap for accuracy estimation and model selection.pdf:pdf},
number = {2},
pages = {1137--1145},
title = {{A study of cross-validation and bootstrap for accuracy estimation and model selection}},
url = {http://frostiebek.free.fr/docs/Machine Learning/validation-1.pdf},
volume = {14},
year = {1995}
}
@misc{Eckhardt1987,
author = {Eckhardt, Roger},
title = {monte carlo},
year = {1987}
}
@article{Cai2014,
author = {Cai, T. Tony and Yuan, Ming},
doi = {10.1214/13-AOS1175},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Cai, Yuan - 2014 - Discussion “a significance test for the lasso”.pdf:pdf},
issn = {0090-5364},
journal = {The Annals of Statistics},
month = {apr},
number = {2},
pages = {478--482},
title = {{Discussion: “a significance test for the lasso”}},
volume = {42},
year = {2014}
}
@article{Keogh2005,
author = {Keogh, Eamonn and Lin, Jessica},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Keogh, Lin - 2005 - Clustering of time-series subsequences is meaningless implications for previous and future research.pdf:pdf},
journal = {Knowledge and information systems},
number = {2},
pages = {154--177},
title = {{Clustering of time-series subsequences is meaningless: implications for previous and future research}},
url = {http://link.springer.com/article/10.1007/s10115-004-0172-7},
volume = {8},
year = {2005}
}
@article{Soudry2016,
abstract = {We use smoothed analysis techniques to provide guarantees on the training loss of Multilayer Neural Networks (MNNs) at differentiable local minima. Specifically, we examine MNNs with piecewise linear activation functions, quadratic loss and a single output, under mild over-parametrization. We prove that for a MNN with one hidden layer, the training error is zero at every differentiable local minimum, for almost every dataset and dropout-like noise realization. We then extend these results to the case of more than one hidden layer. Our theoretical guarantees assume essentially nothing on the training data, and are verified numerically. These results suggest why the highly non-convex loss of such MNNs can be easily optimized using local updates (e.g., stochastic gradient descent), as observed empirically.},
archivePrefix = {arXiv},
arxivId = {1605.08361},
author = {Soudry, Daniel and Carmon, Yair},
eprint = {1605.08361},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Soudry, Carmon - 2016 - No bad local minima Data independent training error guarantees for multilayer neural networks.pdf:pdf},
number = {DLM},
pages = {1--12},
title = {{No bad local minima: Data independent training error guarantees for multilayer neural networks}},
url = {http://arxiv.org/abs/1605.08361},
year = {2016}
}
@unpublished{Amasyali2009,
author = {Amasyali, M Fatih and Ersoy, Okan K.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Amasyali, Ersoy - 2009 - A Study of Meta Learning for Regression.pdf:pdf},
institution = {Purdue University},
title = {{A Study of Meta Learning for Regression}},
year = {2009}
}
@article{Dietterich1998,
author = {Dietterich, Thomas G},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Dietterich - 1998 - Approximate statistical tests for comparing supervised classification learning algorithms.pdf:pdf},
journal = {Neural computation},
title = {{Approximate statistical tests for comparing supervised classification learning algorithms}},
url = {http://www.mitpressjournals.org/doi/abs/10.1162/089976698300017197},
year = {1998}
}
@article{Dagenais1971,
abstract = {The purpose of this article is to suggest a method of estimating parameters of linear regressions containing two independent variables, when data is missing among these variables. The problem envisaged concerns the case where: (1) the independent variables are considered as fixed numbers; (2) each observation contains the values of the dependent variable and at least one of the independent variables; (3) some observations are complete. In contrast with other approaches dealing with similar problems, the technique developed in this article has the following advantages: (1) it is based on rather unrestrictive hypotheses; (2) the resulting estimators are consistent; (3) the asymptotic variances of these estimators are smaller than those of comparable estimators described in the literature. Although the question is not examined in the present article, it seems also that the proposed method offers good possibilities of generalization.},
author = {Dagenais, Marcel G},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Dagenais - 1971 - Further suggestions concerning the utilization of incomplete observations in regression analysis.pdf:pdf},
journal = {Journal of the American Statistical Association},
keywords = {Missing data,Model specification},
number = {333},
pages = {93--98},
title = {{Further suggestions concerning the utilization of incomplete observations in regression analysis }},
volume = {66},
year = {1971}
}
@article{Leisch2004,
author = {Leisch, F},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Leisch - 2004 - {\{}FlexMix{\}} A General Framework for Finite Mixture Models and Latent Class Regression in {\{}R{\}}.pdf:pdf},
keywords = {finite mixture models,latent class regression,model based clustering,r},
number = {8},
pages = {1--18},
title = {{{\{}FlexMix{\}}: A General Framework for Finite Mixture Models and Latent Class Regression in {\{}R{\}}}},
volume = {11},
year = {2004}
}
@misc{Tibshirani,
author = {Tibshirani, Robert},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Tibshirani - Unknown - Machine Learning vs. Statistics.pdf:pdf},
title = {{Machine Learning vs. Statistics}}
}
@article{Schuurmans2002,
author = {Schuurmans, Dale and Southey, Finnegan},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Schuurmans, Southey - 2002 - Metric-Based Methods for Adaptive Model Selection and Regularization.pdf:pdf},
journal = {Machine Learning},
keywords = {model selection,regularization,unlabeled examples},
pages = {51--84},
title = {{Metric-Based Methods for Adaptive Model Selection and Regularization}},
volume = {48},
year = {2002}
}
@inproceedings{Vandewalle2008,
author = {Vandewalle, Vincent and Biernacki, Christophe and Celeux, Gilles and Govaert, Gerard},
booktitle = {vincent.vandewalle.perso.sfr.fr},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Vandewalle et al. - 2008 - Are unlabeled data useful in semi-supervised model-based classification Combining hypothesis testing and mode.pdf:pdf},
title = {{Are unlabeled data useful in semi-supervised model-based classification? Combining hypothesis testing and model choice}},
url = {http://vincent.vandewalle.perso.sfr.fr/documents/recherche/articles/vbcg.pdf},
year = {2008}
}
@book{Wickham2009,
author = {Wickham, Hadley},
isbn = {978-0-387-98140-6},
publisher = {Springer-Verlag New York},
title = {{ggplot2: Elegant Graphics for Data Analysis}},
url = {http://ggplot2.org},
year = {2009}
}
@inproceedings{Steck2003,
author = {Steck, Harald and Jaakkola, Tommi S.},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Steck, Jaakkola - 2003 - Bias-corrected bootstrap and model uncertainty.pdf:pdf},
title = {{Bias-corrected bootstrap and model uncertainty}},
url = {http://machinelearning.wustl.edu/mlpapers/paper{\_}files/NIPS2003{\_}AA66.pdf},
year = {2003}
}
@article{Wilson2016a,
abstract = {We present a set of computing tools and techniques that every researcher can and should adopt. These recommendations synthesize inspiration from our own work, from the experiences of the thousands of people who have taken part in Software Carpentry and Data Carpentry workshops over the past six years, and from a variety of other guides. Unlike some other guides, our recommendations are aimed specifically at people who are new to research computing.},
archivePrefix = {arXiv},
arxivId = {1609.00037},
author = {Wilson, Greg and Bryan, Jennifer and Cranston, Karen and Kitzes, Justin and Nederbragt, Lex and Teal, Tracy K.},
eprint = {1609.00037},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wilson et al. - 2016 - Good Enough Practices in Scientific Computing.pdf:pdf},
pages = {1--30},
title = {{Good Enough Practices in Scientific Computing}},
url = {http://arxiv.org/abs/1609.00037},
year = {2016}
}
@article{Chatterjee2007,
annote = {The authors show how to use genetic algorithms from Matlab to generate datasets with the same statistics but different visual characteristics, like the manually constructed Anscombe datasets. Results show some interesting datasets for some datasets, although a better measure of fitness is likely needed to get datasets that have the same stricking differences as the Anscombe datasets.},
author = {Chatterjee, Sangit and Firat, Aykut},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Chatterjee, Firat - 2007 - Generating Data with Identical Statistics but Dissimilar Graphics.pdf:pdf},
journal = {The American Statistician},
month = {aug},
number = {3},
pages = {248--254},
title = {{Generating Data with Identical Statistics but Dissimilar Graphics}},
volume = {61},
year = {2007}
}
@article{Li2015b,
abstract = {The scarcity of data annotated at the desired level of granularity is a recurring issue in many applications. Significant amounts of effort have been devoted to developing weakly supervised methods tailored to each individual setting, which are often carefully designed to take advantage of the particular properties of weak supervision regimes, form of available data and prior knowledge of the task at hand. Unfortunately, it is difficult to adapt these methods to new tasks and/or forms of data, which often require different weak supervision regimes or models. We present a general-purpose method that can solve any weakly supervised learning problem irrespective of the weak supervision regime or the model. The proposed method turns any off-the-shelf strongly supervised classifier into a weakly supervised classifier and allows the user to specify any arbitrary weakly supervision regime via a loss function. We apply the method to several different weak supervision regimes and demonstrate competitive results compared to methods specifically engineered for those settings.},
archivePrefix = {arXiv},
arxivId = {1509.06807},
author = {Li, Ke and Malik, Jitendra},
eprint = {1509.06807},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Li, Malik - 2015 - Bandit Label Inference for Weakly Supervised Learning.pdf:pdf},
pages = {1--10},
title = {{Bandit Label Inference for Weakly Supervised Learning}},
url = {http://arxiv.org/abs/1509.06807},
year = {2015}
}
@article{Subramanya2011a,
author = {Subramanya, A and Bilmes, Jeff},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Subramanya, Bilmes - 2011 - Semi-supervised learning with measure propagation.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {graph-based semi-supervised learning,large-scale semi-supervised,learning,non-parametric models,transductive inference},
pages = {3311--3370},
title = {{Semi-supervised learning with measure propagation}},
url = {http://dl.acm.org/citation.cfm?id=2078212},
volume = {12},
year = {2011}
}
@article{Buhlmann2002,
author = {Buhlmann, Peter and Yu, Bin},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Buhlmann, Yu - 2002 - Analyzing bagging.pdf:pdf},
journal = {The Annals of Statistics},
number = {4},
pages = {927--961},
title = {{Analyzing bagging}},
volume = {30},
year = {2002}
}
@article{Platanios2014,
author = {Platanios, Emmanouil Antonios and Blum, Avrim and Mitchell, Tom},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Platanios, Blum, Mitchell - 2014 - Estimating Accuracy from Unlabeled Data.pdf:pdf},
isbn = {9780974903910},
journal = {30th Conference on Uncertainty in Artificial Intelligence},
title = {{Estimating Accuracy from Unlabeled Data}},
year = {2014}
}
@article{Krishnasamy2016,
author = {Krishnasamy, Ganesh and Paramesran, Raveendran},
doi = {10.1016/j.neucom.2016.05.039},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Krishnasamy, Paramesran - 2016 - Hessian semi-supervised extreme learning machine.pdf:pdf},
issn = {09252312},
journal = {Neurocomputing},
keywords = {Extreme learning machine,Hessian regularization,Manifold learning,Semi-supervised learning,extreme learning machine,semi-supervised learning},
pages = {560--567},
publisher = {Elsevier},
title = {{Hessian semi-supervised extreme learning machine}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0925231216303915},
volume = {207},
year = {2016}
}
@book{Quinonero-Candela2009,
author = {Quinonero-Candela, Joaquin and Sugiyama, Masashi and Schwaighofer, Anton and Lawrence, Neil D.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Quinonero-Candela et al. - 2009 - Dataset Shift in Machine Learning.pdf:pdf},
isbn = {9780262170055},
title = {{Dataset Shift in Machine Learning}},
url = {http://dl.acm.org/citation.cfm?id=1462129},
year = {2009}
}
@inproceedings{Chu2007,
abstract = {We are at the beginning of the multicore era. Computers will have increasingly many cores (processors), but there is still no good programming framework for these architectures, and thus no simple and unified way for machine learning to take advantage of the potential speed up. In this paper, we develop a broadly ap- plicable parallel programming method, one that is easily applied to many different learning algorithms. Our work is in distinct contrast to the tradition in machine learning of designing (often ingenious) ways to speed up a single algorithm at a time. Specifically, we show that algorithms that fit the Statistical Query model [15] can be written in a certain “summation form,” which allows them to be easily par- allelized on multicore computers. We adapt Google's map-reduce [7] paradigm to demonstrate this parallel speed up technique on a variety of learning algorithms including locally weighted linear regression (LWLR), k-means, logistic regres- sion (LR), naive Bayes (NB), SVM, ICA, PCA, gaussian discriminant analysis (GDA), EM, and backpropagation (NN). Our experimental results show basically linear speedup with an increasing number of processors.},
author = {Chu, Cheng-tao and Kim, Sang Kyun and Lin, Yi-An and Yu, YuanYuan and Bradski, Gary and Ng, Andrew Y. and Olukotun, Kunle},
booktitle = {Advances in Neural Information Processing Systems 19},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Chu et al. - 2007 - Map-Reduce for Machine Learning on Multicore.pdf:pdf},
pages = {281--288},
title = {{Map-Reduce for Machine Learning on Multicore}},
year = {2007}
}
@article{Balcan2010,
author = {Balcan, Maria-Florina and Blum, Avrim},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Balcan, Blum - 2010 - A Discriminative Model for Semi-Supervised Learning.pdf:pdf},
journal = {Journal of the ACM (JACM)},
number = {3},
title = {{A Discriminative Model for Semi-Supervised Learning}},
url = {http://dl.acm.org/citation.cfm?id=1706599},
volume = {57},
year = {2010}
}
@article{Targ2016,
archivePrefix = {arXiv},
arxivId = {1603.08029},
author = {Targ, Sasha and Almeida, Diogo and Lyman, Kevin},
eprint = {1603.08029},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Targ, Almeida, Lyman - 2016 - ResNet In ResNet Generalizing Residual Architectures.pdf:pdf},
number = {1},
pages = {1--4},
title = {{ResNet In ResNet: Generalizing Residual Architectures}},
year = {2016}
}
@article{Williamson,
author = {Williamson, Robert C},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Williamson - Unknown - Loss Functions.pdf:pdf},
number = {3},
pages = {1--10},
title = {{Loss Functions}}
}
@inproceedings{Ogawa2013,
author = {Ogawa, Kohei and Imamura, Motoki and Takeuchi, Ichiro and Sugiyama, Masashi},
booktitle = {Proceedings of the 30th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ogawa et al. - 2013 - Infinitesimal Annealing for Training Semi-Supervised Support Vector Machines.pdf:pdf},
pages = {897--905},
title = {{Infinitesimal Annealing for Training Semi-Supervised Support Vector Machines}},
url = {http://sugiyama-www.cs.titech.ac.jp/{~}sugi/2013/ICML2013b.pdf},
year = {2013}
}
@article{An2016,
author = {An, H Aitao G and Uo, Z Hizeng L and An, Y Ingle F and Ang, N O N G S},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/An et al. - 2016 - Enhanced manifold regularization for semi-supervised classification.pdf:pdf},
journal = {Journal of the Optical Society of America},
number = {6},
pages = {1207--1213},
title = {{Enhanced manifold regularization for semi-supervised classification}},
volume = {33},
year = {2016}
}
@article{Wainwright2008,
author = {Wainwright, Martin J. and Jordan, Michael I.},
doi = {10.1561/2200000001},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wainwright, Jordan - 2008 - Graphical models, exponential families, and variational inference.pdf:pdf;:Users/jkrijthe/Documents/Mendeley Desktop/Wainwright, Jordan - 2008 - Graphical models, exponential families, and variational inference(2).pdf:pdf},
journal = {Foundations and Trends in Machine Learning},
pages = {1--305},
title = {{Graphical models, exponential families, and variational inference}},
url = {http://discovery.ucl.ac.uk/185880/ http://dl.acm.org/citation.cfm?id=1498841},
volume = {1},
year = {2008}
}
@article{Jones2012,
author = {Jones, Emrys A and Deininger, S{\"{o}}ren-oliver and Hogendoorn, Pancras C W and Deelder, Andr{\'{e}} M and Mcdonnell, Liam A},
doi = {10.1016/j.jprot.2012.06.014},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Jones et al. - 2012 - Imaging mass spectrometry statistical analysis.pdf:pdf},
issn = {1874-3919},
journal = {Journal of Proteomics},
keywords = {Biomarker discovery,Data analysis,Molecular histology,imaging mass spectrometry},
number = {16},
pages = {4962--4989},
publisher = {Elsevier B.V.},
title = {{Imaging mass spectrometry statistical analysis}},
url = {http://dx.doi.org/10.1016/j.jprot.2012.06.014},
volume = {75},
year = {2012}
}
@article{Janzing2013,
author = {Janzing, Dominik and Balduzzi, David and Grosse-Wentrup, Moritz and Sch{\"{o}}lkopf, Bernhard},
doi = {10.1214/13-AOS1145},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Janzing et al. - 2013 - Quantifying causal influences.pdf:pdf},
issn = {0090-5364},
journal = {The Annals of Statistics},
month = {oct},
number = {5},
pages = {2324--2358},
title = {{Quantifying causal influences}},
url = {http://projecteuclid.org/euclid.aos/1383661266},
volume = {41},
year = {2013}
}
@unpublished{Grunwald2014,
archivePrefix = {arXiv},
arxivId = {arXiv:1412.3730v1},
author = {Gr{\"{u}}nwald, Peter and van Ommen, Thijs},
eprint = {arXiv:1412.3730v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gr{\"{u}}nwald, Ommen - 2014 - Inconsistency of Bayesian Inference for Misspecified Linear Models , and a Proposal for Repairing It.pdf:pdf},
pages = {1--70},
title = {{Inconsistency of Bayesian Inference for Misspecified Linear Models , and a Proposal for Repairing It}},
year = {2014}
}
@article{Eddelbuettel2011,
author = {Eddelbuettel, Dirk and Francois, Romain},
doi = {10.18637/jss.v040.i08},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Eddelbuettel, Francois - 2011 - Rcpp Seamless R and C Integration.pdf:pdf},
journal = {Journal of Statistical Software},
number = {1},
pages = {1--18},
title = {{Rcpp : Seamless R and C ++ Integration}},
volume = {40},
year = {2011}
}
@incollection{Cozman2006,
author = {Cozman, F and Cohen, Ira},
booktitle = {Semi-Supervised Learning},
chapter = {4},
editor = {Chapelle, Olivier and Sch{\"{o}}lkopf, Bernhard and Zien, A},
pages = {56--72},
publisher = {MIT press},
title = {{Risks of Semi-Supervised Learning}},
year = {2006}
}
@article{Lanckriet2002,
abstract = {When constructing a classifier, the probability of correct classification of future data points should be maximized. We consider a binary classification problem where the mean and covariance matrix of each class are assumed to be known. No further assumptions are made with respect to the class- conditional distributions. Misclassification probabilities are then controlled in a worst-case setting: that is, under all possible choices of class-conditional densities with given mean and covariance matrix, we minimize the worst-case (maximum) probability of misclassification of future data points. For a linear decision boundary, this desideratum is translated in a very direct way into a (convex) second order cone optimization problem, with complexity similar to a support vector machine problem. The minimax problem can be interpreted geometrically as minimizing the maximum of the Mahalanobis distances to the two classes. We address the issue of robustness with respect to estimation errors (in the means and covariances of the classes) via a simple modification of the input data. We also show how to exploit Mercer kernels in this setting to obtain nonlinear decision boundaries, yielding a classifier which proves to be competitive with current methods, including support vector machines. An important feature of this method is that a worst-case bound on the probability of misclassification of future data is always obtained explicitly},
author = {Lanckriet, Gert R. G. and Ghaoui, Laurent El and Bhattacharyya, Chiranjib and Jordan, Michael I.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lanckriet et al. - 2002 - A Robust Minimax Approach to Classification.pdf:pdf},
issn = {1532-4435},
journal = {Journal of Machine Learning Research},
keywords = {classification,convex optimization,kernel methods,second order cone programming},
pages = {555--582},
title = {{A Robust Minimax Approach to Classification}},
volume = {3},
year = {2002}
}
@article{Wang2009,
author = {Wang, Xiaozhe and Smith-Miles, Kate and Hyndman, Rob},
doi = {10.1016/j.neucom.2008.10.017},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wang, Smith-Miles, Hyndman - 2009 - Rule induction for forecasting method selection Meta-learning the characteristics of univariate time.pdf:pdf},
issn = {09252312},
journal = {Neurocomputing},
month = {jun},
number = {10-12},
pages = {2581--2594},
title = {{Rule induction for forecasting method selection: Meta-learning the characteristics of univariate time series}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0925231208005134},
volume = {72},
year = {2009}
}
@inproceedings{Lawrence2004,
author = {Lawrence, Neil D. and Jordan, Michael I.},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lawrence, Jordan - 2004 - Semi-supervised learning via Gaussian processes.pdf:pdf},
pages = {753--760},
title = {{Semi-supervised learning via Gaussian processes}},
year = {2004}
}
@article{Culp2008,
author = {Culp, Mark and Michailidis, George},
doi = {10.1198/106186008X344748},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Culp, Michailidis - 2008 - An iterative algorithm for extending learners to a semi-supervised setting.pdf:pdf;:Users/jkrijthe/Documents/Mendeley Desktop/Culp, Michailidis - 2008 - An iterative algorithm for extending learners to a semi-supervised setting(2).pdf:pdf},
issn = {1061-8600},
journal = {Journal of Computational and Graphical Statistics},
keywords = {convergence,iterative algorithm,linear smoothers,semi-supervised learning},
month = {sep},
number = {3},
pages = {545--571},
title = {{An iterative algorithm for extending learners to a semi-supervised setting}},
volume = {17},
year = {2008}
}
@phdthesis{Hamers2012,
author = {Hamers, Adrian},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hamers - 2012 - The Evolution of Coeval Stellar Hierarchical Triple Systems.pdf:pdf},
school = {Utrecht University},
title = {{The Evolution of Coeval Stellar Hierarchical Triple Systems}},
year = {2012}
}
@article{Carlo,
author = {Carlo, Monte and Eckhardt, Roger},
title = {{and the MONTE CARL0 METHOD}}
}
@article{VonHippel2007,
author = {von Hippel, Paul T.},
doi = {10.1111/j.1467-9531.2007.00180.x},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/von Hippel - 2007 - Regression With Missing Ys an Improved Strategy for Analyzing Multiply Imputed Data.pdf:pdf},
issn = {0081-1750},
journal = {Sociological Methodology},
month = {dec},
number = {1},
pages = {83--117},
title = {{Regression With Missing Ys: an Improved Strategy for Analyzing Multiply Imputed Data}},
url = {http://smx.sagepub.com/lookup/doi/10.1111/j.1467-9531.2007.00180.x},
volume = {37},
year = {2007}
}
@inproceedings{Tax2005,
author = {Tax, David M.J. and Duin, Robert P.W.},
booktitle = {Proceedings of the Sixteenth Annual Symposium of the Pattern Recognition Association of South Africa},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Tax, Duin - 2005 - Characterizing one-class datasets.pdf:pdf},
number = {4},
pages = {21--26},
title = {{Characterizing one-class datasets}},
url = {http://mediamatica.ewi.tudelft.nl/sites/default/files/TaxDui2005.pdf},
volume = {1},
year = {2005}
}
@article{Efron1977,
author = {Efron, Bradley and Morris, Carl},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Efron, Morris - 1977 - Stein's paradox in statistics.pdf:pdf},
journal = {Scientific American},
pages = {119--127},
title = {{Stein's paradox in statistics}},
url = {https://www.cs.nyu.edu/{~}roweis/csc2515-2006/readings/stein{\_}sciam.pdf},
year = {1977}
}
@book{MacKay2008,
author = {MacKay, David},
doi = {10.1109/PES.2004.1373296.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/MacKay - 2008 - Sustainable Energy-without the hot air.pdf:pdf},
isbn = {9780954452933},
issn = {00029505},
title = {{Sustainable Energy-without the hot air}},
url = {http://www.dspace.cam.ac.uk/handle/1810/217849 https://www.repository.cam.ac.uk/handle/1810/217849},
year = {2008}
}
@inproceedings{Plessis2012,
author = {du Plessis, Marthinus Christoffel and Sugiyama, Masashi},
booktitle = {Proceedings of the 29th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Plessis, Sugiyama - 2012 - Semi-supervised learning of class balance under class-prior change by distribution matching.pdf:pdf},
title = {{Semi-supervised learning of class balance under class-prior change by distribution matching}},
url = {http://arxiv.org/abs/1206.4677},
year = {2012}
}
@misc{Lichman2013,
author = {Lichman, M.},
publisher = {University of California, Irvine, School of Information and Computer Sciences},
title = {{UCI Machine Learning Repository}},
url = {http://archive.ics.uci.edu/ml},
year = {2013}
}
@inproceedings{Loog2012,
author = {Loog, Marco and Duin, Robert P W},
booktitle = {Structural, Syntactic, and Statistical Pattern Recognition (Lecture Notes in Computer Science Volume 7626)},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Loog, Duin - 2012 - The dipping phenomenon.pdf:pdf},
pages = {310--317},
publisher = {Springer},
title = {{The dipping phenomenon}},
year = {2012}
}
@inproceedings{Cortes2011,
author = {Cortes, Corinna and Mohri, Mehryar},
booktitle = {Algorithmic Learning Theory (Lecture Notes in Computer Science Volume 6925)},
doi = {10.1007/978-3-642-24412-4_25},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Cortes, Mohri - 2011 - Domain adaptation in regression.pdf:pdf},
pages = {308--323},
title = {{Domain adaptation in regression}},
url = {http://link.springer.com/chapter/10.1007/978-3-642-24412-4{\_}25},
year = {2011}
}
@inproceedings{Sechidis2014,
abstract = {We propose a set of novel methodologies which enable valid statistical hypothesis testing when we have only positive and unlabelled (PU) examples. This type of problem, a special case of semi-supervised data, is common in text mining, bioinformatics, and computer vision. Focusing on a generalised likelihood ratio test, we have 3 key contributions: (1) a proof that assuming all unlabelled examples are negative cases is sufficient for independence testing, but not for power analysis activities; (2) a new methodology that compensates this and enables power analysis, allowing sample size determination for observing an effect with a desired power; and finally, (3) a new capability, supervision determination, which can determine a-priori the number of labelled examples the user must collect before being able to observe a desired statistical effect. Beyond general hypothesis testing, we suggest the tools will additionally be useful for information theoretic feature selection, and Bayesian Network structure learning. {\textcopyright} 2014 Springer-Verlag.},
author = {Sechidis, Konstantinos and Calvo, Borja and Brown, Gavin},
booktitle = {Joint European Conference on Machine Learning and Knowledge Discovery in Databases},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Sechidis, Calvo, Brown - 2014 - Statistical hypothesis testing in positive unlabelled data.pdf:pdf},
isbn = {9783662448441},
issn = {16113349},
number = {PART 3},
pages = {66--81},
title = {{Statistical hypothesis testing in positive unlabelled data}},
volume = {8726 LNAI},
year = {2014}
}
@article{Bareinboim2014,
abstract = {Selection bias is caused by preferential exclusion of units from the samples and represents a major obstacle to valid causal and statistical inferences; it cannot be removed by randomized experiments and can rarely be detected in either experimental or observational studies. In this paper, we provide complete graphical and algorithmic conditions for recovering conditional probabilities from selection biased data. We also provide graphical conditions for recoverability when unbiased data is available over a subset of the variables. Finally, we provide a graphical condition that generalizes the backdoor criterion and serves to recover causal effects when the data is collected under preferential selection.},
author = {Bareinboim, Elias and Tian, Jin and Pearl, Judea},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bareinboim, Tian, Pearl - 2014 - Recovering from Selection Bias in Causal and Statistical Inference.pdf:pdf},
journal = {Proceedings of the 28th AAAI Conference on Artificial Intelligence (AAAI 2014)},
keywords = {causal inference,causality,sampling bias,selection bias,statistical inference},
number = {Pearl},
pages = {2410--2416},
title = {{Recovering from Selection Bias in Causal and Statistical Inference}},
url = {https://www.aaai.org/ocs/index.php/AAAI/AAAI14/paper/view/8628/8707 http://ftp.cs.ucla.edu/pub/stat{\_}ser/r425.pdf},
year = {2014}
}
@article{Bodo2015,
abstract = {{\textless}p{\textgreater}Semi-supervised learning has become an important and thoroughly studied subdomain of machine learning in the past few years, because gathering large unlabeled data is almost costless, and the costly human labeling process can be minimized by semi-supervision. Label propagation is a transductive semi-supervised learning method that operates on the—most of the time undirected—data graph. It was introduced in [8] and since many variants were proposed. However, the base algorithm has two variants: the first variant presented in [8] and its slightly modified version used afterwards, e.g. in [7]. This paper presents and compares the two algorithms—both theoretically and experimentally—and also tries to make a recommendation which variant to use.{\textless}/p{\textgreater}},
author = {Bod{\'{o}}, Zal{\'{a}}n and Csat{\'{o}}, Lehel},
doi = {10.1515/ausi-2015-0010},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bod{\'{o}}, Csat{\'{o}} - 2015 - A note on label propagation for semi-supervised learning.pdf:pdf},
issn = {2066-7760},
journal = {Acta Universitatis Sapientiae, Informatica},
keywords = {and phrases,label propagation,semi-supervised learning},
number = {1},
pages = {18--30},
title = {{A note on label propagation for semi-supervised learning}},
url = {http://www.degruyter.com/view/j/ausi.2015.7.issue-1/ausi-2015-0010/ausi-2015-0010.xml},
volume = {7},
year = {2015}
}
@inproceedings{Bernad2004,
author = {Bernad{\'{o}}-Mansilla, Ester and Ho, Tin Kam},
booktitle = {Proceedings of the 17th International Conference on Pattern Recognition},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bernad{\'{o}}-Mansilla, Ho - 2004 - On classifier domains of competence.pdf:pdf},
title = {{On classifier domains of competence}},
url = {http://ieeexplore.ieee.org/xpls/abs{\_}all.jsp?arnumber=1334026},
year = {2004}
}
@article{Lockhart2014f,
author = {Lockhart, Richard and Taylor, Jonathan and Tibshirani, Ryan J. and Tibshirani, Robert},
doi = {10.1214/13-AOS1175},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lockhart et al. - 2014 - Rejoinder A significance test for the lasso.pdf:pdf},
issn = {0090-5364},
journal = {The Annals of Statistics},
month = {apr},
number = {2},
pages = {518--531},
title = {{Rejoinder: A significance test for the lasso}},
url = {http://projecteuclid.org/euclid.aos/1400592161},
volume = {42},
year = {2014}
}
@book{Hastie2009,
author = {Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome H.},
edition = {2},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hastie, Tibshirani, Friedman - 2009 - The Elements of Statistical Learning.pdf:pdf},
isbn = {0387848576},
publisher = {Spinger},
title = {{The Elements of Statistical Learning}},
year = {2009}
}
@inproceedings{Krijthe2014,
address = {Stockholm},
archivePrefix = {arXiv},
arxivId = {1411.4521},
author = {Krijthe, Jesse Hendrik and Loog, Marco},
booktitle = {Proceedings of the 22nd International Conference on Pattern Recognition},
doi = {10.1109/ICPR.2014.646},
eprint = {1411.4521},
pages = {3762--3767},
title = {{Implicitly Constrained Semi-Supervised Linear Discriminant Analysis}},
year = {2014}
}
@inproceedings{Nigam2000a,
author = {Nigam, Kamal and Ghani, R},
booktitle = {Proceedings of the 9th International Conference on Information and Knowledge Management},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Nigam, Ghani - 2000 - Analyzing the effectiveness and applicability of co-training.pdf:pdf},
isbn = {1581133200},
keywords = {a related set of,blum and mitchell 1,for example,in problem domains where,into,present,research uses labeled and,the features naturally divide,two disjoint sets,unlabeled data},
pages = {86--93},
title = {{Analyzing the effectiveness and applicability of co-training}},
url = {http://dl.acm.org/citation.cfm?id=354805},
year = {2000}
}
@article{Sun2010,
author = {Sun, Shiliang and Shawe-taylor, John},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Sun, Shawe-taylor - 2010 - Sparse Semi-supervised Learning Using Conjugate Functions.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {fenchel-legendre conjugate,multi-,representer theorem,semi-supervised learning,statistical learning theory,support vector machine,view regularization},
pages = {2423--2455},
title = {{Sparse Semi-supervised Learning Using Conjugate Functions}},
volume = {11},
year = {2010}
}
@inproceedings{Sindhwani2006,
author = {Sindhwani, Vikas and Keerthi, S. S.},
booktitle = {Proceedings of the 29th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Sindhwani, Keerthi - 2006 - Large scale semi-supervised linear SVMs.pdf:pdf},
isbn = {1595933697},
keywords = {global optimiza-,support vector machines,text categorization,tion,unlabeled data},
pages = {477--484},
publisher = {ACM},
title = {{Large scale semi-supervised linear SVMs}},
year = {2006}
}
@article{Gelman2013a,
author = {Gelman, Andrew},
doi = {10.1097/EDE.0b013e31827886f7},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gelman - 2013 - P values and statistical practice.pdf:pdf},
issn = {1531-5487},
journal = {Epidemiology},
month = {jan},
number = {1},
pages = {69--72},
pmid = {23232612},
title = {{P values and statistical practice.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/23232612},
volume = {24},
year = {2013}
}
@article{McLachlan1975,
author = {McLachlan, Geoffrey J.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/McLachlan - 1975 - Iterative Reclassification Procedure for Constructing an Asymptotically Optimal Rule of Allocation in Discriminant An.pdf:pdf},
journal = {Journal of the American Statistical Association},
number = {350},
pages = {365--369},
title = {{Iterative Reclassification Procedure for Constructing an Asymptotically Optimal Rule of Allocation in Discriminant Analysis}},
volume = {70},
year = {1975}
}
@article{Stahlecker1996,
author = {Stahlecker, Peter and Knautz, Henning and Trenkler, Gotz},
doi = {10.1007/BF00046994},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Stahlecker, Knautz, Trenkler - 1996 - Minimax adjustment technique in a parameter restricted linear model.pdf:pdf},
issn = {0167-8019},
journal = {Acta Applicandae Mathematicae},
keywords = {linear regression,minimax adjustment,projection estimator},
month = {apr},
number = {1},
pages = {139--144},
title = {{Minimax adjustment technique in a parameter restricted linear model}},
url = {http://link.springer.com/10.1007/BF00046994},
volume = {43},
year = {1996}
}
@inproceedings{Chaubey2003,
author = {Chaubey, Yogendra P. and Nebebe, Fassil and Sen, Debaraj},
booktitle = {Joint Statistical Meetings},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Chaubey, Nebebe, Sen - 2003 - Estimation of Joint Distribution from Marginal Distributions.pdf:pdf},
keywords = {bayesian prediction,because they require multidimensional,ble,contingency ta- methods,dirichlet prior,however,is preferred as it,merical integration,nu-,of the,readily presents an estimate,the bayesian method},
pages = {883--889},
title = {{Estimation of Joint Distribution from Marginal Distributions}},
url = {http://www.amstat.org/sections/SRMS/Proceedings/y2003/Files/JSM2003-000794.pdf},
year = {2003}
}
@article{Tulabandhula2015,
author = {Tulabandhula, Theja and Rudin, Cynthia},
doi = {10.1007/s10994-014-5478-4},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Tulabandhula, Rudin - 2015 - Generalization Bounds for Learning with Linear and Quadratic Side Knowledge.pdf:pdf},
journal = {Machine Learning},
pages = {183--216},
title = {{Generalization Bounds for Learning with Linear and Quadratic Side Knowledge}},
volume = {100},
year = {2015}
}
@article{Gu2012a,
author = {Gu, Quanquan and Zhang, Tong and Ding, Chris and Han, Jiawei},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gu et al. - 2012 - Selective Labeling via Error Bound Minimization.pdf:pdf},
isbn = {9781627480031},
issn = {10495258},
journal = {Advances in Neural Information Processing Systems 25},
pages = {332--340},
title = {{Selective Labeling via Error Bound Minimization}},
url = {http://books.nips.cc/papers/files/nips25/NIPS2012{\_}0180.pdf},
year = {2012}
}
@article{Guo2010,
author = {Guo, Yuanyuan and Niu, Xiaoda and Zhang, Harry},
doi = {10.1109/ICDM.2010.66},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Guo, Niu, Zhang - 2010 - An Extensive Empirical Study on Semi-supervised Learning.pdf:pdf},
isbn = {978-1-4244-9131-5},
journal = {IEEE International Conference on Data Mining},
keywords = {-semi-supervised learning,bayesian classifiers},
month = {dec},
pages = {186--195},
publisher = {Ieee},
title = {{An Extensive Empirical Study on Semi-supervised Learning}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=5693972},
year = {2010}
}
@inproceedings{Kuncheva2001,
author = {Kuncheva, Ludmila I and Roli, Fabio and Marcialis, Gian Luca and Shipp, Catherine A.},
booktitle = {Multiple Classifier Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kuncheva et al. - 2001 - Complexity of Data Subsets Generated by the Random Subspace Method An Experimental Investigation.pdf:pdf},
pages = {349--358},
title = {{Complexity of Data Subsets Generated by the Random Subspace Method: An Experimental Investigation}},
year = {2001}
}
@article{Littwin2016,
author = {Littwin, Etai and Wolf, Lior},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Littwin, Wolf - 2016 - Complexity of Multiverse Networks and their Multilayer Generalization.pdf:pdf},
title = {{Complexity of Multiverse Networks and their Multilayer Generalization}},
year = {2016}
}
@article{Hand2013,
author = {Hand, D.J. and Anagnostopoulos, C.},
doi = {10.1016/j.patrec.2012.12.004},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hand, Anagnostopoulos - 2013 - When is the area under the receiver operating characteristic curve an appropriate measure of classifier p.pdf:pdf},
issn = {01678655},
journal = {Pattern Recognition Letters},
keywords = {area under the curve},
month = {apr},
number = {5},
pages = {492--495},
title = {{When is the area under the receiver operating characteristic curve an appropriate measure of classifier performance?}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0167865512003923},
volume = {34},
year = {2013}
}
@article{Blanchard2010,
author = {Blanchard, Gilles and Lee, Gyemin and Scott, Clayton},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Blanchard, Lee, Scott - 2010 - Semi-supervised novelty detection.pdf:pdf},
journal = {Journal of Machine Learning Research},
pages = {2973--3009},
title = {{Semi-supervised novelty detection}},
url = {http://dl.acm.org/citation.cfm?id=1953028},
volume = {11},
year = {2010}
}
@article{Vehtari2015a,
abstract = {Leave-one-out cross-validation (LOO) and the widely applicable information criterion (WAIC) are methods for estimating pointwise out-of-sample prediction accuracy from a fitted Bayesian model using the log-likelihood evaluated at the posterior simulations of the parameter values. LOO and WAIC have various advantages over simpler estimates of predictive error such as AIC and DIC but are less used in practice because they involve additional computational steps. Here we lay out fast and stable computations for LOO and WAIC that can be performed using existing simulation draws. We compute LOO using Pareto smoothed importance sampling (PSIS), a new procedure for regularizing importance weights. As a byproduct of our calculations, we also obtain approximate standard errors for estimated predictive errors and for comparing of predictive errors between two models. We implement the computations in an R package called 'loo' and demonstrate using models fit with the Bayesian inference package Stan.},
archivePrefix = {arXiv},
arxivId = {1507.04544},
author = {Vehtari, Aki and Gelman, Andrew and Gabry, Jonah},
eprint = {1507.04544},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Vehtari, Gelman, Gabry - 2015 - Efficient implementation of leave-one-out cross-validation and WAIC for evaluating fitted Bayesian model.pdf:pdf},
keywords = {bayesian computation,k -fold cross-validation,leave-one-out cross-validation,loo,pareto smoothed importance sampling,stan,waic,widely applicable information criterion},
number = {July},
title = {{Efficient implementation of leave-one-out cross-validation and WAIC for evaluating fitted Bayesian models}},
url = {http://arxiv.org/abs/1507.04544},
year = {2015}
}
@inproceedings{Shaffer1994,
author = {Schaffer, Cullen},
booktitle = {International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Schaffer - 1994 - A conservation law for generalization performance.pdf:pdf},
title = {{A conservation law for generalization performance}},
url = {http://dml.cs.byu.edu/{~}cgc/docs/mldm{\_}tools/Reading/LCG.pdf},
year = {1994}
}
@article{Williams1998,
author = {Williams, Christopher K. I.},
doi = {10.1162/089976698300017412},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Williams - 1998 - Computation with Infinite Neural Networks.pdf:pdf},
issn = {0899-7667},
journal = {Neural Computation},
month = {jul},
number = {5},
pages = {1203--1216},
title = {{Computation with Infinite Neural Networks}},
url = {http://www.mitpressjournals.org/doi/abs/10.1162/089976698300017412},
volume = {10},
year = {1998}
}
@article{Gelman2006,
abstract = {It is common to summarize statistical comparisons by declarations of statistical significance or nonsignificance. Here we discuss one problem with such declarations, namely that changes in statistical significance are often not themselves statistically significant. By this, we are not merely making the commonplace observation that any particular threshold is arbitrary?for example, only a small change is required to move an estimate from a 5.1{\%} significance level to 4.9{\%}, thus moving it into statistical significance. Rather, we are pointing out that even large changes in significance levels can correspond to small, nonsignificant changes in the underlying quantities.The error we describe is conceptually different from other oft-cited problems?that statistical significance is not the same as practical importance, that dichotomization into significant and nonsignificant results encourages the dismissal of observed differences in favor of the usually less interesting null hypothesis of no difference, and that any particular threshold for declaring significance is arbitrary. We are troubled by all of these concerns and do not intend to minimize their importance. Rather, our goal is to bring attention to this additional error of interpretation. We illustrate with a theoretical example and two applied examples. The ubiquity of this statistical error leads us to suggest that students and practitioners be made more aware that the difference between ?significant? and ?not significant? is not itself statistically significant. It is common to summarize statistical comparisons by declarations of statistical significance or nonsignificance. Here we discuss one problem with such declarations, namely that changes in statistical significance are often not themselves statistically significant. By this, we are not merely making the commonplace observation that any particular threshold is arbitrary?for example, only a small change is required to move an estimate from a 5.1{\%} significance level to 4.9{\%}, thus moving it into statistical significance. Rather, we are pointing out that even large changes in significance levels can correspond to small, nonsignificant changes in the underlying quantities.The error we describe is conceptually different from other oft-cited problems?that statistical significance is not the same as practical importance, that dichotomization into significant and nonsignificant results encourages the dismissal of observed differences in favor of the usually less interesting null hypothesis of no difference, and that any particular threshold for declaring significance is arbitrary. We are troubled by all of these concerns and do not intend to minimize their importance. Rather, our goal is to bring attention to this additional error of interpretation. We illustrate with a theoretical example and two applied examples. The ubiquity of this statistical error leads us to suggest that students and practitioners be made more aware that the difference between ?significant? and ?not significant? is not itself statistically significant.},
author = {Gelman, Andrew and Stern, Hal},
doi = {10.1198/000313006X152649},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gelman, Stern - 2006 - The Difference Between “Significant” and “Not Significant” is not Itself Statistically Significant.pdf:pdf},
isbn = {0602440371100},
issn = {0003-1305},
journal = {The American Statistician},
keywords = {comparison,hypothesis testing,meta-analysis,pairwise,replication},
number = {4},
pages = {328--331},
title = {{The Difference Between “Significant” and “Not Significant” is not Itself Statistically Significant}},
volume = {60},
year = {2006}
}
@book{Spirtes2000,
archivePrefix = {arXiv},
arxivId = {arXiv:1011.1669v3},
author = {Spirtes, Peter and Glymour, Clark and Scheines, Richard},
booktitle = {Journal of Chemical Information and Modeling},
edition = {Second},
eprint = {arXiv:1011.1669v3},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Spirtes, Glymour, Scheines - 2000 - Causation, Prediction, and Search.pdf:pdf},
isbn = {0-262-19440-6},
issn = {1098-6596},
number = {9},
pmid = {25246403},
title = {{Causation, Prediction, and Search}},
volume = {53},
year = {2000}
}
@article{Mealli2015,
author = {Mealli, Fabrizia and Rubin, Donald B.},
doi = {10.1093/biomet/asv035},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Mealli, Rubin - 2015 - Clarifying missing at random and related definitions, and implications when coupled with exchangeability.pdf:pdf},
issn = {0006-3444},
journal = {Biometrika},
number = {4},
pages = {995--1000},
title = {{Clarifying missing at random and related definitions, and implications when coupled with exchangeability}},
url = {http://biomet.oxfordjournals.org/lookup/doi/10.1093/biomet/asv035},
volume = {102},
year = {2015}
}
@inproceedings{Mooij,
author = {Mooij, Joris M and Heskes, Tom and Janzing, Dominik and Sch{\"{o}}lkopf, Bernhard},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Mooij et al. - 2011 - On Causal Discovery with Cyclic Additive Noise Models.pdf:pdf},
pages = {639--647},
title = {{On Causal Discovery with Cyclic Additive Noise Models}},
year = {2011}
}
@article{Jarrett2016,
author = {Jarrett, Richard G.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Jarrett - 2016 - The Analysis of Designed Experiments with Missing Observations.pdf:pdf},
journal = {Journal of the Royal Statistical Society. Series C (Applied Statistics)},
number = {1},
pages = {38--46},
title = {{The Analysis of Designed Experiments with Missing Observations}},
volume = {27},
year = {2016}
}
@article{Pribram1978a,
author = {Afifi, A.A. and Elashoff, R.M.},
doi = {10.1017/S0140525X00060003},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Afifi, Elashoff - 1967 - Missing Observations in Multivariate Statistics II Point Estimation in Simple Linear Regression.pdf:pdf},
isbn = {0300104251},
issn = {0140-525X},
journal = {Journal of the American Statistical Association},
number = {317},
pages = {10--29},
title = {{Missing Observations in Multivariate Statistics II: Point Estimation in Simple Linear Regression}},
volume = {62},
year = {1967}
}
@incollection{Jain1982,
author = {Jain, A K and Chandrasekaran, B},
booktitle = {Handbook of Statistics},
editor = {Krishnaiah, P. R. and Kanal, L.N.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Jain, Chandrasekaran - 1982 - Dimensionality and Sample Size Considerations in Pattern Recognition Practice.pdf:pdf},
pages = {835--855},
publisher = {North-Holland Publishing Company},
title = {{Dimensionality and Sample Size Considerations in Pattern Recognition Practice}},
volume = {2},
year = {1982}
}
@article{Johnson,
archivePrefix = {arXiv},
arxivId = {arXiv:1504.01255v1},
author = {Johnson, Rie},
eprint = {arXiv:1504.01255v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Johnson - Unknown - Semi-Supervised Learning with Multi-View Embedding Theory and Application with Convolutional Neural Networks.pdf:pdf},
pages = {1--16},
title = {{Semi-Supervised Learning with Multi-View Embedding : Theory and Application with Convolutional Neural Networks}}
}
@article{Healy1956,
author = {Healy, Michael and Westmacott, Michael},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Healy, Westmacott - 1956 - Missing Values in Experiments Analysed on Automatic Computers.pdf:pdf},
journal = {Journal of the Royal Statistical Society},
number = {3},
pages = {203--206},
title = {{Missing Values in Experiments Analysed on Automatic Computers}},
volume = {5},
year = {1956}
}
@inproceedings{Li2011,
author = {Li, Yu-Feng and Zhou, Zhi-hua},
booktitle = {Proceedings of the 28th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Li, Zhou - 2011 - Towards Making Unlabeled Data Never Hurt.pdf:pdf},
pages = {1081--1088},
title = {{Towards Making Unlabeled Data Never Hurt}},
year = {2011}
}
@article{Dougherty2001,
abstract = {In order to study the molecular biological differences between normal and diseased tissues, it is desirable to perform classification among diseases and stages of disease using microarray-based gene-expression values. Owing to the limited number of microarrays typically used in these studies, serious issues arise with respect to the design, performance and analysis of classifiers based on microarray data. This paper reviews some fundamental issues facing small-sample classification: classification rules, constrained classifiers, error estimation and feature selection. It discusses both unconstrained and constrained classifier design from sample data, and the contributions to classifier error from constrained optimization and lack of optimality owing to design from sample data. The difficulty with estimating classifier error when confined to small samples is addressed, particularly estimating the error from training data. The impact of small samples on the ability to include more than a few variables as classifier features is explained.},
author = {Dougherty, Edward R.},
doi = {10.1002/cfg.62},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Dougherty - 2001 - Small sample issues for microarray-based classification.pdf:pdf},
issn = {1531-6912},
journal = {Comparative and functional genomics},
month = {jan},
number = {1},
pages = {28--34},
pmid = {18628896},
title = {{Small sample issues for microarray-based classification.}},
url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=2447190{\&}tool=pmcentrez{\&}rendertype=abstract},
volume = {2},
year = {2001}
}
@inproceedings{Sokolovska2011,
address = {Greece},
author = {Sokolovska, Nataliya},
booktitle = {ECML PKDD},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Sokolovska - 2011 - Aspects of semi-supervised and active learning in conditional random fields.pdf:pdf},
keywords = {active learn-,conditional random fields,ing,probability of observations,semi-supervised learning},
title = {{Aspects of semi-supervised and active learning in conditional random fields}},
url = {http://www.springerlink.com/index/3308764R6251J70P.pdf},
year = {2011}
}
@article{Rtsne,
author = {Krijthe, Jesse Hendrik},
title = {{Rtsne: T-Distributed Stochastic Neighbor Embedding using a Barnes-Hut Implementation}},
year = {2016}
}
@article{Lockhart2014c,
author = {Wasserman, Larry},
doi = {10.1214/13-AOS1175},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wasserman - 2014 - Discussion “a significance test for the lasso”.pdf:pdf},
issn = {0090-5364},
journal = {The Annals of Statistics},
month = {apr},
number = {2},
pages = {501--508},
title = {{Discussion: “a significance test for the lasso”}},
url = {http://projecteuclid.org/euclid.aos/1400592161},
volume = {42},
year = {2014}
}
@unpublished{Hennig2015,
archivePrefix = {arXiv},
arxivId = {arXiv:1502.02555v1},
author = {Hennig, Christian},
eprint = {arXiv:1502.02555v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hennig - 2015 - What are the true clusters.pdf:pdf},
keywords = {active scientific realism,cat-,comparison of clustering methods,constructivism,egorization,mixture models,natural kinds,variable},
title = {{What are the true clusters?}},
year = {2015}
}
@article{Lapin2014,
abstract = {Prior knowledge can be used to improve predictive performance of learning algorithms or reduce the amount of data required for training. The same goal is pursued within the learning using privileged information paradigm which was recently introduced by Vapnik et al.and is aimed at utilizing additional information available only at training time-a framework implemented by SV M+. We relate the privileged information to importance weighting and show that the prior knowledge expressible with privileged features can also be encoded by weights associated with every training example. We show that a weighted SVM can always replicate an SV M+ solution, while the converse is not true and we construct a counterexample highlighting the limitations of SV M+. Finally, we touch on the problem of choosing weights for weighted SVMs when privileged features are not available. ?? 2014 Elsevier Ltd.},
archivePrefix = {arXiv},
arxivId = {arXiv:1306.3161v1},
author = {Lapin, Maksim and Hein, Matthias and Schiele, Bernt},
doi = {10.1016/j.neunet.2014.02.002},
eprint = {arXiv:1306.3161v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lapin, Hein, Schiele - 2014 - Learning using privileged information SV M and weighted SVM.pdf:pdf},
issn = {08936080},
journal = {Neural Networks},
keywords = {Importance weighting,Prior knowledge,Privileged information,SV M+,SVM,Weighted SVM},
pages = {95--108},
pmid = {24576747},
title = {{Learning using privileged information: SV M+ and weighted SVM}},
volume = {53},
year = {2014}
}
@inproceedings{Krijthe2012b,
author = {Krijthe, Jesse Hendrik and Ho, Tin Kam and Loog, Marco},
booktitle = {Proceedings of the 21st International Conference on Pattern Recognition},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Krijthe, Ho, Loog - 2012 - Improving cross-validation based classifier selection using meta-learning.pdf:pdf},
pages = {2873--2876},
title = {{Improving cross-validation based classifier selection using meta-learning}},
year = {2012}
}
@phdthesis{Marlin2008,
author = {Marlin, BM},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Marlin - 2008 - Missing data problems in machine learning.pdf:pdf},
school = {University of Toronto},
title = {{Missing data problems in machine learning}},
url = {http://www-devel.cs.ubc.ca/{~}bmarlin/research/phd{\_}thesis/marlin-phd-thesis.pdf},
year = {2008}
}
@inproceedings{Hajizadeh2016,
author = {Hajizadeh, Siamak and N{\'{u}}{\~{n}}ez, Alfredo and Tax, David M.J.},
booktitle = {Proceedings of the 14th IFAC Symposium on Control in Transportation Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hajizadeh, N{\'{u}}{\~{n}}ez, Tax - 2016 - Semi-supervised Rail Defect Detection from Imbalanced Image Data.pdf:pdf},
keywords = {imbalance},
title = {{Semi-supervised Rail Defect Detection from Imbalanced Image Data}},
year = {2016}
}
@article{Kalousis2001,
author = {Kalousis, Alexandros and Hilario, Melanie},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kalousis, Hilario - 2001 - Model selection via meta-learning a comparative study.pdf:pdf},
journal = {International Journal on Artificial Intelligence Tools},
number = {4},
title = {{Model selection via meta-learning: a comparative study}},
url = {http://www.worldscientific.com/doi/abs/10.1142/S0218213001000647},
volume = {10},
year = {2001}
}
@article{Gu2012,
author = {Gu, Quanquan and Han, Jiawei},
doi = {10.1109/ICDM.2012.72},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gu, Han - 2012 - Towards active learning on graphs An error bound minimization approach.pdf:pdf},
isbn = {9780769549057},
issn = {15504786},
journal = {Proceedings - IEEE International Conference on Data Mining, ICDM},
keywords = {Active learning,Generalization error bound,Graph,Sequential optimization},
pages = {882--887},
title = {{Towards active learning on graphs: An error bound minimization approach}},
year = {2012}
}
@article{Norton2003,
author = {Norton, John D},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Norton - 2003 - A Material Theory of Induction.pdf:pdf},
journal = {Philosophy of Science},
number = {October},
pages = {647--670},
title = {{A Material Theory of Induction}},
volume = {70},
year = {2003}
}
@phdthesis{Ommen2015,
archivePrefix = {arXiv},
arxivId = {arXiv:1412.3730},
author = {Ommen, Thijs Van},
eprint = {arXiv:1412.3730},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ommen - 2015 - Better Predictions when Models are Wrong or Underspecified.pdf:pdf},
isbn = {9789462596894},
school = {Leiden University},
title = {{Better Predictions when Models are Wrong or Underspecified}},
year = {2015}
}
@article{Meinshausen2010,
author = {Meinshausen, Nicolai and B{\"{u}}hlmann, Peter},
doi = {10.1111/j.1467-9868.2010.00740.x},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Meinshausen, B{\"{u}}hlmann - 2010 - Stability selection.pdf:pdf},
issn = {13697412},
journal = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
keywords = {high dimensional data,resampling,stability selection,structure estimation},
month = {jul},
number = {4},
pages = {417--473},
title = {{Stability selection}},
url = {http://doi.wiley.com/10.1111/j.1467-9868.2010.00740.x},
volume = {72},
year = {2010}
}
@article{Schafer2002,
author = {Schafer, Joseph L. and Graham, John W.},
doi = {10.1037//1082-989X.7.2.147},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Schafer, Graham - 2002 - Missing data Our view of the state of the art.pdf:pdf},
issn = {1082-989X},
journal = {Psychological Methods},
number = {2},
pages = {147--177},
title = {{Missing data: Our view of the state of the art.}},
url = {http://doi.apa.org/getdoi.cfm?doi=10.1037/1082-989X.7.2.147},
volume = {7},
year = {2002}
}
@article{Nguyen2009,
author = {Nguyen, XuanLong and Wainwright, Martin J. and Jordan, Michael I.},
doi = {10.1214/08-AOS595},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Nguyen, Wainwright, Jordan - 2009 - On surrogate loss functions and f -divergences.pdf:pdf},
issn = {0090-5364},
journal = {The Annals of Statistics},
month = {apr},
number = {2},
pages = {876--904},
title = {{On surrogate loss functions and f -divergences}},
url = {http://projecteuclid.org/euclid.aos/1236693153},
volume = {37},
year = {2009}
}
@article{Shannon1948,
author = {Shannon, Claude Elwood},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Shannon - 1948 - A mathematical theory of communication.pdf:pdf},
journal = {The Bell System Technical Journal},
number = {J},
pages = {379--423},
title = {{A mathematical theory of communication}},
url = {http://dl.acm.org/citation.cfm?id=584093},
volume = {27},
year = {1948}
}
@article{Yuille2003,
abstract = {The concave-convex procedure (CCCP) is a way to construct discrete-time iterative dynamical systems that are guaranteed to decrease global optimization and energy functions monotonically. This procedure can be applied to almost any optimization problem, and many existing algorithms can be interpreted in terms of it. In particular, we prove that all expectation-maximization algorithms and classes of Legendre minimization and variational bounding algorithms can be reexpressed in terms of CCCP. We show that many existing neural network and mean-field theory algorithms are also examples of CCCP. The generalized iterative scaling algorithm and Sinkhorn's algorithm can also be expressed as CCCP by changing variables. CCCP can be used both as a new way to understand, and prove the convergence of, existing optimization algorithms and as a procedure for generating new algorithms.},
author = {Yuille, a L and Rangarajan, Anand},
doi = {10.1162/08997660360581958},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Yuille, Rangarajan - 2003 - The concave-convex procedure.pdf:pdf},
issn = {0899-7667},
journal = {Neural computation},
keywords = {Algorithms,Energy Metabolism,Neural Networks (Computer)},
month = {apr},
number = {4},
pages = {915--36},
pmid = {12689392},
title = {{The concave-convex procedure.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/12689392},
volume = {15},
year = {2003}
}
@article{Gelman2011a,
author = {Gelman, Andrew},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gelman - 2011 - Ethics and statistics Open data and open methods.pdf:pdf},
journal = {Chance},
pages = {51--53},
title = {{Ethics and statistics: Open data and open methods}},
url = {http://scholar.google.com/scholar?hl=en{\&}btnG=Search{\&}q=intitle:Ethics+and+Statistics+Open+Data+and+Open+Methods{\#}3},
year = {2011}
}
@article{Kuncheva2002,
author = {Kuncheva, Ludmila I},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kuncheva - 2002 - A Theoretical Study on Six Classifier Fusion Strategies.pdf:pdf},
journal = {IEEE transactions on pattern analysis and machine intelligence},
number = {2},
pages = {281--286},
title = {{A Theoretical Study on Six Classifier Fusion Strategies}},
volume = {24},
year = {2002}
}
@book{Rasmussen2005,
author = {Rasmussen, Carl Edward and Williams, Christopher K. I.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop//Rasmussen, Williams - 2005 - Gaussian Processes for Machine Learning.pdf:pdf},
issn = {0129-},
month = {apr},
publisher = {MIT Press},
title = {{Gaussian Processes for Machine Learning}},
year = {2005}
}
@article{Brown2015,
author = {Brown, Gavin},
doi = {10.1016/j.patrec.2015.04.014},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Brown - 2015 - On Unifiers, Diversifiers, and the Nature of Pattern Recognition.pdf:pdf},
issn = {01678655},
journal = {Pattern Recognition Letters},
keywords = {Nature of pattern recognition,Unifying,Diversifyin,nature of pattern recognition},
pages = {1--10},
publisher = {Elsevier Ltd.},
title = {{On Unifiers, Diversifiers, and the Nature of Pattern Recognition}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0167865515001312},
volume = {000},
year = {2015}
}
@article{Suykens1999,
author = {Suykens, Johan A. K. and Vandewalle, J.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Suykens, Vandewalle - 1999 - Least Squares Support Vector Machine Classifiers.pdf:pdf},
journal = {Neural Processing Letters},
keywords = {abbreviations,classification,linear least squares,radial basis,radial basis function kernel,rbf,support vector machines,svm,vapnik-chervonenkis,vc},
pages = {293--300},
title = {{Least Squares Support Vector Machine Classifiers}},
volume = {9},
year = {1999}
}
@article{Jain1999a,
author = {Jain, A.K. and Murty, M.N. and Flynn, P.J.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Jain, Murty, Flynn - 1999 - Data clustering a review.pdf:pdf},
journal = {ACM computing surveys (CSUR)},
number = {3},
title = {{Data clustering: a review}},
url = {http://dl.acm.org/citation.cfm?id=331504},
volume = {31},
year = {1999}
}
@techreport{Dixon1973,
author = {Dixon, John K.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Dixon - 1973 - Pattern Recognition with Partly Missing Data.pdf:pdf},
title = {{Pattern Recognition with Partly Missing Data}},
year = {1973}
}
@article{Doksum2007,
author = {Doksum, Kjell and Ozeki, Akichika and Kim, Jihoon and {Chaibub Neto}, Elias},
doi = {10.1016/j.spl.2007.03.005},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Doksum et al. - 2007 - Thinking outside the box Statistical inference based on Kullback–Leibler empirical projections.pdf:pdf},
issn = {01677152},
journal = {Statistics {\&} Probability Letters},
keywords = {bootstrap,box-cox transformation,classification,covariate,k-l divergence,klep,outside the box,sandwich formula},
month = {jul},
number = {12},
pages = {1201--1213},
title = {{Thinking outside the box: Statistical inference based on Kullback–Leibler empirical projections}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0167715207000843},
volume = {77},
year = {2007}
}
@inproceedings{Chaudhuri2009,
author = {Chaudhuri, Kamalika and Monteleoni, Claire},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Chaudhuri, Monteleoni - 2009 - Privacy-Preserving Logistic Regression.pdf:pdf},
pages = {289--296},
title = {{Privacy-Preserving Logistic Regression}},
url = {https://papers.nips.cc/paper/3486-privacy-preserving-logistic-regression.pdf},
year = {2009}
}
@article{Herndon2014,
author = {Herndon, Thomas and Ash, Michael and Pollin, Robert},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Herndon, Ash, Pollin - 2014 - Does High Public Debt Consistently Stifle Economic Growth A Critique of Reinhart and Rogoff.pdf:pdf},
journal = {Cambridge Journal of Economics},
number = {2},
pages = {257--279},
title = {{Does High Public Debt Consistently Stifle Economic Growth? A Critique of Reinhart and Rogoff}},
volume = {38},
year = {2014}
}
@article{Marchand2004,
author = {Marchand, E and Strawderman, WE},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Marchand, Strawderman - 2004 - Estimation in Restricted Parameter Spaces A Review.pdf:pdf},
journal = {Institute of Mathematical Statistics Lecture Notes-Monograph Series},
number = {2004},
pages = {21--44},
title = {{Estimation in Restricted Parameter Spaces: A Review}},
url = {http://www.jstor.org/stable/10.2307/4356296},
volume = {45},
year = {2004}
}
@inproceedings{Huang2014,
abstract = {Despite their theoretical appeal and grounding in tractable convex optimization techniques, kernel methods are often not the first choice for large-scale speech applications due to their significant memory requirements and computational expense. In recent years, randomized approximate feature maps have emerged as an elegant mechanism to scale-up kernel methods. Still, in practice, a large number of random features is required to obtain acceptable accuracy in predictive tasks. In this paper, we develop two algorithmic schemes to address this computational bottleneck in the context of kernel ridge regression. The first scheme is a specialized distributed block coordinate descent procedure that avoids the explicit materialization of the feature space data matrix, while the second scheme gains efficiency by combining multiple weak random feature models in an ensemble learning framework. We demonstrate that these schemes enable kernel methods to match the performance of state of the art Deep Neural Networks on TIMIT for speech recognition and classification tasks. In particular, we obtain the best classification error rates reported on TIMIT using kernel methods.},
author = {Huang, Po Sen and Avron, Haim and Sainath, Tara N and Sindhwani, Vikas and Ramabhadran, Bhuvana},
booktitle = {Proceedings of the International Conference on Acoustics, Speech and Signal Processing},
doi = {10.1109/ICASSP.2014.6853587},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Huang et al. - 2014 - Kernel methods match deep neural networks on TIMIT.pdf:pdf},
isbn = {9781479928927},
issn = {15206149},
keywords = {deep learning,distributed computing,large-scale kernel machines,random features,speech recognition},
pages = {205--209},
title = {{Kernel methods match deep neural networks on TIMIT}},
year = {2014}
}
@inproceedings{Niu2012,
author = {Niu, Gang and Dai, Bo and Yamada, Makoto and Sugiyama, Masashi},
booktitle = {Proceedings of the 29th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Niu et al. - 2012 - Information-theoretic Semi-supervised Metric Learning via Entropy Regularization.pdf:pdf},
number = {c},
title = {{Information-theoretic Semi-supervised Metric Learning via Entropy Regularization}},
url = {http://arxiv.org/abs/1206.4614},
year = {2012}
}
@inproceedings{Aha1992,
author = {Aha, David W.},
booktitle = {International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Aha - 1992 - Generalizing from case studies A case study.pdf:pdf},
title = {{Generalizing from case studies: A case study}},
year = {1992}
}
@article{Rubin1976,
abstract = {SUMMARY When making sampling distribution inferences about the parameter of the data, $\theta$, it is appropriate to ignore the process that causes missing data if the missing data are ‘missing at random' and the observed data are ‘observed at random', but these inferences are generally conditional on the observed pattern of missing data. When making direct-likelihood or Bayesian inferences about $\theta$, it is appropriate to ignore the process that causes missing data if the missing data are missing at random and the parameter of the missing data process is ‘distinct' from $\theta$. These conditions are the weakest general conditions under which ignoring the process that causes missing data always leads to correct inferences.},
author = {Rubin, Donald B.},
doi = {10.2307/2335739},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Rubin - 1976 - Inference and Missing Data.pdf:pdf},
isbn = {0006344414643510},
issn = {00063444},
journal = {Biometrika},
number = {3},
pages = {581},
pmid = {86},
title = {{Inference and Missing Data}},
url = {http://biomet.oxfordjournals.org.libproxy1.nus.edu.sg/content/63/3/581$\backslash$nhttp://www.jstor.org/stable/2335739?origin=crossref},
volume = {63},
year = {1976}
}
@article{Li2015,
author = {Li, Yu-Feng and Zhou, Zhi-Hua},
doi = {10.1109/TPAMI.2014.2299812},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Li, Zhou - 2015 - Towards Making Unlabeled Data Never Hurt.pdf:pdf},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
month = {jan},
number = {1},
pages = {175--188},
title = {{Towards Making Unlabeled Data Never Hurt}},
volume = {37},
year = {2015}
}
@article{Vandewalle2013,
author = {Vandewalle, Vincent and Biernacki, Christophe},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Vandewalle, Biernacki - 2013 - A predictive deviance criterion for selecting a generative model in semi-supervised classification.pdf:pdf},
journal = {Computational Statistics {\&} Data Analysis},
pages = {220--236},
title = {{A predictive deviance criterion for selecting a generative model in semi-supervised classification}},
url = {http://www.sciencedirect.com/science/article/pii/S0167947313000546},
volume = {64},
year = {2013}
}
@article{Kall2007,
abstract = {Shotgun proteomics uses liquid chromatography-tandem mass spectrometry to identify proteins in complex biological samples. We describe an algorithm, called Percolator, for improving the rate of confident peptide identifications from a collection of tandem mass spectra. Percolator uses semi-supervised machine learning to discriminate between correct and decoy spectrum identifications, correctly assigning peptides to 17{\%} more spectra from a tryptic Saccharomyces cerevisiae dataset, and up to 77{\%} more spectra from non-tryptic digests, relative to a fully supervised approach.},
author = {K{\"{a}}ll, Lukas and Canterbury, Jesse D and Weston, Jason and Noble, William Stafford and MacCoss, Michael J},
doi = {10.1038/nmeth1113},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/K{\"{a}}ll et al. - 2007 - Semi-supervised learning for peptide identification from shotgun proteomics datasets.pdf:pdf},
journal = {Nature methods},
number = {11},
pages = {923--925},
title = {{Semi-supervised learning for peptide identification from shotgun proteomics datasets.}},
volume = {4},
year = {2007}
}
@inproceedings{Krijthe2016b,
author = {Krijthe, Jesse Hendrik and Loog, Marco},
booktitle = {Structural, Syntactic, and Statistical Pattern Recognition (Lecture Notes in Computer Science Volume 10029)},
doi = {10.1007/978-3-319-49055-7_27},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Krijthe, Loog - 2016 - The Peaking Phenomenon in Semi-supervised Learning.pdf:pdf},
keywords = {least squares classifier,peaking,pseudo-,semi-supervised learning},
title = {{The Peaking Phenomenon in Semi-supervised Learning}},
year = {2016}
}
@techreport{Poggio2001,
author = {Poggio, T and Mukherjee, S and Rifkin, R and Rakhlin, A and Verri, A},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Poggio et al. - 2001 - b.pdf:pdf},
title = {b},
year = {2001}
}
@article{Shcherbatyi2016,
abstract = {Regularized empirical risk minimization with constrained labels (in contrast to fixed labels) is a remarkably general abstraction of learning. For common loss and regularization functions, this optimization problem assumes the form of a mixed integer program (MIP) whose objective function is non-convex. In this form, the problem is resistant to standard optimization techniques. We construct MIPs with the same solutions whose objective functions are convex. Specifically, we characterize the tightest convex extension of the objective function, given by the Legendre-Fenchel biconjugate. Computing values of this tightest convex extension is NP-hard. However, by applying our characterization to every function in an additive decomposition of the objective function, we obtain a class of looser convex extensions that can be computed efficiently. For some decompositions, common loss and regularization functions, we derive a closed form.},
archivePrefix = {arXiv},
arxivId = {1602.06746},
author = {Shcherbatyi, Iaroslav and Andres, Bjoern},
eprint = {1602.06746},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Shcherbatyi, Andres - 2016 - Convexification of Learning from Constraints.pdf:pdf},
number = {1},
pages = {1--13},
title = {{Convexification of Learning from Constraints}},
url = {http://arxiv.org/abs/1602.06746},
year = {2016}
}
@inproceedings{Goldman2000,
author = {Goldman, Sally and Zhou, Yan},
booktitle = {Proceedings of the 17th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Goldman, Zhou - 2000 - Enhancing supervised learning with unlabeled data.pdf:pdf},
pages = {327--334},
title = {{Enhancing supervised learning with unlabeled data}},
url = {http://scholar.google.com/scholar?hl=en{\&}btnG=Search{\&}q=intitle:Enhancing+Supervised+Learning+with+Unlabeled+Data{\#}0},
volume = {3},
year = {2000}
}
@article{Minka1998,
author = {Minka, Thomas P},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Minka - 1998 - Expectation-Maximization as lower bound maximization.pdf:pdf},
number = {1977},
pages = {1--8},
title = {{Expectation-Maximization as lower bound maximization}},
year = {1998}
}
@article{Witten2010,
abstract = {We consider the problem of clustering observations using a potentially large set of features. One might expect that the true underlying clusters present in the data differ only with respect to a small fraction of the features, and will be missed if one clusters the observations using the full set of features. We propose a novel framework for sparse clustering, in which one clusters the observations using an adaptively chosen subset of the features. The method uses a lasso-type penalty to select the features. We use this framework to develop simple methods for sparse K-means and sparse hierarchical clustering. A single criterion governs both the selection of the features and the resulting clusters. These approaches are demonstrated on simulated data and on genomic data sets.},
author = {Witten, Daniela M. and Tibshirani, Robert},
doi = {10.1198/jasa.2010.tm09415},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Witten, Tibshirani - 2010 - A framework for feature selection in clustering.pdf:pdf},
issn = {0162-1459},
journal = {Journal of the American Statistical Association},
keywords = {hierarchical clustering,high-dimensional,k-means clustering,lasso,model selection,sparsity,unsupervised learning},
month = {jun},
number = {490},
pages = {713--726},
pmid = {20811510},
title = {{A framework for feature selection in clustering.}},
url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=2930825{\&}tool=pmcentrez{\&}rendertype=abstract},
volume = {105},
year = {2010}
}
@article{Kanamori2012,
author = {Kanamori, Takafumi and Takeda, A and Suzuki, T},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kanamori, Takeda, Suzuki - 2012 - A Conjugate Property between Loss Functions and Uncertainty Sets in Classification Problems.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {consistency,convex conjugate,loss function,uncertainty set},
pages = {1461--1504},
title = {{A Conjugate Property between Loss Functions and Uncertainty Sets in Classification Problems}},
url = {http://arxiv.org/abs/1204.6583},
volume = {14},
year = {2012}
}
@article{Collobert2006,
author = {Collobert, Ronan and Sinz, Fabian and Weston, Jason and Bottou, Leon},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Collobert et al. - 2006 - Large scale transductive SVMs.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {cccp,semi-supervised learning,transduction,transductive svms},
pages = {1687--1712},
title = {{Large scale transductive SVMs}},
volume = {7},
year = {2006}
}
@article{Pearl2009,
author = {Pearl, Judea},
doi = {10.1214/09-SS057},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Pearl - 2009 - Causal inference in statistics An overview.pdf:pdf},
issn = {1935-7516},
journal = {Statistics Surveys},
keywords = {Structural equation models, confounding, graphical,and phrases,causal effects,causes of effects,confounding,counterfactuals,graph-,ical methods,mediation,policy evaluation,potential-outcome,received september 2009,structural equation models},
number = {September},
pages = {96--146},
title = {{Causal inference in statistics: An overview}},
url = {http://projecteuclid.org/euclid.ssu/1255440554},
volume = {3},
year = {2009}
}
@inproceedings{Szummer2001,
author = {Szummer, Martin and Jaakkola, Tommi},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Szummer, Jaakkola - 2001 - Clustering and efficient use of unlabeled examples.pdf:pdf},
title = {{Clustering and efficient use of unlabeled examples}},
url = {http://www.ai.mit.edu/projects/ntt/projects/MIT2000-08/documents/SzummerJaakkola.pdf},
year = {2001}
}
@book{Jaynes,
author = {Jaynes, E.T.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Jaynes - 2003 - Probability Theory The Logic of Science.pdf:pdf},
title = {{Probability Theory: The Logic of Science}},
year = {2003}
}
@phdthesis{Macia2011,
author = {Macia, Nuria},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Macia - 2011 - Data Complexity in Supervised Learning A Far-Reaching Implication.pdf:pdf},
title = {{Data Complexity in Supervised Learning: A Far-Reaching Implication}},
year = {2011}
}
@article{Molinaro2005,
abstract = {In genomic studies, thousands of features are collected on relatively few samples. One of the goals of these studies is to build classifiers to predict the outcome of future observations. There are three inherent steps to this process: feature selection, model selection and prediction assessment. With a focus on prediction assessment, we compare several methods for estimating the 'true' prediction error of a prediction model in the presence of feature selection.},
author = {Molinaro, Annette M and Simon, Richard and Pfeiffer, Ruth M},
doi = {10.1093/bioinformatics/bti499},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Molinaro, Simon, Pfeiffer - 2005 - Prediction error estimation a comparison of resampling methods.pdf:pdf},
issn = {1367-4803},
journal = {Bioinformatics (Oxford, England)},
keywords = {Algorithms,Computer Simulation,Data Interpretation, Statistical,Gene Expression Profiling,Gene Expression Profiling: methods,Models, Genetic,Models, Statistical,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Reproducibility of Results,Sample Size,Sensitivity and Specificity,Software},
month = {aug},
number = {15},
pages = {3301--7},
pmid = {15905277},
title = {{Prediction error estimation: a comparison of resampling methods.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/15905277},
volume = {21},
year = {2005}
}
@article{Marshall1960,
author = {Marshall, Albert W. and Olkin, Ingram},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Marshall, Olkin - 1960 - Multivariate Chebyshev Inequalities.pdf:pdf},
journal = {Annals of Mathematical Statistics},
pages = {1001--1014},
title = {{Multivariate Chebyshev Inequalities}},
volume = {31},
year = {1960}
}
@article{Le2015,
author = {Le, Thanh-Binh and Kim, Sang-Woon},
doi = {10.1016/j.patrec.2015.04.011},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Le, Kim - 2015 - Modified criterion to select useful unlabeled data for improving semi-supervised support vector machines.pdf:pdf},
issn = {01678655},
journal = {Pattern Recognition Letters},
keywords = {Semi-supervised learning,Semi-supervised boosting,,semi-supervised boosting,semi-supervised learning,support vector machines},
pages = {48--56},
publisher = {Elsevier Ltd.},
title = {{Modified criterion to select useful unlabeled data for improving semi-supervised support vector machines}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0167865515001282},
volume = {60-61},
year = {2015}
}
@article{Fernandez-Delgado2014,
abstract = {We evaluate 179 classifiers arising from 17 families (discriminant analysis, Bayesian, neural networks, support vector machines, decision trees, rule-based classifiers, boosting, bagging, stacking, random forests and other ensembles, generalized linear models, nearest-neighbors, partial least squares and principal component regression, logistic and multino-mial regression, multiple adaptive regression splines and other methods), implemented in Weka, R (with and without the caret package), C and Matlab, including all the relevant classifiers available today. We use 121 data sets, which represent the whole UCI data base (excluding the large-scale problems) and other own real problems, in order to achieve significant conclusions about the classifier behavior, not dependent on the data set col-lection. The classifiers most likely to be the bests are the random forest (RF) versions, the best of which (implemented in R and accessed via caret) achieves 94.1{\%} of the maximum accuracy overcoming 90{\%} in the 84.3{\%} of the data sets. However, the dif-ference is not statistically significant with the second best, the SVM with Gaussian kernel implemented in C using LibSVM, which achieves 92.3{\%} of the maximum accuracy. A few models are clearly better than the remaining ones: random forest, SVM with Gaussian and polynomial kernels, extreme learning machine with Gaussian kernel, C5.0 and avNNet (a committee of multi-layer perceptrons implemented in R with the caret package). The random forest is clearly the best family of classifiers (3 out of 5 bests classifiers are RF), followed by SVM (4 classifiers in the top-10), neural networks and boosting ensembles (5 and 3 members in the top-20, respectively).},
author = {Fern{\'{a}}ndez-Delgado, Manuel and Cernadas, Eva and Barro, Sen{\'{e}}n and Amorim, Dinani and {Amorim Fern{\'{a}}ndez-Delgado}, Dinani},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Fern{\'{a}}ndez-Delgado et al. - 2014 - Do we Need Hundreds of Classifiers to Solve Real World Classification Problems.pdf:pdf},
isbn = {1532-4435},
issn = {1532-4435},
journal = {Journal of Machine Learning Research},
keywords = {Bayesian classifiers,UCI data base,classification,decision trees,discriminant analysis,ensembles,generalized linear models,logistic and multinomial regression,multiple adaptive regression splines,nearest-neighbors,neural networks,random forest,rule-based classifiers,support vector machine},
pages = {3133--3181},
title = {{Do we Need Hundreds of Classifiers to Solve Real World Classification Problems?}},
volume = {15},
year = {2014}
}
@article{Sun2013,
author = {Sun, Quan and Pfahringer, Bernhard},
doi = {10.1007/s10994-013-5387-y},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Sun, Pfahringer - 2013 - Pairwise meta-rules for better meta-learning-based algorithm ranking.pdf:pdf},
issn = {0885-6125},
journal = {Machine Learning},
keywords = {algorithm ranking,ensemble learning,meta-learning,ranking trees},
month = {jul},
number = {1},
pages = {141--161},
title = {{Pairwise meta-rules for better meta-learning-based algorithm ranking}},
url = {http://link.springer.com/10.1007/s10994-013-5387-y},
volume = {93},
year = {2013}
}
@article{Park2016,
abstract = {We propose a clustering-based iterative algorithm to solve certain optimization problems in machine learning, where we start the algorithm by aggregating the original data, solving the problem on aggregated data, and then in subsequent steps gradually disaggregate the aggregated data. We apply the algorithm to common machine learning problems such as the least absolute deviation regression problem, support vector machines, and semi-supervised support vector machines. We derive model-specific data aggregation and disaggregation procedures. We also show optimality, convergence, and the optimality gap of the approximated solution in each iteration. A computational study is provided.},
archivePrefix = {arXiv},
arxivId = {1607.01400},
author = {Park, Young Woong and Klabjan, Diego},
doi = {10.1007/s10994-016-5562-z},
eprint = {1607.01400},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Park, Klabjan - 2016 - An aggregate and iterative disaggregate algorithm with proven optimality in machine learning.pdf:pdf},
issn = {15730565},
journal = {Machine Learning},
keywords = {AID,Aggregate and iterative disaggregate,Data aggregation,Least absolute deviation regression,Machine learning,Optimization,Semi-supervised support vector machine,Support vector machine},
number = {2},
pages = {1--34},
publisher = {Springer US},
title = {{An aggregate and iterative disaggregate algorithm with proven optimality in machine learning}},
url = {"http://dx.doi.org/10.1007/s10994-016-5562-z},
volume = {105},
year = {2016}
}
@article{Bengio2009,
author = {Bengio, Yoshua},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bengio - 2009 - Learning deep architectures for AI.pdf:pdf},
journal = {Foundations and trends{\textregistered} in Machine Learning},
title = {{Learning deep architectures for AI}},
url = {http://dl.acm.org/citation.cfm?id=1658424},
year = {2009}
}
@unpublished{Blum2001,
author = {Blum, Avrim and Chawla, S},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Blum, Chawla - 2001 - Learning from labeled and unlabeled data using graph mincuts.pdf:pdf},
institution = {Carnegie Mellon University, Computer Science Department},
title = {{Learning from labeled and unlabeled data using graph mincuts}},
url = {http://repository.cmu.edu/compsci/163/?utm{\_}source=repository.cmu.edu{\%}2Fcompsci{\%}2F163{\&}utm{\_}medium=PDF{\&}utm{\_}campaign=PDFCoverPages},
year = {2001}
}
@article{Lockhart2014d,
author = {Lockhart, Richard and Taylor, Jonathan and Tibshirani, Ryan J. and Tibshirani, Robert},
doi = {10.1214/13-AOS1175},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lockhart et al. - 2014 - A significance test for the lasso(3).pdf:pdf},
issn = {0090-5364},
journal = {The Annals of Statistics},
month = {apr},
number = {2},
pages = {413--468},
title = {{A significance test for the lasso}},
url = {http://projecteuclid.org/euclid.aos/1400592161},
volume = {42},
year = {2014}
}
@article{Kasabov2003,
author = {Kasabov, Nikola and Pang, Shaoning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kasabov, Pang - 2003 - Transductive Support Vector Machines and Applications in Bioinformatics for Promoter Recognition.pdf:pdf},
journal = {Proceedings of the International Conference on Neural networks and signal processing},
keywords = {inductive svm,motif,promoter,promoter recognition,transductive svm},
number = {2},
pages = {31--38},
title = {{Transductive Support Vector Machines and Applications in Bioinformatics for Promoter Recognition}},
volume = {3},
year = {2003}
}
@article{Brodley1995,
author = {Brodley, Carla E.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Brodley - 1995 - Recursive automatic bias selection for classifier construction.pdf:pdf},
journal = {Machine Learning},
keywords = {automatic algorithm selection,decision trees,hybrid classifiers,inductive bias,learning from},
pages = {63--94},
title = {{Recursive automatic bias selection for classifier construction}},
url = {http://link.springer.com/article/10.1023/A:1022686102325},
volume = {94},
year = {1995}
}
@phdthesis{Hillebrand2012,
author = {Hillebrand, Arne},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hillebrand - 2012 - Separating a polygonal environment into a multi-layered environment.pdf:pdf},
keywords = {branch,explicit corridor map,ge-,graphs,local search,multi-layered environment,multicut,netic algorithm,price},
school = {Utrecht University},
title = {{Separating a polygonal environment into a multi-layered environment}},
year = {2012}
}
@article{Dean,
archivePrefix = {arXiv},
arxivId = {arXiv:1503.02531v1},
author = {Hinton, Geoffrey and Vinyals, Oriol and Dean, Jeff},
eprint = {arXiv:1503.02531v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hinton, Vinyals, Dean - Unknown - Distilling the Knowledge in a Neural Network.pdf:pdf},
pages = {1--9},
title = {{Distilling the Knowledge in a Neural Network}}
}
@article{Lv2014,
author = {Lv, Jinchi and Zheng, Zemin},
doi = {10.1214/13-AOS1175},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lv, Zheng - 2014 - Discussion “a significance test for the lasso”.pdf:pdf},
journal = {The Annals of Statistics},
number = {2},
pages = {493--500},
title = {{Discussion: “a significance test for the lasso”}},
volume = {42},
year = {2014}
}
@article{Leemis2008,
author = {Leemis, Lawrence M and McQueston, Jacquelyn T},
doi = {10.1198/000313008X270448},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Leemis, McQueston - 2008 - Univariate Distribution Relationships.pdf:pdf},
issn = {0003-1305},
journal = {The American Statistician},
keywords = {asymptotic relationships,distribution proper-,limiting distributions,stochastic parameters,ties,transforma-},
month = {feb},
number = {1},
pages = {45--53},
title = {{Univariate Distribution Relationships}},
url = {http://www.tandfonline.com/doi/abs/10.1198/000313008X270448},
volume = {62},
year = {2008}
}
@inproceedings{Ratsaby1995,
author = {Ratsaby, Joel and Venkatesht, Santosh S.},
booktitle = {Proceedings of the 8th Annual conference on Computational learning theory},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ratsaby, Venkatesht - 1995 - Learning from a mixture of labeled and unlabeled examples with parametric side information.pdf:pdf},
pages = {412--417},
title = {{Learning from a mixture of labeled and unlabeled examples with parametric side information}},
url = {http://dl.acm.org/citation.cfm?id=225348},
year = {1995}
}
@techreport{Freeman2013,
abstract = {The jump from problem sets to research can be hard. We sometimes see students who ace their classes struggle with their research. In little bites, here is what I think is important for succeeding in research as a graduate student. • The first advice can go on a bumper sticker: " Slow down to speed up " . In classes, the world is rigged. There's a simple correct answer and the problem is structured to let you come to that answer. You get feedback with the correct answer within a day after you submit anything. Research is different. No one tells you the right answer, we don't know if there is a right answer. We don't know if something doesn't work because there's a silly mistake in the program or because a broad set of assumptions is flawed. How do you deal with that? Take things slowly. Verify your assumptions. Understand the thing, whatever it is–the program, the algorithm, or the proof. As you do experiments, only change one thing at a time, so you know what the outcome of the experiment means. It may feel like you're going slowly, but you'll be making much more progress than if you flail around, trying different things, but not understanding what's going on. Figure 1: Research advice for a bumper sticker. • Please don't tell me " it doesn't work " . Of course it doesn't work. If there's a single mistake in the chain, the whole thing won't work, and how could you possibly go through all those steps without making a mistake somewhere? What I want to hear instead is something like, " I've narrowed down the problem to step B. Until step A, you can see that it works, because you put in X and you get Y out, as we expect. You can see how it fails here at B. I've ruled out W and Z as the cause. " • " This sounds like hard work. " Yes. It's no longer about being smart. By now, everyone around you is smart. In graduate school, it's the hard workers who pull ahead. This happens in sports, too. You always read stories about how hard the great players work, being the first ones out to practice, the last ones to leave, etc.},
author = {Freeman, Bill},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Freeman - 2013 - How to Do Research.pdf:pdf},
pages = {3},
title = {{How to Do Research}},
year = {2013}
}
@article{Astorino2007,
author = {Astorino, A and Fuduli, A},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Astorino, Fuduli - 2007 - Nonsmooth optimization techniques for Semi-Supervised Classification.pdf:pdf},
journal = {IEEE transactions on pattern analysis and machine intelligence},
keywords = {nonsmooth optimization,semi,supervised learning},
number = {12},
pages = {2135--2142},
title = {{Nonsmooth optimization techniques for Semi-Supervised Classification}},
url = {http://ieeexplore.ieee.org/xpls/abs{\_}all.jsp?arnumber=4359288},
volume = {29},
year = {2007}
}
@article{Giraud-carrier2004,
author = {Giraud-carrier, Christophe and Vilalta, Ricardo and Brazdil, Pavel B.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Giraud-carrier, Vilalta, Brazdil - 2004 - Introduction to the special issue on meta-learning.pdf:pdf},
journal = {Machine learning},
keywords = {dynamic bias selection,inductive bias,meta-knowledge,meta-learning},
pages = {187--193},
title = {{Introduction to the special issue on meta-learning}},
url = {http://link.springer.com/article/10.1023/B:MACH.0000015878.60765.42},
volume = {54},
year = {2004}
}
@inproceedings{Rao1995,
author = {Rao, R. Bharat and Gordon, Diana and Spears, William},
booktitle = {Proceedings of the 12th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Rao, Gordon, Spears - 1995 - For Every Generalization Action, Is there really an Equal and Opposite Reaction Analysis of the conservatio.pdf:pdf},
pages = {471--479},
title = {{For Every Generalization Action, Is there really an Equal and Opposite Reaction? Analysis of the conservation Law for Generalization Performance}},
url = {http://www.researchgate.net/publication/2516136{\_}For{\_}Every{\_}Generalization{\_}Action{\_}Is{\_}There{\_}Really{\_}An{\_}Equal{\_}And{\_}Opposite{\_}Reaction{\_}Analysis{\_}of{\_}the{\_}Conservation{\_}Law{\_}for{\_}Generalization{\_}Performance/file/79e4150b866697f897.pdf},
year = {1995}
}
@article{Seaman2013,
author = {Seaman, Shaun and Galati, John and Jackson, Dan and Carlin, John},
doi = {10.1214/13-STS415},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Seaman et al. - 2013 - What Is Meant by “Missing at Random”.pdf:pdf},
issn = {0883-4237},
journal = {Statistical Science},
keywords = {Ignorability, direct-likelihood inference, frequen,and phrases,direct-likelihood inference,frequen-,ignorability,missing completely at random,repeated sampling,tist inference},
month = {may},
number = {2},
pages = {257--268},
title = {{What Is Meant by “Missing at Random”?}},
url = {http://projecteuclid.org/euclid.ss/1369147915},
volume = {28},
year = {2013}
}
@article{Tian2016,
abstract = {In this paper, we propose a strategy dealing with the semi-supervised classification problem, in which the support vector machine with self-constructed Universum is iteratively solved. Universum data, which do not belong to either class of interest, have been illustrated to encode some prior knowledge by representing meaningful concepts in the same domain as the problem at hand. Our new method is applied to seek more reliable positive and negative examples from the unlabeled dataset step by step, and the Universum support vector machine(U-SVM) is used iteratively. Different Universum data will result in different performance, so several effective approaches are explored to construct Universum datasets. Experimental results demonstrate that appropriately constructed Universum will improve the accuracy and reduce the number of iterations.},
author = {Tian, Yingjie and Zhang, Ying and Liu, Dalian},
doi = {10.1016/j.neucom.2015.11.041},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Tian, Zhang, Liu - 2016 - Semi-supervised support vector classification with self-constructed Universum.pdf:pdf},
issn = {18728286},
journal = {Neurocomputing},
keywords = {Classification,Semi-supervised,Support vector machine,Universum},
pages = {33--42},
publisher = {Elsevier},
title = {{Semi-supervised support vector classification with self-constructed Universum}},
url = {http://dx.doi.org/10.1016/j.neucom.2015.11.041},
volume = {189},
year = {2016}
}
@article{Shalev-ShwartzSHAIS2010,
abstract = {The problem of characterizing learnability is the most basic question of statistical learning theory. A fun-damental and long-standing answer, at least for the case of supervised classification and regression, is that learnability is equivalent to uniform convergence of the empirical risk to the population risk, and that if a problem is learnable, it is learnable via empirical risk minimization. In this paper, we consider the General Learning Setting (introduced by Vapnik), which includes most statistical learning problems as special cases. We show that in this setting, there are non-trivial learning problems where uniform convergence does not hold, empirical risk minimization fails, and yet they are learnable using alternative mechanisms. Instead of uniform convergence, we identify stability as the key necessary and sufficient condition for learnability. More-over, we show that the conditions for learnability in the general setting are significantly more complex than in supervised classification and regression.},
author = {{Shalev-Shwartz SHAIS}, Shai and Shamir, Ohad and Srebro, Nathan and {Sridharan KARTHIK}, Karthik},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Shalev-Shwartz SHAIS et al. - 2010 - Learnability, Stability and Uniform Convergence.pdf:pdf},
isbn = {1532-4435},
issn = {15324435},
journal = {Journal of Machine Learning Research},
keywords = {learnability,stability,statistical learning theory,stochastic convex opti-mization,uniform convergence},
pages = {2635--2670},
title = {{Learnability, Stability and Uniform Convergence}},
volume = {11},
year = {2010}
}
@article{Chapelle2007,
abstract = {Most literature on support vector machines (SVMs) concentrates on the dual optimization problem. In this letter, we point out that the primal problem can also be solved efficiently for both linear and nonlinear SVMs and that there is no reason for ignoring this possibility. On the contrary, from the primal point of view, new families of algorithms for large-scale SVM training can be investigated.},
author = {Chapelle, Olivier},
doi = {10.1162/neco.2007.19.5.1155},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Chapelle - 2007 - Training a support vector machine in the primal.pdf:pdf},
issn = {0899-7667},
journal = {Neural computation},
keywords = {Algorithms,Models, Theoretical,Neural Networks (Computer),Nonlinear Dynamics,Pattern Recognition, Automated},
month = {may},
number = {5},
pages = {1155--78},
pmid = {17381263},
title = {{Training a support vector machine in the primal.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/17381263},
volume = {19},
year = {2007}
}
@article{Vehtari2015,
abstract = {Importance weighting is a convenient general way to adjust for draws from the wrong distribution, but the resulting ratio estimate can be noisy when the importance weights have a heavy right tail, as routinely occurs when there are aspects of the target distribution not well captured by the approximating distribution. More stable estimates can be obtained by truncating the importance ratios. Here we present a new method for stabilizing importance weights using generalized Pareto distribution fit to the upper tail of the distribution of the simulated importance ratios.},
archivePrefix = {arXiv},
arxivId = {1507.02646},
author = {Vehtari, Aki and Gelman, Andrew},
eprint = {1507.02646},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Vehtari, Gelman - 2015 - Very Good Importance Sampling.pdf:pdf},
keywords = {bayesian computation,importance sampling,leave-one-,loo,monte carlo,out cross-validation},
number = {July},
title = {{Very Good Importance Sampling}},
url = {http://arxiv.org/abs/1507.02646},
year = {2015}
}
@inproceedings{Yoshiyama2011,
author = {Yoshiyama, Kazuki and Sakurai, Akito},
booktitle = {IAPR International Workshop on Partially Supervised Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Yoshiyama, Sakurai - 2011 - Manifold-Regularized Minimax Probability Machine.pdf:pdf},
pages = {42--51},
title = {{Manifold-Regularized Minimax Probability Machine}},
year = {2011}
}
@book{Mohri2012,
author = {Mohri, Mehryar and Rostamizadeh, Afshin and Talwalkar, Ameet},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Mohri, Rostamizadeh, Talwalkar - 2012 - Foundations of Machine Learning.pdf:pdf},
isbn = {978-0-262-01825-8},
pages = {1--},
publisher = {The MIT Press},
title = {{Foundations of Machine Learning}},
year = {2012}
}
@inproceedings{Ghahramani1994,
annote = {Shows how minimization the complete data likelihood of generative models using EM allows one to fit both mixtures and deal with missing values in a uniform way. They point out that for missing values, one does not just predict them in the E step, but rather one uses the expected sufficient statistics belonging to these unknown values.},
author = {Ghahramani, Zoubin and Jordan, Michael I.},
booktitle = {Advances in Neural Information Processing Systems 6},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ghahramani, Jordan - 1994 - Supervised Learning from incomplete data via an EM approach.pdf:pdf},
title = {{Supervised Learning from incomplete data via an EM approach}},
year = {1994}
}
@article{Ma2015,
author = {Ma, Jianping and Jiang, Jin},
doi = {10.1016/j.net.2014.12.005},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ma, Jiang - 2015 - Semisupervised classification for fault diagnosis in nuclear power plants.pdf:pdf},
issn = {17385733},
journal = {Nuclear Engineering and Technology},
number = {2},
pages = {176--186},
publisher = {Elsevier B.V},
title = {{Semisupervised classification for fault diagnosis in nuclear power plants}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S1738573315000054},
volume = {47},
year = {2015}
}
@article{Chang2011,
author = {Chang, Chih-Chung and Lin, Chih-Jen},
journal = {ACM Transactions on Intelligent Systems and Technology},
number = {3},
pages = {27},
publisher = {ACM},
title = {{LIBSVM: a library for support vector machines}},
volume = {2},
year = {2011}
}
@article{Webb1996,
author = {Webb, Geoffrey I.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Webb - 1996 - Further Experimental Evidence against the Utility of Occam's Razor.pdf:pdf},
journal = {Journal of Artificial Intelligence Research},
pages = {397--417},
title = {{Further Experimental Evidence against the Utility of Occam's Razor}},
url = {http://arxiv.org/abs/cs/9605101},
volume = {4},
year = {1996}
}
@article{Dawid2007,
abstract = {A decision problem is defined in terms of an outcome space, an action space and a loss function. Starting from these simple ingredients, we can construct: Proper Scoring Rule; Entropy Function; Divergence Function; Riemannian Metric; and Unbiased Estimating Equation. From an abstract viewpoint, the loss function defines a duality between the outcome and action spaces, while the correspondence between a distribution and its Bayes act induces a self-duality. Together these determine a "decision geometry" for the family of distributions on outcome space. This allows generalisation of many standard statistical concepts and properties. In particular we define and study generalised exponential families. Several examples are analysed, including a general Bregman geometry.},
author = {Dawid, A. P.},
doi = {10.1007/s10463-006-0099-8},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Dawid - 2007 - The geometry of proper scoring rules.pdf:pdf},
issn = {00203157},
journal = {Annals of the Institute of Statistical Mathematics},
keywords = {Bregman geometry,Decision geometry,Generalised exponential family,Information geometry,Proper scoring rule,Unbiased estimating equation},
number = {1},
pages = {77--93},
title = {{The geometry of proper scoring rules}},
volume = {59},
year = {2007}
}
@article{Weston2006,
address = {New York, New York, USA},
author = {Weston, Jason and Collobert, Ronan and Sinz, Fabian and Bottou, L{\'{e}}on and Vapnik, Vladimir},
doi = {10.1145/1143844.1143971},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Weston et al. - 2006 - Inference with the Universum.pdf:pdf},
isbn = {1595933832},
journal = {Proceedings of the 23rd international conference on Machine learning - ICML '06},
pages = {1009--1016},
publisher = {ACM Press},
title = {{Inference with the Universum}},
url = {http://portal.acm.org/citation.cfm?doid=1143844.1143971},
year = {2006}
}
@book{Pearl,
abstract = {Written by one of the pre-eminent researchers in the field, this book provides a comprehensive exposition of modern analysis of causation. It shows how causality has grown from a nebulous concept into a mathematical theory with significant applications in the fields of statistics, artificial intelligence, philosophy, cognitive science, and the health and social sciences. Pearl presents a unified account of the probabilistic, manipulative, counterfactual and structural approaches to causation, and devises simple mathematical tools for analyzing the relationships between causal connections, statistical associations, actions and observations. The book will open the way for including causal analysis in the standard curriculum of statistics, artifical intelligence, business, epidemiology, social science and economics. Students in these areas will find natural models, simple identification procedures, and precise mathematical definitions of causal concepts that traditional texts have tended to evade or make unduly complicated. This book will be of interest to professionals and students in a wide variety of fields. Anyone who wishes to elucidate meaningful relationships from data, predict effects of actions and policies, assess explanations of reported events, or form theories of causal understanding and causal speech will find this book stimulating and invaluable. Professor of Computer Science at the UCLA, Judea Pearl is the winner of the 2008 Benjamin Franklin Award in Computers and Cognitive Science.},
author = {Pearl, Judea},
doi = {10.1215/00318108-110-4-639},
edition = {Second},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Pearl - 2009 - Causality Models, Reasoning, and Inference.pdf:pdf},
isbn = {0521773628},
issn = {00323470},
pmid = {11768929},
publisher = {Cambridge University Press},
title = {{Causality: Models, Reasoning, and Inference}},
year = {2009}
}
@article{Zhou2007b,
address = {New York, New York, USA},
author = {Zhou, Dengyong and Burges, Christopher J. C.},
doi = {10.1145/1273496.1273642},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Zhou, Burges - 2007 - Spectral clustering and transductive learning with multiple views.pdf:pdf},
isbn = {9781595937933},
journal = {Proceedings of the 24th international conference on Machine learning - ICML '07},
pages = {1159--1166},
publisher = {ACM Press},
title = {{Spectral clustering and transductive learning with multiple views}},
url = {http://portal.acm.org/citation.cfm?doid=1273496.1273642},
year = {2007}
}
@inproceedings{Loogc,
author = {Loog, M. and van Ginneken, B.},
booktitle = {Proceedings of the International Conference on Pattern Recognition},
doi = {10.1109/ICPR.2002.1048456},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Loog, van Ginneken - 2002 - Supervised Segmentation by Iterated Contextual Pixel Classificatio.pdf:pdf},
pages = {925--928},
title = {{Supervised Segmentation by Iterated Contextual Pixel Classificatio}},
year = {2002}
}
@article{Braga-Neto2004,
abstract = {MOTIVATION: Microarray classification typically possesses two striking attributes: (1) classifier design and error estimation are based on remarkably small samples and (2) cross-validation error estimation is employed in the majority of the papers. Thus, it is necessary to have a quantifiable understanding of the behavior of cross-validation in the context of very small samples. RESULTS: An extensive simulation study has been performed comparing cross-validation, resubstitution and bootstrap estimation for three popular classification rules-linear discriminant analysis, 3-nearest-neighbor and decision trees (CART)-using both synthetic and real breast-cancer patient data. Comparison is via the distribution of differences between the estimated and true errors. Various statistics for the deviation distribution have been computed: mean (for estimator bias), variance (for estimator precision), root-mean square error (for composition of bias and variance) and quartile ranges, including outlier behavior. In general, while cross-validation error estimation is much less biased than resubstitution, it displays excessive variance, which makes individual estimates unreliable for small samples. Bootstrap methods provide improved performance relative to variance, but at a high computational cost and often with increased bias (albeit, much less than with resubstitution).},
author = {Braga-Neto, Ulisses M and Dougherty, Edward R.},
doi = {10.1093/bioinformatics/btg419},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Braga-Neto, Dougherty - 2004 - Is cross-validation valid for small-sample microarray classification.pdf:pdf},
issn = {1367-4803},
journal = {Bioinformatics (Oxford, England)},
keywords = {Algorithms,Benchmarking,Benchmarking: methods,Breast Neoplasms,Breast Neoplasms: diagnosis,Breast Neoplasms: genetics,Computer Simulation,Gene Expression Profiling,Gene Expression Profiling: methods,Genetic Predisposition to Disease,Genetic Predisposition to Disease: genetics,Genetic Testing,Genetic Testing: methods,Humans,Models, Genetic,Models, Statistical,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Pattern Recognition, Automated,Reproducibility of Results,Sample Size,Sensitivity and Specificity},
month = {feb},
number = {3},
pages = {374--80},
pmid = {14960464},
title = {{Is cross-validation valid for small-sample microarray classification?}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/14960464},
volume = {20},
year = {2004}
}
@inproceedings{Goodman2016,
abstract = {We summarize the potential impact that the European Union's new General Data Protection Regulation will have on the routine use of machine learning algorithms. Slated to take effect as law across the EU in 2018, it will restrict automated individual decision-making (that is, algorithms that make decisions based on user-level predictors) which "significantly affect" users. The law will also create a "right to explanation," whereby a user can ask for an explanation of an algorithmic decision that was made about them. We argue that while this law will pose large challenges for industry, it highlights opportunities for machine learning researchers to take the lead in designing algorithms and evaluation frameworks which avoid discrimination.},
archivePrefix = {arXiv},
arxivId = {1606.08813},
author = {Goodman, Bryce and Flaxman, Seth},
booktitle = {ICML Workshop on Human Interpretability in Machine Learning (WHI 2016)},
eprint = {1606.08813},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Goodman, Flaxman - 2016 - EU regulations on algorithmic decision-making and a right to explanation.pdf:pdf},
keywords = {machine learning},
title = {{EU regulations on algorithmic decision-making and a "right to explanation"}},
url = {http://arxiv.org/abs/1606.08813},
year = {2016}
}
@article{Eddelbuettel2014,
author = {Eddelbuettel, Dirk and Sanderson, Conrad},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Eddelbuettel, Sanderson - 2014 - Rcpparmadillo Accelerating R with high-performance C linear algebra.pdf:pdf},
journal = {Computational Statistics {\&} Data Analysis},
keywords = {c,linear algebra,r,software},
pages = {1054--1063},
title = {{Rcpparmadillo: Accelerating R with high-performance C++ linear algebra}},
url = {http://www.sciencedirect.com/science/article/pii/S0167947313000492},
volume = {71},
year = {2014}
}
@inproceedings{Bennett1998,
author = {Bennett, Kristin P. and Demiriz, Ayhan},
booktitle = {Advances in Neural Information Processing Systems 11},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bennett, Demiriz - 1998 - Semi-supervised support vector machines.pdf:pdf},
pages = {368--374},
title = {{Semi-supervised support vector machines}},
year = {1998}
}
@article{Erren2007,
author = {Erren, Thomas C and Bourne, Philip E},
doi = {10.1371/journal.pcbi.0030102},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Erren, Bourne - 2007 - Ten simple rules for a good poster presentation.pdf:pdf},
issn = {1553-7358},
journal = {PLoS computational biology},
keywords = {Algorithms,Audiovisual Aids,Biomedical Research,Communication,Congresses as Topic,Exhibits as Topic,Information Dissemination,Information Dissemination: methods,Professional Competence},
month = {may},
number = {5},
pages = {e102},
pmid = {17530921},
title = {{Ten simple rules for a good poster presentation.}},
url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=1876493{\&}tool=pmcentrez{\&}rendertype=abstract},
volume = {3},
year = {2007}
}
@article{Gelman1999,
abstract = {Maps are frequently used to display spatial distributions of parameters of interest, such as cancer rates or average pollutant concentrations by county. It is well known that plotting observed rates can have serious drawbacks when sample sizes vary by area, since very high (and low) observed rates are found disproportionately in poorly-sampled areas. Unfortunately, adjusting the observed rates to account for the effects of small-sample noise can introduce an opposite effect, in which the highest adjusted rates tend to be found disproportionately in well-sampled areas. In either case, the maps can be difficult to interpret because the display of spatial variation in the underlying parameters of interest is confounded with spatial variation in sample sizes. As a result, spatial patterns occur in adjusted rates even if there is no spatial structure in the underlying parameters of interest, and adjusted rates tend to look too uniform in areas with little data. We introduce two models (normal and Poisson) in which parameters of interest have no spatial patterns, and demonstrate the existence of spatial artefacts in inference from these models. We also discuss spatial models and the extent to which they are subject to the same artefacts. We present examples from Bayesian modelling, but, as we explain, the artefacts occur generally.},
author = {Gelman, Andrew and Price, Phlllip N.},
doi = {10.1002/(SICI)1097-0258(19991215)18:23<3221::AID-SIM312>3.0.CO;2-M},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gelman, Price - 1999 - All maps of parameter estimates are misleading.pdf:pdf},
isbn = {0277-6715},
issn = {02776715},
journal = {Statistics in Medicine},
pages = {3221--3234},
pmid = {10602147},
title = {{All maps of parameter estimates are misleading}},
volume = {18},
year = {1999}
}
@book{Casella2002,
author = {Casella, George and Berger, Roger L.},
edition = {Second},
title = {{Statistical Inference}},
year = {2002}
}
@inproceedings{Reid2009,
abstract = {We present tight surrogate regret bounds for the class of proper (i.e., Fisher consistent) losses. The bounds generalise the margin-based bounds due to Bartlett et al. (2006). The proof uses Taylor's theorem and leads to new representations for loss and regret and a simple proof of the integral representation of proper losses. We also present a different formulation of a duality result of Bregman divergences which leads to a simple demonstration of the convexity of composite losses using canonical link functions},
author = {Reid, Mark and Williamson, Bob},
booktitle = {Proceedings of the 26th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Reid, Williamson - 2009 - Surrogate Regret Bounds for Proper Losses.pdf:pdf},
pages = {897--904},
title = {{Surrogate Regret Bounds for Proper Losses}},
url = {http://eprints.pascal-network.org/archive/00008977/},
year = {2009}
}
@article{Loog2016,
abstract = {Improvement guarantees for semi-supervised classifiers can currently only be given under restrictive conditions on the data. We propose a general way to perform semi-supervised parameter estimation for likelihood-based classifiers for which, on the full training set, the estimates are never worse than the supervised solution in terms of the log-likelihood. We argue, moreover, that we may expect these solutions to really improve upon the supervised classifier in particular cases. In a worked-out example for LDA, we take it one step further and essentially prove that its semi-supervised version is strictly better than its supervised counterpart. The two new concepts that form the core of our estimation principle are contrast and pessimism. The former refers to the fact that our objective function takes the supervised estimates into account, enabling the semi-supervised solution to explicitly control the potential improvements over this estimate. The latter refers to the fact that our estimates are conservative and therefore resilient to whatever form the true labeling of the unlabeled data takes on. Experiments demonstrate the improvements in terms of both the log-likelihood and the classification error rate on independent test sets.},
archivePrefix = {arXiv},
arxivId = {1503.00269},
author = {Loog, Marco},
eprint = {1503.00269},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Loog - 2016 - Contrastive Pessimistic Likelihood Estimation for Semi-Supervised Classification.pdf:pdf},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
keywords = {contrast,ear discriminant analysis,lin-,maximum likelihood,pessimism,semi-supervised learning},
number = {3},
pages = {462--475},
title = {{Contrastive Pessimistic Likelihood Estimation for Semi-Supervised Classification}},
volume = {38},
year = {2016}
}
@article{Bengio2013,
abstract = {The success of machine learning algorithms generally depends on data representation, and we hypothesize that this is because different representations can entangle and hide more or less the different explanatory factors of variation behind the data. Although specific domain knowledge can be used to help design representations, learning with generic priors can also be used, and the quest for AI is motivating the design of more powerful representation-learning algorithms implementing such priors. This paper reviews recent work in the area of unsupervised feature learning and deep learning, covering advances in probabilistic models, autoencoders, manifold learning, and deep networks. This motivates longer term unanswered questions about the appropriate objectives for learning good representations, for computing representations (i.e., inference), and the geometrical connections between representation learning, density estimation, and manifold learning.},
author = {Bengio, Yoshua and Courville, Aaron and Vincent, Pascal},
doi = {10.1109/TPAMI.2013.50},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bengio, Courville, Vincent - 2013 - Representation learning a review and new perspectives.pdf:pdf},
issn = {1939-3539},
journal = {IEEE transactions on pattern analysis and machine intelligence},
keywords = {Algorithms,Artificial Intelligence,Artificial Intelligence: trends,Humans,Neural Networks (Computer)},
month = {aug},
number = {8},
pages = {1798--828},
pmid = {23787338},
title = {{Representation learning: a review and new perspectives.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/23787338},
volume = {35},
year = {2013}
}
@article{Wang2007,
author = {Wang, Junhui and Shen, Xiaotong},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wang, Shen - 2007 - Large margin Semi-supervised Learning.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {generalization,grouping,sequential quadratic programming,support vectors},
pages = {1867--1891},
title = {{Large margin Semi-supervised Learning}},
volume = {8},
year = {2007}
}
@inproceedings{Liu2014,
author = {Liu, Anqi and Ziebart, Brian D},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Liu, Ziebart - 2014 - Robust Classification Under Sample Selection Bias(2).pdf:pdf},
pages = {37--45},
title = {{Robust Classification Under Sample Selection Bias}},
year = {2014}
}
@inproceedings{Druck2007,
author = {Druck, Gregory and Pal, Chris and McCallum, Andrew Kachites and Zhu, Xiaojin},
booktitle = {Proceedings of the 13th ACM SIGKDD international conference on Knowledge discovery and data mining},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Druck et al. - 2007 - Semi-supervised classification with hybrid generativediscriminative methods.pdf:pdf},
isbn = {9781595936097},
keywords = {discriminative,hybrid generative,methods,semi-supervised learning,text classification},
pages = {280--289},
title = {{Semi-supervised classification with hybrid generative/discriminative methods}},
url = {http://dl.acm.org/citation.cfm?id=1281225},
year = {2007}
}
@article{Hsu2011,
archivePrefix = {arXiv},
arxivId = {arXiv:1106.2363v1},
author = {Hsu, Daniel and Kakade, Sham M. and Zhang, Tong},
eprint = {arXiv:1106.2363v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hsu, Kakade, Zhang - 2011 - An analysis of random design linear regression.pdf:pdf},
journal = {arXiv preprint},
title = {{An analysis of random design linear regression}},
url = {http://arxiv.org/abs/1106.2363},
year = {2011}
}
@article{Bengio2010,
author = {Bengio, Yoshua and Delalleau, Olivier and Simard, Clarence},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bengio, Delalleau, Simard - 2010 - Decision trees do not generalize to new variations.pdf:pdf},
journal = {Computational Intelligence},
number = {4},
pages = {449--467},
title = {{Decision trees do not generalize to new variations}},
url = {http://onlinelibrary.wiley.com/doi/10.1111/j.1467-8640.2010.00366.x/full},
volume = {26},
year = {2010}
}
@article{Breiman2001,
author = {Breiman, Leo},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Breiman - 2001 - Statistical Modeling The Two Cultures.pdf:pdf},
journal = {Statistical Science},
number = {3},
pages = {199--231},
title = {{Statistical Modeling: The Two Cultures}},
url = {http://scholar.google.com/scholar?hl=en{\&}btnG=Search{\&}q=intitle:Statistical+Modeling+:+The+Two+Cultures{\#}2 http://scholar.google.com/scholar?hl=en{\&}btnG=Search{\&}q=intitle:Statistical+Modeling:+The+Two+Cultures{\#}2},
volume = {16},
year = {2001}
}
@incollection{Loog2016a,
author = {Loog, Marco and Krijthe, Jesse Hendrik and Jensen, A C},
booktitle = {Handbook of Pattern Recognition and Computer Vision},
chapter = {1.3},
edition = {5},
editor = {Chen, C H},
publisher = {World Scientific},
title = {{On Measuring and Quantifying Performance: Error Rates, Surrogate Loss, and an Example in SSL}},
year = {2016}
}
@article{Liu2013,
author = {Liu, J. and Gelman, Andrew and Hill, J. and Su, Y.-S. and Kropko, J.},
doi = {10.1093/biomet/ast044},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Liu et al. - 2013 - On the stationary distribution of iterative imputations.pdf:pdf},
issn = {0006-3444},
journal = {Biometrika},
month = {nov},
number = {1},
pages = {155--173},
title = {{On the stationary distribution of iterative imputations}},
url = {http://biomet.oxfordjournals.org/cgi/doi/10.1093/biomet/ast044},
volume = {101},
year = {2013}
}
@inproceedings{Damianou2015,
author = {Damianou, Andreas and Lawrence, Neil D.},
booktitle = {31st Conference on Uncertainty in Artificial Intelligence},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Damianou, Lawrence - 2015 - Semi-described and semi-supervised learning with Gaussian processes.pdf:pdf},
title = {{Semi-described and semi-supervised learning with Gaussian processes}},
year = {2015}
}
@inproceedings{Szummer2002,
author = {Szummer, Martin and Jaakkola, Tommi S.},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Szummer, Jaakkola - 2002 - Information regularization with partially labeled data.pdf:pdf},
pages = {1025--1032},
title = {{Information regularization with partially labeled data}},
url = {http://machinelearning.wustl.edu/mlpapers/paper{\_}files/AA69.pdf},
year = {2002}
}
@article{Hoogerbrugge1983,
author = {Hoogerbrugge, Ronald and Willig, Simon J. and Kistemaker, Piet G.},
doi = {10.1021/ac00261a016},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hoogerbrugge, Willig, Kistemaker - 1983 - Discriminant analysis by double stage principal component analysis.pdf:pdf},
issn = {0003-2700},
journal = {Analytical Chemistry},
month = {sep},
number = {11},
pages = {1710--1712},
title = {{Discriminant analysis by double stage principal component analysis}},
url = {http://pubs.acs.org/doi/abs/10.1021/ac00261a016},
volume = {55},
year = {1983}
}
@article{Arlot2010,
author = {Arlot, Sylvain and Celisse, Alain},
doi = {10.1214/09-SS054},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Arlot, Celisse - 2010 - A survey of cross-validation procedures for model selection.pdf:pdf},
issn = {1935-7516},
journal = {Statistics Surveys},
keywords = {and phrases,cross-validation,leave-one-out,model selection},
pages = {40--79},
title = {{A survey of cross-validation procedures for model selection}},
url = {http://projecteuclid.org/euclid.ssu/1268143839},
volume = {4},
year = {2010}
}
@incollection{Steinwart2013,
author = {Steinwart, Ingo},
booktitle = {Empirical Inference},
chapter = {4},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Steinwart - 2013 - Some Remarks on the Statistical Analysis of SVMs and Related Methods.pdf:pdf},
pages = {25--36},
title = {{Some Remarks on the Statistical Analysis of SVMs and Related Methods}},
year = {2013}
}
@article{Spirtes2010,
author = {Spirtes, Peter},
doi = {10.2202/1557-4679.1203},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Spirtes - 2010 - Introduction to causal inference.pdf:pdf},
issn = {1557-4679},
journal = {Journal of Machine Learning Research},
month = {jan},
pages = {1643--1662},
pmid = {20305706},
title = {{Introduction to causal inference}},
url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=2836213{\&}tool=pmcentrez{\&}rendertype=abstract},
volume = {11},
year = {2010}
}
@article{Soares2004,
author = {Soares, Carlos and Brazdil, Pavel B. and Kuba, Petr},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Soares, Brazdil, Kuba - 2004 - A Meta-Learning Method to Select the KernelWidth in Support Vector Regression.pdf:pdf},
journal = {Machine learning},
keywords = {gaussian kernel,learning rankings,meta-learning,parameter setting,support vector machines},
pages = {195--209},
title = {{A Meta-Learning Method to Select the KernelWidth in Support Vector Regression}},
url = {http://link.springer.com/article/10.1023/b:mach.0000015879.28004.9b},
volume = {54},
year = {2004}
}
@article{Bartlett2003a,
author = {Bartlett, Peter L and Mendelson, S},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bartlett, Mendelson - 2003 - Rademacher and Gaussian complexities Risk bounds and structural results.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {data-dependent complexity,error bounds,maxi-,rademacher averages},
pages = {463--482},
title = {{Rademacher and Gaussian complexities: Risk bounds and structural results}},
url = {http://dl.acm.org/citation.cfm?id=944944},
volume = {3},
year = {2003}
}
@article{Lopez-paz2012,
archivePrefix = {arXiv},
arxivId = {arXiv:1502.02398v1},
author = {Lopez-paz, David and Sch, Bernhard},
eprint = {arXiv:1502.02398v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lopez-paz, Sch - 2015 - Towards a Learning Theory of Causation.pdf:pdf},
title = {{Towards a Learning Theory of Causation}},
year = {2015}
}
@article{Zhu2014,
abstract = {In many situations we have some measurement of confidence on " positiveness " for a binary label. The " positiveness " is a continuous value whose range is a bounded interval. It quantifies the affiliation of each training data to the positive class. We propose a novel learning algorithm called expectation loss SVM (e-SVM) that is devoted to the problems where only the " positiveness " instead of a binary label of each training sample is available. Our e-SVM algorithm can also be readily extended to learn segment classifiers under weak supervision where the exact positiveness value of each training example is unobserved. In experiments, we show that the e-SVM algorithm can effectively address the segment proposal classification task under both strong supervision (e.g. the pixel-level annotations are available) and the weak supervision (e.g. only bounding-box annotations are available), and outperforms the alternative approaches. Besides, we further vali-date this method on two major tasks of computer vision: semantic segmentation and object detection. Our method achieves the state-of-the-art object detection performance on PASCAL VOC 2007 dataset.},
author = {Zhu, Jun and Mao, Junhua and Yuille, Alan},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Zhu, Mao, Yuille - 2014 - Learning From Weakly Supervised Data by The Expectation Loss SVM ( e-SVM ) Algorithm.pdf:pdf},
journal = {Advances in Neural Information Processing Systems},
pages = {1--9},
title = {{Learning From Weakly Supervised Data by The Expectation Loss SVM ( e-SVM ) Algorithm}},
year = {2014}
}
@article{Yu2016,
author = {Yu, Hongbin and Lu, Hongtao},
doi = {10.1016/j.patcog.2016.07.005},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Yu, Lu - 2016 - Orthogonal optimal reverse prediction for semi-supervised learning.pdf:pdf},
issn = {00313203},
journal = {Pattern Recognition},
keywords = {Optimal reverse prediction,Orthogonal constraint,Orthogonal optimal reverse prediction,Semi-supervised learning,optimal reverse prediction,orthogonal constraint,semi-supervised learning},
pages = {908--920},
publisher = {Elsevier},
title = {{Orthogonal optimal reverse prediction for semi-supervised learning}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0031320316301522},
volume = {60},
year = {2016}
}
@article{Sugiyama2009,
author = {Sugiyama, Masashi and Id{\'{e}}, Tsuyoshi and Nakajima, Shinichi and Sese, Jun},
doi = {10.1007/s10994-009-5125-7},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Sugiyama et al. - 2009 - Semi-supervised local Fisher discriminant analysis for dimensionality reduction.pdf:pdf},
issn = {0885-6125},
journal = {Machine Learning},
month = {jul},
number = {1-2},
pages = {35--61},
title = {{Semi-supervised local Fisher discriminant analysis for dimensionality reduction}},
url = {http://www.springerlink.com/index/10.1007/s10994-009-5125-7},
volume = {78},
year = {2009}
}
@article{Reif2012,
author = {Reif, Matthias and Shafait, Faisal and Goldstein, Markus and Breuel, Thomas and Dengel, Andreas},
doi = {10.1007/s10044-012-0280-z},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Reif et al. - 2012 - Automatic classifier selection for non-experts.pdf:pdf},
issn = {1433-7541},
journal = {Pattern Analysis and Applications},
keywords = {classifier recommendation,classifier selection,landmarking,meta-features,meta-learning,regression},
month = {jul},
title = {{Automatic classifier selection for non-experts}},
url = {http://www.springerlink.com/index/10.1007/s10044-012-0280-z},
year = {2012}
}
@article{Kakade2008,
abstract = {This work characterizes the generalization ability of algorithms whose predictions are linear in the input vector. To this end, we provide sharp bounds for Rademacher and Gaussian complexities of (constrained) linear classes, which directly lead to a number of generalization bounds. This derivation provides simpliﬁed proofs of a number of corollaries including: risk bounds for linear prediction (including settings where the weight vectors are constrained by either L2 or L1 constraints), margin bounds (including both L2 and L1 margins, along with more general notions based on relative entropy), a proof of the PAC-Bayes theorem, and upper bounds on L2 covering numbers (with Lp norm constraints and relative entropy constraints). In addition to providing a uniﬁed analysis, the results herein provide some of the sharpest risk and margin bounds. Interestingly, our results show that the uniform convergence rates of empirical risk minimization algorithms tightly match the regret bounds of online learning algorithms for linear prediction, up to a constant factor of 2.},
author = {Kakade, Sham M. and Sridharan, Karthik and Tewari, Ambuj},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kakade, Sridharan, Tewari - 2008 - On the Complexity of Linear Prediction risk bounds, margin bounds, and regularization.pdf:pdf},
isbn = {9781605609492},
journal = {Advances in Neural Information Processing Systems},
pages = {1--11},
title = {{On the Complexity of Linear Prediction : risk bounds, margin bounds, and regularization}},
url = {https://papers.nips.cc/paper/3510-on-the-complexity-of-linear-prediction-risk-bounds-margin-bounds-and-regularization.pdf},
year = {2008}
}
@article{VanderKooi2013,
abstract = {OBJECTIVES: We investigated how much the Human Development Index (HDI), a global measure of development, modifies the effect of education on self-reported health.

METHODS: We analyzed cross-sectional World Health Survey data on 217,642 individuals from 49 countries, collected in 2002 to 2005, with random-intercept multilevel linear regression models.

RESULTS: We observed greater positive associations between educational levels and self-reported good health with increasing HDI. The magnitude of this effect modification of the education-health relation tended to increase with educational attainment. For example, before adjustment for effect modification, at comparable HDI, on average, finishing primary school was associated with better general health (b = 1.49; 95{\%} confidence interval [CI] = 1.18, 1.80). With adjustment for effect modification by HDI, the impact became 4.63 (95{\%} CI = 3.63, 5.62) for every 0.1 increase in HDI. Among those who completed high school, these associations were, respectively, 5.59 (95{\%} CI = 5.20, 5.98) and 9.95 (95{\%} CI = 8.89, 11.00).

CONCLUSIONS: The health benefits of educational attainment are greater in countries with greater human development. Health inequalities attributable to education are, therefore, larger in more developed countries.},
author = {van der Kooi, Anne L F and Stronks, Karien and Thompson, Caroline a and DerSarkissian, Maral and Arah, Onyebuchi a},
doi = {10.2105/AJPH.2013.301593},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/van der Kooi et al. - 2013 - The modifying influence of country development on the effect of individual educational attainment on self-r.pdf:pdf},
issn = {1541-0048},
journal = {American journal of public health},
keywords = {Adult,Aged,Cross-Sectional Studies,Developed Countries,Developed Countries: statistics {\&} numerical data,Developing Countries,Developing Countries: statistics {\&} numerical data,Educational Status,Female,Health Status,Health Status Disparities,Health Surveys,Humans,Male,Middle Aged,Self Report,Young Adult},
month = {nov},
number = {11},
pages = {e49--54},
pmid = {24028233},
title = {{The modifying influence of country development on the effect of individual educational attainment on self-rated health.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/24028233},
volume = {103},
year = {2013}
}
@article{Lin2011a,
abstract = {We design a class of submodular functions meant for document summarization tasks. These functions each combine two terms, one which encourages the summary to be representative of the corpus, and the other which positively rewards diversity. Critically, our functions are monotone nondecreasing and submodular, which means that an efficient scalable greedy optimization scheme has a constant factor guarantee of optimality. When evaluated on DUC 2004-2007 corpora, we obtain better than existing state-of-art results in both generic and query-focused document summarization. Lastly, we show that several well-established methods for document summarization correspond, in fact, to submodular function optimization, adding further evidence that submodular functions are a natural fit for document summarization.},
author = {Lin, Hui and Bilmes, Jeff},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lin, Bilmes - 2011 - A Class of Submodular Functions for Document Summarization(2).pdf:pdf},
isbn = {978-1-932432-87-9},
journal = {Computational Linguistics},
pages = {510--520},
title = {{A Class of Submodular Functions for Document Summarization}},
url = {http://ssli.ee.washington.edu/people/hlin/papers/lin-acl11-summ.pdf},
volume = {1},
year = {2011}
}
@article{Neal1998,
abstract = {The EM algorithm performs maximum likelihood estimation for data in which some variables are unobserved. We present a function that resembles negative free energy and show that the M step maximizes this function with respect to the model parameters and the E step maximizes it with respect to the distribution over the unobserved variables. From this perspective, it is easy to justify an incremental variant of the EM algorithm in which the distribution for only one of the unobserved variables is recalculated in each E step. This variant is shown empirically to give faster convergence in a mixture estimation problem. A variant of the algorithm that exploits sparse conditional distributions is also described, and a wide range of other variant algorithms are also seen to be possible.},
archivePrefix = {arXiv},
arxivId = {arXiv:1011.1669v3},
author = {Neal, R.M. and Hinton, G.E.},
doi = {10.1007/978-94-011-5014-9_12},
eprint = {arXiv:1011.1669v3},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Neal, Hinton - 1998 - A view of the EM algorithm that justifies incremental, sparse, and other variants.pdf:pdf},
isbn = {0262600323},
issn = {978-1-932432-41-1},
journal = {Learning in Graphical Models},
pages = {355--368},
pmid = {15991970},
title = {{A view of the EM algorithm that justifies incremental, sparse, and other variants}},
volume = {89},
year = {1998}
}
@article{Benavoli2015,
archivePrefix = {arXiv},
arxivId = {1505.02288v1},
author = {Benavoli, Alessio and Corani, Giorgio and Mangili, Francesca and Dalle, Istituto and Artificiale, Intelligenza},
eprint = {1505.02288v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop//Benavoli et al. - 2015 - Should we really use post-hoc tests based on mean-ranks.pdf:pdf},
issn = {15337928},
journal = {Journal of Machine Learning Research},
keywords = {friedman test,post-hoc test,statistical comparison},
pages = {1--10},
title = {{Should we really use post-hoc tests based on mean-ranks?}},
volume = {17},
year = {2015}
}
@article{Nakamoto2008,
abstract = {A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.},
author = {Nakamoto, Satoshi},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Nakamoto - 2008 - Bitcoin A Peer-to-Peer Electronic Cash System.pdf:pdf},
pages = {1--9},
title = {{Bitcoin: A Peer-to-Peer Electronic Cash System}},
year = {2008}
}
@article{Zhang2014,
author = {Zhang, Kai and Lan, Liang and Kwok, James T. and Vucetic, Slobodan and Parvin, Bahram},
doi = {10.1109/TNNLS.2014.2315526},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Zhang et al. - 2014 - Scaling Up Graph-Based Semisupervised Learning via Prototype Vector Machines.pdf:pdf},
issn = {2162-237X},
journal = {IEEE Transactions on Neural Networks and Learning Systems},
pages = {1--1},
title = {{Scaling Up Graph-Based Semisupervised Learning via Prototype Vector Machines}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=6803073},
year = {2014}
}
@article{McLachlan1982,
author = {McLachlan, Geoffrey J. and Ganesalingam, S.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/McLachlan, Ganesalingam - 1982 - Updating a discriminant function on the basis of unclassified data.pdf:pdf},
journal = {Communication in Statistics- Simulation and Computation},
number = {6},
pages = {753--767},
title = {{Updating a discriminant function on the basis of unclassified data}},
url = {http://www.tandfonline.com/doi/full/10.1080/03610918208812293},
volume = {11},
year = {1982}
}
@article{Little1992,
abstract = {The literature of regression analysis with missing values of the independent variables is reviewed. Six classes of procedures are distinguished: complete case analysis, available case methods, least squares on imputed data, maximum likelihood, Bayesian methods, and multiple imputation. Methods are compared and illustrated when missing data are confined to one independent variable, and extensions to more general patterns are indicated. Attention is paid to the performance of methods when the missing data are not missing completely at random. Least squares methods that fill in missing X's using only data on the X's are contrasted with likelihood-based methods that use data on the X's and Y. The latter approach is preferred and provides methods for elaboration of the basic normal linear regression model. It is suggested that more widely distributed software is needed that advances beyond complete-case analysis, available-case analysis, and naive imputation methods. Bayesian simulation methods and multiple imputation are reviewed; these provide fruitful avenues for future research.},
author = {Little, Roderick J. a.},
doi = {10.2307/2290664},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Little - 1992 - Regression with missing x's A review.pdf:pdf},
isbn = {01621459},
issn = {01621459},
journal = {Journal of the American Statistical Association},
keywords = {bayesian inference,imputation,incomplete data,multiple imputation},
number = {420},
pages = {1227--1237},
pmid = {318},
title = {{Regression with missing x's: A review}},
url = {http://www.jstor.org.libproxy1.nus.edu.sg/stable/2290664},
volume = {87},
year = {1992}
}
@inproceedings{Miguel2014,
author = {Carreira-Perpinan, Miguel A. and Wang, Weiran},
booktitle = {AISTATS},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Carreira-Perpinan, Wang - 2014 - Distributed Optimization of Deeply Nested Systems.pdf:pdf},
pages = {10--19},
title = {{Distributed Optimization of Deeply Nested Systems}},
year = {2014}
}
@article{Wang2009a,
author = {Wang, Fei and Wang, Xin and Li, Tao},
doi = {10.1109/CVPR.2009.5206675},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wang, Wang, Li - 2009 - Beyond the graphs Semi-parametric semi-supervised discriminant analysis.pdf:pdf},
isbn = {978-1-4244-3992-8},
journal = {IEEE Conference on Computer Vision and Pattern Recognition},
month = {jun},
pages = {2113--2120},
publisher = {Ieee},
title = {{Beyond the graphs: Semi-parametric semi-supervised discriminant analysis}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=5206675},
year = {2009}
}
@article{Yarowsky1995,
address = {Morristown, NJ, USA},
author = {Yarowsky, David},
doi = {10.3115/981658.981684},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Yarowsky - 1995 - Unsupervised word sense disambiguation rivaling supervised methods.pdf:pdf},
journal = {Proceedings of the 33rd annual meeting on Association for Computational Linguistics},
pages = {189--196},
publisher = {Association for Computational Linguistics},
title = {{Unsupervised word sense disambiguation rivaling supervised methods}},
year = {1995}
}
@article{Gelman2012a,
author = {Gelman, Andrew and Hill, Jennifer and Yajima, Masanao},
doi = {10.1080/19345747.2011.618213},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gelman, Hill, Yajima - 2012 - Why We (Usually) Don't Have to Worry About Multiple Comparisons.pdf:pdf},
issn = {1934-5747},
journal = {Journal of Research on Educational Effectiveness},
keywords = {bayesian inference,hierarchical modeling,multiple comparisons,statis-,type s error},
month = {apr},
number = {2},
pages = {189--211},
title = {{Why We (Usually) Don't Have to Worry About Multiple Comparisons}},
url = {http://www.tandfonline.com/doi/abs/10.1080/19345747.2011.618213},
volume = {5},
year = {2012}
}
@article{Oneto2015a,
author = {Oneto, Luca and Ridella, Sandro and Anguita, Davide},
doi = {10.1007/s10994-015-5540-x},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Oneto, Ridella, Anguita - 2015 - Tikhonov, Ivanov and Morozov regularization for support vector machine learning.pdf:pdf},
issn = {0885-6125},
journal = {Machine Learning},
keywords = {Ivanov regularization,Morozov regularization,Structural risk minimization,Support vector machine,Tikhonov regularization,ivanov regularization,morozov regularization,structural risk minimization,support vector machine,tikhonov regularization},
publisher = {Springer US},
title = {{Tikhonov, Ivanov and Morozov regularization for support vector machine learning}},
url = {http://link.springer.com/10.1007/s10994-015-5540-x},
year = {2015}
}
@inproceedings{McDowell2012,
author = {McDowell, Luke and Aha, David W.},
booktitle = {Proceedings of the 29th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/McDowell, Aha - 2012 - Semi-supervised collective classification via hybrid label regularization.pdf:pdf},
pages = {975--982},
title = {{Semi-supervised collective classification via hybrid label regularization}},
url = {http://arxiv.org/abs/1206.6467},
year = {2012}
}
@inproceedings{Gretton2006,
author = {Gretton, Arthur and Sch{\"{o}}lkopf, Bernhard},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gretton, Sch{\"{o}}lkopf - 2006 - A Kernel Method for the Two-Sample-Problem.pdf:pdf},
pages = {513--520},
title = {{A Kernel Method for the Two-Sample-Problem}},
year = {2006}
}
@inproceedings{Yu2013,
author = {Yu, Felix X. and Liu, Dong and Kumar, Sanjiv and Jebara, Tony and Chang, Shih-Fu},
booktitle = {International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Yu et al. - 2013 - {\$}{\$}backslash{\$} propto {\$} SVM for Learning with Label Proportions.pdf:pdf},
pages = {504--512},
title = {{{\$}{\$}$\backslash$backslash{\$} propto {\$} SVM for Learning with Label Proportions}},
year = {2013}
}
@unpublished{Chena,
archivePrefix = {arXiv},
arxivId = {arXiv:0000.0000},
author = {Chen, Aiyou and Owen, Art B. and Shi, Minghui},
eprint = {arXiv:0000.0000},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Chen, Owen, Shi - Unknown - Data Enriched Linear Regression.pdf:pdf},
pages = {1--37},
title = {{Data Enriched Linear Regression}}
}
@article{Shaffer1991,
author = {Shaffer, Juliet Popper},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Shaffer - 1991 - The Gauss-Markov Theorem and Random Regressors.pdf:pdf},
journal = {The American Statistician},
keywords = {best linear unbiased estimators,finite-,linear regression,population sampling,unbiased esti-},
number = {4},
pages = {269--273},
title = {{The Gauss-Markov Theorem and Random Regressors}},
volume = {45},
year = {1991}
}
@article{Gelman2015,
abstract = {We argue that the words " objectivity " and " subjectivity " in statistics discourse are used in a mostly unhelpful way, and we propose to replace each of them with broader collections of attributes, with objectivity replaced by transparency, consensus, impartiality, and correspon-dence to observable reality, and subjectivity replaced by awareness of multiple perspectives and context dependence. The advantage of these reformulations is that the replacement terms do not oppose each other. Instead of debating over whether a given statistical method is subjec-tive or objective (or normatively debating the relative merits of subjectivity and objectivity in statistical practice), we can recognize desirable attributes such as transparency and acknowledg-ment of multiple perspectives as complementary goals. We demonstrate the implications of our proposal with recent applied examples from pharmacology, election polling, and socioeconomic stratification.},
archivePrefix = {arXiv},
arxivId = {arXiv:1508.05453v1},
author = {Gelman, Andrew and Hennig, Christian},
eprint = {arXiv:1508.05453v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gelman, Hennig - 2015 - Beyond subjective and objective in statistics.pdf:pdf},
keywords = {()},
number = {August},
title = {{Beyond subjective and objective in statistics}},
year = {2015}
}
@article{Gelman2011,
author = {Gelman, Andrew},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gelman - 2011 - Induction and deduction in Bayesian data analysis.pdf:pdf},
journal = {Rationality, Markets and Morals (RMM)},
pages = {67--78},
title = {{Induction and deduction in Bayesian data analysis}},
url = {http://www.stat.columbia.edu/{~}gelman/research/unpublished/philosophy{\_}online4.pdf},
volume = {2},
year = {2011}
}
@article{Tanha2015,
author = {Tanha, Jafar and van Someren, Maarten and Afsarmanesh, Hamideh},
doi = {10.1007/s13042-015-0328-7},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Tanha, van Someren, Afsarmanesh - 2015 - Semi-supervised self-training for decision tree classifiers.pdf:pdf},
isbn = {1304201503},
issn = {1868-8071},
journal = {International Journal of Machine Learning and Cybernetics},
number = {JANUARY},
title = {{Semi-supervised self-training for decision tree classifiers}},
url = {http://link.springer.com/10.1007/s13042-015-0328-7},
year = {2015}
}
@article{Goel2003,
author = {Goel, Prem K. and Ginebra, Josep},
doi = {10.1046/j.1467-9884.2003.00376.x},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Goel, Ginebra - 2003 - When is one experiment 'always better than' another.pdf:pdf},
issn = {00390526},
journal = {Journal of the Royal Statistical Society Series D: The Statistician},
keywords = {Comparison of experiments,Location experiments,Loewner ordering,Optimal design,Statistical information,Stochastic ordering,Sufficiency},
number = {4},
pages = {515--537},
title = {{When is one experiment 'always better than' another?}},
volume = {52},
year = {2003}
}
@article{Calandriello2016,
abstract = {While the harmonic function solution performs well in many semi-supervised learning (SSL) tasks, it is known to scale poorly with the number of samples. Recent successful and scalable methods, such as the eigenfunction method focus on efficiently approximating the whole spectrum of the graph Laplacian constructed from the data. This is in contrast to various subsampling and quantization methods proposed in the past, which may fail in preserving the graph spectra. However, the impact of the approximation of the spectrum on the final generalization error is either unknown, or requires strong assumptions on the data. In this paper, we introduce Sparse-HFS, an efficient edge-sparsification algorithm for SSL. By constructing an edge-sparse and spectrally similar graph, we are able to leverage the approximation guarantees of spectral sparsification methods to bound the generalization error of Sparse-HFS. As a result, we obtain a theoretically-grounded approximation scheme for graph-based SSL that also empirically matches the performance of known large-scale methods.},
archivePrefix = {arXiv},
arxivId = {1601.05675},
author = {Calandriello, Daniele and Lazaric, Alessandro and Valko, Michal and Koutis, Ioannis},
eprint = {1601.05675},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Calandriello et al. - 2016 - Incremental Spectral Sparsification for Large-Scale Graph-Based Semi-Supervised Learning.pdf:pdf},
title = {{Incremental Spectral Sparsification for Large-Scale Graph-Based Semi-Supervised Learning}},
url = {http://arxiv.org/abs/1601.05675},
year = {2016}
}
@article{Blei2016,
abstract = {One of the core problems of modern statistics is to approximate difficult-to-compute probability distributions. This problem is especially important in Bayesian statistics, which frames all inference about unknown quantities as a calculation about the posterior. In this paper, we review variational inference (VI), a method from machine learning that approximates probability distributions through optimization. VI has been used in myriad applications and tends to be faster than classical methods, such as Markov chain Monte Carlo sampling. The idea behind VI is to first posit a family of distributions and then to find the member of that family which is close to the target. Closeness is measured by Kullback-Leibler divergence. We review the ideas behind mean-field variational inference, discuss the special case of VI applied to exponential family models, present a full example with a Bayesian mixture of Gaussians, and derive a variant that uses stochastic optimization to scale up to massive data. We discuss modern research in VI and highlight important open problems. VI is powerful, but it is not yet well understood. Our hope in writing this paper is to catalyze statistical research on this widely-used class of algorithms.},
archivePrefix = {arXiv},
arxivId = {1601.00670},
author = {Blei, David M. and Kucukelbir, Alp and McAuliffe, Jon D.},
eprint = {1601.00670},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Blei, Kucukelbir, McAuliffe - 2016 - Variational Inference A Review for Statisticians.pdf:pdf},
isbn = {1601.00670},
journal = {arXiv},
keywords = {Graphical Model,Variational Inference},
pages = {1--33},
title = {{Variational Inference: A Review for Statisticians}},
url = {http://arxiv.org/abs/1601.00670},
year = {2016}
}
@inproceedings{Duin2000,
author = {Duin, Robert P.W.},
booktitle = {Proceedings of the 15th International Conference on Pattern Recognition},
doi = {10.1109/ICPR.2000.906006},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Duin - 2000 - Classifiers in Almost Empty Spaces.pdf:pdf},
pages = {1--7},
title = {{Classifiers in Almost Empty Spaces}},
year = {2000}
}
@book{Barber2012,
author = {Barber, David},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Barber - 2012 - Bayesian reasoning and machine learning.pdf:pdf},
title = {{Bayesian reasoning and machine learning}},
year = {2012}
}
@inproceedings{Muandet2012,
author = {Muandet, Krikamol and Fukumizu, K},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Muandet, Fukumizu - 2012 - Learning from distributions via support measure machines.pdf:pdf},
pages = {1--9},
title = {{Learning from distributions via support measure machines}},
url = {http://arxiv.org/abs/1202.6504},
year = {2012}
}
@inproceedings{Kulesza2010,
abstract = {We present a novel probabilistic model for distributions over sets of structures— for example, sets of sequences, trees, or graphs. The critical characteristic of our model is a preference for diversity: sets containing dissimilar structures are more likely. Our model is a marriage of},
author = {Kulesza, Alex and Taskar, Ben},
booktitle = {Advances in Neural Information Processing Systems 23},
doi = {10.1080/00036840500405656},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kulesza, Taskar - 2010 - Structured Determinantal Point Processes.pdf:pdf},
isbn = {0003684050040},
issn = {{\textless}null{\textgreater}},
pages = {1--9},
title = {{Structured Determinantal Point Processes}},
year = {2010}
}
@article{Nigam2000,
author = {Nigam, Kamal and McCallum, Andrew K. and Thrun, Sebastian and Mitchell, Tom},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Nigam et al. - 2000 - Text classification from labeled and unlabeled documents using EM.pdf:pdf},
journal = {Machine learning},
keywords = {bayesian learning,combining labeled and unlabeled,data,expectation-maximization,integrating supervised and unsuper-,text classification,vised learning},
pages = {1--34},
title = {{Text classification from labeled and unlabeled documents using EM}},
volume = {34},
year = {2000}
}
@book{Aubin2000,
author = {Aubin, Jean-Pierre},
edition = {2},
isbn = {9780471179764},
publisher = {John Wiley {\&} Sons},
title = {{Applied functional analysis}},
year = {2000}
}
@article{Bouckaert2004,
abstract = {Empirical research in learning algorithms for classification tasks gen- erally requires the use of significance tests.The quality of a test is typically judged on Type I error (how often the test indicates a difference when it should not) and Type II error (how often it indicates no difference when it should). In this paper we argue that the replicability of a test is also of importance.We say that a test has low replicability if its outcome strongly depends on the particular random parti- tioning of the data that is used to perform it. We present empirical measures of replicability and use them to compare the performance of several popular tests in a realistic setting involving standard learning algorithms and benchmark datasets. Based on our results we give recommendations on which test to use.},
author = {Bouckaert, Remco R and Frank, Eibe},
doi = {10.1007/978-3-540-24775-3},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bouckaert, Frank - 2004 - Evaluating the Replicability of Significance Tests for Comparing Learning Algorithms.pdf:pdf},
isbn = {3-540-22064-X},
issn = {0302-9743},
journal = {Advances in knowledge discovery and data mining},
pages = {3--12},
title = {{Evaluating the Replicability of Significance Tests for Comparing Learning Algorithms}},
year = {2004}
}
@article{Anand2013,
abstract = {Mean shift clustering is a powerful nonparametric technique that does not require prior knowledge of the number of clusters and does not constrain the shape of the clusters. However, being completely unsupervised, its performance suffers when the original distance metric fails to capture the underlying cluster structure. Despite recent advances in semi-supervised clustering methods, there has been little effort towards incorporating supervision into mean shift. We propose a semi-supervised framework for kernel mean shift clustering (SKMS) that uses only pairwise constraints to guide the clustering procedure. The points are first mapped to a high-dimensional kernel space where the constraints are imposed by a linear transformation of the mapped points. This is achieved by modifying the initial kernel matrix by minimizing a log det divergence-based objective function.We show the advantages of SKMS by evaluating its performance on various synthetic and real datasets while comparing with state-of-the-art semi-supervised clustering algorithms.},
author = {Anand, Saket and Mittal, Sushil and Tuzel, Oncel and Meer, Peter},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Anand et al. - 2013 - Semi-Supervised Kernel Mean Shift Clustering.pdf:pdf},
issn = {1939-3539},
journal = {IEEE transactions on pattern analysis and machine intelligence},
month = {sep},
number = {6},
pages = {1201--1215},
pmid = {24101327},
title = {{Semi-Supervised Kernel Mean Shift Clustering.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/24101327},
volume = {36},
year = {2013}
}
@techreport{Bensusan2000,
author = {Bensusan, H. and Giraud-carrier, Christophe and Kennedy, C.J.},
booktitle = {ILP Work-in-progress {\ldots}},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bensusan, Giraud-carrier, Kennedy - 2000 - A Higher-order Approach to Meta-learning.pdf:pdf},
institution = {University of Bristol},
title = {{A Higher-order Approach to Meta-learning}},
url = {http://137.222.102.8/Publications/Papers/1000471.pdf},
year = {2000}
}
@article{scikit-learn,
author = {Pedregosa, F and Varoquaux, G and Gramfort, A and Michel, V and Thirion, B and Grisel, O and Blondel, M and Prettenhofer, P and Weiss, R and Dubourg, V and Vanderplas, J and Passos, A and Cournapeau, D and Brucher, M and Perrot, M and Duchesnay, E},
journal = {Journal of Machine Learning Research},
pages = {2825--2830},
title = {{Scikit-learn: Machine Learning in Python}},
volume = {12},
year = {2011}
}
@inproceedings{Singh2008,
author = {Singh, Aarti and Nowak, Robert D. and Zhu, Xiaojin},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Singh, Nowak, Zhu - 2008 - Unlabeled data Now it helps, now it doesn't.pdf:pdf},
pages = {1513--1520},
title = {{Unlabeled data: Now it helps, now it doesn't}},
year = {2008}
}
@article{McLachlan1977,
author = {McLachlan, Geoffrey John},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/McLachlan - 1977 - Estimating the Linear Discriminant Function from Initial Samples Containing a Small Number of Unclassified Observatio.pdf:pdf},
journal = {Journal of the American Statistical Association},
number = {358},
pages = {403--406},
title = {{Estimating the Linear Discriminant Function from Initial Samples Containing a Small Number of Unclassified Observations}},
volume = {72},
year = {1977}
}
@article{Hand2009,
author = {Hand, David J.},
doi = {10.1007/s10994-009-5119-5},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hand - 2009 - Measuring classifier performance a coherent alternative to the area under the ROC curve.pdf:pdf},
issn = {0885-6125},
journal = {Machine Learning},
keywords = {auc,classification,cost,error rate,loss,misclassification,rate,roc curves,sensitivity,specificity},
month = {jun},
number = {1},
pages = {103--123},
title = {{Measuring classifier performance: a coherent alternative to the area under the ROC curve}},
url = {http://link.springer.com/10.1007/s10994-009-5119-5},
volume = {77},
year = {2009}
}
@article{Ho2004,
abstract = {Studies on ensemble methods for classification suffer from the difficulty of modeling the complementary strengths of the components. Kleinberg's theory of stochastic discrimination (SD) addresses this rigorously via mathematical notions of enrichment, uniformity, and projectability of an ensemble. We explain these concepts via a very simple numerical example that captures the basic principles of the SD theory and method. We focus on a fundamental symmetry in point set covering that is the key observation leading to the foundation of the theory. We believe a better understanding of the SD method will lead to developments of better tools for analyzing other ensemble methods.},
archivePrefix = {arXiv},
arxivId = {cs/0402021},
author = {Ho, Tin Kam},
eprint = {0402021},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ho - 2004 - A Numerical Example on the Principles of Stochastic Discrimination.pdf:pdf},
primaryClass = {cs},
title = {{A Numerical Example on the Principles of Stochastic Discrimination}},
url = {http://arxiv.org/abs/cs/0402021},
year = {2004}
}
@article{Krijthe2017,
archivePrefix = {arXiv},
arxivId = {arXiv:1512.08240v1},
author = {Krijthe, Jesse H. and Loog, Marco},
doi = {10.1016/j.patcog.2016.09.009},
eprint = {arXiv:1512.08240v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Krijthe, Loog - 2017 - Robust Semi-supervised Least Squares Classification by Implicit Constraints.pdf:pdf},
issn = {00313203},
journal = {Pattern Recognition},
keywords = {least squares classification,robust,semi-supervised learning},
pages = {115--126},
publisher = {Elsevier},
title = {{Robust Semi-supervised Least Squares Classification by Implicit Constraints}},
url = {http://dx.doi.org/10.1016/j.patcog.2016.09.009},
volume = {63},
year = {2017}
}
@inproceedings{Cozman2002,
author = {Cozman, FG and Cohen, Ira},
booktitle = {FLAIRS Conference},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Cozman, Cohen - 2002 - Unlabeled Data Can Degrade Classification Performance of Generative Classifiers.pdf:pdf},
pages = {327--331},
title = {{Unlabeled Data Can Degrade Classification Performance of Generative Classifiers.}},
url = {http://www.aaai.org/Papers/FLAIRS/2002/FLAIRS02-065.pdf},
year = {2002}
}
@inproceedings{VanderMaaten2013,
author = {{Van der Maaten}, Laurens and Chen, Minmin and Tyree, Stephen and Weinberger, Kilian Q.},
booktitle = {Proceedings of the 30th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Van der Maaten et al. - 2013 - Learning with Marginalized Corrupted Features.pdf:pdf},
pages = {410--418},
title = {{Learning with Marginalized Corrupted Features}},
url = {http://machinelearning.wustl.edu/mlpapers/papers/ICML2013{\_}vandermaaten13},
year = {2013}
}
@article{Nishimura2015a,
abstract = {Hamiltonian Monte Carlo and related algorithms have become routinely used in Bayesian computation. The utility of such approaches is highlighted in the software package STAN, which provides a platform for automatic implementation of general Bayesian models. Hence, methods for improving the efficiency of general Hamiltonian Monte Carlo algorithms can have a substantial impact on practice. We propose such a method in this article by recycling the intermediate leap-frog steps used in approximating the Hamiltonian trajectories. Current algorithms use only the final step, and wastefully discard all the intermediate steps. We propose a simple and provably accurate approach for using these intermediate samples, boosting the effective sample size with little programming effort and essentially no extra computational cost. We show that our recycled Hamiltonian Monte Carlo algorithm can lead to substantial gains in computational efficiency in a variety of experiments.},
archivePrefix = {arXiv},
arxivId = {1511.06925},
author = {Nishimura, Akihiko and Dunson, David},
eprint = {1511.06925},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Nishimura, Dunson - 2015 - Recycling intermediate steps to improve Hamiltonian Monte Carlo.pdf:pdf},
number = {1},
title = {{Recycling intermediate steps to improve Hamiltonian Monte Carlo}},
url = {http://arxiv.org/abs/1511.06925},
year = {2015}
}
@article{Peng2002,
author = {Peng, Yonghong and Flach, Peter A. and Soares, Carlos and Brazdil, Pavel B.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Peng et al. - 2002 - Improved dataset characterisation for meta-learning.pdf:pdf},
journal = {Lecture Notes in Computer Science},
pages = {141--152},
title = {{Improved dataset characterisation for meta-learning}},
url = {http://link.springer.com/chapter/10.1007/3-540-36182-0{\_}14},
volume = {2534},
year = {2002}
}
@article{Grunwald2016,
abstract = {We formalize the idea of probability distributions that lead to reliable predictions about some, but not all aspects of a domain. The resulting notion of `safety' provides a fresh perspective on foundational issues in statistics, providing a middle ground between imprecise probability and multiple-prior models on the one hand and strictly Bayesian approaches on the other. It also allows us to formalize fiducial distributions in terms of the set of random variables that they can safely predict, thus taking some of the sting out of the fiducial idea. By restricting probabilistic inference to safe uses, one also automatically avoids paradoxes such as the Monty Hall problem. Safety comes in a variety of degrees, such as "validity" (the strongest notion), "calibration", "confidence safety" and "unbiasedness" (almost the weakest notion).},
archivePrefix = {arXiv},
arxivId = {1604.01785},
author = {Gr{\"{u}}nwald, Peter},
eprint = {1604.01785},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gr{\"{u}}nwald - 2016 - Safe Probability.pdf:pdf},
number = {1979},
pages = {1--39},
title = {{Safe Probability}},
url = {http://arxiv.org/abs/1604.01785},
year = {2016}
}
@article{Weston2005,
abstract = {MOTIVATION: Building an accurate protein classification system depends critically upon choosing a good representation of the input sequences of amino acids. Recent work using string kernels for protein data has achieved state-of-the-art classification performance. However, such representations are based only on labeled data--examples with known 3D structures, organized into structural classes--whereas in practice, unlabeled data are far more plentiful. RESULTS: In this work, we develop simple and scalable cluster kernel techniques for incorporating unlabeled data into the representation of protein sequences. We show that our methods greatly improve the classification performance of string kernels and outperform standard approaches for using unlabeled data, such as adding close homologs of the positive examples to the training data. We achieve equal or superior performance to previously presented cluster kernel methods and at the same time achieving far greater computational efficiency. AVAILABILITY: Source code is available at www.kyb.tuebingen.mpg.de/bs/people/weston/semiprot. The Spider matlab package is available at www.kyb.tuebingen.mpg.de/bs/people/spider. SUPPLEMENTARY INFORMATION: www.kyb.tuebingen.mpg.de/bs/people/weston/semiprot.},
author = {Weston, Jason and Leslie, Christina and Ie, Eugene and Zhou, Dengyong and Elisseeff, Andre and Noble, William Stafford},
doi = {10.1093/bioinformatics/bti497},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Weston et al. - 2005 - Semi-supervised protein classification using cluster kernels.pdf:pdf},
issn = {1367-4803},
journal = {Bioinformatics},
keywords = {Algorithms,Artificial Intelligence,Automated,Automated: methods,Cluster Analysis,Pattern Recognition,Protein,Protein: methods,Proteins,Proteins: analysis,Proteins: chemistry,Proteins: classification,Sequence Alignment,Sequence Alignment: methods,Sequence Analysis,Software},
month = {aug},
number = {15},
pages = {3241--7},
pmid = {15905279},
title = {{Semi-supervised protein classification using cluster kernels.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/15905279},
volume = {21},
year = {2005}
}
@book{Kuncheva2004,
author = {Kuncheva, Ludmila I},
booktitle = {Methods and Algorithms. Wiley, Chichester},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kuncheva - 2004 - Combining Pattern Classifers.pdf:pdf},
isbn = {9786468600},
title = {{Combining Pattern Classifers}},
url = {http://www.tandfonline.com/doi/abs/10.1198/tech.2005.s320 http://scholar.google.com/scholar?hl=en{\&}btnG=Search{\&}q=intitle:Combining+Pattern+Classifiers{\#}3},
year = {2004}
}
@article{Zhou2005,
author = {Zhou, Zhi-hua and Li, Ming},
doi = {10.1109/TKDE.2005.186},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Zhou, Li - 2005 - Tri-training exploiting unlabeled data using three classifiers.pdf:pdf},
issn = {1041-4347},
journal = {IEEE Transactions on Knowledge and Data Engineering},
month = {nov},
number = {11},
pages = {1529--1541},
title = {{Tri-training: exploiting unlabeled data using three classifiers}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=1512038},
volume = {17},
year = {2005}
}
@inproceedings{Taigman2014,
author = {Taigman, Yaniv and Ranzato, Marc Aurelio and Aviv, Tel and Park, Menlo},
booktitle = {Computer Vision and Pattern Recognition},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Taigman et al. - 2014 - DeepFace Closing the Gap to Human-Level Performance in Face Verification.pdf:pdf},
title = {{DeepFace : Closing the Gap to Human-Level Performance in Face Verification}},
year = {2014}
}
@article{Bishop1995,
author = {Bishop, Chris M},
doi = {10.1162/neco.1995.7.1.108},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bishop - 1995 - Training with Noise is Equivalent to Tikhonov Regularization.pdf:pdf},
journal = {Neural Computation},
number = {1},
pages = {108--116},
title = {{Training with Noise is Equivalent to Tikhonov Regularization}},
volume = {7},
year = {1995}
}
@article{Arnold2000,
author = {Arnold, Bernard F. and Stahlecker, Peter},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Arnold, Stahlecker - 2000 - The minimax adjustment principle.pdf:pdf},
journal = {Mathematical methods of operations research},
keywords = {ellipsoidal information,minimax,minimax adjustment principle,principle,projection estimator,supply policy},
pages = {103--113},
title = {{The minimax adjustment principle}},
url = {http://link.springer.com/article/10.1007/s001860050005},
volume = {51},
year = {2000}
}
@article{Meding2012,
abstract = {In clinical diagnostics, it is of outmost importance to correctly identify the source of a metastatic tumor, especially if no apparent primary tumor is present. Tissue-based proteomics might allow correct tumor classification. As a result, we performed MALDI imaging to generate proteomic signatures for different tumors. These signatures were used to classify common cancer types. At first, a cohort comprised of tissue samples from six adenocarcinoma entities located at different organ sites (esophagus, breast, colon, liver, stomach, thyroid gland, n = 171) was classified using two algorithms for a training and test set. For the test set, Support Vector Machine and Random Forest yielded overall accuracies of 82.74 and 81.18{\%}, respectively. Then, colon cancer liver metastasis samples (n = 19) were introduced into the classification. The liver metastasis samples could be discriminated with high accuracy from primary tumors of colon cancer and hepatocellular carcinoma. Additionally, colon cancer liver metastasis samples could be successfully classified by using colon cancer primary tumor samples for the training of the classifier. These findings demonstrate that MALDI imaging-derived proteomic classifiers can discriminate between different tumor types at different organ sites and in the same site.},
author = {Meding, Stephan and Nitsche, Ulrich and Balluff, Benjamin and Elsner, Mareike and Rauser, Sandra and Sch{\"{o}}ne, C{\'{e}}drik and Nipp, Martin and Maak, Matthias and Feith, Marcus and Ebert, Matthias P and Friess, Helmut and Langer, Rupert and H{\"{o}}fler, Heinz and Zitzelsberger, Horst and Rosenberg, Robert and Walch, Axel},
doi = {10.1021/pr200784p},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Meding et al. - 2012 - Tumor classification of six common cancer types based on proteomic profiling by MALDI imaging.pdf:pdf},
issn = {1535-3907},
journal = {Journal of proteome research},
keywords = {Adenocarcinoma,Adenocarcinoma: metabolism,Adenocarcinoma: secondary,Algorithms,Humans,Neoplasms,Neoplasms: diagnosis,Neoplasms: metabolism,Neoplasms: pathology,Proteome,Proteome: metabolism,Proteomics,Sensitivity and Specificity,Spectrometry, Mass, Matrix-Assisted Laser Desorpti,Support Vector Machines},
month = {mar},
number = {3},
pages = {1996--2003},
pmid = {22224404},
title = {{Tumor classification of six common cancer types based on proteomic profiling by MALDI imaging.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/22224404},
volume = {11},
year = {2012}
}
@inproceedings{Cohen2014,
author = {Cohen, Taco and Welling, Max},
booktitle = {International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Cohen, Welling - 2014 - Learning the Irreducible Representations of Commutative Lie Groups.pdf:pdf},
pages = {1755--1763},
title = {{Learning the Irreducible Representations of Commutative Lie Groups}},
year = {2014}
}
@inproceedings{Duin1995,
author = {Duin, Robert P W},
booktitle = {Proceedings of the Scandinavian Conference on Image Analysis},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Duin - 1995 - Small sample size generalization.pdf:pdf},
keywords = {classification error,linear discriminants,small sample size},
pages = {957--964},
title = {{Small sample size generalization}},
year = {1995}
}
@article{Dempster1977,
author = {Dempster, AP and Laird, NM and Rubin, DB},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Dempster, Laird, Rubin - 1977 - Maximum likelihood from incomplete data via the EM algorithm.pdf:pdf},
isbn = {0000000779},
journal = {Journal of the Royal Statistical Society. Series B},
keywords = {incomplete,likelihood,maximum},
number = {1},
pages = {1--38},
title = {{Maximum likelihood from incomplete data via the EM algorithm}},
volume = {39},
year = {1977}
}
@article{Yu2013a,
author = {Yu, Bin},
doi = {10.3150/13-BEJSP14},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Yu - 2013 - Stability.pdf:pdf},
issn = {1350-7265},
journal = {Bernoulli},
keywords = {cross-validation,double exponential error,estimation stability,fmri,high-dim regression,lasso,movie reconstruction,robust statistics,stability},
number = {4},
pages = {1484--1500},
title = {{Stability}},
url = {http://projecteuclid.org/euclid.bj/1377612862},
volume = {19},
year = {2013}
}
@inproceedings{Rasmus2015,
archivePrefix = {arXiv},
arxivId = {1507.02672},
author = {Rasmus, Antti and Valpola, Harri and Honkala, Mikko and Berglund, Mathias and Raiko, Tapani},
booktitle = {Advances in Neural Information Processing Systems},
eprint = {1507.02672},
file = {:Users/jkrijthe/Documents/Mendeley Desktop//Rasmus et al. - 2015 - Semi-supervised learning with Ladder Networks.pdf:pdf},
pages = {3546--3554},
title = {{Semi-supervised learning with Ladder Networks}},
year = {2015}
}
@inproceedings{Chang2007,
author = {Chang, Ming-wei and Ratinov, Lev and Roth, Dan},
booktitle = {Proceedings of the 45th Annual Meeting of the Association of Computational Linguistics},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Chang, Ratinov, Roth - 2007 - Guiding Semi-Supervision with Constraint-Driven Learning.pdf:pdf},
pages = {280--287},
title = {{Guiding Semi-Supervision with Constraint-Driven Learning}},
year = {2007}
}
@book{Dresher1961,
author = {Dresher, Melvin},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Dresher - 1961 - Games of Strategy Theory and Applications.pdf:pdf},
pages = {1--186},
publisher = {Prentice Hall},
title = {{Games of Strategy: Theory and Applications}},
year = {1961}
}
@inproceedings{Kleiner2012,
abstract = {The bootstrap provides a simple and pow- erful means of assessing the quality of esti- mators. However, in settings involving large datasets, the computation of bootstrap-based quantities can be prohibitively demanding. As an alternative, we present the Bag of Lit- tle Bootstraps (BLB), a new procedure which incorporates features of both the bootstrap and subsampling to obtain a robust, compu- tationally efficient means of assessing estima- tor quality. BLB is well suited to modern par- allel and distributed computing architectures and retains the generic applicability, statisti- cal efficiency, and favorable theoretical prop- erties of the bootstrap. We provide the re- sults of an extensive empirical and theoretical investigation of BLB's behavior, including a study of its statistical correctness, its large- scale implementation and performance, selec- tion of hyperparameters, and performance on real data. 1.},
author = {Kleiner, Ariel and Talwalkar, Ameet and Sarkar, Purnamrita and Jordan, Michael I.},
booktitle = {Proceedings of the 29th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kleiner et al. - 2012 - The Big Data Bootstrap.pdf:pdf},
pages = {1759--1766},
title = {{The Big Data Bootstrap}},
year = {2012}
}
@article{Cesa-Bianchi2007,
author = {Cesa-Bianchi, Nicol{\`{o}}},
doi = {10.1016/j.tcs.2007.03.053},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Cesa-Bianchi - 2007 - Applications of regularized least squares to pattern classification.pdf:pdf},
issn = {03043975},
journal = {Theoretical Computer Science},
keywords = {on-line learning,perceptron,ridge regression,selective sampling},
month = {sep},
number = {3},
pages = {221--231},
title = {{Applications of regularized least squares to pattern classification}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S030439750700237X},
volume = {382},
year = {2007}
}
@techreport{Zhu2005,
author = {Zhu, Xiaojin},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Zhu - 2005 - Semi-supervised learning literature survey.pdf:pdf},
institution = {University of Wisconsin - Madison},
pages = {1--59},
title = {{Semi-supervised learning literature survey}},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.99.9681{\&}rep=rep1{\&}type=pdf},
year = {2005}
}
@article{Vilalta2002,
author = {Vilalta, Ricardo and Drissi, Youssef},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Vilalta, Drissi - 2002 - A perspective view and survey of meta-learning.pdf:pdf},
journal = {Artificial Intelligence Review},
keywords = {classification,inductive learning,meta-knowledge},
number = {1997},
pages = {77--95},
title = {{A perspective view and survey of meta-learning}},
url = {http://link.springer.com/article/10.1023/A:1019956318069},
year = {2002}
}
@article{Kleijn2006a,
abstract = {We consider the asymptotic behavior of posterior distributions if the model is misspecified. Given a prior distribution and a random sample from a distribution {\$}P{\_}0{\$}, which may not be in the support of the prior, we show that the posterior concentrates its mass near the points in the support of the prior that minimize the Kullback--Leibler divergence with respect to {\$}P{\_}0{\$}. An entropy condition and a prior-mass condition determine the rate of convergence. The method is applied to several examples, with special interest for infinite-dimensional models. These include Gaussian mixtures, nonparametric regression and parametric models.},
archivePrefix = {arXiv},
arxivId = {math/0607023},
author = {Kleijn, B. J K and {Van Der Vaart}, a. W.},
doi = {10.1214/009053606000000029},
eprint = {0607023},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kleijn, Van Der Vaart - 2006 - Misspecification in infinite-dimensional Bayesian statistics.pdf:pdf},
issn = {00905364},
journal = {Annals of Statistics},
keywords = {Infinite-dimensional model,Misspecification,Posterior distribution,Rate of convergence},
number = {2},
pages = {837--877},
primaryClass = {math},
title = {{Misspecification in infinite-dimensional Bayesian statistics}},
volume = {34},
year = {2006}
}
@article{Lopez-Paz2015,
abstract = {We describe generalized distillation, a framework to learn from multiple representations in a semisupervised fashion. We show that distillation (Hinton et al., 2015) and privileged information (Vapnik {\&} Izmailov, 2015) are particular instances of generalized distillation, give insight about why and when generalized distillation works, and provide numerical simulations to assess its effectiveness.},
archivePrefix = {arXiv},
arxivId = {1511.03643},
author = {Lopez-Paz, David and Bottou, L{\'{e}}on and Sch{\"{o}}lkopf, Bernhard and Vapnik, Vladimir},
eprint = {1511.03643},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lopez-Paz et al. - 2015 - Unifying distillation and privileged information.pdf:pdf},
number = {1},
pages = {1--9},
title = {{Unifying distillation and privileged information}},
url = {http://arxiv.org/abs/1511.03643},
year = {2015}
}
@article{Kingma2014,
abstract = {The ever-increasing size of modern data sets combined with the difficulty of obtaining label information has made semi-supervised learning one of the problems of significant practical importance in modern data analysis. We revisit the approach to semi-supervised learning with generative models and develop new models that allow for effective generalisation from small labelled data sets to large unlabelled ones. Generative approaches have thus far been either inflexible, inefficient or non-scalable. We show that deep generative models and approximate Bayesian inference exploiting recent advances in variational methods can be used to provide significant improvements, making generative approaches highly competitive for semi-supervised learning.},
archivePrefix = {arXiv},
arxivId = {1406.5298},
author = {Kingma, Diederik P. and Rezende, Danilo J. and Mohamed, Shakir and Welling, Max},
eprint = {1406.5298},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kingma et al. - 2014 - Semi-Supervised Learning with Deep Generative Models.pdf:pdf},
month = {jun},
pages = {1--9},
title = {{Semi-Supervised Learning with Deep Generative Models}},
url = {http://arxiv.org/abs/1406.5298},
year = {2014}
}
@article{Janzing2015,
author = {Janzing, Dominik and Sch{\"{o}}lkopf, Bernhard},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Janzing, Sch{\"{o}}lkopf - 2015 - Semi-Supervised Interpolation in an Anticausal Learning Scenario.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {anticausal learning,causality,independence of cause and,information geometry,mechanism,semi-supervised learning},
number = {Sep},
pages = {1923--1948},
title = {{Semi-Supervised Interpolation in an Anticausal Learning Scenario}},
volume = {16},
year = {2015}
}
@inproceedings{Hernandez-reyes2005,
author = {Hern{\'{a}}ndez-reyes, Edith and Carrasco-Ochoa, J.A. and Mart{\'{i}}nez-trinidad, J Fco},
booktitle = {Proceedings of the 10th Iberoamerican Congress on Pattern Recognition},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hern{\'{a}}ndez-reyes, Carrasco-Ochoa, Mart{\'{i}}nez-trinidad - 2005 - Classifier Selection Based on Data Complexity Measures.pdf:pdf},
pages = {586--592},
title = {{Classifier Selection Based on Data Complexity Measures}},
year = {2005}
}
@article{Schmidt1995,
author = {Schmidt, Karsten and Stahlecker, Peter},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Schmidt, Stahlecker - 1995 - Reducing the Maximum Risk of Regression Estimators by Polyhedral Projection.pdf:pdf},
journal = {Journal of Statistical Computation and Simulation},
number = {1},
pages = {1--15},
title = {{Reducing the Maximum Risk of Regression Estimators by Polyhedral Projection}},
url = {http://www.tandfonline.com/doi/abs/10.1080/00949659508811648},
volume = {52},
year = {1995}
}
@article{Forestier2007,
author = {Forestier, Germain and Wemmert, C{\'{e}}dric},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Forestier, Wemmert - 2016 - Semi-supervised learning using multiple clusterings with limited labeled data.pdf:pdf},
journal = {Information Sciences},
keywords = {semi-supervised learning},
pages = {48--65},
title = {{Semi-supervised learning using multiple clusterings with limited labeled data}},
volume = {361},
year = {2016}
}
@inproceedings{Muandet2014,
archivePrefix = {arXiv},
arxivId = {arXiv:1306.0842v2},
author = {Muandet, Krikamol and Fukumizu, Kenji and Sriperumbudur, Bharath K. and Gretton, Arthur and Sch{\"{o}}lkopf, Bernhard},
booktitle = {31st International Conference on Machine Learning},
eprint = {arXiv:1306.0842v2},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Muandet et al. - 2014 - Kernel Mean Estimation and Stein's Effect.pdf:pdf},
pages = {10--18},
title = {{Kernel Mean Estimation and Stein's Effect}},
year = {2014}
}
@inproceedings{Solomon2014,
abstract = {Probability distributions and histograms are nat-ural representations for product ratings, traffic measurements, and other data considered in many machine learning applications. Thus, this pa-per introduces a technique for graph-based semi-supervised learning of histograms, derived from the theory of optimal transportation. Our method has several properties making it suitable for this application; in particular, its behavior can be char-acterized by the moments and shapes of the his-tograms at the labeled nodes. In addition, it can be used for histograms on non-standard domains like circles, revealing a strategy for manifold-valued semi-supervised learning. We also extend this technique to related problems such as smoothing distributions on graph nodes.},
author = {Solomon, Justin and Rustamov, Raif M. and Guibas, Leonidas and Butscher, Adrian},
booktitle = {International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Solomon et al. - 2014 - Wasserstein Propagation for Semi-Supervised Learning.pdf:pdf},
pages = {306--314},
title = {{Wasserstein Propagation for Semi-Supervised Learning}},
year = {2014}
}
@article{Rendell1990,
author = {Rendell, Larry and Cho, Howard},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Rendell, Cho - 1990 - Empirical learning as a function of concept character.pdf:pdf},
journal = {Machine Learning},
keywords = {concepts as functions,empirical concept learning,experimental studies},
pages = {267--298},
title = {{Empirical learning as a function of concept character}},
url = {http://www.springerlink.com/index/K5311727465WLH07.pdf},
volume = {5},
year = {1990}
}
@inproceedings{Haffari2007,
archivePrefix = {arXiv},
arxivId = {1206.5240},
author = {Haffari, Gholamreza and Sarkar, Anoop},
booktitle = {Proceedings of the 23rd Conference on Uncertainty in Artificial Intelligence},
eprint = {1206.5240},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Haffari, Sarkar - 2007 - Analysis of semi-supervised learning with the Yarowsky algorithm.pdf:pdf},
title = {{Analysis of semi-supervised learning with the Yarowsky algorithm}},
year = {2007}
}
@article{Duchi2016,
abstract = {We provide a unifying view of statistical information measures, multi-class classification problems, multi-way Bayesian hypothesis testing, and loss functions, elaborating equivalence results between all of these objects. In particular, we consider a particular generalization of {\$}f{\$}-divergences to multiple distributions, and we show that there is a constructive equivalence between {\$}f{\$}-divergences, statistical information (in the sense of uncertainty as elaborated by DeGroot), and loss functions for multi-category classification. We also study an extension of our results to multi-class classification problems in which we must both infer a discriminant function {\$}\backslashgamma{\$} and a data representation (or, in the setting of a hypothesis testing problem, an experimental design), represented by a quantizer {\$}\backslashmathsf{\{}q{\}}{\$} from a family of possible quantizers {\$}\backslashmathsf{\{}Q{\}}{\$}. There, we give a complete characterization of the equivalence between loss functions, meaning that optimizing either of two losses yields the same optimal discriminant and quantizer {\$}\backslashmathsf{\{}q{\}}{\$}. A main consequence of our results is to describe those convex loss functions that are Fisher consistent for jointly choosing a data representation and minimizing the (weighted) probability of error in multi-category classification and hypothesis testing problems.},
archivePrefix = {arXiv},
arxivId = {1603.00126},
author = {Duchi, John C. and Khosravi, Khashayar and Ruan, Feng},
eprint = {1603.00126},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Duchi, Khosravi, Ruan - 2016 - Information Measures, Experiments, Multi-category Hypothesis Tests, and Surrogate Losses.pdf:pdf},
title = {{Information Measures, Experiments, Multi-category Hypothesis Tests, and Surrogate Losses}},
url = {http://arxiv.org/abs/1603.00126},
year = {2016}
}
@phdthesis{Krijthe2012,
abstract = {In order to choose from the large number of classification methods available for use, cross-validation error estimates are often employed. We present this cross-validation selection strategy in the framework of meta-learning and show that conceptually, meta- learning techniques could provide better classifier selections than traditional cross-validation selection. Using various simulation studies we illustrate and discuss this possibility. Through a collection of datasets resembling real-world data, we investigate whether these improvements could possibly exist in the real-world as well. Although the approach presented here currently requires signifi- cant investment when applied to practical applications, the concept of being able to outperform cross-validation selection opens the door to new classifier selection strategies.},
author = {Krijthe, Jesse Hendrik},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Krijthe - 2012 - Improving Cross-Validation Classifier Selection Accuracy through Meta-Learning.pdf:pdf},
keywords = {Classifier Selection,Error estimation,Meta-Learning},
mendeley-tags = {Classifier Selection,Error estimation,Meta-Learning},
school = {Delft University of Technology},
title = {{Improving Cross-Validation Classifier Selection Accuracy through Meta-Learning}},
year = {2012}
}
@article{Lee2013,
abstract = {Abstract We propose the simple and efficient method of semi-supervised learning for deep neural networks. Basically, the proposed network is trained in a supervised fashion with labeled and unlabeled data simultaneously. For unlabeled data, Pseudo -Labels, just ... $\backslash$n},
author = {Lee, D H},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lee - 2013 - Pseudo-label The simple and efficient semi-supervised learning method for deep neural networks.pdf:pdf},
journal = {ICML Workshop on Challenges in Representation Learning},
title = {{Pseudo-label: The simple and efficient semi-supervised learning method for deep neural networks}},
year = {2013}
}
@article{Garcia-Laencina2010,
abstract = {Pattern classification has been successfully applied in many problem domains, such as biometric recognition, document classification or medical diagnosis. Missing or unknown data are a common drawback that pattern recognition techniques need to deal with when solving real-life classification tasks. Machine learning approaches and methods imported from statistical learning theory have been most intensively studied and used in this subject. The aim of this work is to analyze the missing data problem in pattern classification tasks, and to summarize and compare some of the well-known methods used for handling missing values.},
author = {Garc{\'{i}}a-Laencina, Pedro J. and Sancho-G{\'{o}}mez, Jos{\'{e}}-Luis and Figueiras-Vidal, An{\'{i}}bal R.},
doi = {10.1007/s00521-009-0295-6},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Garc{\'{i}}a-Laencina, Sancho-G{\'{o}}mez, Figueiras-Vidal - 2010 - Pattern classification with missing data a review.pdf:pdf},
issn = {0941-0643},
journal = {Neural Computing and Applications},
keywords = {data {\'{a}},learning,neural networks {\'{a}} machine,pattern classification {\'{a}} missing},
number = {2},
pages = {263--282},
title = {{Pattern classification with missing data: a review}},
volume = {19},
year = {2010}
}
@article{Balcan2005,
abstract = {Semi Supervised Learning;},
author = {Balcan, M.F. and Blum, a. and Choi, P.P. and Lafferty, J. and Pantano, B. and Rwebangira, M.R. and Zhu, X.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Balcan et al. - 2005 - Person identification in webcam images An application of semi-supervised learning.pdf:pdf},
journal = {ICML 2005 Workshop on Learning with Partially Classified Training Data},
pages = {6},
title = {{Person identification in webcam images: An application of semi-supervised learning}},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.61.1706{\&}rep=rep1{\&}type=pdf},
volume = {2},
year = {2005}
}
@article{Culp2013,
abstract = {The cluster assumption had a significant impact on the reasoning behind semi-supervised classification methods in graph-based learning. The literature includes numerous applications where harmonic functions provided estimates that conformed to data satisfying this well-known assumption, but the relationship between this assumption and harmonic functions is not as well-understood theoretically. We investigate these matters from the perspective of supervised kernel classification and provide concrete answers to two fundamental questions. (i) Under what conditions do semi-supervised harmonic approaches satisfy this assumption? (ii) If such an assumption is satisfied then why precisely would an observation sacrifice its own supervised estimate in favor of the cluster? First, a harmonic function is guaranteed to assign labels to data in harmony with the cluster assumption if a specific condition on the boundary of the harmonic function is satisfied. Second, it is shown that any harmonic function estimate within the interior is a probability weighted average of supervised estimates, where the weight is focused on supervised kernel estimates near labeled cases. We demonstrate that the uniqueness criterion for harmonic estimators is sensitive when the graph is sparse or the size of the boundary is relatively small. This sets the stage for a third contribution, a new regularized joint harmonic function for semi-supervised learning based on a joint optimization criterion. Mathematical properties of this estimator, such as its uniqueness even when the graph is sparse or the size of the boundary is relatively small, are proven. A main selling point is its ability to operate in circumstances where the cluster assumption may not be fully satisfied on real data by compromising between the purely harmonic and purely supervised estimators. The competitive stature of the new regularized joint harmonic approach is established.},
author = {Culp, Mark Vere and Ryan, K J},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Culp, Ryan - 2013 - Joint Harmonic Functions and Their Supervised Connections.pdf:pdf},
isbn = {1532-4435},
issn = {1532-4435},
journal = {Journal of Machine Learning Research},
keywords = {cluster assumption,harmonic function,joint training,semi-supervised learning},
pages = {3721--3752},
title = {{Joint Harmonic Functions and Their Supervised Connections}},
volume = {14},
year = {2013}
}
@article{Bickel2009,
author = {Bickel, Steffen and Br{\"{u}}ckner, Michael and Scheffer, Tobias},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bickel, Br{\"{u}}ckner, Scheffer - 2009 - Discriminative learning under covariate shift.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {covariate shift,discriminative learning,transfer learning},
pages = {2137--2155},
title = {{Discriminative learning under covariate shift}},
url = {http://dl.acm.org/citation.cfm?id=1755858},
volume = {10},
year = {2009}
}
@article{Ben-David2011b,
author = {Ben-david, Shai and Srebro, Nati and Urner, Ruth},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ben-david, Srebro, Urner - 2011 - Is learning possible without Prior Knowledge Do Universal Learners exist High level view of ( Statis.pdf:pdf},
title = {{Is learning possible without Prior Knowledge ? Do Universal Learners exist ? High level view of ( Statistical ) Machine Learning}},
year = {2011}
}
@article{Stigler2013,
author = {Stigler, Stephen M.},
doi = {10.1214/13-STS438},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Stigler - 2013 - The True Title of Bayes's Essay.pdf:pdf},
issn = {0883-4237},
journal = {Statistical Science},
keywords = {Thomas Bayes, Richard Price, Bayes's theorem, hist,and phrases},
month = {aug},
number = {3},
pages = {283--288},
title = {{The True Title of Bayes's Essay}},
url = {http://projecteuclid.org/euclid.ss/1377696937},
volume = {28},
year = {2013}
}
@article{Balsubramani,
archivePrefix = {arXiv},
arxivId = {arXiv:1503.01811v1},
author = {Balsubramani, Akshay and Freund, Yoav},
eprint = {arXiv:1503.01811v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Balsubramani, Freund - Unknown - Optimally Combining Classifiers Using Unlabeled Data.pdf:pdf},
title = {{Optimally Combining Classifiers Using Unlabeled Data}}
}
@inproceedings{Joachims1999,
author = {Joachims, Thorsten},
booktitle = {Proceedings of the 16th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Joachims - 1999 - Transductive inference for text classification using support vector machines.pdf:pdf},
pages = {200--209},
publisher = {Morgan Kaufmann Publishers},
title = {{Transductive inference for text classification using support vector machines}},
year = {1999}
}
@inproceedings{Giraud-Carrier2008,
author = {Giraud-carrier, Christophe},
booktitle = {Tutorial at the 2008 International Conference on Machine Learning and Applications},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Giraud-carrier - 2008 - Metalearning - A Tutorial.pdf:pdf},
number = {December},
title = {{Metalearning - A Tutorial}},
url = {http://dml.cs.byu.edu/{~}cgc/docs/ICMLA2008Tut/ICMLA 2008.pdf},
year = {2008}
}
@article{Mirowski2008,
author = {Mirowski, Piotr W. and LeCun, Yann and Madhavan, Deepak and Kuzniecky, Ruben},
doi = {10.1109/MLSP.2008.4685487},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Mirowski et al. - 2008 - Comparing SVM and convolutional networks for epileptic seizure prediction from intracranial EEG.pdf:pdf},
isbn = {978-1-4244-2375-0},
journal = {2008 IEEE Workshop on Machine Learning for Signal Processing},
month = {oct},
pages = {244--249},
publisher = {Ieee},
title = {{Comparing SVM and convolutional networks for epileptic seizure prediction from intracranial EEG}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=4685487},
year = {2008}
}
@article{Witten2011,
abstract = {We consider the supervised classification setting, in which the data consist of p features measured on n observations, each of which belongs to one of K classes. Linear discriminant analysis (LDA) is a classical method for this problem. However, in the high-dimensional setting where p ≫ n, LDA is not appropriate for two reasons. First, the standard estimate for the within-class covariance matrix is singular, and so the usual discriminant rule cannot be applied. Second, when p is large, it is difficult to interpret the classification rule obtained from LDA, since it involves all p features. We propose penalized LDA, a general approach for penalizing the discriminant vectors in Fisher's discriminant problem in a way that leads to greater interpretability. The discriminant problem is not convex, so we use a minorization-maximization approach in order to efficiently optimize it when convex penalties are applied to the discriminant vectors. In particular, we consider the use of L(1) and fused lasso penalties. Our proposal is equivalent to recasting Fisher's discriminant problem as a biconvex problem. We evaluate the performances of the resulting methods on a simulation study, and on three gene expression data sets. We also survey past methods for extending LDA to the high-dimensional setting, and explore their relationships with our proposal.},
author = {Witten, Daniela M. and Tibshirani, Robert},
doi = {10.1111/j.1467-9868.2011.00783.x},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Witten, Tibshirani - 2011 - Penalized classification using Fisher's linear discriminant.pdf:pdf},
issn = {1369-7412},
journal = {Journal of the Royal Statistical Society. Series B, Statistical methodology},
keywords = {classification,discriminant analysis,feature selection,high dimensional problems,lasso,linear,supervised learning},
month = {nov},
number = {5},
pages = {753--772},
pmid = {22323898},
title = {{Penalized classification using Fisher's linear discriminant.}},
url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3272679{\&}tool=pmcentrez{\&}rendertype=abstract},
volume = {73},
year = {2011}
}
@inproceedings{Joachims2003,
author = {Joachims, Thorsten},
booktitle = {Proceedings of the 20th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Joachims - 2003 - Transductive learning via spectral graph partitioning.pdf:pdf},
pages = {290--297},
title = {{Transductive learning via spectral graph partitioning}},
url = {http://www.aaai.org/Papers/ICML/2003/ICML03-040.pdf},
year = {2003}
}
@book{Brazdil2010,
author = {Brazdil, Pavel B. and Bernstein, Abraham},
editor = {Brazdil, Pavel and Bernstein, Abraham},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Brazdil, Bernstein - 2010 - Proceedings of 3rd Planning to Learn Workshop at ECAI 2010.pdf:pdf},
number = {Ecai},
title = {{Proceedings of 3rd Planning to Learn Workshop at ECAI 2010}},
year = {2010}
}
@article{Lei2015,
author = {Lei, Zhikun and Li, Renfu and {Sherry Ni}, Xuelei and Huo, Xiaoming},
doi = {10.1016/j.sigpro.2015.03.003},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lei et al. - 2015 - High-dimensional semi-supervised learning via a fusion-refinement procedure.pdf:pdf},
issn = {01651684},
journal = {Signal Processing},
keywords = {Fusion-refinement (FR) procedure,Semi-supervised learning (SSL),Sufficient dimension reduction (SDR),fr,fusion-refinement,procedure,sdr,semi-supervised learning,ssl,sufficient dimension reduction},
pages = {171--182},
publisher = {Elsevier},
title = {{High-dimensional semi-supervised learning via a fusion-refinement procedure}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0165168415000961},
volume = {114},
year = {2015}
}
@article{Li2011a,
abstract = {Classifying biological data into different groups is a central task of bioinformatics: for instance, to predict the function of a gene or protein, the disease state of a patient or the phenotype of an individual based on its genotype. Support Vector Machines are a wide spread approach for classifying biological data, due to their high accuracy, their ability to deal with structured data such as strings, and the ease to integrate various types of data. However, it is unclear how to correct for confounding factors such as population structure, age or gender or experimental conditions in Support Vector Machine classification.},
author = {Li, Limin and Rakitsch, Barbara and Borgwardt, Karsten},
doi = {10.1093/bioinformatics/btr204},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Li, Rakitsch, Borgwardt - 2011 - ccSVM Correcting Support Vector Machines for confounding factors in biological data classification.pdf:pdf},
issn = {13674803},
journal = {Bioinformatics},
number = {13},
pmid = {21685091},
title = {{ccSVM: Correcting Support Vector Machines for confounding factors in biological data classification}},
volume = {27},
year = {2011}
}
@article{Hunt1989,
author = {Hunt, D. N. and Triggs, C. M.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hunt, Triggs - 1989 - Iterative Missing Value Estimation Authors.pdf:pdf},
journal = {Journal of the Royal Statistical Society. Series C (Applied Statistics)},
number = {2},
pages = {293--300},
title = {{Iterative Missing Value Estimation Authors}},
volume = {38},
year = {1989}
}
@article{Marcum2015,
archivePrefix = {arXiv},
arxivId = {arXiv:1512.02914v1},
author = {Marcum, Christopher Steven},
eprint = {arXiv:1512.02914v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Marcum - 2015 - Yet Another Statistical Analysis of Bob Ross Paintings.pdf:pdf},
keywords = {art,bob ross,linear subspace,paintings},
pages = {1--16},
title = {{Yet Another Statistical Analysis of Bob Ross Paintings}},
year = {2015}
}
@article{Lopez-Paz2014,
abstract = {We are interested in learning causal relationships between pairs of random variables, purely from observational data. To effectively address this task, the state-of-the-art relies on strong assumptions regarding the mechanisms mapping causes to effects, such as invertibility or the existence of additive noise, which only hold in limited situations. On the contrary, this short paper proposes to learn how to perform causal inference directly from data, and without the need of feature engineering. In particular, we pose causality as a kernel mean embedding classification problem, where inputs are samples from arbitrary probability distributions on pairs of random variables, and labels are types of causal relationships. We validate the performance of our method on synthetic and real-world data against the state-of-the-art. Moreover, we submitted our algorithm to the ChaLearn's "Fast Causation Coefficient Challenge" competition, with which we won the fastest code prize and ranked third in the overall leaderboard.},
archivePrefix = {arXiv},
arxivId = {1409.4366},
author = {Lopez-Paz, David and Muandet, Krikamol and Recht, Benjamin},
eprint = {1409.4366},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lopez-Paz, Muandet, Recht - 2014 - The Randomized Causation Coefficient.pdf:pdf},
journal = {arXiv preprint},
keywords = {causality,cause-effect inference,kernel mean embeddings,random features},
pages = {1--4},
title = {{The Randomized Causation Coefficient}},
url = {http://arxiv.org/abs/1409.4366},
volume = {16},
year = {2014}
}
@article{Poggio2003,
author = {Poggio, Tomaso and Smale, Steve},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Poggio, Smale - 2003 - The Mathematics of Learning Dealing with Data.pdf:pdf},
journal = {Notices of the AMS},
number = {5},
pages = {537--544},
title = {{The Mathematics of Learning: Dealing with Data}},
volume = {50},
year = {2003}
}
@inproceedings{Auger2007,
author = {Auger, Anne and Teytaud, Olivier},
booktitle = {Proceedings of the 9th annual conference on Genetic and evolutionary computation},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Auger, Teytaud - 2007 - Continuous lunches are free!.pdf:pdf},
isbn = {9781595936974},
keywords = {free-lunch,kolmogorov,no-free-lunch,s extension theo-},
pages = {916--922},
title = {{Continuous lunches are free!}},
url = {http://dl.acm.org/citation.cfm?id=1277145},
year = {2007}
}
@inproceedings{Cortes2010,
author = {Cortes, Corinna and Mansour, Yishay and Mohri, Mehryar},
booktitle = {Advances in Neural Information Processing Systems 23},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Cortes, Mansour, Mohri - 2010 - Learning bounds for importance weighting.pdf:pdf},
pages = {442--450},
title = {{Learning bounds for importance weighting}},
url = {http://www.cs.nyu.edu/{~}mohri/pub/importance.pdf},
year = {2010}
}
@inproceedings{Narasimhan,
author = {Narasimhan, Mukund and Jojic, Nebojsa and Bilmes, Jeff},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Narasimhan, Jojic, Bilmes - 2005 - Q-Clustering.pdf:pdf},
pages = {979--986},
title = {{Q-Clustering}},
year = {2005}
}
@incollection{Xie2014,
author = {Xie, Yihui},
booktitle = {Implementing Reproducible Research},
chapter = {1},
editor = {Stodden, Victoria and Leisch, Friedrich and Peng, Roger D.},
publisher = {CRC Press},
title = {{knitr: A Comprehensive Tool for Reproducible Research in R}},
year = {2014}
}
@article{Kalousis1999,
author = {Kalousis, Alexis and Theoharis, T},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kalousis, Theoharis - 1999 - NOEMON An intelligent Assistant for Classifier Selection.pdf:pdf},
journal = {Intelligent Data Analysis},
keywords = {classifier comparison,classifier selection,dataset morphology,multidimensional metrics},
number = {5},
pages = {319--337},
title = {{NOEMON: An intelligent Assistant for Classifier Selection}},
url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.38.7762},
volume = {3},
year = {1999}
}
@article{Byrd1995,
author = {Byrd, Richard H. and Lu, Peihuang and Nocedal, Jorge and Zhu, Ciyou},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Byrd et al. - 1995 - A limited memory algorithm for bound constrained optimization.pdf:pdf},
journal = {SIAM Journal on Scientific Computing},
number = {5},
pages = {1190--1208},
title = {{A limited memory algorithm for bound constrained optimization}},
volume = {16},
year = {1995}
}
@article{Bruand2011,
abstract = {Mass Spectrometric Imaging (MSI) is a molecular imaging technique that allows the generation of 2D ion density maps for a large complement of the active molecules present in cells and sectioned tissues. Automatic segmentation of such maps according to patterns of co-expression of individual molecules can be used for discovery of novel molecular signatures (molecules that are specifically expressed in particular spatial regions). However, current segmentation techniques are biased toward the discovery of higher abundance molecules and large segments; they allow limited opportunity for user interaction, and validation is usually performed by similarity to known anatomical features. We describe here a novel method, AMASS (Algorithm for MSI Analysis by Semi-supervised Segmentation). AMASS relies on the discriminating power of a molecular signal instead of its intensity as a key feature, uses an internal consistency measure for validation, and allows significant user interaction and supervision as options. An automated segmentation of entire leech embryo data images resulted in segmentation domains congruent with many known organs, including heart, CNS ganglia, nephridia, nephridiopores, and lateral and ventral regions, each with a distinct molecular signature. Likewise, segmentation of a rat brain MSI slice data set yielded known brain features and provided interesting examples of co-expression between distinct brain regions. AMASS represents a new approach for the discovery of peptide masses with distinct spatial features of expression. Software source code and installation and usage guide are available at http://bix.ucsd.edu/AMASS/ .},
author = {Bruand, Jocelyne and Alexandrov, Theodore and Sistla, Srinivas and Wisztorski, Maxence and Meriaux, C{\'{e}}line and Becker, Michael and Salzet, Michel and Fournier, Isabelle and Macagno, Eduardo and Bafna, Vineet},
doi = {10.1021/pr2005378},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bruand et al. - 2011 - AMASS algorithm for MSI analysis by semi-supervised segmentation.pdf:pdf},
issn = {1535-3907},
journal = {Journal of proteome research},
keywords = {Algorithms,Animals,Automatic Data Processing,Brain,Brain: metabolism,Cluster Analysis,Computational Biology,Computational Biology: methods,Gene Expression Regulation,Gene Expression Regulation, Developmental,Image Processing, Computer-Assisted,Image Processing, Computer-Assisted: methods,Leeches,Mass Spectrometry,Mass Spectrometry: methods,Peptides,Peptides: chemistry,Rats,Spectrometry, Mass, Matrix-Assisted Laser Desorpti},
month = {oct},
number = {10},
pages = {4734--43},
pmid = {21800894},
title = {{AMASS: algorithm for MSI analysis by semi-supervised segmentation.}},
url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3190602{\&}tool=pmcentrez{\&}rendertype=abstract},
volume = {10},
year = {2011}
}
@article{Kalousis2004,
author = {Kalousis, Alexandros and Gama, Joao and Hilario, Melanie},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kalousis, Gama, Hilario - 2004 - On data and algorithms Understanding inductive performance.pdf:pdf},
journal = {Machine Learning},
number = {3},
pages = {275--312},
title = {{On data and algorithms: Understanding inductive performance}},
url = {http://link.springer.com/article/10.1023/B:MACH.0000015882.38031.85},
volume = {54},
year = {2004}
}
@article{Yu2006,
abstract = {This paper considers the problem of selecting the most informative experiments x to get measurements y for learning a regression model y = f(x). We propose a novel and simple concept for active learning, transductive experimental design, that explores available unmeasured experiments (i.e., unlabeled data) and has a better scalability in comparison with classic experimental design methods. Our in-depth analysis shows that the new method tends to favor experiments that are on the one side hard-to-predict and on the other side representative for the rest of the experiments. Efficient optimization of the new design problem is achieved through alternating optimization and sequential greedy search. Extensive experimental results on synthetic problems and three real-world tasks, including questionnaire design for preference learning, active learning for text categorization, and spatial sensor placement, highlight the advantages of the proposed approaches.},
author = {Yu, Kai and Bi, Jinbo and Tresp, Volker},
doi = {10.1145/1143844.1143980},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Yu, Bi, Tresp - 2006 - Active learning via transductive experimental design.pdf:pdf},
isbn = {1595933832},
journal = {Proceedings of the 23rd international conference on Machine learning ICML 06},
number = {6},
pages = {1081--1088},
title = {{Active learning via transductive experimental design}},
url = {http://portal.acm.org/citation.cfm?doid=1143844.1143980},
volume = {148},
year = {2006}
}
@article{Donoho2015,
author = {Donoho, David},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Donoho - 2015 - 50 years of data collection UNHCR expce.pdf:pdf},
pages = {1--41},
title = {{50 years of data collection UNHCR expce}},
year = {2015}
}
@article{Grunwald2004,
abstract = {We describe and develop a close relationship between two problems that have customarily been regarded as distinct: that of maximizing entropy, and that of minimizing worst-case expected loss.  Using a formulation grounded in the equilibrium theory of zero-sum games between Decision Maker and Nature, these two problems are shown to be dual to each other, the solution to each providing that to the other.  Although Topsoe described this connection for the Shannon entropy over 20 years ago, it does not appear to be widely known even in that important special case. 

We here generalize this theory to apply to arbitrary decision problems and loss functions.  We indicate how an appropriate generalized definition of entropy can be associated with such a problem, and we show that, subject to certain regularity conditions, the above-mentioned duality continues to apply in this extended context.  This simultaneously provides a possible rationale for maximizing entropy and a tool for finding robust Bayes acts.  We also describe the essential identity between the problem of maximizing entropy and that of minimizing a related discrepancy or divergence between distributions.  This leads to an extension, to arbitrary discrepancies, of a well-known minimax theorem for the case of Kullback-Leibler divergence (the "redundancy-capacity theorem'' of information theory).  For the important case of families of distributions having certain mean values specified, we develop simple sufficient conditions and methods for identifying the desired solutions.},
archivePrefix = {arXiv},
arxivId = {math/0410076},
author = {Gr{\"{u}}nwald, Peter D. and {Philip Dawid}, a.},
doi = {10.1214/009053604000000553},
eprint = {0410076},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gr{\"{u}}nwald, Philip Dawid - 2004 - Game theory, maximum entropy, minimum discrepancy and robust Bayesian decision theory.pdf:pdf},
issn = {00905364},
journal = {Annals of Statistics},
keywords = {Additive model,Bayes act,Bregman divergence,Brier score,Convexity,Duality,Equalizer rule,Exponential family,Gamma-minimax,Generalized exponential family,Kullback-Leibler divergence,Logarithmic score,Maximin,Mean-value constraints,Minimax},
number = {4},
pages = {1367--1433},
primaryClass = {math},
title = {{Game theory, maximum entropy, minimum discrepancy and robust Bayesian decision theory}},
volume = {32},
year = {2004}
}
@inproceedings{Scholkopf2012,
author = {Sch{\"{o}}lkopf, Bernhard and Janzing, Dominik and Peters, Jonas},
booktitle = {Proceedings of the 29th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Sch{\"{o}}lkopf, Janzing, Peters - 2012 - On Causal and Anticausal Learning.pdf:pdf},
pages = {1255--1262},
title = {{On Causal and Anticausal Learning}},
url = {http://arxiv.org/abs/1206.6471},
year = {2012}
}
@inproceedings{Lu2016,
author = {Lu, Kai and Xie, Jiang and Shu, Junhui},
booktitle = {High Performance Computing Applications},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lu, Xie, Shu - 2016 - A Fast Training Method for Transductive Support Vector Machine in Semi-supervised Learning.pdf:pdf},
keywords = {fast training,semi-supervised classification,transductive support},
pages = {211--217},
title = {{A Fast Training Method for Transductive Support Vector Machine in Semi-supervised Learning}},
year = {2016}
}
@article{Hassanzadeh,
author = {Hassanzadeh, Aidin and Kaarna, Arto and Kauranne, Tuomo},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hassanzadeh, Kaarna, Kauranne - Unknown - Outlier Robust Geodesic K-means Algorithm for High Dimensional Data.pdf:pdf},
keywords = {clustering,geodesic dis-,high-dimensional data,k-means,local outlier factor,shared nearest neighbour,tance},
pages = {1--10},
title = {{Outlier Robust Geodesic K-means Algorithm for High Dimensional Data}}
}
@article{Shore1980,
author = {Shore, John E. and Johnson, Rodney W.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Shore, Johnson - 1980 - Axiomatic derivation of the principle of maximum entropy and the principle of minimum cross-entropy.pdf:pdf},
journal = {IEEE Transactions on Information Theory},
number = {1},
pages = {26--37},
title = {{Axiomatic derivation of the principle of maximum entropy and the principle of minimum cross-entropy}},
url = {http://ieeexplore.ieee.org/xpls/abs{\_}all.jsp?arnumber=1056144},
volume = {26},
year = {1980}
}
@article{Amini2002,
author = {Amini, Massih-Reza and Gallinari, Patrick},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Amini, Gallinari - 2002 - Semi-supervised logistic regression.pdf:pdf},
journal = {15th European Conference on Artificial Intelligence},
pages = {390--394},
title = {{Semi-supervised logistic regression}},
year = {2002}
}
@article{Kawano2012,
archivePrefix = {arXiv},
arxivId = {arXiv:1108.5244v3},
author = {Kawano, Shuichi},
eprint = {arXiv:1108.5244v3},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kawano - 2012 - Semi-supervised logistic discrimination via labeled data and unlabeled data from different sampling distributions.pdf:pdf},
journal = {arXiv preprint},
keywords = {and phrases,covariate shift,em algorithm,model selection,reg-,semi-supervised learning,ularization},
pages = {1--19},
title = {{Semi-supervised logistic discrimination via labeled data and unlabeled data from different sampling distributions}},
year = {2012}
}
@article{Hanselmann2013,
abstract = {Digital staining for the automated annotation of mass spectrometry imaging (MSI) data has previously been achieved using state-of-the-art classifiers such as random forests or support vector machines (SVMs). However, the training of such classifiers requires an expert to label exemplary data in advance. This process is time-consuming and hence costly, especially if the tissue is heterogeneous. In theory, it may be sufficient to only label a few highly representative pixels of an MS image, but it is not known a priori which pixels to select. This motivates active learning strategies in which the algorithm itself queries the expert by automatically suggesting promising candidate pixels of an MS image for labeling. Given a suitable querying strategy, the number of required training labels can be significantly reduced while maintaining classification accuracy. In this work, we propose active learning for convenient annotation of MSI data. We generalize a recently proposed active learning method to the multiclass case and combine it with the random forest classifier. Its superior performance over random sampling is demonstrated on secondary ion mass spectrometry data, making it an interesting approach for the classification of MS images.},
author = {Hanselmann, Michael and R{\"{o}}der, Jens and K{\"{o}}the, Ullrich and Renard, Bernhard Y and Heeren, Ron M.A. and Hamprecht, Fred A.},
doi = {10.1021/ac3023313},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hanselmann et al. - 2013 - Active learning for convenient annotation and classification of secondary ion mass spectrometry images.pdf:pdf},
issn = {1520-6882},
journal = {Analytical Chemistry},
month = {jan},
number = {1},
pages = {147--55},
pmid = {23157438},
title = {{Active learning for convenient annotation and classification of secondary ion mass spectrometry images.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/23157438},
volume = {85},
year = {2013}
}
@article{Papadopoulos2016,
abstract = {Training object class detectors typically requires a large set of images in which objects are annotated by bounding-boxes. However, manually drawing bounding-boxes is very time consuming. We propose a new scheme for training object detectors which only requires annotators to verify bounding-boxes produced automatically by the learning algorithm. Our scheme iterates between re-training the detector, re-localizing objects in the training images, and human verification. We use the verification signal both to improve re-training and to reduce the search space for re-localisation, which makes these steps different to what is normally done in a weakly supervised setting. Extensive experiments on PASCAL VOC 2007 show that (1) using human verification to update detectors and reduce the search space leads to the rapid production of high-quality bounding-box annotations; (2) our scheme delivers detectors performing almost as good as those trained in a fully supervised setting, without ever drawing any bounding-box; (3) as the verification task is very quick, our scheme substantially reduces total annotation time by a factor 6x-9x.},
archivePrefix = {arXiv},
arxivId = {1602.08405},
author = {Papadopoulos, Dim P. and Uijlings, Jasper R. R. and Keller, Frank and Ferrari, Vittorio},
eprint = {1602.08405},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Papadopoulos et al. - 2016 - We don't need no bounding-boxes Training object class detectors using only human verification.pdf:pdf},
number = {1},
title = {{We don't need no bounding-boxes: Training object class detectors using only human verification}},
url = {http://arxiv.org/abs/1602.08405},
year = {2016}
}
@article{Poon2011,
abstract = {The key limiting factor in graphical model inference and learning is the complexity of the partition function. We thus ask the question: what are the most general conditions under which the partition function is tractable? The answer leads to a new kind of deep architecture, which we call sum-product networks (SPNs) and will present in this abstract.},
archivePrefix = {arXiv},
arxivId = {1202.3732},
author = {Poon, Hoifung and Domingos, Pedro},
doi = {10.1109/ICCVW.2011.6130310},
eprint = {1202.3732},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Poon, Domingos - 2011 - Sum-product networks A new deep architecture.pdf:pdf},
isbn = {9781467300629},
journal = {Proceedings of the IEEE International Conference on Computer Vision},
pages = {689--690},
title = {{Sum-product networks: A new deep architecture}},
year = {2011}
}
@inproceedings{Ben-David2011,
author = {Ben-David, Shai and Srebro, Nathan and Urner, R},
booktitle = {Philosophy and Machine Learning - Workshop at NIPS},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ben-David, Srebro, Urner - 2011 - Universal learning vs. no free lunch results.pdf:pdf},
title = {{Universal learning vs. no free lunch results}},
url = {http://www.dsi.unive.it/PhiMaLe2011/Abstract/Ben-David{\_}Srebro{\_}Urner.pdf},
year = {2011}
}
@article{Wagenmakers2011,
abstract = {Does psi exist? D. J. Bem (2011) conducted 9 studies with over 1,000 participants in an attempt to demonstrate that future events retroactively affect people's responses. Here we discuss several limitations of Bem's experiments on psi; in particular, we show that the data analysis was partly exploratory and that one-sided p values may overstate the statistical evidence against the null hypothesis. We reanalyze Bem's data with a default Bayesian t test and show that the evidence for psi is weak to nonexistent. We argue that in order to convince a skeptical audience of a controversial claim, one needs to conduct strictly confirmatory studies and analyze the results with statistical tests that are conservative rather than liberal. We conclude that Bem's p values do not indicate evidence in favor of precognition; instead, they indicate that experimental psychologists need to change the way they conduct their experiments and analyze their data.},
author = {Wagenmakers, Eric-Jan and Wetzels, Ruud and Borsboom, Denny and van der Maas, Han L J},
doi = {10.1037/a0022790},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wagenmakers et al. - 2011 - Why psychologists must change the way they analyze their data the case of psi comment on Bem (2011).pdf:pdf},
isbn = {0022-3514},
issn = {1939-1315},
journal = {Journal of personality and social psychology},
number = {3},
pages = {426--32},
pmid = {21280965},
title = {{Why psychologists must change the way they analyze their data: the case of psi: comment on Bem (2011).}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/21280965},
volume = {100},
year = {2011}
}
@article{Kawakita2014a,
abstract = {We are interested in developing a safe semi-supervised learning that works in any situation. Semi-supervised learning postulates that n(') unlabeled data are available in addition to n labeled data. However, almost all of the previous semi-supervised methods require additional assumptions (not only unlabeled data) to make improvements on supervised learning. If such assumptions are not met, then the methods possibly perform worse than supervised learning. Sokolovska, Capp{\'{e}}, and Yvon (2008) proposed a semi-supervised method based on a weighted likelihood approach. They proved that this method asymptotically never performs worse than supervised learning (i.e., it is safe) without any assumption. Their method is attractive because it is easy to implement and is potentially general. Moreover, it is deeply related to a certain statistical paradox. However, the method of Sokolovska et al. (2008) assumes a very limited situation, i.e., classification, discrete covariates, n(')→∞ and a maximum likelihood estimator. In this paper, we extend their method by modifying the weight. We prove that our proposal is safe in a significantly wide range of situations as long as n≤n('). Further, we give a geometrical interpretation of the proof of safety through the relationship with the above-mentioned statistical paradox. Finally, we show that the above proposal is asymptotically safe even when n('){\textless}n by modifying the weight. Numerical experiments illustrate the performance of these methods.},
author = {Kawakita, Masanori and Takeuchi, Jun'ichi Jun'ichi},
doi = {10.1016/j.neunet.2014.01.016},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kawakita, Takeuchi - 2014 - Safe semi-supervised learning based on weighted likelihood.pdf:pdf;:Users/jkrijthe/Documents/Mendeley Desktop/Kawakita, Takeuchi - 2014 - Safe semi-supervised learning based on weighted likelihood(2).pdf:pdf},
journal = {Neural Networks},
keywords = {semi-supervised learning},
month = {may},
pages = {146--164},
publisher = {Elsevier Ltd},
title = {{Safe semi-supervised learning based on weighted likelihood}},
volume = {53},
year = {2014}
}
@inproceedings{Balcan2013,
author = {Balcan, Maria-Florina and Berlind, Christopher and Ehrlich, Steven and Liang, Yingyu},
booktitle = {Proceedings of the 30th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Balcan et al. - 2013 - Efficient Semi-supervised and Active Learning of Disjunctions.pdf:pdf},
pages = {633--641},
title = {{Efficient Semi-supervised and Active Learning of Disjunctions}},
url = {http://machinelearning.wustl.edu/mlpapers/papers/ICML2013{\_}balcan13},
year = {2013}
}
@inproceedings{Krijthe2015,
address = {Saint {\'{E}}tienne. France},
archivePrefix = {arXiv},
arxivId = {1507.06802},
author = {Krijthe, Jesse Hendrik and Loog, Marco},
booktitle = {14th International Symposium on Advances in Intelligent Data Analysis XIV (Lecture Notes in Computer Science Volume 9385)},
doi = {10.1007/978-3-319-24465-5_14},
editor = {Fromont, Elisa and {De Bie}, Tijl and van Leeuwen, Matthijs},
eprint = {1507.06802},
file = {:Users/jkrijthe/Documents/Mendeley Desktop//Krijthe, Loog - 2015 - Implicitly Constrained Semi-Supervised Least Squares Classification.pdf:pdf},
keywords = {constrained,least squares classification,semi-supervised learning},
pages = {158--169},
title = {{Implicitly Constrained Semi-Supervised Least Squares Classification}},
year = {2015}
}
@article{Nowak2008,
abstract = {When applying hierarchical clustering algorithms to cluster patient samples from microarray data, the clustering patterns generated by most algorithms tend to be dominated by groups of highly differentially expressed genes that have closely related expression patterns. Sometimes, these genes may not be relevant to the biological process under study or their functions may already be known. The problem is that these genes can potentially drown out the effects of other genes that are relevant or have novel functions. We propose a procedure called complementary hierarchical clustering that is designed to uncover the structures arising from these novel genes that are not as highly expressed. Simulation studies show that the procedure is effective when applied to a variety of examples. We also define a concept called relative gene importance that can be used to identify the influential genes in a given clustering. Finally, we analyze a microarray data set from 295 breast cancer patients, using clustering with the correlation-based distance measure. The complementary clustering reveals a grouping of the patients which is uncorrelated with a number of known prognostic signatures and significantly differing distant metastasis-free probabilities.},
author = {Nowak, Gen and Tibshirani, Robert},
doi = {10.1093/biostatistics/kxm046},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Nowak, Tibshirani - 2008 - Complementary hierarchical clustering.pdf:pdf},
issn = {1468-4357},
journal = {Biostatistics (Oxford, England)},
keywords = {Algorithms,Breast Neoplasms,Breast Neoplasms: genetics,Cluster Analysis,Computer Simulation,Female,Fuzzy Logic,Gene Expression,Gene Expression Profiling,Gene Expression Profiling: methods,Gene Expression Profiling: statistics {\&} numerical,Genetic Markers,Humans,Information Storage and Retrieval,Information Storage and Retrieval: methods,Neoplasm Metastasis,Neoplasm Metastasis: genetics,Oligonucleotide Array Sequence Analysis,Oligonucleotide Array Sequence Analysis: methods,Pattern Recognition, Automated,Principal Component Analysis,Reference Values},
month = {jul},
number = {3},
pages = {467--83},
pmid = {18093965},
title = {{Complementary hierarchical clustering.}},
url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3294318{\&}tool=pmcentrez{\&}rendertype=abstract},
volume = {9},
year = {2008}
}
@unpublished{Liu2014a,
author = {Liu, Mingxia and Zhang, Daoqiang},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Liu, Zhang - 2014 - CGS A Novel Pairwise Constraint-Guided Sparse Feature Selection Method.pdf:pdf},
title = {{CGS: A Novel Pairwise Constraint-Guided Sparse Feature Selection Method}},
year = {2014}
}
@inproceedings{Pfahringer2000,
author = {Pfahringer, Bernhard and Giraud-carrier, Christophe},
booktitle = {Proceedings of the 17th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Pfahringer, Giraud-carrier - 2000 - Meta-Learning by Landmarking Various Learning Algorithms.pdf:pdf},
pages = {743--750},
title = {{Meta-Learning by Landmarking Various Learning Algorithms}},
year = {2000}
}
@unpublished{Sakai2016,
abstract = {Semi-supervised learning based on the low-density separation principle such as the cluster and manifold assumptions has been extensively studied in the last decades. However, such semi-supervised learning methods do not always perform well due to violation of the cluster and manifold assumptions. In this paper, we propose a novel approach to semi-supervised learning that does not require such restrictive assumptions. Our key idea is to combine learning from positive and negative data (standard supervised learning) and learning from positive and unlabeled data (PU learning), the latter is guaranteed to be able to utilize unlabeled data without the cluster and manifold assumptions. We theoretically and experimentally show the usefulness of our approach.},
archivePrefix = {arXiv},
arxivId = {1605.06955},
author = {Sakai, Tomoya and du Plessis, Marthinus Christoffel and Niu, Gang and Sugiyama, Masashi},
eprint = {1605.06955},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Sakai et al. - 2016 - Beyond the Low-density Separation Principle A Novel Approach to Semi-supervised Learning.pdf:pdf},
title = {{Beyond the Low-density Separation Principle: A Novel Approach to Semi-supervised Learning}},
year = {2016}
}
@inproceedings{Mey2016,
author = {Mey, Alexander and Loog, Marco},
booktitle = {Proceedings of the 23rd International Conference on Pattern Recognition (To Appear)},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Mey, Loog - 2016 - A Soft-Labeled Self-Training Approach.pdf:pdf},
title = {{A Soft-Labeled Self-Training Approach}},
year = {2016}
}
@article{Hartley1968,
author = {Hartley, H.O. and Rao, J.N.K.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hartley, Rao - 1968 - A new estimation for sample theory surveys.pdf:pdf},
journal = {Biometrika},
number = {3},
pages = {547--557},
title = {{A new estimation for sample theory surveys}},
volume = {55},
year = {1968}
}
@inproceedings{Kopf2000,
author = {K{\"{o}}pf, Christian and Taylor, Charles and Keller, Jorg},
booktitle = {Proceedings of the PKDD-00 workshop on data mining, decision support, meta-learning and ILP},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/K{\"{o}}pf, Taylor, Keller - 2000 - Meta-analysis from data characterisation for meta-learning to meta-regression.pdf:pdf},
number = {Ml},
title = {{Meta-analysis: from data characterisation for meta-learning to meta-regression}},
url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.26.8159},
year = {2000}
}
@unpublished{Savov,
author = {Savov, Ivan},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Savov - Unknown - Linear algebra explained in four pages.pdf:pdf},
pages = {1--4},
title = {{Linear algebra explained in four pages}}
}
@article{Freedman1983,
author = {Freedman, David A},
doi = {10.1080/00031305.1983.10482729},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Freedman - 1983 - A Note on Screening Regression Equations.pdf:pdf},
isbn = {9780471463764},
issn = {0003-1305},
journal = {The American Statistician},
number = {2},
pages = {152--155},
title = {{A Note on Screening Regression Equations}},
url = {http://dx.doi.org/10.1002/0471463760.app1},
volume = {37},
year = {1983}
}
@incollection{Opper2001,
author = {Opper, Manfred},
booktitle = {Frontiers of Life: Intelligent systems},
editor = {Baltimore, David},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Opper - 2001 - Learning to generalize.pdf:pdf},
isbn = {0120773430},
pages = {763--775},
publisher = {Academic Press},
title = {{Learning to generalize}},
year = {2001}
}
@article{Baumbach2016,
author = {Baumbach, Sebastian and Wittich, Frank and Sachs, Florian and Ahmed, Sheraz and Dengel, Andreas},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Baumbach et al. - 2016 - A Novel Approach for Data-Driven Automatic Site Recommendation and Selection.pdf:pdf},
title = {{A Novel Approach for Data-Driven Automatic Site Recommendation and Selection}},
year = {2016}
}
@article{Huggins2015,
archivePrefix = {arXiv},
arxivId = {arXiv:1505.04984v1},
author = {Huggins, Jonathan H and Tenenbaum, Joshua B},
eprint = {arXiv:1505.04984v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Huggins, Tenenbaum - 2015 - Risk and Regret of Hierarchical Bayesian Learners.pdf:pdf},
journal = {Proceedings of the 32nd International Conference on Machine Learning},
title = {{Risk and Regret of Hierarchical Bayesian Learners}},
volume = {37},
year = {2015}
}
@article{Chandrasekaran2013,
abstract = {Modern massive datasets create a fundamental problem at the intersection of the computational and statistical sciences: how to provide guarantees on the quality of statistical inference given bounds on computational resources, such as time or space. Our approach to this problem is to define a notion of "algorithmic weakening," in which a hierarchy of algorithms is ordered by both computational efficiency and statistical efficiency, allowing the growing strength of the data at scale to be traded off against the need for sophisticated processing. We illustrate this approach in the setting of denoising problems, using convex relaxation as the core inferential tool. Hierarchies of convex relaxations have been widely used in theoretical computer science to yield tractable approximation algorithms to many computationally intractable tasks. In the current paper, we show how to endow such hierarchies with a statistical characterization and thereby obtain concrete tradeoffs relating algorithmic runtime to amount of data.},
archivePrefix = {arXiv},
arxivId = {arXiv:1211.1073v2},
author = {Chandrasekaran, Venkat and Jordan, Michael I.},
doi = {10.1073/pnas.1302293110},
eprint = {arXiv:1211.1073v2},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Chandrasekaran, Jordan - 2013 - Computational and statistical tradeoffs via convex relaxation.pdf:pdf},
issn = {1091-6490},
journal = {Proceedings of the National Academy of Sciences of the United States of America},
keywords = {convex geometry,convex relaxation,high-dimensional statistics,massive datasets},
month = {mar},
number = {13},
pages = {E1181--90},
pmid = {23479655},
title = {{Computational and statistical tradeoffs via convex relaxation.}},
volume = {110},
year = {2013}
}
@article{Gui2016,
author = {Gui, Jie and Liu, Tongliang and Tao, Dacheng and Sun, Zhenan and Tan, Tieniu},
doi = {10.1109/TCYB.2015.2457234},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gui et al. - 2016 - Representative Vector Machines A Unified Framework for Classical Classifiers.pdf:pdf},
issn = {21682267},
journal = {IEEE Transactions on Cybernetics},
keywords = {Discriminant vector machine (DVM),pattern classification,representative vector machines (RVMs),sparse representation,support vector machines (SVMs)},
number = {8},
pages = {1877--1888},
title = {{Representative Vector Machines: A Unified Framework for Classical Classifiers}},
volume = {46},
year = {2016}
}
@inproceedings{Goldberg2009,
author = {Goldberg, Andrew B. and Zhu, Xiaojin},
booktitle = {Proceedings of the NAACL HLT 2009 Workshop on Semi-Supervised Learning for Natural Language Processing},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Goldberg, Zhu - 2009 - Keepin'it real semi-supervised learning with realistic tuning.pdf:pdf},
pages = {19--27},
title = {{Keepin'it real: semi-supervised learning with realistic tuning}},
year = {2009}
}
@misc{Klein2004,
author = {Klein, Dan},
booktitle = {University of California at Berkeley, Computer Science {\ldots}},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Klein - 2004 - Lagrange Multipliers without Permanent Scarring.pdf:pdf},
title = {{Lagrange Multipliers without Permanent Scarring}},
url = {http://www.ee.columbia.edu/{~}vittorio/LagrangeMultipliers-Klein.pdf},
year = {2004}
}
@article{Ganesalingam1978,
author = {Ganesalingam, S. and Mclachlan, Geoffrey John},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ganesalingam, Mclachlan - 1978 - The efficiency of a linear disriminant function based on unclassified initial samples.pdf:pdf},
journal = {Biometrica},
number = {3},
pages = {658--665},
title = {{The efficiency of a linear disriminant function based on unclassified initial samples}},
volume = {65},
year = {1978}
}
@article{Wang2013,
author = {Wang, Jun and Jebara, Tony and Chang, Shih-Fu},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wang, Jebara, Chang - 2013 - Semi-Supervised Learning Using Greedy Max-Cut.pdf:pdf},
journal = {Journal of Machine Learning Research},
pages = {771--800},
title = {{Semi-Supervised Learning Using Greedy Max-Cut}},
url = {http://www.ee.columbia.edu/ln/dvmm/publications/13/ggmc{\_}13.pdf},
volume = {14},
year = {2013}
}
@article{Castelli1995,
author = {Castelli, Vittorio and Cover, Thomas M.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Castelli, Cover - 1995 - On the exponential value of labeled samples.pdf:pdf},
journal = {Pattern Recognition Letters},
pages = {105--111},
title = {{On the exponential value of labeled samples}},
volume = {16},
year = {1995}
}
@article{Hughes1968,
author = {Hughes, Gordon F.},
doi = {10.1109/TIT.1968.1054102},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hughes - 1968 - On the Mean Accuracy of Statistical Pattern Recognizers.pdf:pdf},
journal = {IEEE Transactions on Information Theory},
number = {1},
pages = {55--63},
title = {{On the Mean Accuracy of Statistical Pattern Recognizers}},
volume = {14},
year = {1968}
}
@inproceedings{Sindhwani2005,
author = {Sindhwani, Vikas and Niyogi, Partha and Belkin, Mikhail},
booktitle = {Proceedings of the 22nd international conference on Machine learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Sindhwani, Niyogi, Belkin - 2005 - Beyond the point cloud from transductive to semi-supervised learning.pdf:pdf},
number = {0},
pages = {824--831},
title = {{Beyond the point cloud: from transductive to semi-supervised learning}},
url = {http://dl.acm.org/citation.cfm?id=1102455},
year = {2005}
}
@article{Hanselmann2008,
abstract = {Imaging mass spectrometry (IMS) is a promising technology which allows for detailed analysis of spatial distributions of (bio)molecules in organic samples. In many current applications, IMS relies heavily on (semi)automated exploratory data analysis procedures to decompose the data into characteristic component spectra and corresponding abundance maps, visualizing spectral and spatial structure. The most commonly used techniques are principal component analysis (PCA) and independent component analysis (ICA). Both methods operate in an unsupervised manner. However, their decomposition estimates usually feature negative counts and are not amenable to direct physical interpretation. We propose probabilistic latent semantic analysis (pLSA) for non-negative decomposition and the elucidation of interpretable component spectra and abundance maps. We compare this algorithm to PCA, ICA, and non-negative PARAFAC (parallel factors analysis) and show on simulated and real-world data that pLSA and non-negative PARAFAC are superior to PCA or ICA in terms of complementarity of the resulting components and reconstruction accuracy. We further combine pLSA decomposition with a statistical complexity estimation scheme based on the Akaike information criterion (AIC) to automatically estimate the number of components present in a tissue sample data set and show that this results in sensible complexity estimates.},
author = {Hanselmann, Michael and Kirchner, Marc and Renard, Bernhard Y and Amstalden, Erika R and Glunde, Kristine and Heeren, Ron M a and Hamprecht, Fred a},
doi = {10.1021/ac801303x},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hanselmann et al. - 2008 - Concise representation of mass spectrometry images by probabilistic latent semantic analysis.pdf:pdf},
issn = {1520-6882},
journal = {Analytical chemistry},
keywords = {Algorithms,Breast Neoplasms,Breast Neoplasms: pathology,Computer Simulation,Female,Humans,Image Processing, Computer-Assisted,Mass Spectrometry,Principal Component Analysis,Signal Processing, Computer-Assisted},
month = {dec},
number = {24},
pages = {9649--58},
pmid = {18989936},
title = {{Concise representation of mass spectrometry images by probabilistic latent semantic analysis.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/18989936},
volume = {80},
year = {2008}
}
@book{Lehmann1998,
author = {Lehmann, E. L. and Casella, G.},
edition = {2},
publisher = {Springer-Verlag},
title = {{Theory of Point Estimation}},
year = {1998}
}
@article{Tian2016a,
author = {Tian, Ye and Luo, Jian},
doi = {10.1007/s00500-016-2089-y},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Tian, Luo - 2016 - A new branch-and-bound approach to semi-supervised support vector machine.pdf:pdf},
isbn = {0050001620},
issn = {14337479},
journal = {Soft Computing},
keywords = {Branch-and-bound scheme,Lower bound estimator,MIQP reformulation,Soft margin {\$}{\$}$\backslash$mathrm {\{}S{\^{}}3VM{\}}{\$}{\$}S3VMmodel},
number = {2001},
pages = {1--10},
publisher = {Springer Berlin Heidelberg},
title = {{A new branch-and-bound approach to semi-supervised support vector machine}},
url = {"http://dx.doi.org/10.1007/s00500-016-2089-y},
year = {2016}
}
@inproceedings{Blum1998,
author = {Blum, Avrim and Mitchell, Tom},
booktitle = {Proceedings of the 11th Annual Conference on Computational Learning Theory},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Blum, Mitchell - 1998 - Combining labeled and unlabeled data with co-training.pdf:pdf},
pages = {92--100},
title = {{Combining labeled and unlabeled data with co-training}},
url = {http://dl.acm.org/citation.cfm?id=279962},
year = {1998}
}
@article{Yates1933,
author = {Yates, Frank},
journal = {Empire Journal of Experimental Agriculture},
number = {2},
pages = {129--142},
title = {{The analysis of replicated experiments when the field results are incomplete}},
volume = {1},
year = {1933}
}
@inproceedings{Zhu2003,
author = {Zhu, Xiaojin and Ghahramani, Zoubin and Lafferty, John},
booktitle = {Proceedings of the 20th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Zhu, Ghahramani, Lafferty - 2003 - Semi-supervised learning using gaussian fields and harmonic functions.pdf:pdf},
pages = {912--919},
title = {{Semi-supervised learning using gaussian fields and harmonic functions}},
year = {2003}
}
@article{Cortes2014,
abstract = {We present a series of new theoretical, algorithmic, and empirical results for domain adaptation and sample bias correction in regression. We prove that the discrepancy is a distance for the squared loss when the hypothesis set is the reproducing kernel Hilbert space induced by a universal kernel such as the Gaussian kernel. We give new pointwise loss guarantees based on the discrepancy of the empirical source and target distributions for the general class of kernel-based regularization algorithms. These bounds have a simpler form than previous results and hold for a broader class of convex loss functions not necessarily differentiable, including Lq losses and the hinge loss. We also give finer bounds based on the discrepancy and a weighted feature discrepancy parameter. We extend the discrepancy minimization adaptation algorithm to the more significant case where kernels are used and show that the problem can be cast as an SDP similar to the one in the feature space. We also show that techniques from smooth optimization can be used to derive an efficient algorithm for solving such SDPs even for very high-dimensional feature spaces and large samples. We have implemented this algorithm and report the results of experiments both with artificial and real-world data sets demonstrating its benefits both for general scenario of adaptation and the more specific scenario of sample bias correction. Our results show that it can scale to large data sets of tens of thousands or more points and demonstrate its performance improvement benefits. ?? 2013 Elsevier B.V.},
author = {Cortes, Corinna and Mohri, Mehryar},
doi = {10.1016/j.tcs.2013.09.027},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Cortes, Mohri - 2014 - Domain adaptation and sample bias correction theory and algorithm for regression.pdf:pdf},
issn = {03043975},
journal = {Theoretical Computer Science},
keywords = {Domain adaptation,Learning theory,Machine learning,Optimization},
number = {June 2013},
pages = {103--126},
title = {{Domain adaptation and sample bias correction theory and algorithm for regression}},
url = {http://dx.doi.org/10.1016/j.tcs.2013.09.027},
volume = {519},
year = {2014}
}
@article{Pillai2015,
abstract = {The Cauchy distribution is usually presented as a mathematical curiosity, an exception to the Law of Large Numbers, or even as an "Evil" distribution in some introductory courses. It therefore surprised us when Drton and Xiao (2014) proved the following result for {\$}m=2{\$} and conjectured it for {\$}m\backslashge 3{\$}. Let {\$}X= (X{\_}1,..., X{\_}m){\$} and {\$}Y = (Y{\_}1, ...,Y{\_}m){\$} be i.i.d {\$}N(0,\backslashSigma){\$}, where {\$}\backslashSigma=\backslash{\{}\backslashsigma{\_}{\{}ij{\}}\backslash{\}}\backslashge 0{\$} is an {\$}m\backslashtimes m{\$} and $\backslash$textit{\{}arbitrary{\}} covariance matrix with {\$}\backslashsigma{\_}{\{}jj{\}}{\textgreater}0{\$} for all {\$}1\backslashleq j\backslashleq m{\$}. Then {\$}{\$}Z = $\backslash$sum{\_}{\{}j=1{\}}{\^{}}m w{\_}j $\backslash$frac{\{}X{\_}j{\}}{\{}Y{\_}j{\}} $\backslash$ $\backslash$sim $\backslash$mathrm{\{}Cauchy{\}}(0,1),{\$}{\$} as long as {\$}w=(w{\_}1,..., w{\_}m) {\$} is independent of {\$}(X, Y){\$}, {\$}w{\_}j\backslashge 0, j=1,..., m{\$}, and {\$}\backslashsum{\_}{\{}j=1{\}}{\^{}}m w{\_}j=1{\$}. In this note, we present an elementary proof of this conjecture for any {\$}m \backslashgeq 2{\$} by linking {\$}Z{\$} to a geometric characterization of Cauchy(0,1) given in Willams (1969). This general result is essential to the large sample behavior of Wald tests in many applications such as factor models and contingency tables. It also leads to other unexpected results such as {\$}{\$} $\backslash$sum{\_}{\{}i=1{\}}{\^{}}m$\backslash$sum{\_}{\{}j=1{\}}{\^{}}m $\backslash$frac{\{}w{\_}iw{\_}j$\backslash$sigma{\_}{\{}ij{\}}{\}}{\{}X{\_}iX{\_}j{\}} $\backslash$sim {\{}$\backslash$text{\{}L$\backslash$'{\{}e{\}}vy{\}}{\}}(0, 1). {\$}{\$} This generalizes the "super Cauchy phenomenon" that the average of {\$}m{\$} i.i.d. standard L$\backslash$'evy variables (i.e., inverse chi-squared variables with one degree of freedom) has the same distribution as that of a single standard L$\backslash$'evy variable multiplied by {\$}m{\$} (which is obtained by taking {\$}w{\_}j=1/m{\$} and {\$}\backslashSigma{\$} to be the identity matrix).},
archivePrefix = {arXiv},
arxivId = {1505.01957},
author = {Pillai, Natesh S. and Meng, Xiao-Li},
doi = {10.1214/15-AOS1407},
eprint = {1505.01957},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Pillai, Meng - 2015 - An unexpected encounter with Cauchy and Levy.pdf:pdf},
issn = {00905364},
journal = {arXiv.org},
pages = {1--9},
title = {{An unexpected encounter with Cauchy and Levy}},
url = {http://arxiv.org/abs/1505.01957},
year = {2015}
}
@article{Ben-David2015a,
abstract = {It is well known that most of the common clustering objectives are NP-hard to optimize. In practice, however, clustering is being routinely carried out. One approach for providing theoretical understanding of this seeming discrepancy is to come up with notions of clusterability that distinguish realistically interesting input data from worst-case data sets. The hope is that there will be clustering algorithms that are provably efficient on such "clusterable" instances. This paper addresses the thesis that the computational hardness of clustering tasks goes away for inputs that one really cares about. In other words, that "Clustering is difficult only when it does not matter" (the $\backslash$emph{\{}CDNM thesis{\}} for short). I wish to present a a critical bird's eye overview of the results published on this issue so far and to call attention to the gap between available and desirable results on this issue. A longer, more detailed version of this note is available as arXiv:1507.05307. I discuss which requirements should be met in order to provide formal support to the the CDNM thesis and then examine existing results in view of these requirements and list some significant unsolved research challenges in that direction.},
archivePrefix = {arXiv},
arxivId = {1510.05336},
author = {Ben-David, Shai},
eprint = {1510.05336},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ben-David - 2015 - Clustering is Easy When ....What.pdf:pdf},
pages = {2--7},
title = {{Clustering is Easy When ....What?}},
url = {http://arxiv.org/abs/1510.05336},
year = {2015}
}
@article{Kuncheva2003,
author = {Kuncheva, Ludmila I and Whitaker, Christopher J.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kuncheva, Whitaker - 2003 - Measures of Diversity in Classifier Ensembles.pdf:pdf},
journal = {Machine Learning},
keywords = {committee of learners,dependency and diversity,multiple classifiers ensemble,pattern recognition},
pages = {181--207},
title = {{Measures of Diversity in Classifier Ensembles}},
volume = {51},
year = {2003}
}
@article{Bartlett2006,
author = {Bartlett, Peter L and Jordan, Michael I. and McAuliffe, Jon D},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Bartlett, Jordan, McAuliffe - 2006 - Convexity, Classification, and Risk Bounds.pdf:pdf},
journal = {Journal of the American Statistical Association},
keywords = {boosting,convex optimization},
number = {473},
pages = {138--156},
title = {{Convexity, Classification, and Risk Bounds}},
volume = {101},
year = {2006}
}
@techreport{Wolpert1996,
author = {Wolpert, David H and Macready, W},
booktitle = {Santa Fe Institute Technical Report},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wolpert, Macready - 1996 - Combining Stacking With Bagging To Improve A Learning Algorithm.pdf:pdf},
pages = {1--28},
title = {{Combining Stacking With Bagging To Improve A Learning Algorithm}},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.53.9933{\&}rep=rep1{\&}type=pdf},
year = {1996}
}
@inproceedings{Rosset2004,
author = {Rosset, Saharon and Zhu, Ji and Zou, Hui and Hastie, Trevor J},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Rosset et al. - 2004 - A Method for Inferring Label Sampling Mechanisms in Semi-Supervised Learning.pdf:pdf},
pages = {1161--1168},
title = {{A Method for Inferring Label Sampling Mechanisms in Semi-Supervised Learning}},
year = {2004}
}
@article{Zhang2004a,
author = {Zhang, Tong},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Zhang - 2004 - Statistical Behavior and Consistency of Classification Methods Based on Convex Risk Minimization.pdf:pdf},
journal = {The Annals of Statistics},
number = {1},
pages = {56--134},
title = {{Statistical Behavior and Consistency of Classification Methods Based on Convex Risk Minimization}},
url = {http://www.jstor.org/stable/10.2307/3448494},
volume = {32},
year = {2004}
}
@article{Wallace2016,
author = {Wallace, Byron C and Marshall, Iain J},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wallace, Marshall - 2016 - Extracting PICO Sentences from Clinical Trial Reports using Supervised Distant Supervision.pdf:pdf},
pages = {1--25},
title = {{Extracting PICO Sentences from Clinical Trial Reports using Supervised Distant Supervision}},
volume = {17},
year = {2016}
}
@article{Abney2004,
author = {Abney, Steven},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Abney - 2004 - Understanding the Yarowsky algorithm.pdf:pdf},
journal = {Computational Linguistics},
number = {3},
pages = {365--395},
title = {{Understanding the Yarowsky algorithm}},
volume = {30},
year = {2004}
}
@article{Skurichina1999,
author = {Skurichina, Marina and Duin, Robert P. W.},
doi = {10.1007/s100440050013},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Skurichina, Duin - 1999 - Regularisation of Linear Classifiers by Adding Redundant Features.pdf:pdf},
issn = {1433-7541},
journal = {Pattern Analysis {\&} Applications},
keywords = {critical sample size,generalisation error,noise injection,peaking behaviour,pseudo fisher linear discriminant,regularisation},
number = {1},
pages = {44--52},
title = {{Regularisation of Linear Classifiers by Adding Redundant Features}},
volume = {2},
year = {1999}
}
@article{Halevy2009,
author = {Halevy, Alon and Norvig, Peter and Pereira, Fernando},
doi = {10.1109/MIS.2009.36},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Halevy, Norvig, Pereira - 2009 - The Unreasonable Effectiveness of Data.pdf:pdf},
issn = {1541-1672},
journal = {IEEE Intelligent Systems},
month = {mar},
number = {2},
pages = {8--12},
title = {{The Unreasonable Effectiveness of Data}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=4804817},
volume = {24},
year = {2009}
}
@inproceedings{Ben-David2006,
author = {Ben-David, Shai and Luxburg, Ulrike Von and P{\'{a}}l, David},
booktitle = {Proceedings of the 19th Annual Conference on Learning Theory},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ben-David, Luxburg, P{\'{a}}l - 2006 - A sober look at clustering stability.pdf:pdf},
number = {2002},
pages = {5--19},
title = {{A sober look at clustering stability}},
url = {http://link.springer.com/chapter/10.1007/11776420{\_}4},
year = {2006}
}
@article{VanErven2015,
abstract = {The pursuit of fast rates in online and statistical learning has led to the conception of many conditions in learning theory under which fast learning is possible. We show that most of these conditions are special cases of a single, unifying condition, that comes in two forms: the central condition for 'proper' learning algorithms that always output a hypothesis in the given model, and stochastic mixability for online algorithms that may make predictions outside of the model. We show that, under surprisingly weak conditions, both conditions are, in a certain sense, equivalent. The central condition has a re-interpretation in terms of convexity of a set of pseudoprobabilities, linking it to density estimation under misspecification. For bounded losses, we show how the central condition enables a direct proof of fast rates and we prove its equivalence to the Bernstein condition, itself a generalization of the Tsybakov-Mammen margin condition, which has played a central role in obtaining fast rates in statistical learning. Yet, while the Bernstein condition is two-sided, the central condition is one-sided, making it more suitable to deal with unbounded losses. In its stochastic mixability form, our condition generalizes both a stochastic exp-concavity condition identified by Juditsky, Rigollet and Tsybakov, and Vovk's notion of mixability. Our unifying conditions thus provide a significant step towards a characterization of fast rates in statistical learning, similar to how classical mixability characterizes constant regret in the sequential prediction with expert advice setting.},
archivePrefix = {arXiv},
arxivId = {1507.02592},
author = {van Erven, Tim and Gr{\"{u}}nwald, Peter D. and Mehta, Nishant a. and Reid, Mark D. and Williamson, Robert C.},
eprint = {1507.02592},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/van Erven et al. - 2015 - Fast rates in statistical and online learning.pdf:pdf;:Users/jkrijthe/Documents/Mendeley Desktop/van Erven et al. - 2015 - Fast rates in statistical and online learning(2).pdf:pdf},
pages = {1793--1861},
title = {{Fast rates in statistical and online learning}},
url = {http://arxiv.org/abs/1507.02592},
volume = {2014},
year = {2015}
}
@article{Simmons2011,
author = {Simmons, Joseph P. and Nelson, Leif D. and Simonsohn, Uri},
doi = {10.1177/0956797611417632},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Simmons, Nelson, Simonsohn - 2011 - False-Positive Psychology Undisclosed Flexibility in Data Collection and Analysis Allows Presenting.pdf:pdf},
isbn = {1467-9280 (Electronic)$\backslash$n0956-7976 (Linking)},
issn = {0956-7976},
journal = {Psychological Science},
month = {nov},
number = {11},
pages = {1359--1366},
pmid = {22006061},
title = {{False-Positive Psychology: Undisclosed Flexibility in Data Collection and Analysis Allows Presenting Anything as Significant}},
url = {http://pss.sagepub.com/lookup/doi/10.1177/0956797611417632},
volume = {22},
year = {2011}
}
@article{Tu2015,
author = {Tu, Enmei and Yang, Jie and Kasabov, Nicola and Zhang, Yaqian},
doi = {10.1016/j.neucom.2015.01.020},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Tu et al. - 2015 - Posterior Distribution Learning (PDL) A novel supervised learning framework using unlabeled samples to improve classi.pdf:pdf},
issn = {09252312},
journal = {Neurocomputing},
keywords = {Posterior distribution learning,Supervised learning,Supervised manifold classification},
pages = {173--186},
publisher = {Elsevier},
title = {{Posterior Distribution Learning (PDL): A novel supervised learning framework using unlabeled samples to improve classification performance}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0925231215000417},
volume = {157},
year = {2015}
}
@inproceedings{Subramanya2009,
author = {Subramanya, Amarnag and Bilmes, Jeff a},
booktitle = {Advances in Neural Information Processing Systems 22},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Subramanya, Bilmes - 2009 - Entropic Graph Regularization in Non-Parametric Semi-Supervised Classification.pdf:pdf},
pages = {1803--1811},
title = {{Entropic Graph Regularization in Non-Parametric Semi-Supervised Classification}},
year = {2009}
}
@article{Azizyan2013,
author = {Azizyan, Martin and Singh, Aarti and Wasserman, Larry},
doi = {10.1214/13-AOS1092},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Azizyan, Singh, Wasserman - 2013 - Density-sensitive semisupervised inference.pdf:pdf},
issn = {0090-5364},
journal = {The Annals of Statistics},
month = {apr},
number = {2},
pages = {751--771},
title = {{Density-sensitive semisupervised inference}},
url = {http://projecteuclid.org/euclid.aos/1368018172},
volume = {41},
year = {2013}
}
@inproceedings{Dean2012,
author = {Dean, Jeffrey and Corrado, Greg S and Monga, Rajat and Chen, Kai and Devin, Matthieu and Le, Quoc V and Mao, Mark Z and Ranzato, Marc Aurelio and Senior, Andrew and Tucker, Paul and Yang, Ke and Ng, Andrew Y},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Dean et al. - 2012 - Large Scale Distributed Deep Networks.pdf:pdf},
pages = {1223--1231},
title = {{Large Scale Distributed Deep Networks}},
year = {2012}
}
@article{Niyogi2013,
author = {Niyogi, Partha},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Niyogi - 2013 - Manifold Regularization and Semi-supervised Learning Some Theoretical Analyses.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {graph laplacian,manifold regularization,minimax rates,semi-supervised learning},
pages = {1229--1250},
title = {{Manifold Regularization and Semi-supervised Learning: Some Theoretical Analyses}},
volume = {14},
year = {2013}
}
@article{VanderKooi2016,
author = {van der Kooi, A.L.F. and van den Heuvel-Eibrink, M.M. and van Noortwijk, A. and Neggers, S.J.C.M.M. and Pluijm, S.M.F. and {van Dulmen-den Broeder}, E. and van Dorp, W. and Laven, J.S.E.},
doi = {10.1093/humrep/dew278},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/van der Kooi et al. - 2016 - Longitudinal follow-up in female Childhood Cancer Survivors no signs of accelerated ovarian function loss.pdf:pdf},
issn = {0268-1161},
journal = {Human Reproduction},
keywords = {anti-m{\"{u}}llerian hormone,childhood cancer,females,follow-up studies,ive health,longitudinal studies,neoplasms,ovarian function,reproduct-,survivors},
pages = {1--8},
title = {{Longitudinal follow-up in female Childhood Cancer Survivors: no signs of accelerated ovarian function loss}},
url = {http://humrep.oxfordjournals.org/lookup/doi/10.1093/humrep/dew278},
year = {2016}
}
@article{Hilario2006,
abstract = {Among the many applications of mass spectrometry, biomarker pattern discovery from protein mass spectra has aroused considerable interest in the past few years. While research efforts have raised hopes of early and less invasive diagnosis, they have also brought to light the many issues to be tackled before mass-spectra-based proteomic patterns become routine clinical tools. Known issues cover the entire pipeline leading from sample collection through mass spectrometry analytics to biomarker pattern extraction, validation, and interpretation. This study focuses on the data-analytical phase, which takes as input mass spectra of biological specimens and discovers patterns of peak masses and intensities that discriminate between different pathological states. We survey current work and investigate computational issues concerning the different stages of the knowledge discovery process: exploratory analysis, quality control, and diverse transforms of mass spectra, followed by further dimensionality reduction, classification, and model evaluation. We conclude after a brief discussion of the critical biomedical task of analyzing discovered discriminatory patterns to identify their component proteins as well as interpret and validate their biological implications.},
author = {Hilario, Melanie and Kalousis, Alexandros and Pellegrini, Christian and M{\"{u}}ller, Markus},
doi = {10.1002/mas.20072},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hilario et al. - 2006 - Processing and classification of protein mass spectra.pdf:pdf},
issn = {0277-7037},
journal = {Mass spectrometry reviews},
keywords = {Algorithms,Animals,Biological Markers,Computational Biology,Humans,Mass Spectrometry,Mass Spectrometry: classification,Mass Spectrometry: methods,Models, Chemical,Peptide Mapping,Proteins,Proteins: analysis,Proteomics},
number = {3},
pages = {409--49},
pmid = {16463283},
title = {{Processing and classification of protein mass spectra.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/16463283},
volume = {25},
year = {2006}
}
@inproceedings{Collobert2006a,
author = {Collobert, Ronan and Sinz, Fabian and Weston, Jason and Bottou, L},
booktitle = {International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Collobert et al. - 2006 - Trading convexity for scalability.pdf:pdf},
pages = {201--208},
title = {{Trading convexity for scalability}},
url = {http://dl.acm.org/citation.cfm?id=1143870},
year = {2006}
}
@article{Rubin1972,
author = {Rubin, Donald B.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Rubin - 1972 - A Non-Iterative Algorithm for Least Squares Estimation of Missing Values in Any Analysis of Variance Design.pdf:pdf},
journal = {Journal of the Royal Statistical Society. Series C (Applied Statistics)},
number = {2},
pages = {136--141},
title = {{A Non-Iterative Algorithm for Least Squares Estimation of Missing Values in Any Analysis of Variance Design}},
volume = {21},
year = {1972}
}
@article{Wolpert1996a,
abstract = {This is the first of two papers that use off-training set {\{}(OTS){\}} error to investigate the assumption-free relationship between learning algorithms. This first paper discusses the senses in which there are no {\{}$\backslash$textbackslash{\}}textita priori distinctions between learning algorithms. {\{}(The{\}} second paper discusses the senses in which there are such distinctions.) In this first paper it is shown, loosely speaking, that for any two algorithms A and B, there are “as many” targets (or priors over targets) for which A has lower expected {\{}OTS{\}} error than B as vice-versa, for loss functions like zero-one loss. In particular, this is true if A is cross-validation and B is “anti-cross-validation” (choose the learning algorithm with largest cross-validation error). This paper ends with a discussion of the implications of these results for computational learning theory. It is shown that one can not say: if empirical misclassification rate is low; the {\{}Vapnik-Chervonenkis{\}} dimension of your generalizer is small; and the training set is large, then with high probability your {\{}OTS{\}} error is small. Other implications for “membership queries” algorithms and “punting” algorithms are also discussed.},
author = {Wolpert, David H},
doi = {10.1162/neco.1996.8.7.1341},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wolpert - 1996 - The Lack of A Priori Distinctions Between Learning Algorithms.pdf:pdf},
isbn = {0899-7667},
issn = {0899-7667},
journal = {Neural Computation},
number = {7},
pages = {1341--1390},
title = {{The Lack of A Priori Distinctions Between Learning Algorithms}},
url = {http://www.mitpressjournals.org/doi/abs/10.1162/neco.1996.8.7.1341},
volume = {8},
year = {1996}
}
@article{Ioannidis2005,
abstract = {There is increasing concern that most current published research findings are false. The probability that a research claim is true may depend on study power and bias, the number of other studies on the same question, and, importantly, the ratio of true to no relationships among the relationships probed in each scientific field. In this framework, a research finding is less likely to be true when the studies conducted in a field are smaller; when effect sizes are smaller; when there is a greater number and lesser preselection of tested relationships; where there is greater flexibility in designs, definitions, outcomes, and analytical modes; when there is greater financial and other interest and prejudice; and when more teams are involved in a scientific field in chase of statistical significance. Simulations show that for most study designs and settings, it is more likely for a research claim to be false than true. Moreover, for many current scientific fields, claimed research findings may often be simply accurate measures of the prevailing bias. In this essay, I discuss the implications of these problems for the conduct and interpretation of research.},
author = {Ioannidis, John P.A.},
doi = {10.1371/journal.pmed.0020124},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ioannidis - 2005 - Why most published research findings are false.pdf:pdf},
issn = {1549-1676},
journal = {PLoS medicine},
keywords = {Bias (Epidemiology),Data Interpretation,Likelihood Functions,Meta-Analysis as Topic,Odds Ratio,Publishing,Reproducibility of Results,Research Design,Sample Size,Statistical},
month = {aug},
number = {8},
pages = {e124},
pmid = {16060722},
title = {{Why most published research findings are false.}},
url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=1182327{\&}tool=pmcentrez{\&}rendertype=abstract},
volume = {2},
year = {2005}
}
@inproceedings{Zhou2007,
author = {Zhou, Zhi-hua and Zhan, De-Chuan and Yang, Qiang},
booktitle = {Proceedings of the 22nd national conference on Artificial intelligence},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Zhou, Zhan, Yang - 2007 - Semi-supervised learning with very few labeled training examples.pdf:pdf},
title = {{Semi-supervised learning with very few labeled training examples}},
url = {http://www.aaai.org/Papers/AAAI/2007/AAAI07-107.pdf},
year = {2007}
}
@article{Kouw2016,
archivePrefix = {arXiv},
arxivId = {1512.04829},
author = {Kouw, Wouter M and Krijthe, Jesse Hendrik and Loog, Marco and {Van der Maaten}, Laurens},
eprint = {1512.04829},
journal = {Journal of Machine Learning Research},
pages = {1--32},
title = {{Feature-Level Domain Adaptation}},
volume = {17},
year = {2016}
}
@article{Friedman2000,
author = {Friedman, Jerome H. and Hastie, Trevor and Tibshirani, Robert},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Friedman, Hastie, Tibshirani - 2000 - Additive Logistic Regression A Statistical View of Boosting.pdf:pdf},
journal = {The Annals of Statistics},
number = {2},
pages = {337--407},
title = {{Additive Logistic Regression: A Statistical View of Boosting}},
volume = {28},
year = {2000}
}
@article{Moshkbar-bakhshayesh2016,
author = {Moshkbar-bakhshayesh, Khalil and Ghofrani, Mohammad B},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Moshkbar-bakhshayesh, Ghofrani - 2016 - Combining Supervised and Semi-Supervised Learning in the Design of a New Identifier for NPPs Tra.pdf:pdf},
journal = {IEEE Transactions on Nuclear Science},
number = {3},
pages = {1882--1888},
title = {{Combining Supervised and Semi-Supervised Learning in the Design of a New Identifier for NPPs Transients}},
volume = {63},
year = {2016}
}
@article{Louppe2015a,
abstract = {Author name disambiguation in bibliographic databases is the problem of grouping together scientific publications written by the same person, accounting for potential homonyms and/or synonyms. Among solutions to this problem, digital libraries are increasingly offering tools for authors to manually curate their publications and claim those that are theirs. Indirectly, these tools allow for the inexpensive collection of large annotated training data, which can be further leveraged to build a complementary automated disambiguation system capable of inferring patterns for identifying publications written by the same person. Building on more than 1 million publicly released crowdsourced annotations, we propose an automated author disambiguation solution exploiting this data (i) to learn an accurate classifier for identifying coreferring authors and (ii) to guide the clustering of scientific publications by distinct authors in a semi-supervised way. To the best of our knowledge, our analysis is the first to be carried out on data of this size and coverage. With respect to the state of the art, we validate the general pipeline used in most existing solutions, and improve by: (i) proposing phonetic-based blocking strategies, thereby increasing recall; and (ii) adding strong ethnicity-sensitive features for learning a linkage function, thereby tailoring disambiguation to non-Western author names whenever necessary.},
archivePrefix = {arXiv},
arxivId = {1508.07744},
author = {Louppe, Gilles and Al-Natsheh, Hussein and Susik, Mateusz and Maguire, Eamonn},
eprint = {1508.07744},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Louppe et al. - 2015 - Ethnicity sensitive author disambiguation using semi-supervised learning.pdf:pdf},
pages = {1--14},
title = {{Ethnicity sensitive author disambiguation using semi-supervised learning}},
url = {http://arxiv.org/abs/1508.07744},
year = {2015}
}
@article{Hanczar2010a,
abstract = {The receiver operator characteristic (ROC) curves are commonly used in biomedical applications to judge the performance of a discriminant across varying decision thresholds. The estimated ROC curve depends on the true positive rate (TPR) and false positive rate (FPR), with the key metric being the area under the curve (AUC). With small samples these rates need to be estimated from the training data, so a natural question arises: How well do the estimates of the AUC, TPR and FPR compare with the true metrics?},
author = {Hanczar, Blaise and Hua, Jianping and Sima, Chao and Weinstein, John and Bittner, Michael and Dougherty, Edward R.},
doi = {10.1093/bioinformatics/btq037},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Hanczar et al. - 2010 - Small-sample precision of ROC-related estimates.pdf:pdf},
issn = {1367-4811},
journal = {Bioinformatics (Oxford, England)},
keywords = {Algorithms,False Positive Reactions,Oligonucleotide Array Sequence Analysis,Pattern Recognition, Automated,Pattern Recognition, Automated: methods,ROC Curve},
month = {mar},
number = {6},
pages = {822--30},
pmid = {20130029},
title = {{Small-sample precision of ROC-related estimates.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/20130029},
volume = {26},
year = {2010}
}
@article{White1982,
author = {White, Halbert},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/White - 1982 - Maximum Likelihood Estimation of Misspecified Models.pdf:pdf},
journal = {Econometrica},
number = {1},
pages = {1--25},
title = {{Maximum Likelihood Estimation of Misspecified Models}},
url = {http://www.jstor.org/stable/10.2307/1912526},
volume = {50},
year = {1982}
}
@article{Wang2016a,
abstract = {In adaptive data analysis, the user makes a sequence of queries on the data, where at each step the choice of query may depend on the results in previous steps. The releases are often randomized in order to reduce overfitting for such adaptively chosen queries. In this paper, we propose a minimax framework for adaptive data analysis. Assuming Gaussianity of queries, we establish the first sharp minimax lower bound on the squared error in the order of {\$}O(\backslashfrac{\{}\backslashsqrt{\{}k{\}}\backslashsigma{\^{}}2{\}}{\{}n{\}}){\$}, where {\$}k{\$} is the number of queries asked, and {\$}\backslashsigma{\^{}}2/n{\$} is the ordinary signal-to-noise ratio for a single query. Our lower bound is based on the construction of an approximately least favorable adversary who picks a sequence of queries that are most likely to be affected by overfitting. This approximately least favorable adversary uses only one level of adaptivity, suggesting that the minimax risk for 1-step adaptivity with k-1 initial releases and that for {\$}k{\$}-step adaptivity are on the same order. The key technical component of the lower bound proof is a reduction to finding the convoluting distribution that optimally obfuscates the sign of a Gaussian signal. Our lower bound construction also reveals a transparent and elementary proof of the matching upper bound as an alternative approach to Russo and Zou (2015), who used information-theoretic tools to provide the same upper bound. We believe that the proposed framework opens up opportunities to obtain theoretical insights for many other settings of adaptive data analysis, which would extend the idea to more practical realms.},
archivePrefix = {arXiv},
arxivId = {1602.04287},
author = {Wang, Yu-Xiang and Lei, Jing and Fienberg, Stephen E.},
eprint = {1602.04287},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wang, Lei, Fienberg - 2016 - A Minimax Theory for Adaptive Data Analysis.pdf:pdf},
title = {{A Minimax Theory for Adaptive Data Analysis}},
url = {http://arxiv.org/abs/1602.04287},
year = {2016}
}
@article{Salimans2016,
abstract = {We present a variety of new architectural features and training procedures that we apply to the generative adversarial networks (GANs) framework. We focus on two applications of GANs: semi-supervised learning, and the generation of images that humans find visually realistic. Unlike most work on generative models, our primary goal is not to train a model that assigns high likelihood to test data, nor do we require the model to be able to learn well without using any labels. Using our new techniques, we achieve state-of-the-art results in semi-supervised classifica-tion on MNIST, CIFAR-10 and SVHN. The generated images are of high quality as confirmed by a visual Turing test: our model generates MNIST samples that humans cannot distinguish from real data, and CIFAR-10 samples that yield a hu-man error rate of 21.3{\%}. We also present ImageNet samples with unprecedented resolution and show that our methods enable the model to learn recognizable fea-tures of ImageNet classes.},
archivePrefix = {arXiv},
arxivId = {1606.03498},
author = {Salimans, Tim and Goodfellow, Ian and Zaremba, Wojciech and Cheung, Vicki and Radford, Alec and Chen, Xi},
eprint = {1606.03498},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Salimans et al. - 2016 - Improved Techniques for Training GANs.pdf:pdf},
journal = {Advances in Neural Information Processing Systems},
pages = {1--10},
title = {{Improved Techniques for Training GANs}},
year = {2016}
}
@inproceedings{Corduneanu2002,
author = {Corduneanu, Adrian and Jaakkola, Tommi},
booktitle = {Proceedings of the 19th Conference on Uncertainty in Artificial Intelligence},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Corduneanu, Jaakkola - 2002 - On information regularization.pdf:pdf},
pages = {151--158},
title = {{On information regularization}},
url = {http://dl.acm.org/citation.cfm?id=2100602},
year = {2002}
}
@article{Breiman1996,
author = {Breiman, Leo},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Breiman - 1996 - Bagging predictors.pdf:pdf},
journal = {Machine learning},
keywords = {aggregation,averaging,bootstrap,combining},
pages = {123--140},
title = {{Bagging predictors}},
url = {http://www.springerlink.com/index/L4780124W2874025.pdf},
volume = {140},
year = {1996}
}
@article{Lehmann1993,
author = {Lehmann, EL},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lehmann - 1993 - The Fisher, Neyman-Pearson theories of testing hypotheses One theory or two.pdf:pdf},
journal = {Journal of the American Statistical Association},
number = {424},
pages = {1242--1249},
title = {{The Fisher, Neyman-Pearson theories of testing hypotheses: One theory or two?}},
url = {http://www.jstor.org/stable/10.2307/2291263},
volume = {88},
year = {1993}
}
@article{Guyon2003,
author = {Guyon, Isabelle and Elisseeff, Andre},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Guyon, Elisseeff - 2003 - An Introduction to Variable and Feature Selection.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {bioinformatics,clustering,computational biology,ery,feature selection,filters,gene expression,genomics,information retrieval,information theory,microarray,model selection,pattern discov-,proteomics,qsar,space dimensionality reduction,statistical testing,support vector machines,text classification,variable selection,wrappers},
pages = {1157--1182},
title = {{An Introduction to Variable and Feature Selection}},
volume = {3},
year = {2003}
}
@article{VanRooden2010,
abstract = {The clinical variability between patients with Parkinson's disease (PD) may point at the existence of subtypes of the disease. Identification of subtypes is important, since a focus on homogeneous groups may enhance the chance of success of research on mechanisms of disease and may also lead to tailored treatment strategies. Cluster analysis (CA) is an objective method to classify patients into subtypes. We systematically reviewed the methodology and results of CA studies in PD to gain a better understanding of the robustness of identified subtypes. We found seven studies that fulfilled the inclusion criteria. Studies were limited by incomplete reporting and methodological limitations. Differences between studies rendered comparisons of the results difficult. However, it appeared that studies which applied a comparable design identified similar subtypes. The cluster profiles "old age-at-onset and rapid disease progression" and "young age-at-onset and slow disease progression" emerged from the majority of studies. Other cluster profiles were less consistent across studies. Future studies with a rigorous study design that is standardized with respect to the included variables, data processing, and CA technique may advance the knowledge on subtypes in PD.},
author = {van Rooden, Stephanie M and Heiser, Willem J and Kok, Joost N and Verbaan, Dagmar and van Hilten, Jacobus J and Marinus, Johan},
doi = {10.1002/mds.23116},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/van Rooden et al. - 2010 - The identification of Parkinson's disease subtypes using cluster analysis a systematic review.pdf:pdf},
issn = {1531-8257},
journal = {Movement disorders : official journal of the Movement Disorder Society},
keywords = {Algorithms,Cluster Analysis,Humans,Parkinson Disease,Parkinson Disease: classification,PubMed,PubMed: statistics {\&} numerical data},
month = {jun},
number = {8},
pages = {969--78},
pmid = {20535823},
title = {{The identification of Parkinson's disease subtypes using cluster analysis: a systematic review.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/20535823},
volume = {25},
year = {2010}
}
@article{Belkin2006,
author = {Belkin, Mikhail and Niyogi, Partha and Sindhwani, Vikas},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Belkin, Niyogi, Sindhwani - 2006 - Manifold regularization A geometric framework for learning from labeled and unlabeled examples.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {fold learning,graph transduction,kernel methods,mani-,regularization,semi-supervised learning,spectral graph theory,support vector machines,unlabeled data},
pages = {2399--2434},
title = {{Manifold regularization: A geometric framework for learning from labeled and unlabeled examples}},
volume = {7},
year = {2006}
}
@inproceedings{Nadler2009,
abstract = {We study the behavior of the popular Laplacian Regularization method for Semi-Supervised Learning at the regime of a fixed number of labeled points but a large number of unlabeled points. We show that in ℝ d, d ≥ 2, the method is actually not well-posed, and as the number of unlabeled points increases the solution degenerates to a noninformative function. We also contrast the method with the Laplacian Eigenvector method, and discuss the "smoothness" assumptions associated with this alternate method.},
author = {Nadler, Boaz and Srebro, Nathan and Zhou, Xueyuan},
booktitle = {Advances in Neural Information Processing Systems 22},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Nadler, Srebro, Zhou - 2009 - Semi-supervised learning with the graph laplacian The limit of infinite unlabelled data.pdf:pdf},
isbn = {9781615679119},
pages = {1331--1338},
title = {{Semi-supervised learning with the graph laplacian: The limit of infinite unlabelled data}},
year = {2009}
}
@phdthesis{Mika2002,
author = {Mika, Sebastian},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Mika - 2002 - Kernel fisher discriminants.pdf:pdf},
title = {{Kernel fisher discriminants}},
url = {http://opus.kobv.de/tuberlin/volltexte/2003/477/},
year = {2002}
}
@article{Goodman2016a,
author = {Goodman, Steven N and Fanelli, Daniele and Ioannidis, John P A},
doi = {10.1126/scitranslmed.aaf5027},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Goodman, Fanelli, Ioannidis - 2016 - What does research reproducibility mean.pdf:pdf},
issn = {1946-6234},
journal = {Science Translational Medicine},
number = {341},
pages = {341ps12},
pmid = {27252173},
title = {{What does research reproducibility mean?}},
volume = {8},
year = {2016}
}
@inproceedings{Cozman2003,
author = {Cozman, Fabio Gagliardi and Cohen, Ira and Cirelo, Marcelo Cesar},
booktitle = {Proceedings of the 20th International Conference on Machine Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Cozman, Cohen, Cirelo - 2003 - Semi-Supervised Learning of Mixture Models.pdf:pdf},
pages = {99--106},
title = {{Semi-Supervised Learning of Mixture Models}},
year = {2003}
}
@incollection{Vickers2016,
author = {Vickers, John},
booktitle = {The Stanford Encyclopedia of Philosophy},
edition = {Spring 201},
editor = {Zalta, Edward N},
howpublished = {$\backslash$url{\{}http://plato.stanford.edu/archives/spr2016/entries/induction-problem/{\}}},
title = {{The Problem of Induction}},
year = {2016}
}
@article{Sion1958,
author = {Sion, Maurice},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Sion - 1958 - On general minimax theorems.pdf:pdf},
journal = {Pacific J. Math},
number = {1},
pages = {171--176},
title = {{On general minimax theorems}},
volume = {8},
year = {1958}
}
@article{Ding2015,
author = {Ding, Shifei and Zhu, Zhibin and Zhang, Xiekai},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ding, Zhu, Zhang - 2015 - An overview on semi-supervised support vector machine.pdf:pdf},
journal = {Neural Computing and Applications},
title = {{An overview on semi-supervised support vector machine}},
year = {2015}
}
@article{Krause2014,
abstract = {Submodularity1 is a property of set functions with deep theoretical consequences and far– reaching applications. At first glance it appears very similar to concavity, in other ways it resembles convexity. It appears in a wide variety of applications: in Computer Science it ... $\backslash$n},
author = {Krause, Andreas and Golovin, Daniel},
doi = {10.1017/CBO9781139177801.004},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Krause, Golovin - 2014 - Submodular function maximization.pdf:pdf},
isbn = {9781139177801},
issn = {{\textless}null{\textgreater}},
journal = {Tractability: Practical Approaches to Hard Problems},
pages = {71--104},
title = {{Submodular function maximization}},
volume = {3},
year = {2014}
}
@article{Betancourt2015a,
archivePrefix = {arXiv},
arxivId = {arXiv:1506.02273v1},
author = {Betancourt, Michael},
eprint = {arXiv:1506.02273v1},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Betancourt - 2015 - A Unified Treatment of Predictive Model Comparison.pdf:pdf},
pages = {1--20},
title = {{A Unified Treatment of Predictive Model Comparison}},
year = {2015}
}
@inproceedings{Ho1996,
author = {Ho, Tin Kam and Kleinberg, Eeugene M.},
booktitle = {International Conference on Pattern Recognition},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Ho, Kleinberg - 1996 - Building projectable classifiers of arbitrary complexity.pdf:pdf},
pages = {880--885},
title = {{Building projectable classifiers of arbitrary complexity}},
url = {http://ieeexplore.ieee.org/xpls/abs{\_}all.jsp?arnumber=547202},
volume = {2},
year = {1996}
}
@article{Maaløe2015a,
abstract = {Deep generative models based upon continuous variational distributions parameterized by deep networks give state-of-the-art performance. In this paper we propose a framework for extending the latent representation with extra auxiliary variables in order to make the variational distribution more expressive for semi-supervised learning. By utilizing the stochasticity of the auxiliary variable we demonstrate how to train discriminative classifiers resulting in state-of-the-art performance within semi-supervised learning exemplified by an 0.96{\%} error on MNIST using 100 labeled data points. Furthermore we observe empirically that using auxiliary variables increases convergence speed suggesting that less expressive variational distributions, not only lead to looser bounds but also slower model training.},
archivePrefix = {arXiv},
arxivId = {1602.05473},
author = {Maal{\o}e, Lars and S{\o}nderby, Casper Kaae and S{\o}nderby, S{\o}ren Kaae and Winther, Ole},
eprint = {1602.05473},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Maal{\o}e et al. - 2015 - Improving Semi-Supervised Learning with Auxiliary Deep Generative Models.pdf:pdf},
journal = {NIPS workshop},
pages = {1--5},
title = {{Improving Semi-Supervised Learning with Auxiliary Deep Generative Models}},
year = {2015}
}
@article{VanderMaaten2008,
author = {van der Maaten, L.J.P. and Hinton, G.E.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/van der Maaten, Hinton - 2008 - Visualizing High-Dimensional Data Using t-SNE.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {dimensionality reduction,embedding algorithms,manifold learning,multidimensional scaling,visualization},
pages = {2579--2605},
title = {{Visualizing High-Dimensional Data Using t-SNE}},
volume = {9},
year = {2008}
}
@inproceedings{Abney2002,
author = {Abney, Steven},
booktitle = {Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Abney - 2002 - Bootstrapping.pdf:pdf},
number = {July},
pages = {360--367},
title = {{Bootstrapping}},
year = {2002}
}
@article{Devroye1982,
abstract = {Consider the basic discrimination problem based on a sample of size n drawn from the distribution of (X, Y) on the Borel sets of Rdx {\{}O, 1{\}}. If 0 {\textless} R*{\textless} is a given number, and 'n - 0 is an arbitrary positive sequence, then for any discrimination rule one can find a distribution for (X, Y), not depending upon n, with Bayes probability of error R* such that the probability of error (Rn) of the discrimination rule is larger than R* + 'On for infinitely many n. We give a formal proof of this result, which is a generalization of a result by Cover [1].},
author = {Devroye, L},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Devroye - 1982 - Any discrimination rule can have an arbitrarily bad probability of error for finite sample size.pdf:pdf},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
number = {2},
pages = {154--157},
title = {{Any discrimination rule can have an arbitrarily bad probability of error for finite sample size.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/21869021},
volume = {4},
year = {1982}
}
@article{Zeiler2012,
archivePrefix = {arXiv},
arxivId = {arXiv:1311.2901v3},
author = {Zeiler, Matthew D and Fergus, Rob},
eprint = {arXiv:1311.2901v3},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Zeiler, Fergus - 2013 - Visualizing and Understanding Convolutional Networks.pdf:pdf},
title = {{Visualizing and Understanding Convolutional Networks}},
year = {2013}
}
@inproceedings{Arthur2007,
abstract = {The k-means method is a widely used clustering technique that seeks to minimize the average squared distance between points in the same cluster. Although it offers no accuracy guarantees, its simplicity and speed are very appealing in practice. By augmenting k-means with a very simple, ran- domized seeding technique, we obtain an algorithm that is $\Theta$(log k)-competitive with the optimal clustering. Prelim- inary experiments show that our augmentation improves both the speed and the accuracy of k-means, often quite dramatically.},
author = {Arthur, David and Vassilvitskii, Sergei},
booktitle = {Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Arthur, Vassilvitskii - 2007 - k-means The Advantages of Careful Seeding.pdf:pdf},
pages = {1027--1035},
title = {{k-means ++ : The Advantages of Careful Seeding}},
year = {2007}
}
@article{Leistner2009,
author = {Leistner, Christian and Saffari, Amir and Santner, Jakob and Bischof, Horst},
doi = {10.1109/ICCV.2009.5459198},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Leistner et al. - 2009 - Semi-Supervised Random Forests.pdf:pdf},
isbn = {978-1-4244-4420-5},
journal = {2009 IEEE 12th International Conference on Computer Vision},
month = {sep},
pages = {506--513},
publisher = {Ieee},
title = {{Semi-Supervised Random Forests}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=5459198},
year = {2009}
}
@inproceedings{Furnkranz2001,
author = {F{\"{u}}rnkranz, Johannes and Petrak, Johann},
booktitle = {Proceedings of the ECML/PKDD Workshop on Integrating Aspects of Data Mining, Decision Support and Meta-Learning},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/F{\"{u}}rnkranz, Petrak - 2001 - An Evaluation of Landmarking Variants.pdf:pdf},
pages = {57--68},
title = {{An Evaluation of Landmarking Variants}},
url = {http://ai.ijs.si/branax/iddm-2001-proceedings/paper9.pdf},
year = {2001}
}
@article{Lindner1999,
author = {Lindner, Guido and Studer, Rudi},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Lindner, Studer - 1999 - AST Support for algorithm selection with a CBR approach.pdf:pdf},
journal = {Principles of Data Mining and Knowledge Discovery},
title = {{AST: Support for algorithm selection with a CBR approach}},
url = {http://www.springerlink.com/index/QBDF3R2GKVW57LUF.pdf http://link.springer.com/chapter/10.1007/978-3-540-48247-5{\_}52},
year = {1999}
}
@book{Newport,
author = {Newport, Cal},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Newport - Unknown - Deep Work.pdf:pdf},
title = {{Deep Work}}
}
@inproceedings{Kemp2004,
author = {Kemp, Charles and Griffiths, Thomas L. and Stromsten, Sean and Tenenbaum, Joshua B.},
booktitle = {Advances in neural information processing systems 16},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Kemp et al. - 2004 - Semi-supervised Learning with Trees.pdf:pdf},
pages = {1--9},
title = {{Semi-supervised Learning with Trees}},
year = {2004}
}
@article{Friedman2001,
author = {Friedman, Jerome H.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Friedman - 2001 - Greedy Function Approximation A Gradient Boosting Machine.pdf:pdf},
journal = {The Annals of Statistics},
number = {5},
pages = {1189--1232},
title = {{Greedy Function Approximation: A Gradient Boosting Machine}},
url = {http://home.olemiss.edu/{~}xdang/676/Greedy{\_}function{\_}approximation{\_}a{\_}gradient{\_}bossting{\_}machine.pdf http://scholar.google.com/scholar?hl=en{\&}btnG=Search{\&}q=intitle:Greedy+Function+Approximation:+A+Gradient+Boosting+Machine{\#}3},
volume = {29},
year = {2001}
}
@article{Henmi2004,
author = {Henmi, Masayuki and Eguchi, Shinto},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Henmi, Eguchi - 2004 - A paradox concerning nuisance parameters and projected estimating functions.pdf:pdf},
journal = {Biometrika},
number = {4},
pages = {929--941},
title = {{A paradox concerning nuisance parameters and projected estimating functions}},
volume = {91},
year = {2004}
}
@inproceedings{Zhang2004,
author = {Zhang, P. and Peng, J.},
booktitle = {Proceedings of the 17th International Conference on Pattern Recognition},
doi = {10.1109/ICPR.2004.1334050},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Zhang, Peng - 2004 - SVM vs regularized least squares classification.pdf:pdf},
isbn = {0-7695-2128-2},
pages = {176--179},
title = {{SVM vs regularized least squares classification}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=1334050},
year = {2004}
}
@article{Gama1995,
author = {Gama, Joao and Brazdil, Pavel B.},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Gama, Brazdil - 1995 - Characterization of Classification Algorithms.pdf:pdf},
journal = {Progress in Artificial Intelligence},
pages = {189--200},
title = {{Characterization of Classification Algorithms}},
url = {http://link.springer.com/chapter/10.1007/3-540-60428-6{\_}16},
year = {1995}
}
@inproceedings{Karasuyama2013,
author = {Karasuyama, Masayuki and Mamitsuka, Hiroshi},
booktitle = {Advances in Neural Information Processing Systems},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Karasuyama, Mamitsuka - 2013 - Manifold-based Similarity Adaptation for Label Propagation.pdf:pdf},
pages = {1547--1555},
title = {{Manifold-based Similarity Adaptation for Label Propagation}},
year = {2013}
}
@phdthesis{Rifkin2002,
author = {Rifkin, Ryan},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Rifkin - 2002 - Everything Old Is New Again A Fresh Look at Historical Approaches in Machine Learning.pdf:pdf},
title = {{Everything Old Is New Again: A Fresh Look at Historical Approaches in Machine Learning}},
year = {2002}
}
@article{Wang2015a,
author = {Wang, Fang and Li, Renfu and Lei, Zhikun and Ni, Xuelei Sherry and Huo, Xiaoming and Chen, Ming},
doi = {10.1016/j.patrec.2015.06.005},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Wang et al. - 2015 - Kernel Fusion-Refinement for Semi-supervised Nonlinear Dimension Reduction.pdf:pdf},
issn = {01678655},
journal = {Pattern Recognition Letters},
keywords = {Dimension reduction,Semi-supervised learning,kernel fusion-refinement},
publisher = {Elsevier Ltd.},
title = {{Kernel Fusion-Refinement for Semi-supervised Nonlinear Dimension Reduction}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0167865515001671},
year = {2015}
}
@article{Xu2009,
address = {New York, New York, USA},
author = {Xu, Linli and White, Martha and Schuurmans, Dale},
doi = {10.1145/1553374.1553519},
file = {:Users/jkrijthe/Documents/Mendeley Desktop/Xu, White, Schuurmans - 2009 - Optimal reverse prediction.pdf:pdf},
isbn = {9781605585161},
journal = {Proceedings of the 26th Annual International Conference on Machine Learning - ICML '09},
pages = {1--8},
publisher = {ACM Press},
title = {{Optimal reverse prediction}},
url = {http://portal.acm.org/citation.cfm?doid=1553374.1553519},
year = {2009}
}