tesis.bib


@misc{read_moa_2012,
	title = {{MOA}: moa.streams.generators.multilabel.{MetaMultilabelGenerator} {Class} {Reference}},
	url = {https://www.cs.waikato.ac.nz/~abifet/MOA/API/classmoa_1_1streams_1_1generators_1_1multilabel_1_1_meta_multilabel_generator.html},
	urldate = {2021-02-20},
	author = {Read, Jesse},
	year = {2012},
}

@article{gama_evaluating_2013,
	title = {On evaluating stream learning algorithms},
	volume = {90},
	issn = {1573-0565},
	url = {https://doi.org/10.1007/s10994-012-5320-9},
	doi = {10.1007/s10994-012-5320-9},
	abstract = {Most streaming decision models evolve continuously over time, run in resource-aware environments, and detect and react to changes in the environment generating data. One important issue, not yet convincingly addressed, is the design of experimental work to evaluate and compare decision models that evolve over time. This paper proposes a general framework for assessing predictive stream learning algorithms. We defend the use of prequential error with forgetting mechanisms to provide reliable error estimators. We prove that, in stationary data and for consistent learning algorithms, the holdout estimator, the prequential error and the prequential error estimated over a sliding window or using fading factors, all converge to the Bayes error. The use of prequential error with forgetting mechanisms reveals to be advantageous in assessing performance and in comparing stream learning algorithms. It is also worthwhile to use the proposed methods for hypothesis testing and for change detection. In a set of experiments in drift scenarios, we evaluate the ability of a standard change detection algorithm to detect change using three prequential error estimators. These experiments point out that the use of forgetting mechanisms (sliding windows or fading factors) are required for fast and efficient change detection. In comparison to sliding windows, fading factors are faster and memoryless, both important requirements for streaming applications. Overall, this paper is a contribution to a discussion on best practice for performance assessment when learning is a continuous process, and the decision models are dynamic and evolve over time.},
	language = {en},
	number = {3},
	urldate = {2021-02-16},
	journal = {Machine Learning},
	author = {Gama, João and Sebastião, Raquel and Rodrigues, Pedro Pereira},
	month = mar,
	year = {2013},
	pages = {317--346},
}

@article{tsoumakas_mulan_2011,
	title = {{MULAN}: {A} {Java} library for multi-label learning},
	volume = {12},
	shorttitle = {{MULAN}},
	abstract = {Mulan is a Java library for learning from multi-label data. It offers a variety of classiffication, ranking, thresholding and dimensionality reduction algorithms, including an algorithm for learning from hierarchically structured labels. In addition, it contains an evaluation framework that calculates a rich variety of performance measures.},
	journal = {Journal of Machine Learning Research},
	author = {Tsoumakas, Grigorios and Spyromitros-Xioufis, Eleftherios and Vilcek, Jozef and Vlahavas, I.},
	month = jul,
	year = {2011},
	pages = {2411--2414},
}

@article{pedregosa_scikit-learn_2018,
	title = {Scikit-learn: {Machine} {Learning} in {Python}},
	shorttitle = {Scikit-learn},
	url = {http://arxiv.org/abs/1201.0490},
	abstract = {Scikit-learn is a Python module integrating a wide range of state-of-the-art machine learning algorithms for medium-scale supervised and unsupervised problems. This package focuses on bringing machine learning to non-specialists using a general-purpose high-level language. Emphasis is put on ease of use, performance, documentation, and API consistency. It has minimal dependencies and is distributed under the simplified BSD license, encouraging its use in both academic and commercial settings. Source code, binaries, and documentation can be downloaded from http://scikit-learn.org.},
	urldate = {2021-02-15},
	journal = {arXiv:1201.0490 [cs]},
	author = {Pedregosa, Fabian and Varoquaux, Gaël and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Müller, Andreas and Nothman, Joel and Louppe, Gilles and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and Vanderplas, Jake and Passos, Alexandre and Cournapeau, David and Brucher, Matthieu and Perrot, Matthieu and Duchesnay, Édouard},
	month = jun,
	year = {2018},
	note = {arXiv: 1201.0490},
	keywords = {Computer Science - Machine Learning, Computer Science - Mathematical Software},
	annote = {Comment: Update authors list and URLs},
}

@article{bifet_moa_2010,
	title = {{MOA}: massive online analysis},
	volume = {11},
	shorttitle = {{MOA}},
	abstract = {Massive Online Analysis (MOA) is a software environment for implementing algorithms and running experiments for online learning from evolving data streams. MOA includes a collection of offline and online methods as well as tools for evaluation. In particular, it implements boosting, bagging, and Hoeffding Trees, all with and without Naïve Bayes classifiers at the leaves. MOA supports bi-directional interaction with WEKA, the Waikato Environment for Knowledge Analysis, and is released under the GNU GPL license.},
	journal = {Journal of Machine Learning Research},
	author = {Bifet, Albert and Holmes, Geoffrey and Kirkby, Richard and Pfahringer, Bernhard},
	month = may,
	year = {2010},
}

@article{montiel_scikit-multiflow_2018,
	title = {Scikit-{Multiflow}: {A} {Multi}-output {Streaming} {Framework}},
	shorttitle = {Scikit-{Multiflow}},
	url = {http://arxiv.org/abs/1807.04662},
	doi = {10.5555/3291125.3309634},
	abstract = {Scikit-multiflow is a multi-output/multi-label and stream data mining framework for the Python programming language. Conceived to serve as a platform to encourage democratization of stream learning research, it provides multiple state of the art methods for stream learning, stream generators and evaluators. scikit-multiflow builds upon popular open source frameworks including scikit-learn, MOA and MEKA. Development follows the FOSS principles and quality is enforced by complying with PEP8 guidelines and using continuous integration and automatic testing. The source code is publicly available at https://github.com/scikit-multiflow/scikit-multiflow.},
	urldate = {2021-02-15},
	journal = {arXiv:1807.04662 [cs, stat]},
	author = {Montiel, Jacob and Read, Jesse and Bifet, Albert and Abdessalem, Talel},
	month = jul,
	year = {2018},
	note = {arXiv: 1807.04662},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
	annote = {Comment: 5 pages, Open Source Software},
}

@article{buyukcakir_novel_2018,
	title = {A {Novel} {Online} {Stacked} {Ensemble} for {Multi}-{Label} {Stream} {Classification}},
	url = {http://arxiv.org/abs/1809.09994},
	doi = {10.1145/3269206.3271774},
	abstract = {As data streams become more prevalent, the necessity for online algorithms that mine this transient and dynamic data becomes clearer. Multi-label data stream classification is a supervised learning problem where each instance in the data stream is classified into one or more pre-defined sets of labels. Many methods have been proposed to tackle this problem, including but not limited to ensemble-based methods. Some of these ensemble-based methods are specifically designed to work with certain multi-label base classifiers; some others employ online bagging schemes to build their ensembles. In this study, we introduce a novel online and dynamically-weighted stacked ensemble for multi-label classification, called GOOWE-ML, that utilizes spatial modeling to assign optimal weights to its component classifiers. Our model can be used with any existing incremental multi-label classification algorithm as its base classifier. We conduct experiments with 4 GOOWE-ML-based multi-label ensembles and 7 baseline models on 7 real-world datasets from diverse areas of interest. Our experiments show that GOOWE-ML ensembles yield consistently better results in terms of predictive performance in almost all of the datasets, with respect to the other prominent ensemble models.},
	urldate = {2021-02-12},
	journal = {Proceedings of the 27th ACM International Conference on Information and Knowledge Management},
	author = {Büyükçakır, Alican and Bonab, Hamed and Can, Fazli},
	month = oct,
	year = {2018},
	note = {arXiv: 1809.09994},
	keywords = {Computer Science - Information Retrieval, Computer Science - Machine Learning, Statistics - Machine Learning},
	pages = {1063--1072},
	annote = {Comment: 10 pages, 4 figures. To be appeared in ACM CIKM 2018, in Torino, Italy},
}

@inproceedings{oza_online_2005,
	title = {Online bagging and boosting},
	volume = {3},
	doi = {10.1109/ICSMC.2005.1571498},
	abstract = {Bagging and boosting are two of the most well-known ensemble learning methods due to their theoretical performance guarantees and strong experimental results. However, these algorithms have been used mainly in batch mode, i.e., they require the entire training set to be available at once and, in some cases, require random access to the data. In this paper, we present online versions of bagging and boosting that require only one pass through the training data. We build on previously presented work by describing some theoretical results. We also compare the online and batch algorithms experimentally in terms of accuracy and running time.},
	booktitle = {2005 {IEEE} {International} {Conference} on {Systems}, {Man} and {Cybernetics}},
	author = {Oza, N. C.},
	month = oct,
	year = {2005},
	note = {ISSN: 1062-922X},
	keywords = {Backpropagation algorithms, Bagging, batch mode, boosting, Boosting, ensemble learning, Intelligent systems, learning (artificial intelligence), Learning systems, NASA, online bagging learning method, online boosting learning method, online learning, Postal services, Predictive models, Supervised learning, training data, Training data},
	pages = {2340--2345 Vol. 3},
}

@article{kolter_dynamic_2007,
	title = {Dynamic {Weighted} {Majority}: {An} {Ensemble} {Method} for {Drifting} {Concepts}},
	volume = {8},
	issn = {1532-4435},
	shorttitle = {Dynamic {Weighted} {Majority}},
	abstract = {We present an ensemble method for concept drift that dynamically creates and removes weighted experts in response to changes in performance. The method, dynamic weighted majority (*DWM*), uses four mechanisms to cope with concept drift: It trains online learners of the ensemble, it weights those learners based on their performance, it removes them, also based on their performance, and it adds new experts based on the global performance of the ensemble. After an extensive evaluation---consisting of five experiments, eight learners, and thirty data sets that varied in type of target concept, size, presence of noise, and the like---we concluded that *DWM* outperformed other learners that only incrementally learn concept descriptions, that maintain and use previously encountered examples, and that employ an unweighted, fixed-size ensemble of experts.},
	journal = {The Journal of Machine Learning Research},
	author = {Kolter, J. Zico and Maloof, Marcus A.},
	month = dec,
	year = {2007},
	pages = {2755--2790},
}

@book{japkowicz_evaluating_2011,
	address = {Cambridge},
	title = {Evaluating {Learning} {Algorithms}: {A} {Classification} {Perspective}},
	isbn = {978-0-521-19600-0},
	shorttitle = {Evaluating {Learning} {Algorithms}},
	url = {https://www.cambridge.org/core/books/evaluating-learning-algorithms/3CB22D16AB609D1770C24CA2CB5A11BF},
	abstract = {The field of machine learning has matured to the point where many sophisticated learning approaches can be applied to practical applications. Thus it is of critical importance that researchers have the proper tools to evaluate learning approaches and understand the underlying issues. This book examines various aspects of the evaluation process with an emphasis on classification algorithms. The authors describe several techniques for classifier performance assessment, error estimation and resampling, obtaining statistical significance as well as selecting appropriate domains for evaluation. They also present a unified evaluation framework and highlight how different components of evaluation are both significantly interrelated and interdependent. The techniques presented in the book are illustrated using R and WEKA, facilitating better practical insight as well as implementation. Aimed at researchers in the theory and applications of machine learning, this book offers a solid basis for conducting performance evaluations of algorithms in practical settings.},
	urldate = {2021-02-02},
	publisher = {Cambridge University Press},
	author = {Japkowicz, Nathalie and Shah, Mohak},
	year = {2011},
	doi = {10.1017/CBO9780511921803},
}

@article{wickramasinghe_naive_2020,
	title = {Naive {Bayes}: applications, variations and vulnerabilities: a review of literature with code snippets for implementation},
	issn = {1433-7479},
	shorttitle = {Naive {Bayes}},
	url = {http://link.springer.com/10.1007/s00500-020-05297-6},
	doi = {10.1007/s00500-020-05297-6},
	language = {en},
	urldate = {2021-01-11},
	journal = {Soft Computing},
	author = {Wickramasinghe, Indika and Kalutarage, Harsha},
	month = sep,
	year = {2020},
}

@incollection{hutchison_enron_2004,
	address = {Berlin, Heidelberg},
	title = {The {Enron} {Corpus}: {A} {New} {Dataset} for {Email} {Classification} {Research}},
	volume = {3201},
	isbn = {978-3-540-30115-8},
	shorttitle = {The {Enron} {Corpus}},
	url = {http://link.springer.com/10.1007/978-3-540-30115-8_22},
	abstract = {Automated classiﬁcation of email messages into user-speciﬁc folders and information extraction from chronologically ordered email streams have become interesting areas in text learning research. However, the lack of large benchmark collections has been an obstacle for studying the problems and evaluating the solutions. In this paper, we introduce the Enron corpus as a new test bed. We analyze its suitability with respect to email folder prediction, and provide the baseline results of a stateof-the-art classiﬁer (Support Vector Machines) under various conditions, including the cases of using individual sections (From, To, Subject and body) alone as the input to the classiﬁer, and using all the sections in combination with regression weights.},
	language = {en},
	urldate = {2020-06-15},
	booktitle = {Machine {Learning}: {ECML} 2004},
	publisher = {Springer Berlin Heidelberg},
	author = {Klimt, Bryan and Yang, Yiming},
	editor = {Hutchison, David and Kanade, Takeo and Kittler, Josef and Kleinberg, Jon M. and Mattern, Friedemann and Mitchell, John C. and Naor, Moni and Nierstrasz, Oscar and Pandu Rangan, C. and Steffen, Bernhard and Sudan, Madhu and Terzopoulos, Demetri and Tygar, Dough and Vardi, Moshe Y. and Weikum, Gerhard and Boulicaut, Jean-François and Esposito, Floriana and Giannotti, Fosca and Pedreschi, Dino},
	year = {2004},
	doi = {10.1007/978-3-540-30115-8_22},
	note = {Series Title: Lecture Notes in Computer Science},
	keywords = {datasets, enron},
	pages = {217--226},
}

@incollection{lang_newsweeder_1995,
	address = {San Francisco (CA)},
	title = {{NewsWeeder}: {Learning} to {Filter} {Netnews}},
	isbn = {978-1-55860-377-6},
	shorttitle = {{NewsWeeder}},
	url = {http://www.sciencedirect.com/science/article/pii/B9781558603776500487},
	abstract = {A significant problem in many information filtering systems is the dependence on the user for the creation and maintenance of a user profile, which describes the user's interests. NewsWeeder is a netnews-filtering system that addresses this problem by letting the user rate his or her interest level for each article being read (1-5), and then learning a user profile based on these ratings. This paper describes how NewsWeeder accomplishes this task, and examines the alternative learning methods used. The results show that a learning algorithm based on the Minimum Description Length (MDL) principle was able to raise the percentage of interesting articles to be shown to users from 14\% to 52\% on average. Further, this performance significantly outperformed (by 21\%) one of the most successful techniques in Information Retrieval (IR), term-frequency/inverse-document-frequency (tf-idf) weighting.},
	language = {en},
	urldate = {2021-02-01},
	booktitle = {Machine {Learning} {Proceedings} 1995},
	publisher = {Morgan Kaufmann},
	author = {Lang, Ken},
	editor = {Prieditis, Armand and Russell, Stuart},
	month = jan,
	year = {1995},
	doi = {10.1016/B978-1-55860-377-6.50048-7},
	keywords = {20ng},
	pages = {331--339},
}

@misc{read_generating_2009,
	title = {Generating {Synthetic} {Multi}-label {Data} {Streams}},
	url = {/paper/Generating-Synthetic-Multi-label-Data-Streams-Read-Pfahringer/147e3bc5f3c03884a8ba6d5420dc100834424c5d},
	abstract = {There are many available methods for generating synthetic data streams. Such methods have been justified by the need to study the efficacy of algorithms on a theoretically infinite stream, and also a lack of real-world data of sufficient size. Although multi-label classification has attracted considerable interest in recent years, most of this work has been carried out in the context of a batch learning environment rather than a data stream. This paper makes an in-depth analysis of multi-label data, and presents a general framework for generating synthetic multi-label data streams.},
	language = {en},
	urldate = {2021-01-31},
	author = {Read, J. and Pfahringer, B. and Holmes, G.},
	year = {2009},
}

@inproceedings{diplaris_protein_2005,
	address = {Berlin, Heidelberg},
	series = {Lecture {Notes} in {Computer} {Science}},
	title = {Protein {Classification} with {Multiple} {Algorithms}},
	isbn = {978-3-540-32091-3},
	doi = {10.1007/11573036_42},
	abstract = {Nowadays, the number of protein sequences being stored in central protein databases from labs all over the world is constantly increasing. From these proteins only a fraction has been experimentally analyzed in order to detect their structure and hence their function in the corresponding organism. The reason is that experimental determination of structure is labor-intensive and quite time-consuming. Therefore there is the need for automated tools that can classify new proteins to structural families. This paper presents a comparative evaluation of several algorithms that learn such classification models from data concerning patterns of proteins with known structure. In addition, several approaches that combine multiple learning algorithms to increase the accuracy of predictions are evaluated. The results of the experiments provide insights that can help biologists and computer scientists design high-performance protein classification systems of high quality.},
	language = {en},
	booktitle = {Advances in {Informatics}},
	publisher = {Springer},
	author = {Diplaris, Sotiris and Tsoumakas, Grigorios and Mitkas, Pericles A. and Vlahavas, Ioannis},
	editor = {Bozanis, Panayiotis and Houstis, Elias N.},
	year = {2005},
	keywords = {Classification Algorithm, Classifier Selection, Protein Classification, Sequential Minimal Optimization, Weight Vote},
	pages = {448--456},
}

@article{boutell_learning_2004,
	title = {Learning multi-label scene classification},
	volume = {37},
	doi = {10.1016/j.patcog.2004.03.009},
	abstract = {In classic pattern recognition problems, classes are mutually exclusive by definition. Classification errors occur when the classes overlap in the feature space. We examine a different situation, occurring when the classes are, by definition, not mutually exclusive. Such problems arise in semantic scene and document classification and in medical diagnosis. We present a framework to handle such problems and apply it to the problem of semantic scene classification, where a natural scene may contain multiple objects such that the scene can be described by multiple class labels (e.g., a field scene with a mountain in the background). Such a problem poses challenges to the classic pattern recognition paradigm and demands a different treatment. We discuss approaches for training and testing in this scenario and introduce new metrics for evaluating individual examples, class recall and precision, and overall accuracy. Experiments show that our methods are suitable for scene classification; furthermore, our work appears to generalize to other classification problems of the same nature.},
	journal = {Pattern Recognition},
	author = {Boutell, Matthew and Luo, Jiebo and Shen, Xipeng and Brown, Christopher},
	month = sep,
	year = {2004},
	pages = {1757--1771},
}

@inproceedings{domingos_mining_2000,
	address = {New York, NY, USA},
	series = {{KDD} '00},
	title = {Mining high-speed data streams},
	isbn = {978-1-58113-233-5},
	url = {https://doi.org/10.1145/347090.347107},
	doi = {10.1145/347090.347107},
	urldate = {2021-01-30},
	booktitle = {Proceedings of the sixth {ACM} {SIGKDD} international conference on {Knowledge} discovery and data mining},
	publisher = {Association for Computing Machinery},
	author = {Domingos, Pedro and Hulten, Geoff},
	month = aug,
	year = {2000},
	keywords = {decision trees, disk-based algorithms, Hoeffding bounds, incremental learning, subsampling},
	pages = {71--80},
}

@misc{kirkby_improving_2007,
	title = {Improving {Hoeffding} {Trees}},
	url = {/paper/Improving-Hoeffding-Trees-Kirkby/56283855992584581eb9c0eb4413b47be496b94e},
	abstract = {Modern information technology allows information to be collected at a far greater rate than ever before. So fast, in fact, that the main problem is making sense of it all. Machine learning offers promise of a solution, but the field mainly focusses on achieving high accuracy when data supply is limited. While this has created sophisticated classification algorithms, many do not cope with increasing data set sizes. When the data set sizes get to a point where they could be considered to represent a continuous supply, or data stream, then incremental classification algorithms are required. In this setting, the effectiveness of an algorithm cannot simply be assessed by accuracy alone. Consideration needs to be given to the memory available to the algorithm and the speed at which data is processed in terms of both the time taken to predict the class of a new data sample and the time taken to include this sample in an incrementally updated classification model. The Hoeffding tree algorithm is a state-of-the-art method for inducing decision trees from data streams. The aim of this thesis is to improve this algorithm. To measure improvement, a comprehensive framework for evaluating the performance of data stream algorithms is developed. Within the framework memory size is fixed in order to simulate realistic application scenarios. In order to simulate continuous operation, classes of synthetic data are generated providing an evaluation on a large scale. Improvements to many aspects of the Hoeffding tree algorithm are demonstrated. First, a number of methods for handling continuous numeric features are compared. Second, tree prediction strategy is investigated to evaluate the utility of various methods. Finally, the possibility of improving accuracy using ensemble methods is explored. The experimental results provide meaningful comparisons of accuracy and processing speeds between different modifications of the Hoeffding tree algorithm under various memory limits. The study on numeric attributes demonstrates that sacrificing accuracy for space at the local level often results in improved global accuracy. The prediction strategy shown to perform best adaptively chooses between standard majority class and Naive Bayes prediction in the leaves. The ensemble method investigation shows that combining trees can be worthwhile, but only when sufficient memory is available, and improvement is less likely than in traditional machine learning. In particular, issues are encountered when applying the popular boosting method to streams.},
	language = {en},
	urldate = {2021-01-30},
	journal = {undefined},
	author = {Kirkby, Richard},
	year = {2007},
}

@article{domingos_mining_2002,
	title = {Mining {High}-{Speed} {Data} {Streams}},
	doi = {10.1145/347090.347107},
	abstract = {Many organizations today have more than very large databases; they have databases that grow without limit at a rate of several million records per day. Mining these continuous data streams brings unique opportunities, but also new challenges. This paper describes and evaluates VFDT, an anytime system that builds decision trees using constant memory and constant time per example. VFDT can incorporate tens of thousands of examples per second using o\#-the-shelf hardware. It uses Hoe\#ding bounds to guarantee that its output is asymptotically nearly identical to that of a conventional learner. We study VFDT's properties and demonstrate its utility through an extensive set of experiments on synthetic data. We apply VFDT to mining the continuous stream of Web access data from the whole University of Washington main campus.},
	journal = {Proceeding of the Sixth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
	author = {Domingos, Pedro and Hulten, Geoff},
	month = nov,
	year = {2002},
}

@book{herrera_multilabel_2016,
	title = {Multilabel {Classification}},
	isbn = {978-3-319-41110-1},
	abstract = {This book offers a comprehensive review of multilabel techniques widely used to classify and label texts, pictures, videos and music in the Internet. A deep review of the specialized literature on the field includes the available software needed to work with this kind of data. It provides the user with the software tools needed to deal with multilabel data, as well as step by step instruction on how to use them. The main topics covered are:

• The special characteristics of multi-labeled data and the metrics available to measure them.
• The importance of taking advantage of label correlations to improve the results.
• The different approaches followed to face multi-label classification.
• The preprocessing techniques applicable to multi-label datasets.
• The available software tools to work with multi-label data.

This book is beneficial for professionals and researchers in a variety of fields because of the wide range of potential applications for multilabel classification. Besides its multiple applications to classify different types of online information, it is also useful in many other areas, such as genomics and biology. No previous knowledge about the subject is required. The book introduces all the needed concepts to understand multilabel data characterization, treatment and evaluation.},
	author = {Herrera, Francisco and Charte, F. and Rivera Rivas, Antonio and Del Jesus, María José},
	month = jan,
	year = {2016},
	doi = {10.1007/978-3-319-41111-8},
}

@article{tsoumakas_random_2011,
	title = {Random k-{Labelsets} for {Multi}-{Label} {Classification}},
	volume = {23},
	doi = {10.1109/TKDE.2010.164},
	abstract = {A simple yet effective multilabel learning method, called label powerset (LP), considers each distinct combination of labels that exist in the training set as a different class value of a single-label classification task. The computational efficiency and predictive performance of LP is challenged by application domains with large number of labels and training examples. In these cases, the number of classes may become very large and at the same time many classes are associated with very few training examples. To deal with these problems, this paper proposes breaking the initial set of labels into a number of small random subsets, called labelsets and employing LP to train a corresponding classifier. The labelsets can be either disjoint or overlapping depending on which of two strategies is used to construct them. The proposed method is called RAkEL (RAndom k labELsets), where k is a parameter that specifies the size of the subsets. Empirical evidence indicates that RAkEL manages to improve substantially over LP, especially in domains with large number of labels and exhibits competitive performance against other high-performing multilabel learning methods.},
	journal = {IEEE Trans. Knowl. Data Eng.},
	author = {Tsoumakas, Grigorios and Katakis, Ioannis and Vlahavas, I.},
	month = jul,
	year = {2011},
	pages = {1079--1089},
}

@article{wolpert_stacked_1992,
	title = {Stacked generalization},
	volume = {5},
	issn = {0893-6080},
	url = {http://www.sciencedirect.com/science/article/pii/S0893608005800231},
	doi = {10.1016/S0893-6080(05)80023-1},
	abstract = {This paper introduces stacked generalization, a scheme for minimizing the generalization error rate of one or more generalizers. Stacked generalization works by deducing the biases of the generalizer(s) with respect to a provided learning set. This deduction proceeds by generalizing in a second space whose inputs are (for example) the guesses of the original generalizers when taught with part of the learning set and trying to guess the rest of it, and whose output is (for example) the correct guess. When used with multiple generalizers, stacked generalization can be seen as a more sophisticated version of cross-validation, exploiting a strategy more sophisticated than cross-validation's crude winner-takes-all for combining the individual generalizers. When used with a single generalizer, stacked generalization is a scheme for estimating (and then correcting for) the error of a generalizer which has been trained on a particular learning set and then asked a particular question. After introducing stacked generalization and justifying its use, this paper presents two numerical experiments. The first demonstrates how stacked generalization improves upon a set of separate generalizers for the NETtalk task of translating text to phonemes. The second demonstrates how stacked generalization improves the performance of a single surface-fitter. With the other experimental evidence in the literature, the usual arguments supporting cross-validation, and the abstract justifications presented in this paper, the conclusion is that for almost any real-world generalization problem one should use some version of stacked generalization to minimize the generalization error rate. This paper ends by discussing some of the variations of stacked generalization, and how it touches on other fields like chaos theory.},
	language = {en},
	number = {2},
	urldate = {2021-01-23},
	journal = {Neural Networks},
	author = {Wolpert, David H.},
	month = jan,
	year = {1992},
	keywords = {Combining generalizers, cross-validation, Error estimation and correction, Generalization and induction, Learning set preprocessing},
	pages = {241--259},
}

@article{breiman_bagging_1996,
	title = {Bagging predictors},
	volume = {24},
	issn = {1573-0565},
	url = {https://doi.org/10.1007/BF00058655},
	doi = {10.1007/BF00058655},
	abstract = {Bagging predictors is a method for generating multiple versions of a predictor and using these to get an aggregated predictor. The aggregation averages over the versions when predicting a numerical outcome and does a plurality vote when predicting a class. The multiple versions are formed by making bootstrap replicates of the learning set and using these as new learning sets. Tests on real and simulated data sets using classification and regression trees and subset selection in linear regression show that bagging can give substantial gains in accuracy. The vital element is the instability of the prediction method. If perturbing the learning set can cause significant changes in the predictor constructed, then bagging can improve accuracy.},
	language = {en},
	number = {2},
	urldate = {2021-01-23},
	journal = {Machine Learning},
	author = {Breiman, Leo},
	month = aug,
	year = {1996},
	pages = {123--140},
}

@book{hastie_elements_2009,
	address = {New York},
	edition = {2},
	series = {Springer {Series} in {Statistics}},
	title = {The {Elements} of {Statistical} {Learning}: {Data} {Mining}, {Inference}, and {Prediction}, {Second} {Edition}},
	isbn = {978-0-387-84857-0},
	shorttitle = {The {Elements} of {Statistical} {Learning}},
	url = {https://www.springer.com/gp/book/9780387848570},
	abstract = {During the past decade there has been an explosion in computation and information technology. With it have come vast amounts of data in a variety of fields such as medicine, biology, finance, and marketing. The challenge of understanding these data has led to the development of new tools in the field of statistics, and spawned new areas such as data mining, machine learning, and bioinformatics. Many of these tools have common underpinnings but are often expressed with different terminology. This book describes the important ideas in these areas in a common conceptual framework. While the approach is statistical, the emphasis is on concepts rather than mathematics. Many examples are given, with a liberal use of color graphics. It is a valuable resource for statisticians and anyone interested in data mining in science or industry. The book's coverage is broad, from supervised learning (prediction) to unsupervised learning. The many topics include neural networks, support vector machines, classification trees and boosting---the first comprehensive treatment of this topic in any book. This major new edition features many topics not covered in the original, including graphical models, random forests, ensemble methods, least angle regression and path algorithms for the lasso, non-negative matrix factorization, and spectral clustering. There is also a chapter on methods for ``wide'' data (p bigger than n), including multiple testing and false discovery rates. Trevor Hastie, Robert Tibshirani, and Jerome Friedman are professors of statistics at Stanford University. They are prominent researchers in this area: Hastie and Tibshirani developed generalized additive models and wrote a popular book of that title. Hastie co-developed much of the statistical modeling software and environment in R/S-PLUS and invented principal curves and surfaces. Tibshirani proposed the lasso and is co-author of the very successful An Introduction to the Bootstrap. Friedman is the co-inventor of many data-mining tools including CART, MARS, projection pursuit and gradient boosting.},
	language = {en},
	urldate = {2021-01-12},
	publisher = {Springer-Verlag},
	author = {Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome},
	year = {2009},
	doi = {10.1007/978-0-387-84858-7},
}

@article{kalutarage_detecting_2015,
	title = {Detecting stealthy attacks: {Efficient} monitoring of suspicious activities on computer networks},
	volume = {47},
	issn = {0045-7906},
	shorttitle = {Detecting stealthy attacks},
	url = {http://www.sciencedirect.com/science/article/pii/S0045790615002384},
	doi = {10.1016/j.compeleceng.2015.07.007},
	abstract = {Stealthy attackers move patiently through computer networks – taking days, weeks or months to accomplish their objectives in order to avoid detection. As networks scale up in size and speed, monitoring for such attack attempts is increasingly a challenge. This paper presents an efficient monitoring technique for stealthy attacks. It investigates the feasibility of proposed method under number of different test cases and examines how design of the network affects the detection. A methodological way for tracing anonymous stealthy activities to their approximate sources is also presented. The Bayesian fusion along with traffic sampling is employed as a data reduction method. The proposed method has the ability to monitor stealthy activities using 10–20\% size sampling rates without degrading the quality of detection.},
	language = {en},
	urldate = {2021-01-11},
	journal = {Computers \& Electrical Engineering},
	author = {Kalutarage, Harsha K. and Shaikh, Siraj A. and Wickramasinghe, Indika P. and Zhou, Qin and James, Anne E.},
	month = oct,
	year = {2015},
	keywords = {Anomaly detection, Bayesian fusion, Network simulation, Stealthy attacks, Traffic sampling},
	pages = {327--344},
}

@article{dulhare_prediction_2018,
	title = {Prediction system for heart disease using {Naive} {Bayes} and particle swarm optimization},
	volume = {29},
	doi = {10.4066/biomedicalresearch.29-18-620},
	abstract = {Heart attack disease is major cause of death anywhere in world. Data mining play an important role in health care industry to enable health systems to properly use the data and analytics to identify impotence that improves care with reduce costs. One of data mining technique as classification is a supervised learning used to accurately predict the target class for each case in the data. Heart disease classification involves identifying healthy and sick individuals. Linear classifier as a Naive Bayes (NB) is relatively stable with respect to small variation or changes in training data. Particle Swarm Optimization (PSO) is an efficient evolutionary computation technique which selects the most optimum features which contribute more to the result which reduces the computation time and increases the accuracy. Experimental result shows that the proposed model with PSO as feature selection increases the predictive accuracy of the Naive Bayes to classify heart disease.},
	journal = {Biomedical Research},
	author = {Dulhare, Uma},
	month = jan,
	year = {2018},
}

@article{arar_feature_2017,
	title = {A feature dependent {Naive} {Bayes} approach and its application to the software defect prediction problem},
	volume = {59},
	issn = {1568-4946},
	url = {http://www.sciencedirect.com/science/article/pii/S1568494617303083},
	doi = {10.1016/j.asoc.2017.05.043},
	abstract = {Naive Bayes is one of the most widely used algorithms in classification problems because of its simplicity, effectiveness, and robustness. It is suitable for many learning scenarios, such as image classification, fraud detection, web mining, and text classification. Naive Bayes is a probabilistic approach based on assumptions that features are independent of each other and that their weights are equally important. However, in practice, features may be interrelated. In that case, such assumptions may cause a dramatic decrease in performance. In this study, by following preprocessing steps, a Feature Dependent Naive Bayes (FDNB) classification method is proposed. Features are included for calculation as pairs to create dependence between one another. This method was applied to the software defect prediction problem and experiments were carried out using widely recognized NASA PROMISE data sets. The obtained results show that this new method is more successful than the standard Naive Bayes approach and that it has a competitive performance with other feature-weighting techniques. A further aim of this study is to demonstrate that to be reliable, a learning model must be constructed by using only training data, as otherwise misleading results arise from the use of the entire data set.},
	language = {en},
	urldate = {2021-01-11},
	journal = {Applied Soft Computing},
	author = {Arar, Ömer Faruk and Ayan, Kürşat},
	month = oct,
	year = {2017},
	keywords = {Data mining, Discretization, Feature independence, Naive Bayes, Software defect prediction},
	pages = {197--209},
}

@article{osojnik_multi-label_2017,
	title = {Multi-label classification via multi-target regression on data streams},
	volume = {106},
	issn = {1573-0565},
	url = {https://doi.org/10.1007/s10994-016-5613-5},
	doi = {10.1007/s10994-016-5613-5},
	abstract = {Multi-label classification (MLC) tasks are encountered more and more frequently in machine learning applications. While MLC methods exist for the classical batch setting, only a few methods are available for streaming setting. In this paper, we propose a new methodology for MLC via multi-target regression in a streaming setting. Moreover, we develop a streaming multi-target regressor iSOUP-Tree that uses this approach. We experimentally compare two variants of the iSOUP-Tree method (building regression and model trees), as well as ensembles of iSOUP-Trees with state-of-the-art tree and ensemble methods for MLC on data streams. We evaluate these methods on a variety of measures of predictive performance (appropriate for the MLC task). The ensembles of iSOUP-Trees perform significantly better on some of these measures, especially the ones based on label ranking, and are not significantly worse than the competitors on any of the remaining measures. We identify the thresholding problem for the task of MLC on data streams as a key issue that needs to be addressed in order to obtain even better results in terms of predictive performance.},
	language = {en},
	number = {6},
	urldate = {2020-06-15},
	journal = {Machine Learning},
	author = {Osojnik, Aljaž and Panov, Panče and Džeroski, Sašo},
	month = jun,
	year = {2017},
	keywords = {20ng, datasets, enron, evaluation, Multi-label classification, streaming, preliminares},
	pages = {745--770},
}

@article{nguyen_multi-label_2019,
	title = {Multi-label classification via incremental clustering on an evolving data stream},
	volume = {95},
	issn = {00313203},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S0031320319302328},
	doi = {10.1016/j.patcog.2019.06.001},
	language = {en},
	urldate = {2021-01-09},
	journal = {Pattern Recognition},
	author = {Nguyen, Tien Thanh and Dang, Manh Truong and Luong, Anh Vu and Liew, Alan Wee-Chung and Liang, Tiancai and McCall, John},
	month = nov,
	year = {2019},
	keywords = {preliminares},
	pages = {96--113},
}

@inproceedings{wang_weighted_2017,
	address = {Cham},
	series = {Lecture {Notes} in {Computer} {Science}},
	title = {Weighted {Ensemble} {Classification} of {Multi}-label {Data} {Streams}},
	isbn = {978-3-319-57529-2},
	doi = {10.1007/978-3-319-57529-2_43},
	abstract = {Many real world applications involve classification of multi-label data streams. However, most existing classification models mostly focused on classifying single-label data streams. Learning in multi-label data stream scenarios is more challenging, as the classification systems should be able to consider several properties, such as large data volumes, label correlations and concept drifts. In this paper, we propose an efficient and effective ensemble model for multi-label stream classification based on ML-KNN (Multi-Label KNN) [31] and propose a balance AdjustWeight function to combine the predictions which can efficiently process high-speed multi-label stream data with concept drifts. The empirical results indicate that our approach achieves a high accuracy and low storage cost, and outperforms the existing methods ML-KNN and SMART [14].},
	language = {en},
	booktitle = {Advances in {Knowledge} {Discovery} and {Data} {Mining}},
	publisher = {Springer International Publishing},
	author = {Wang, Lulu and Shen, Hong and Tian, Hui},
	editor = {Kim, Jinho and Shim, Kyuseok and Cao, Longbing and Lee, Jae-Gil and Lin, Xuemin and Moon, Yang-Sae},
	year = {2017},
	keywords = {Data stream, preliminares, Classification, Multi-label},
	pages = {551--562},
}

@article{zheng_survey_2020,
	title = {A {Survey} on {Multi}-{Label} {Data} {Stream} {Classification}},
	volume = {8},
	issn = {2169-3536},
	doi = {10.1109/ACCESS.2019.2962059},
	abstract = {Nowadays, many real-world applications of our daily life generate massive volume of streaming data at a higher speed than ever before, to name a few, Web clicking data streams, sensor network data and credit transaction streams. Contrary to traditional data mining using static datasets, there are several challenges for data stream mining, for instance, finite memory, one-pass and timely reaction. In this survey, we provide a comprehensive review of existing multi-label streams mining algorithms and categorize these methods based on different perspectives, which mainly focus on the multi-label data stream classification. We first briefly summarize existing multi-label and data stream classification algorithms and discuss their merits and demerits. Secondly, we identify mining constraints on classification for multi-label streaming data, and present a comprehensive study in algorithms for multi-label data stream classification. Finally, several challenges and open issues in multi-label data stream classification are discussed, which are worthwhile to be pursued by the researchers in the future.},
	journal = {IEEE Access},
	author = {Zheng, X. and Li, P. and Chu, Z. and Hu, X.},
	year = {2020},
	note = {Conference Name: IEEE Access},
	keywords = {Classification algorithms, data mining, Data mining, Data models, data stream mining, Data stream mining, Decision trees, multi-label classification, multi-label data, multilabel data stream classification, multilabel streams mining algorithms, pattern classification, Prediction algorithms, Streaming media, Vegetation, Web clicking data streams},
	pages = {1249--1275},
}

@article{polikar_polikar_2006,
	title = {Polikar, {R}.: {Ensemble} based systems in decision making. {IEEE} {Circuit} {Syst}. {Mag}. 6, 21-45},
	volume = {6},
	shorttitle = {Polikar, {R}.},
	doi = {10.1109/MCAS.2006.1688199},
	abstract = {In matters of great importance that have financial, medical, social, or other implications, we often seek a second opinion before making a decision, sometimes a third, and sometimes many more. In doing so, we weigh the individual opinions, and combine them through some thought process to reach a final decision that is presumably the most informed one. The process of consulting "several experts" before making a final decision is perhaps second nature to us; yet, the extensive benefits of such a process in automated decision making applications have only recently been discovered by computational intelligence community. Also known under various other names, such as multiple classifier systems, committee of classifiers, or mixture of experts, ensemble based systems have shown to produce favorable results compared to those of single-expert systems for a broad range of applications and under a variety of scenarios. Design, implementation and application of such systems are the main topics of this article. Specifically, this paper reviews conditions under which ensemble based systems may be more beneficial than their single classifier counterparts, algorithms for generating individual components of the ensemble systems, and various procedures through which the individual classifiers can be combined. We discuss popular ensemble based algorithms, such as bagging, boosting, AdaBoost, stacked generalization, and hierarchical mixture of experts; as well as commonly used combination rules, including algebraic combination of outputs, voting based techniques, behavior knowledge space, and decision templates. Finally, we look at current and future research directions for novel applications of ensemble systems. Such applications include incremental learning, data fusion, feature selection, learning with missing features, confidence estimation, and error correcting output codes; all areas in which ensemble systems have shown great promise},
	journal = {Circuits and Systems Magazine, IEEE},
	author = {Polikar, Robi},
	month = oct,
	year = {2006},
	pages = {21--45},
}

@inproceedings{karponi_empirical_2017,
	address = {Cham},
	series = {Advances in {Intelligent} {Systems} and {Computing}},
	title = {An {Empirical} {Comparison} of {Methods} for {Multi}-label {Data} {Stream} {Classification}},
	isbn = {978-3-319-47898-2},
	doi = {10.1007/978-3-319-47898-2_16},
	abstract = {This paper studies the problem of multi-label classification in the context of data streams. We discuss related work in this area and present our implementation of several existing approaches as part of the Mulan software. We present empirical results on a real-world data stream concerning media monitoring and discuss and draw a number of conclusions regarding their performance.},
	language = {en},
	booktitle = {Advances in {Big} {Data}},
	publisher = {Springer International Publishing},
	author = {Karponi, Konstantina and Tsoumakas, Grigorios},
	editor = {Angelov, Plamen and Manolopoulos, Yannis and Iliadis, Lazaros and Roy, Asim and Vellasco, Marley},
	year = {2017},
	keywords = {Multi-label learning, Classification, Data streams, Media monitoring},
	pages = {151--159},
}

@article{nguyen_multi-label_2019-1,
	title = {Multi-label classification via label correlation and first order feature dependance in a data stream},
	volume = {90},
	issn = {0031-3203},
	url = {http://www.sciencedirect.com/science/article/pii/S0031320319300123},
	doi = {10.1016/j.patcog.2019.01.007},
	abstract = {Many batch learning algorithms have been introduced for offline multi-label classification (MLC) over the years. However, the increasing data volume in many applications such as social networks, sensor networks, and traffic monitoring has posed many challenges to batch MLC learning. For example, it is often expensive to re-train the model with the newly arrived samples, or it is impractical to learn on the large volume of data at once. The research on incremental learning is therefore applicable to a large volume of data and especially for data stream. In this study, we develop a Bayesian-based method for learning from multi-label data streams by taking into consideration the correlation between pairs of labels and the relationship between label and feature. In our model, not only the label correlation is learned with each arrived sample with ground truth labels but also the number of predicted labels are adjusted based on Hoeffding inequality and the label cardinality. We also extend the model to handle missing values, a problem common in many real-world data. To handle concept drift, we propose a decay mechanism focusing on the age of the arrived samples to incrementally adapt to the change of data. The experimental results show that our method is highly competitive compared to several well-known benchmark algorithms under both the stationary and concept drift settings.},
	language = {en},
	urldate = {2020-06-25},
	journal = {Pattern Recognition},
	author = {Nguyen, Tien Thanh and Nguyen, Thi Thu Thuy and Luong, Anh Vu and Nguyen, Quoc Viet Hung and Liew, Alan Wee-Chung and Stantic, Bela},
	month = jun,
	year = {2019},
	keywords = {20ng, enron, Multi-label classification, Data stream, Concept drift, Feature dependence, Label correlation, Multi-label learning, Online learning},
	pages = {35--51},
}

@article{trohidis_multi-label_2011,
	title = {Multi-label classification of music by emotion},
	volume = {2011},
	issn = {1687-4722},
	url = {https://doi.org/10.1186/1687-4722-2011-426793},
	doi = {10.1186/1687-4722-2011-426793},
	abstract = {This work studies the task of automatic emotion detection in music. Music may evoke more than one different emotion at the same time. Single-label classification and regression cannot model this multiplicity. Therefore, this work focuses on multi-label classification approaches, where a piece of music may simultaneously belong to more than one class. Seven algorithms are experimentally compared for this task. Furthermore, the predictive power of several audio features is evaluated using a new multi-label feature selection method. Experiments are conducted on a set of 593 songs with six clusters of emotions based on the Tellegen-Watson-Clark model of affect. Results show that multi-label modeling is successful and provide interesting insights into the predictive quality of the algorithms and features.},
	language = {en},
	number = {1},
	urldate = {2020-06-24},
	journal = {EURASIP Journal on Audio, Speech, and Music Processing},
	author = {Trohidis, Konstantinos and Tsoumakas, Grigorios and Kalliris, George and Vlahavas, Ioannis},
	month = sep,
	year = {2011},
	keywords = {datasets, evaluation, br, lc, music, rakel},
	pages = {4},
}

@article{read_meka_2016,
	title = {{MEKA}: {A} multi-label/multi-target extension to {WEKA}},
	volume = {17},
	issn = {1533-7928},
	shorttitle = {{MEKA}},
	url = {https://researchcommons.waikato.ac.nz/handle/10289/10136},
	abstract = {Multi-label classification has rapidly attracted interest in the machine learning literature, and there are now a large number and considerable variety of methods for this type of learning. We present MEKA: an open-source Java framework based on the well-known WEKA library. MEKA provides interfaces to facilitate practical application, and a wealth of multi-label classifiers, evaluation metrics, and tools for multi-label experiments and development. It supports multi-label and multi-target data, including in incremental and semi- supervised contexts.},
	language = {en},
	number = {21},
	urldate = {2020-06-24},
	author = {Read, Jesse and Reutemann, Peter and Pfahringer, Bernhard and Holmes, Geoffrey},
	year = {2016},
	note = {Accepted: 2016-04-26T04:04:07Z},
	keywords = {meka},
	pages = {1--5},
}

@book{bifet_machine_2018,
	title = {Machine {Learning} for {Data} {Streams} with {Practical} {Examples} in {MOA}},
	isbn = {978-0-262-03779-2},
	abstract = {Today many information sources—including sensor networks, financial markets, social networks, and healthcare monitoring—are so-called data streams, arriving sequentially and at high speed. Analysis must take place in real time, with partial data and without the capacity to store the entire data set. This book presents algorithms and techniques used in data stream mining and real-time analytics. Taking a hands-on approach, the book demonstrates the techniques using MOA (Massive Online Analysis), a popular, freely available open-source software framework, allowing readers to try out the techniques after reading the explanations.

The book first offers a brief introduction to the topic, covering big data mining, basic methodologies for mining data streams, and a simple example of MOA. More detailed discussions follow, with chapters on sketching techniques, change, classification, ensemble methods, regression, clustering, and frequent pattern mining. Most of these chapters include exercises, an MOA-based lab session, or both. Finally, the book discusses the MOA software, covering the MOA graphical user interface, the command line, use of its API, and the development of new methods within MOA. The book will be an essential reference for readers who want to use data stream mining as a tool, researchers in innovation or data stream mining, and programmers who want to create new algorithms for MOA.

https://moa.cms.waikato.ac.nz/book/},
	author = {Bifet, Albert and Gavaldà, Ricard and Pfahringer, Bernhard and Holmes, Geoffrey},
	month = mar,
	year = {2018},
}

@inproceedings{snoek_challenge_2006,
	address = {Santa Barbara, CA, USA},
	title = {The challenge problem for automated detection of 101 semantic concepts in multimedia},
	isbn = {978-1-59593-447-5},
	url = {http://portal.acm.org/citation.cfm?doid=1180639.1180727},
	doi = {10.1145/1180639.1180727},
	abstract = {We introduce the challenge problem for generic video indexing to gain insight in intermediate steps that aﬀect performance of multimedia analysis methods, while at the same time fostering repeatability of experiments. To arrive at a challenge problem, we provide a general scheme for the systematic examination of automated concept detection methods, by decomposing the generic video indexing problem into 2 unimodal analysis experiments, 2 multimodal analysis experiments, and 1 combined analysis experiment. For each experiment, we evaluate generic video indexing performance on 85 hours of international broadcast news data, from the TRECVID 2005/2006 benchmark, using a lexicon of 101 semantic concepts. By establishing a minimum performance on each experiment, the challenge problem allows for component-based optimization of the generic indexing issue, while simultaneously oﬀering other researchers a reference for comparison during indexing methodology development. To stimulate further investigations in intermediate analysis steps that inﬂuence video indexing performance, the challenge oﬀers to the research community a manually annotated concept lexicon, pre-computed low-level multimedia features, trained classiﬁer models, and ﬁve experiments together with baseline performance, which are all available at http://www.mediamill.nl/challenge/.},
	language = {en},
	urldate = {2020-06-15},
	booktitle = {Proceedings of the 14th annual {ACM} international conference on {Multimedia}  - {MULTIMEDIA} '06},
	publisher = {ACM Press},
	author = {Snoek, Cees G. M. and Worring, Marcel and van Gemert, Jan C. and Geusebroek, Jan-Mark and Smeulders, Arnold W. M.},
	year = {2006},
	keywords = {datasets, mediamill},
	pages = {421},
}

@article{maruthupandi_multi-label_2017,
	title = {Multi-label text classification using optimised feature sets},
	volume = {9},
	abstract = {Multi-label text classification is the process of assigning multi-labels to an instance. A significant aspect of the text classification problem is the high dimensionality of the data which hinders the performance of the classifier. Hence, feature selection plays a significant role in classification process that removes the irrelevant data. In this paper, wrapper-based hybrid artificial bee colony and bacterial foraging optimisation (HABBFO) approach has been proposed to select the most appropriate feature subset for prediction. Initially, pre-processing such as tokenisation, stop word removal and stemming has been performed to extract the features (words). Experiments are conducted on the benchmark dataset and the results show that the proposed approach achieves better performance compared to the other feature selection techniques.},
	journal = {International Journal of Data Mining, Modelling and Management (IJDMMM)},
	author = {Maruthupandi, J and Vimala Devi, K},
	month = sep,
	year = {2017},
	pages = {237--248},
}

@inproceedings{zhang_multi-label_2010,
	title = {Multi-label learning by exploiting label dependency},
	booktitle = {Proceedings of the 16th {ACM} {SIGKDD} international conference on {Knowledge} discovery and data mining - {KDD} '10},
	author = {Zhang, Min-Ling and Zhang, Kun},
	year = {2010},
}

@article{read_scalable_2012,
	title = {Scalable and efficient multi-label classification for evolving data streams},
	volume = {88},
	issn = {1573-0565},
	url = {https://doi.org/10.1007/s10994-012-5279-6},
	doi = {10.1007/s10994-012-5279-6},
	abstract = {Many challenging real world problems involve multi-label data streams. Efficient methods exist for multi-label classification in non-streaming scenarios. However, learning in evolving streaming scenarios is more challenging, as classifiers must be able to deal with huge numbers of examples and to adapt to change using limited time and memory while being ready to predict at any point.},
	language = {en},
	number = {1},
	urldate = {2020-06-17},
	journal = {Machine Learning},
	author = {Read, Jesse and Bifet, Albert and Holmes, Geoff and Pfahringer, Bernhard},
	month = jul,
	year = {2012},
	pages = {243--272},
}

@article{sousa_multi-label_2018,
	title = {Multi-label classification from high-speed data streams with adaptive model rules and random rules},
	journal = {Progress in Artificial Intelligence},
	author = {Sousa, Ricardo and Gama, João},
	year = {2018},
}

@inproceedings{hulten_mining_2001,
	address = {San Francisco, California},
	series = {{KDD} '01},
	title = {Mining {Time}-changing {Data} {Streams}},
	booktitle = {Proceedings of the {Seventh} {ACM} {SIGKDD} {International} {Conference} on {Knowledge} {Discovery} and {Data} {Mining}},
	publisher = {ACM},
	author = {Hulten, Geoff and Spencer, Laurie and Domingos, Pedro},
	year = {2001},
	pages = {97--106},
}

@book{gama_knowledge_2010,
	title = {Knowledge {Discovery} from {Data} {Streams}},
	author = {Gama, João},
	year = {2010},
}

@inproceedings{read_journal_2011,
	title = {Journal of {Machine} {Learning} {Research} - {Proceedings} {Track}},
	booktitle = {Streaming {Multi}-label {Classification}},
	author = {Read, Jesse and Bifet, Albert and Holmes, Geoffrey and Pfahringer, Bernhard},
	year = {2011},
	pages = {19--25},
}

@inproceedings{read_multi-label_2008,
	title = {Multi-label {Classification} {Using} {Ensembles} of {Pruned} {Sets}},
	booktitle = {2008 {Eighth} {IEEE} {International} {Conference} on {Data} {Mining}},
	author = {Read, Jesse and Pfahringer, Bernhard and Holmes, Geoff},
	year = {2008},
}

@article{tsoumakas_multi-label_2007,
	title = {Multi-{Label} {Classification}},
	volume = {3},
	number = {3},
	journal = {Int. J. Data Warehouse. Min.},
	author = {Tsoumakas, Grigorios and Katakis, Ioannis},
	year = {2007},
	pages = {1--13},
}

@article{gantz_extracting_2011,
	title = {Extracting value from chaos},
	journal = {IDC IView},
	author = {Gantz, J and Reinsel, D},
	year = {2011},
	pages = {1--12},
}

@inproceedings{chen_extracting_2009,
	address = {Paris, France},
	title = {Extracting discriminative concepts for domain adaptation in text mining},
	isbn = {978-1-60558-495-9},
	url = {http://portal.acm.org/citation.cfm?doid=1557019.1557045},
	doi = {10.1145/1557019.1557045},
	language = {en},
	urldate = {2020-03-01},
	booktitle = {Proceedings of the 15th {ACM} {SIGKDD} international conference on {Knowledge} discovery and data mining - {KDD} '09},
	publisher = {ACM Press},
	author = {Chen, Bo and Lam, Wai and Tsang, Ivor and Wong, Tak-Lam},
	year = {2009},
	keywords = {20ng, datasets},
	pages = {179},
	annote = {Este lo agrego porque describe el dataset NG20.},
}

@inproceedings{liu_deep_2017,
	address = {Shinjuku Tokyo Japan},
	title = {Deep {Learning} for {Extreme} {Multi}-label {Text} {Classification}},
	isbn = {978-1-4503-5022-8},
	url = {https://dl.acm.org/doi/10.1145/3077136.3080834},
	doi = {10.1145/3077136.3080834},
	abstract = {Extreme multi-label text classi cation (XMTC) refers to the problem of assigning to each document its most relevant subset of class labels from an extremely large label collection, where the number of labels could reach hundreds of thousands or millions. e huge label space raises research challenges such as data sparsity and scalability. Signi cant progress has been made in recent years by the development of new machine learning methods, such as tree induction with large-margin partitions of the instance spaces and label-vector embedding in the target space. However, deep learning has not been explored for XMTC, despite its big successes in other related areas. is paper presents the rst a empt at applying deep learning to XMTC, with a family of new Convolutional Neural Network (CNN) models which are tailored for multi-label classi cation in particular. With a comparative evaluation of 7 state-of-the-art methods on 6 benchmark datasets where the number of labels is up to 670,000, we show that the proposed CNN approach successfully scaled to the largest datasets, and consistently produced the best or the second best results on all the datasets. On the Wikipedia dataset with over 2 million documents and 500,000 labels in particular, it outperformed the second best method by 11.7\% ∼ 15.3\% in precision@K and by 11.5\% ∼ 11.7\% in NDCG@K for K = 1,3,5.},
	language = {en},
	urldate = {2020-06-15},
	booktitle = {Proceedings of the 40th {International} {ACM} {SIGIR} {Conference} on {Research} and {Development} in {Information} {Retrieval}},
	publisher = {ACM},
	author = {Liu, Jingzhou and Chang, Wei-Cheng and Wu, Yuexin and Yang, Yiming},
	month = aug,
	year = {2017},
	keywords = {Multi-label classification, mediamill, cnn-kim},
	pages = {115--124},
}

@article{read_classifier_2011,
	title = {Classifier chains for multi-label classification},
	volume = {85},
	number = {3},
	journal = {Mach. Learn.},
	author = {Read, Jesse and Pfahringer, Bernhard and Holmes, Geoff and Frank, Eibe},
	year = {2011},
	pages = {333--359},
}

@inproceedings{gargiulo_deep_2018,
	title = {Deep {Convolution} {Neural} {Network} for {Extreme} {Multi}-label {Text} {Classification}},
	booktitle = {Proceedings of the 11th {International} {Joint} {Conference} on {Biomedical} {Engineering} {Systems} and {Technologies}},
	author = {Gargiulo, Francesco and Silvestri, Stefano and Ciampi, Mario},
	year = {2018},
}

@article{pereira_categorizing_2016,
	title = {Categorizing feature selection methods for multi-label classification},
	volume = {49},
	number = {1},
	journal = {Artificial Intelligence Review},
	author = {Pereira, Rafael B and Plastino, Alexandre and Zadrozny, Bianca and Merschmann, Luiz H C},
	year = {2016},
	pages = {57--78},
}

@article{chen_big_2014,
	title = {Big data: {A} survey},
	volume = {19},
	author = {Chen, Min and Mao, Shiwen and Liu, Yunhao},
	year = {2014},
}

@book{mayer-schonberger_big_2013,
	series = {An {Eamon} {Dolan} book},
	title = {Big {Data}: {A} {Revolution} that {Will} {Transform} how {We} {Live}, {Work}, and {Think}},
	publisher = {Houghton Mifflin Harcourt},
	author = {Mayer-Schonberger, V and Cukier, K},
	year = {2013},
}

@inproceedings{bifet_big_2014,
	title = {Big {Data} {Stream} {Learning} with {SAMOA}},
	booktitle = {2014 {IEEE} {International} {Conference} on {Data} {Mining} {Workshop}},
	author = {Bifet, Albert and De Francisci Morales, Gianmarco},
	year = {2014},
}

@article{madjarov_extensive_2012,
	title = {An extensive experimental comparison of methods for multi-label learning},
	volume = {45},
	issn = {00313203},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S0031320312001203},
	doi = {10.1016/j.patcog.2012.03.004},
	abstract = {Multi-label learning has received signiﬁcant attention in the research community over the past few years: this has resulted in the development of a variety of multi-label learning methods. In this paper, we present an extensive experimental comparison of 12 multi-label learning methods using 16 evaluation measures over 11 benchmark datasets. We selected the competing methods based on their previous usage by the community, the representation of different groups of methods and the variety of basic underlying machine learning methods. Similarly, we selected the evaluation measures to be able to assess the behavior of the methods from a variety of view-points. In order to make conclusions independent from the application domain, we use 11 datasets from different domains. Furthermore, we compare the methods by their efﬁciency in terms of time needed to learn a classiﬁer and time needed to produce a prediction for an unseen example. We analyze the results from the experiments using Friedman and Nemenyi tests for assessing the statistical signiﬁcance of differences in performance. The results of the analysis show that for multi-label classiﬁcation the best performing methods overall are random forests of predictive clustering trees (RF-PCT) and hierarchy of multi-label classiﬁers (HOMER), followed by binary relevance (BR) and classiﬁer chains (CC). Furthermore, RF-PCT exhibited the best performance according to all measures for multi-label ranking. The recommendation from this study is that when new methods for multi-label learning are proposed, they should be compared to RF-PCT and HOMER using multiple evaluation measures.},
	language = {en},
	number = {9},
	urldate = {2020-06-15},
	journal = {Pattern Recognition},
	author = {Madjarov, Gjorgji and Kocev, Dragi and Gjorgjevikj, Dejan and Džeroski, Sašo},
	month = sep,
	year = {2012},
	keywords = {datasets, enron, evaluation, Multi-label classification, mediamill},
	pages = {3084--3104},
}

@book{fayyad_advances_1996,
	address = {Menlo Park, CA, USA},
	title = {Advances in {Knowledge} {Discovery} and {Data} {Mining}},
	publisher = {American Association for Artificial Intelligence},
	author = {Fayyad, Usama M and Piatetsky-Shapiro, Gregory and Smyth, Padhraic},
	editor = {{Fayyad, Usama M. and Piatetsky-Shapiro, Gregory and Smyth, Padhraic and Uthurusamy, Ramasamy}},
	year = {1996},
	note = {Section: From data mining to knowledge discovery: an overview},
}

@article{gibaja_tutorial_2015,
	title = {A {Tutorial} on {Multi}-{Label} {Learning}},
	volume = {47},
	abstract = {Multi-label learning has become a relevant learning paradigm in the last years due to the increasing number of fields where it can be applied and also to the emerging number of techniques that are being developed. This paper presents an up-to-date tutorial about multi-label learning that introduces the paradigm and describes the main contributions developed. Evaluation measures, fields of application, trending topics and resources are also presented.},
	journal = {ACM Computing Surveys},
	author = {Gibaja, Eva and Ventura, Sebastian},
	year = {2015},
}

@article{tanaka_multi-label_2015,
	title = {A multi-label approach using binary relevance and decision trees applied to functional genomics},
	volume = {54},
	abstract = {Many classification problems, especially in the field of bioinformatics, are associated with more than one class, known as multi-label classification problems. In this study, we propose a new adaptation for the Binary Relevance algorithm taking into account possible relations among labels, focusing on the interpretability of the model, not only on its performance. Experiments were conducted to compare the performance of our approach against others commonly found in the literature and applied to functional genomic datasets. The experimental results show that our proposal has a performance comparable to that of other methods and that, at the same time, it provides an interpretable model from the multi-label problem.},
	language = {en},
	journal = {J. Biomed. Inform.},
	author = {Tanaka, Erica Akemi and Nozawa, Sérgio Ricardo and Macedo, Alessandra Alaniz and Baranauskas, José Augusto},
	year = {2015},
	keywords = {Multi-label classification, Decision tree, Functional genomics},
	pages = {85--95},
}

@inproceedings{goncalves_genetic_2013,
	title = {A {Genetic} {Algorithm} for {Optimizing} the {Label} {Ordering} in {Multi}-label {Classifier} {Chains}},
	booktitle = {2013 {IEEE} 25th {International} {Conference} on {Tools} with {Artificial} {Intelligence}},
	author = {Goncalves, Eduardo Correa and Plastino, Alexandre and Freitas, Alex A},
	year = {2013},
}

@article{zhang_review_2014,
	title = {A {Review} {On} {Multi}-{Label} {Learning} {Algorithms}},
	volume = {26},
	journal = {IEEE Trans. Knowl. Data Eng.},
	author = {Zhang, Min-Ling and Zhou, Zhi-Hua},
	year = {2014},
	pages = {1819--1837},
}

@inproceedings{wang_deep_2020,
	title = {Deep {Streaming} {Label} {Learning}},
	url = {https://proceedings.mlr.press/v119/wang20n.html},
	abstract = {In multi-label learning, each instance can be associated with multiple and non-exclusive labels. Previous studies assume that all the labels in the learning process are fixed and static; however, they ignore the fact that the labels will emerge continuously in changing environments. In order to fill in these research gaps, we propose a novel deep neural network (DNN) based framework, Deep Streaming Label Learning (DSLL), to classify instances with newly emerged labels effectively. DSLL can explore and incorporate the knowledge from past labels and historical models to understand and develop emerging new labels. DSLL consists of three components: 1) a streaming label mapping to extract deep relationships between new labels and past labels with a novel label-correlation aware loss; 2) a streaming feature distillation propagating feature-level knowledge from the historical model to a new model; 3) a senior student network to model new labels with the help of knowledge learned from the past. Theoretically, we prove that DSLL admits tight generalization error bounds for new labels in the DNN framework. Experimentally, extensive empirical results show that the proposed method performs significantly better than the existing state-of-the-art multi-label learning methods to handle the continually emerging new labels.},
	language = {en},
	urldate = {2021-10-19},
	booktitle = {Proceedings of the 37th {International} {Conference} on {Machine} {Learning}},
	publisher = {PMLR},
	author = {Wang, Zhen and Liu, Liu and Tao, Dacheng},
	month = nov,
	year = {2020},
	note = {ISSN: 2640-3498},
	pages = {9963--9972},
}

@inproceedings{wang_deep_2020-1,
	title = {Deep {Streaming} {Label} {Learning}},
	url = {https://proceedings.mlr.press/v119/wang20n.html},
	abstract = {In multi-label learning, each instance can be associated with multiple and non-exclusive labels. Previous studies assume that all the labels in the learning process are fixed and static; however, they ignore the fact that the labels will emerge continuously in changing environments. In order to fill in these research gaps, we propose a novel deep neural network (DNN) based framework, Deep Streaming Label Learning (DSLL), to classify instances with newly emerged labels effectively. DSLL can explore and incorporate the knowledge from past labels and historical models to understand and develop emerging new labels. DSLL consists of three components: 1) a streaming label mapping to extract deep relationships between new labels and past labels with a novel label-correlation aware loss; 2) a streaming feature distillation propagating feature-level knowledge from the historical model to a new model; 3) a senior student network to model new labels with the help of knowledge learned from the past. Theoretically, we prove that DSLL admits tight generalization error bounds for new labels in the DNN framework. Experimentally, extensive empirical results show that the proposed method performs significantly better than the existing state-of-the-art multi-label learning methods to handle the continually emerging new labels.},
	language = {en},
	urldate = {2021-10-19},
	booktitle = {Proceedings of the 37th {International} {Conference} on {Machine} {Learning}},
	publisher = {PMLR},
	author = {Wang, Zhen and Liu, Liu and Tao, Dacheng},
	month = nov,
	year = {2020},
	note = {ISSN: 2640-3498},
	pages = {9963--9972},
}

@inproceedings{dembczynski_bayes_2010,
	title = {Bayes {Optimal} {Multilabel} {Classification} via {Probabilistic} {Classifier} {Chains}},
	abstract = {In the realm of multilabel classification (MLC), it has become an opinio communis that optimal predictive performance can only be achieved by learners that explicitly take label dependence into account. The goal of this paper is to elaborate on this postulate in a critical way. To this end, we formalize and analyze MLC within a probabilistic setting. Thus, it becomes possible to look at the problem from the point of view of risk minimization and Bayes optimal prediction. Moreover, inspired by our probabilistic setting, we propose a new method for MLC that generalizes and outperforms another approach, called classifier chains, that was recently introduced in the literature.},
	booktitle = {{ICML}},
	author = {Dembczynski, K. and Cheng, Weiwei and Hüllermeier, E.},
	year = {2010},
}

@inproceedings{godbole_discriminative_2004,
	title = {Discriminative {Methods} for {Multi}-{Labeled} {Classification}},
	volume = {vol. 3056},
	isbn = {978-3-540-22064-0},
	doi = {10.1007/978-3-540-24775-3_5},
	abstract = {In this paper we present methods of enhancing existing discriminative classifiers for multi-labeled predictions. Discriminative methods like support vector machines perform very well for uni-labeled text classification tasks. Multi-labeled classification is a harder task subject to relatively less attention. In the multi-labeled setting, classes are often related to each other or part of a is-a hierarchy. We present a new technique for combining text features and features indicating relationships between classes, which can be used with any discriminative algorithm.},
	author = {Godbole, Shantanu and Sarawagi, Sunita},
	month = aug,
	year = {2004},
}

@inproceedings{bifet_adaptive_2009,
	title = {Adaptive {Learning} from {Evolving} {Data} {Streams}},
	isbn = {978-3-642-03914-0},
	doi = {10.1007/978-3-642-03915-7_22},
	abstract = {We propose and illustrate a method for developing algorithms that can adaptively learn from data streams that drift over time.
As an example, we take Hoeffding Tree, an incremental decision tree inducer for data streams, and use as a basis it to build
two new methods that can deal with distribution and concept drift: a sliding window-based algorithm, Hoeffding Window Tree,
and an adaptive method, Hoeffding Adaptive Tree. Our methods are based on using change detectors and estimator modules at
the right places; we choose implementations with theoretical guarantees in order to extend such guarantees to the resulting
adaptive learning algorithm. A main advantage of our methods is that they require no guess about how fast or how often the
stream will drift; other methods typically have several user-defined parameters to this effect.

In our experiments, the new methods never do worse, and in some cases do much better, than CVFDT, a well-known method for
tree induction on data streams with drift.},
	author = {Bifet, Albert and Gavaldà, Ricard},
	month = aug,
	year = {2009},
	pages = {249--260},
}

@inproceedings{roseberry_multi-label_2018,
	title = {Multi-label {kNN} {Classifier} with {Self} {Adjusting} {Memory} for {Drifting} {Data} {Streams}},
	url = {http://proceedings.mlr.press/v94/roseberry18a.html},
	language = {en},
	urldate = {2021-08-18},
	booktitle = {Second {International} {Workshop} on {Learning} with {Imbalanced} {Domains}: {Theory} and {Applications}},
	publisher = {PMLR},
	author = {Roseberry, Martha and Cano, Alberto},
	month = nov,
	year = {2018},
	note = {ISSN: 2640-3498},
	pages = {23--37},
}

@article{herrera_multilabel_2016-1,
	title = {Multilabel {Classification}: {Problem} {Analysis}, {Metrics} and {Techniques}},
	shorttitle = {Multilabel {Classification}},
	url = {https://www.semanticscholar.org/paper/Multilabel-Classification%3A-Problem-Analysis%2C-and-Herrera-Charte/90356e39f440e2229994c4690ad0c83560fce902},
	abstract = {This book offers a comprehensive review of multilabel techniques widely used to classify and label texts, pictures, videos and music in the Internet, and provides the user with the software tools needed to deal with multi-label data. This book offers a comprehensive review of multilabel techniques widely used to classify and label texts, pictures, videos and music in the Internet. A deep review of the specialized literature on the field includes the available software needed to work with this kind of data. It provides the user with the software tools needed to deal with multilabel data, as well as step by step instruction on how to use them. The main topics covered are: The special characteristics of multi-labeled data and the metrics available to measure them. The importance of taking advantage of label correlations to improve the results. The different approaches followed to face multi-label classification. The preprocessing techniques applicable to multi-label datasets. The available software tools to work with multi-label data. This book is beneficial for professionals and researchers in a variety of fieldsbecause of the wide range of potential applications for multilabel classification. Besides its multiple applications to classify different types of online information, it is also useful in many other areas, such as genomics and biology. No previous knowledge about the subject is required. The book introduces all the needed concepts to understand multilabel data characterization, treatment and evaluation.},
	language = {en},
	urldate = {2021-12-31},
	journal = {undefined},
	author = {Herrera, F. and Charte, F. and Rivera, A. J. and Jesús, M. J. D.},
	year = {2016},
}

@misc{noauthor_google_nodate,
	title = {Google {Search} {Statistics} - {Internet} {Live} {Stats}},
	url = {https://www.internetlivestats.com/google-search-statistics/},
	abstract = {How many searches on Google per month, per day, per second, and year? Live counter showing estimated current searches. Historical search volume, growth rate, and Google's share of global search market. Charts, infographics, data, and interesting info.},
	language = {en},
	urldate = {2022-02-09},
}

@inproceedings{sehgal_sentiment_2016,
	title = {Sentiment analysis of big data applications using {Twitter} {Data} with the help of {HADOOP} framework},
	doi = {10.1109/SYSMART.2016.7894530},
	abstract = {Twitter Data is one of the large amounts of sized data, because it is having a millions of tweets every day. It is one of the largest social media site. We are using this twitter data for the business purpose and industrial or social purpose according to our data requirement and processing the data. It is very large amount of sized data increasing every second that is known as big data. Because of large amount of data increasing every day we cannot easily analysis this data. We are using here new technology is HADOOP; with the help of HADOOP we can easily analysis the large amount of sized data. In this paper, we are using HADOOP for the analyzing the twitter data which is also known as a big data.},
	booktitle = {2016 {International} {Conference} {System} {Modeling} {Advancement} in {Research} {Trends} ({SMART})},
	author = {Sehgal, Divya and Agarwal, Ambuj Kumar},
	month = nov,
	year = {2016},
	keywords = {Analysis of Data, Big Data, Business, Concepts of HDFS, File systems, HADOOP, HDFS, Industries, Map Reduce, Sentiment analysis, Twitter},
	pages = {251--255},
}

@inproceedings{cardona_clasificacion_2021,
	title = {Clasificación multi-etiqueta con ensamble fijo por mayoría ponderada},
	url = {http://50jaiio.sadio.org.ar/pdfs/agranda/AGRANDA-15.pdf},
	abstract = {La clasificación multi-etiquetas es un paradigma de aprendizaje supervisado que generaliza las técnicas clásicas de clasificación para abordar problemas en donde cada instancia de una colección se encuentra asociada a múltiples etiquetas. La mayor parte de los trabajos de investigación han sido realizados en contextos de aprendizaje por {\textbackslash}textit\{batch\}. Los ambientes de flujo continuo de datos (o {\textbackslash}textit\{streaming\}) presentan nuevos desafíos a esta área debido a las limitaciones de tiempo de respuesta y almacenamiento
que acarrean. Se aplicaron algoritmos de clasificación multi-etiqueta a
diversas colecciones de datos no estructuradas de referencia a partir de las cuales se simularon los {\textbackslash}textit\{streamings\} de datos. En este trabajo se propone una estrategia de ensamble de algoritmos de clasificación multi-etiquetas con el objetivo de conseguir mejoras en la predicción. Los resultados han sido alentadores y la propuesta de ensambles utilizando algoritmos clásicos de clasificación multi-etiquetas mostraron rendimientos competitivos que mejoran en varios escenarios al estado del arte.},
	booktitle = {Simposio {Argentino} de ciencia de datos y {GRANdes} {DAtos} ({AGRANDA}, 2021) - 50 {JAIIO}},
	author = {Cardona, Juan and Banchero, Santiago},
	month = oct,
	year = {2021},
}