diff --git a/publications.bib b/publications.bib index fcb167e..8f69cc5 100644 --- a/publications.bib +++ b/publications.bib @@ -1,258 +1,167 @@ -@article{10.1007/s11192-013-1051-3, -author = {Cho, Philip S. and Do, Huy Hoang and Chandrasekaran, Muthu Kumar and Kan, Min-Yen}, -title = {Identifying research facilitators in an emerging Asian Research Area}, -year = {2013}, -issue_date = {October 2013}, -publisher = {Springer-Verlag}, -address = {Berlin, Heidelberg}, -volume = {97}, -number = {1}, -issn = {0138-9130}, -url = {https://doi.org/10.1007/s11192-013-1051-3}, -doi = {10.1007/s11192-013-1051-3}, -abstract = {We introduce a novel set of metrics for triadic closure among individuals or groups to model how co-authorship networks become more integrated over time. We call this process of triadic, third-party mediated integration, research facilitation. We apply our research facilitation or RF-metrics to the development of the Pan-Asian SNP (PASNP) Consortium, the first inter-Asian genomics network. Our aim was to examine if the consortium catalyzed research facilitation or integration among the members and the wider region. The PASNP Consortium is an ideal case study of an emerging Asian Research Area because its members themselves asserted a regional Asian identity. To validate our model, we developed data mining software to extract and match full author and institutional information from the PDFs of scientific papers.}, -journal = {Scientometrics}, -month = oct, -pages = {75–97}, -numpages = {23}, -keywords = {Triadic closure, Research facilitation, RF-metric, Pan-Asian SNP Consortium, Asian Research Area} -} +@inproceedings{10.1109/WI-IAT.2010.14, +author = {Tan, Yee Fan and Kan, Min-Yen}, +title = {Hierarchical Cost-Sensitive Web Resource Acquisition for Record Matching}, +year = {2010}, +isbn = {9780769541914}, +publisher = {IEEE Computer Society}, +address = {USA}, +url = {https://doi.org/10.1109/WI-IAT.2010.14}, +doi = {10.1109/WI-IAT.2010.14}, +abstract = {Web information is increasingly used as evidence in solving various problems, including record matching. However, acquiring web-based resources is slow and can incur other access costs. As such, solutions often acquire only a subset of the resources to achieve a balance between acquisition cost and benefit. Unfortunately, existing work has largely ignored the issue of which resources to acquire. They also fail to emphasize on the hierarchical nature of resource acquisitions, e.g., the search engine results for two queries must be obtained before their TF-IDF cosine similarity be computed. In this paper, we propose a framework for performing cost-sensitive acquisition of resources with hierarchical dependencies, and apply it to the web resource context. Our framework is versatile, and we show that a large variety of problems can be formulated using resource dependency graphs. We solve the resource acquisition problem by casting it as a combinatorial search problem. Finally, we demonstrate the effectiveness of our acquisition framework on record matching problems of different domains.}, +booktitle = {Proceedings of the 2010 IEEE/WIC/ACM International Conference on Web Intelligence and Intelligent Agent Technology - Volume 01}, +pages = {382–389}, +numpages = {8}, +series = {WI-IAT '10} +} -@article{10.1007/s10579-012-9210-3, -author = {Kim, Su Nam and Medelyan, Olena and Kan, Min-Yen and Baldwin, Timothy}, -title = {Automatic keyphrase extraction from scientific articles}, -year = {2013}, -issue_date = {September 2013}, +@inproceedings{10.5555/1875689.1875725, +author = {H\"{a}nse, Markus and Kan, Min-Yen and Karduck, Achim P.}, +title = {Kairos: proactive harvesting of research paper metadata from scientific conference web sites}, +year = {2010}, +isbn = {3642136532}, publisher = {Springer-Verlag}, address = {Berlin, Heidelberg}, -volume = {47}, -number = {3}, -issn = {1574-020X}, -url = {https://doi.org/10.1007/s10579-012-9210-3}, -doi = {10.1007/s10579-012-9210-3}, -abstract = {This paper describes the organization and results of the automatic keyphrase extraction task held at the Workshop on Semantic Evaluation 2010 (SemEval-2010). The keyphrase extraction task was specifically geared towards scientific articles. Systems were automatically evaluated by matching their extracted keyphrases against those assigned by the authors as well as the readers to the same documents. We outline the task, present the overall ranking of the submitted systems, and discuss the improvements to the state-of-the-art in keyphrase extraction.}, -journal = {Lang. Resour. Eval.}, -month = sep, -pages = {723–742}, -numpages = {20}, -keywords = {Keyphrase extraction, Scientific document processing, SemEval-2010, Shared task} +abstract = {We investigate the automatic harvesting of research paper metadata from recent scholarly events. Our system, Kairos, combines a focused crawler and an information extraction engine, to convert a list of conference websites into a index filled with fields of metadata that correspond to individual papers. Using event date metadata extracted from the conference website, Kairos proactively harvests metadata about the individual papers soon after they are made public. We use a Maximum Entropy classifier to classify uniform resource locators (URLs) as scientific conference websites and use Conditional Random Fields (CRF) to extract individual paper metadata from such websites. Experiments show an acceptable measure of classification accuracy of over 95\% for each of the two components.}, +booktitle = {Proceedings of the Role of Digital Libraries in a Time of Global Change, and 12th International Conference on Asia-Pacific Digital Libraries}, +pages = {226–235}, +numpages = {10}, +location = {Gold Coast, Australia}, +series = {ICADL'10} } -@inproceedings{10.1145/2467696.2467741, -author = {Bahrani, Bamdad and Kan, Min-Yen}, -title = {Multimodal alignment of scholarly documents and their presentations}, -year = {2013}, -isbn = {9781450320771}, +@inproceedings{10.1145/1816123.1816193, +author = {Nguyen, Thuy Dung and Kan, Min-Yen and Dang, Dinh-Trung and H\"{a}nse, Markus and Hong, Ching Hoi Andy and Luong, Minh-Thang and Gozali, Jesse Prabawa and Sugiyama, Kazunari and Tan, Yee Fan}, +title = {ForeCite: towards a reader-centric scholarly digital library}, +year = {2010}, +isbn = {9781450300858}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, -url = {https://doi.org/10.1145/2467696.2467741}, -doi = {10.1145/2467696.2467741}, -abstract = {We present a multimodal system for aligning scholarly docu- ments to corresponding presentations in a fine-grained man- ner (i.e., per presentation slide and per paper section). Our method improves upon a state-of-the-art baseline that em- ploys only textual similarity. Based on an analysis of base- line errors, we propose a three-pronged alignment system that combines textual, image, and ordering information to establish alignment. Our results show a statistically sig- nificant improvement of 25\%, confirming the importance of visual content in improving alignment accuracy.}, -booktitle = {Proceedings of the 13th ACM/IEEE-CS Joint Conference on Digital Libraries}, -pages = {281–284}, -numpages = {4}, -keywords = {slide presentation, slide image classification, fine-grained document alignment, digital library}, -location = {Indianapolis, Indiana, USA}, -series = {JCDL '13} +url = {https://doi.org/10.1145/1816123.1816193}, +doi = {10.1145/1816123.1816193}, +abstract = {We present ForeCite (FC), a prototype reader-centric digital library that supports the scholar in using scholarly documents. FC integrates three user interfaces: a bibliometric component, a document reader and annotation system, and a bibliographic management application.}, +booktitle = {Proceedings of the 10th Annual Joint Conference on Digital Libraries}, +pages = {387–388}, +numpages = {2}, +keywords = {ForeCite, argumentative zoning, document logical structure, scholarly digital library}, +location = {Gold Coast, Queensland, Australia}, +series = {JCDL '10} } -@inproceedings{10.1145/2467696.2467730, -author = {Gozali, Jesse Prabawa and Kan, Min-Yen and Sundaram, Hari}, -title = {Constructing an anonymous dataset from the personal digital photo libraries of mac app store users}, -year = {2013}, -isbn = {9781450320771}, +@inproceedings{10.1145/1816123.1816155, +author = {Zhao, Jin and Kan, Min-Yen}, +title = {Domain-specific iterative readability computation}, +year = {2010}, +isbn = {9781450300858}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, -url = {https://doi.org/10.1145/2467696.2467730}, -doi = {10.1145/2467696.2467730}, -abstract = {Personal digital photo libraries embody a large amount of information useful for research into photo organization, photo layout, and development of novel photo browser features. Even when anonymity can be ensured, amassing a sizable dataset from these libraries is still difficult due to the visibility and cost that would be required from such a study.We explore using the Mac App Store to reach more users to collect data from such personal digital photo libraries. More specifically, we compare and discuss how it differs from common data collection methods, e.g. Amazon Mechanical Turk, in terms of time, cost, quantity, and design of the data collection application.We have collected a large, openly available photo feature dataset using this manner. We illustrate the types of data that can be collected. In 60 days, we collected data from 20,778 photo sets (473,772 photos). Our study with the Mac App Store suggests that popular application distribution channels is a viable means to acquire massive data collections for researchers.}, -booktitle = {Proceedings of the 13th ACM/IEEE-CS Joint Conference on Digital Libraries}, -pages = {305–308}, -numpages = {4}, -keywords = {crowd-sourcing, data collection, ground truth, personal digital library, photography}, -location = {Indianapolis, Indiana, USA}, -series = {JCDL '13} +url = {https://doi.org/10.1145/1816123.1816155}, +doi = {10.1145/1816123.1816155}, +abstract = {We present a new algorithm to measure domain-specific readability. It iteratively computes the readability of domain-specific resources based on the difficulty of domain-specific concepts and vice versa, in a style reminiscent of other bipartite graph algorithms such as Hyperlink-Induced Topic Search (HITS) and the Stochastic Approach for Link-Structure Analysis (SALSA). While simple, our algorithm outperforms standard heuristic measures and remains competitive among supervised-learning approaches. Moreover, it is less domain-dependent and portable across domains as it does not rely on an annotated corpus or expensive expert knowledge that supervised or domain-specific methods require.}, +booktitle = {Proceedings of the 10th Annual Joint Conference on Digital Libraries}, +pages = {205–214}, +numpages = {10}, +keywords = {domain-specific information retrieval, graph-based algorithm, iterative computation, readability measure}, +location = {Gold Coast, Queensland, Australia}, +series = {JCDL '10} } -@inproceedings{10.1145/2467696.2467703, -author = {Do, Huy Hoang Nhat and Chandrasekaran, Muthu Kumar and Cho, Philip S. and Kan, Min Yen}, -title = {Extracting and matching authors and affiliations in scholarly documents}, -year = {2013}, -isbn = {9781450320771}, -publisher = {Association for Computing Machinery}, -address = {New York, NY, USA}, -url = {https://doi.org/10.1145/2467696.2467703}, -doi = {10.1145/2467696.2467703}, -abstract = {We introduce Enlil, an information extraction system that discovers the institutional affiliations of authors in scholarly papers. Enlil consists of two steps: one that first identifies authors and affiliations using a conditional random field; and a second support vector machine that connects authors to their affiliations. We benchmark Enlil in three separate experiments drawn from three different sources: the ACL Anthology Corpus, the ACM Digital Library, and a set of cross-disciplinary scientific journal articles acquired by querying Google Scholar. Against a state-of-the-art production baseline, Enlil reports a statistically significant improvement in F_1 of nearly 10\% (p << 0.01). In the case of multidisciplinary articles from Google Scholar, Enlil is benchmarked over both clean input (F_1 > 90\%) and automatically-acquired input (F_1 > 80\%).We have deployed Enlil in a case study involving Asian genomics research publication patterns to understand how government sponsored collaborative links evolve. Enlil has enabled our team to construct and validate new metrics to quantify the facilitation of research as opposed to direct publication.}, -booktitle = {Proceedings of the 13th ACM/IEEE-CS Joint Conference on Digital Libraries}, -pages = {219–228}, -numpages = {10}, -keywords = {conditional random fields, logical structure discovery, metadata extraction, rich document features, support vector machine}, -location = {Indianapolis, Indiana, USA}, -series = {JCDL '13} +@inproceedings{10.5555/1699750.1699762, +author = {Hong, Ching Hoi Andy and Gozali, Jesse Prabawa and Kan, Min-Yen}, +title = {FireCite: lightweight real-time reference string extraction from webpages}, +year = {2009}, +isbn = {9781932432589}, +publisher = {Association for Computational Linguistics}, +address = {USA}, +abstract = {We present FireCite, a Mozilla Firefox browser extension that helps scholars assess and manage scholarly references on the web by automatically detecting and parsing such reference strings in real-time. FireCite has two main components: 1) a reference string recognizer that has a high recall of 96\%, and 2) a reference string parser that can process HTML web pages with an overall F1 of 878 and plaintext reference strings with an overall F1 of 97. In our preliminary evaluation, we presented our FireCite prototype to four academics in separate unstructured interviews. Their positive feedback gives evidence to the desirability of FireCite's citation management capabilities.}, +booktitle = {Proceedings of the 2009 Workshop on Text and Citation Analysis for Scholarly Digital Libraries}, +pages = {71–79}, +numpages = {9}, +location = {Suntec, Singapore}, +series = {NLPIR4DL '09} } -@article{10.1007/s10579-012-9176-1, -author = {Wang, Aobo and Hoang, Cong Duy and Kan, Min-Yen}, -title = {Perspectives on crowdsourcing annotations for natural language processing}, -year = {2013}, -issue_date = {March 2013}, +@inproceedings{10.5555/2039901.2039911, +author = {Teufel, Simone and Kan, Min-Yen}, +title = {Robust argumentative zoning for sensemaking in scholarly documents}, +year = {2009}, +isbn = {9783642231599}, publisher = {Springer-Verlag}, address = {Berlin, Heidelberg}, -volume = {47}, -number = {1}, -issn = {1574-020X}, -url = {https://doi.org/10.1007/s10579-012-9176-1}, -doi = {10.1007/s10579-012-9176-1}, -abstract = {Crowdsourcing has emerged as a new method for obtaining annotations for training models for machine learning. While many variants of this process exist, they largely differ in their methods of motivating subjects to contribute and the scale of their applications. To date, there has yet to be a study that helps the practitioner to decide what form an annotation application should take to best reach its objectives within the constraints of a project. To fill this gap, we provide a faceted analysis of crowdsourcing from a practitioner's perspective, and show how our facets apply to existing published crowdsourced annotation applications. We then summarize how the major crowdsourcing genres fill different parts of this multi-dimensional space, which leads to our recommendations on the potential opportunities crowdsourcing offers to future annotation efforts.}, -journal = {Lang. Resour. Eval.}, -month = mar, -pages = {9–31}, -numpages = {23}, -keywords = {Wikipedia, NLP, Mechanical Turk, Human computation, Games with a purpose, Crowdsourcing, Annotation} +abstract = {We present an automated approach to classify sentences of scholarly work with respect to their rhetorical function. While previous work that achieves this task of argumentative zoning requires richly annotated input, our approach is robust to noise and can process raw text. Even in cases where the input has noise (as it is obtained from optical character recognition or text extraction from PDF files), our robust classifier is largely accurate. We perform an in-depth study of our system both with clean and noisy inputs. We also give preliminary results from in situ acceptability testing when the classifier is embedded within a digital library reading environment.}, +booktitle = {Proceedings of the 2009 International Conference on Advanced Language Technologies for Digital Libraries}, +pages = {154–170}, +numpages = {17}, +location = {Viareggio, Italy}, +series = {NLP4DL'09/AT4DL'09} } -@inproceedings{10.1007/978-3-642-33290-6_12, -author = {Cui, Anqi and Yang, Liner and Hou, Dejun and Kan, Min-Yen and Liu, Yiqun and Zhang, Min and Ma, Shaoping}, -title = {PrEV: preservation explorer and vault for web 2.0 user-generated content}, -year = {2012}, -isbn = {9783642332890}, +@inproceedings{10.1007/978-3-540-89533-6_33, +author = {Dang, Dinh-Trung and Tan, Yee Fan and Kan, Min-Yen}, +title = {Towards a Webpage-Based Bibliographic Manager}, +year = {2008}, +isbn = {9783540895329}, publisher = {Springer-Verlag}, address = {Berlin, Heidelberg}, -url = {https://doi.org/10.1007/978-3-642-33290-6_12}, -doi = {10.1007/978-3-642-33290-6_12}, -abstract = {We present the Pr eservation E xplorer and V ault (PrEV) system, a city-centric multilingual digital library that archives and makes available Web 2.0 resources, and aims to store a comprehensive record of what urban lifestyle is like. To match the current state of the digital environment, a key architectural design choice in PrEV is to archive not only Web 1.0 web pages, but also Web 2.0 multilingual resources that include multimedia, real-time microblog content, as well as mobile application descriptions (e.g., iPhone app) in a collaborative manner. PrEV performs the preservation of such resources for posterity, and makes them available for programmatic retrieval by third party agents, and for exploration by scholars with its user interface.}, -booktitle = {Proceedings of the Second International Conference on Theory and Practice of Digital Libraries}, -pages = {101–112}, -numpages = {12}, -keywords = {web 2.0, user-generated content, preservation, archive visualization, PrEV, NExT, API}, -location = {Paphos, Cyprus}, -series = {TPDL'12} -} - -@inproceedings{10.1109/ICMEW.2012.12, -author = {Gozali, Jesse Prabawa and Kan, Min-Yen and Sundaram, Hari}, -title = {Hidden Markov Model for Event Photo Stream Segmentation}, -year = {2012}, -isbn = {9780769547299}, -publisher = {IEEE Computer Society}, -address = {USA}, -url = {https://doi.org/10.1109/ICMEW.2012.12}, -doi = {10.1109/ICMEW.2012.12}, -abstract = {A photo stream is a chronological sequence of photos. Most existing photo stream segmentation methods assume that a photo stream comprises of photos from multiple events and their goal is to produce groups of photos, each corresponding to an event, i.e. they perform automatic albuming. Even if these photos are grouped by event, sifting through the abundance of photos in each event is cumbersome. To help make photos of each event more manageable, we propose a photo stream segmentation method for an event photo stream--the chronological sequence of photos of a single event--to produce groups of photos, each corresponding to a photo-worthy moment in the event. Our method is based on a hidden Markov model with parameters learned from time, EXIF metadata, and visual information from 1) training data of unlabelled, unsegmented event photo streams and 2) the event photo stream we want to segment. In an experiment with over 5000 photos from 28 personal photo sets, our method outperformed all six baselines with statistical significance (p}, -booktitle = {Proceedings of the 2012 IEEE International Conference on Multimedia and Expo Workshops}, -pages = {25–30}, -numpages = {6}, -keywords = {Event photo stream segmentation, digital photo library, hidden Markov model}, -series = {ICMEW '12} -} - -@inproceedings{10.1145/2325296.2325328, -author = {Poon, Jonathan Y.H. and Sugiyama, Kazunari and Tan, Yee Fan and Kan, Min-Yen}, -title = {Instructor-centric source code plagiarism detection and plagiarism corpus}, -year = {2012}, -isbn = {9781450312462}, -publisher = {Association for Computing Machinery}, -address = {New York, NY, USA}, -url = {https://doi.org/10.1145/2325296.2325328}, -doi = {10.1145/2325296.2325328}, -abstract = {Existing source code plagiarism systems focus on the problem of identifying plagiarism between pairs of submissions. The task of detection, while essential, is only a small part of managing plagiarism in an instructional setting. Holistic plagiarism detection and management requires coordination and sharing of assignment similarity -- elevating plagiarism detection from pairwise similarity to cluster-based similarity; from a single assignment to a sequence of assignments in the same course, and even among instructors of different courses.To address these shortcomings, we have developed Student Submissions Integrity Diagnosis (SSID), an open-source system that provides holistic plagiarism detection in an instructor-centric way. SSID's visuals show overviews of plagiarism clusters throughout all assignments in a course as well as highlighting most-similar submissions on any specific student. SSID supports plagiarism detection workflows; e.g., allowing student assistants to flag suspicious assignments for later review and confirmation by an instructor with proper authority. Evidence is automatically entered into SSID's logs and shared among instructors.We have additionally collected a source code plagiarism corpus, which we employ to identify and correct shortcomings of previous plagiarism detection engines and to optimize parameter tuning for SSID deployment. Since its deployment, SSID's workflow enhancements have made plagiarism detection in our faculty less tedious and more successful.}, -booktitle = {Proceedings of the 17th ACM Annual Conference on Innovation and Technology in Computer Science Education}, -pages = {122–127}, -numpages = {6}, -keywords = {user interface, similarity, programming, plagiarism detection, plagiarism assessment, corpus studies}, -location = {Haifa, Israel}, -series = {ITiCSE '12} -} - -@inproceedings{10.1145/2232817.2232875, -author = {Gozali, Jesse Prabawa and Kan, Min-Yen and Sundaram, Hari}, -title = {How do people organize their photos in each event and how does it affect storytelling, searching and interpretation tasks?}, -year = {2012}, -isbn = {9781450311540}, -publisher = {Association for Computing Machinery}, -address = {New York, NY, USA}, -url = {https://doi.org/10.1145/2232817.2232875}, -doi = {10.1145/2232817.2232875}, -abstract = {This paper explores photo organization within an event photo stream, i.e. the chronological sequence of photos from a single event. The problem is important: with the advent of inexpensive, easy-to-use photo capture devices, people can take a large number of photos per event. A family trip, for example, may include hundreds of photos. In this work, we have developed a photo browser that uses automatically segmented groups of photos---referred to as chapters---to organize such photos. The photo browser also affords users with a drag-and-drop interface to refine the chapter groupings.We conducted an exploratory study of 23 college students with their 8096 personal photos from 92 events, to understand the role of different spatial organization strategies in our chapter-based photo browser, in performing storytelling, photo search and photo set interpretation tasks. We also report novel insights on how the subjects organized their photos into chapters. We tested three layout strategies: bi-level, grid-stacking and space-filling, against a baseline plain grid layout. We found that subjects value the chronological order of the chapters more than maximizing screen space usage and that they value chapter consistency more than the chronological order of the photos. For automatic chapter groupings, having low chapter boundary misses is more important than having low chapter boundary false alarms; the choice of chapter criteria and granularity for chapter groupings are very subjective; and subjects found that chapter-based photo organization helps in all three tasks of the user study. Users preferred the chapter-based layout strategies to the baseline at a statistically significant level, with the grid-stacking strategy preferred the most.}, -booktitle = {Proceedings of the 12th ACM/IEEE-CS Joint Conference on Digital Libraries}, -pages = {315–324}, -numpages = {10}, -keywords = {event photo stream segmentation, photo browser, photo digital library, photo layouts}, -location = {Washington, DC, USA}, -series = {JCDL '12} +url = {https://doi.org/10.1007/978-3-540-89533-6_33}, +doi = {10.1007/978-3-540-89533-6_33}, +abstract = {We present ForeCiteNote, an application that organizes personal digital collections of research articles. It is architected as a single HTML page with embedded Javascript that runs within a web browser. On top of standard annotation and tagging functionality, it also supports both online and offline usage patterns, including local storage of the paper collection.}, +booktitle = {Proceedings of the 11th International Conference on Asian Digital Libraries: Universal and Ubiquitous Access to Information}, +pages = {313–316}, +numpages = {4}, +location = {Bali, Indonesia}, +series = {ICADL 08} } -@inproceedings{10.1145/1998076.1998134, -author = {Ly, Duy Khang and Sugiyama, Kazunari and Lin, Ziheng and Kan, Min-Yen}, -title = {Product review summarization from a deeper perspective}, -year = {2011}, -isbn = {9781450307444}, -publisher = {Association for Computing Machinery}, -address = {New York, NY, USA}, -url = {https://doi.org/10.1145/1998076.1998134}, -doi = {10.1145/1998076.1998134}, -abstract = {With product reviews growing in depth and becoming more numerous, it is growing challenge to acquire a comprehensive understanding of their contents, for both customers and product manufacturers. We built a system that automatically summarizes a large collection of product reviews to generate a concise summary. Importantly, our system not only extracts the review sentiments but also the underlying justification for their opinion. We solve this problem through a novel application of clustering and validate our approach through an empirical study, obtaining good performance as judged by F-measure (the harmonic mean of purity and inverse purity).}, -booktitle = {Proceedings of the 11th Annual International ACM/IEEE Joint Conference on Digital Libraries}, -pages = {311–314}, -numpages = {4}, -keywords = {summarization, sentiment analysis, clustering}, -location = {Ottawa, Ontario, Canada}, -series = {JCDL '11} +@article{10.1007/s00799-008-0042-0, +author = {Kan, Min-Yen and Lee, Dongwon and Lim, Ee-Peng}, +title = {Scholarly digital libraries at scale: introduction to the special issue on very large digital libraries}, +year = {2008}, +issue_date = {November 2008}, +publisher = {Springer-Verlag}, +address = {Berlin, Heidelberg}, +volume = {9}, +number = {2}, +issn = {1432-5012}, +url = {https://doi.org/10.1007/s00799-008-0042-0}, +doi = {10.1007/s00799-008-0042-0}, +journal = {Int. J. Digit. Libr.}, +month = nov, +pages = {81–82}, +numpages = {2} } -@inproceedings{10.1145/1998076.1998134, -author = {Ly, Duy Khang and Sugiyama, Kazunari and Lin, Ziheng and Kan, Min-Yen}, -title = {Product review summarization from a deeper perspective}, -year = {2011}, -isbn = {9781450307444}, +@inproceedings{10.1145/1378889.1378951, +author = {Liew, Guo Min and Kan, Min-Yen}, +title = {Slide image retrieval: a preliminary study}, +year = {2008}, +isbn = {9781595939982}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, -url = {https://doi.org/10.1145/1998076.1998134}, -doi = {10.1145/1998076.1998134}, -abstract = {With product reviews growing in depth and becoming more numerous, it is growing challenge to acquire a comprehensive understanding of their contents, for both customers and product manufacturers. We built a system that automatically summarizes a large collection of product reviews to generate a concise summary. Importantly, our system not only extracts the review sentiments but also the underlying justification for their opinion. We solve this problem through a novel application of clustering and validate our approach through an empirical study, obtaining good performance as judged by F-measure (the harmonic mean of purity and inverse purity).}, -booktitle = {Proceedings of the 11th Annual International ACM/IEEE Joint Conference on Digital Libraries}, -pages = {311–314}, +url = {https://doi.org/10.1145/1378889.1378951}, +doi = {10.1145/1378889.1378951}, +abstract = {We consider the task of automatic slide image retrieval, in which slide images are ranked for relevance against a textual query. Our implemented system, SLIDIR caters specifically for this task using features specifically designed for synthetic images embedded within slide presentation. We show promising results in both the ranking and binary relevance task and analyze the contribution of different features in the task performance.}, +booktitle = {Proceedings of the 8th ACM/IEEE-CS Joint Conference on Digital Libraries}, +pages = {359–362}, numpages = {4}, -keywords = {summarization, sentiment analysis, clustering}, -location = {Ottawa, Ontario, Canada}, -series = {JCDL '11} +keywords = {synthetic images, slidir, slides, presentations, image retrieval}, +location = {Pittsburgh PA, PA, USA}, +series = {JCDL '08} } -@inproceedings{10.1145/1998076.1998133, -author = {Sugiyama, Kazunari and Kan, Min-Yen}, -title = {Serendipitous recommendation for scholarly papers considering relations among researchers}, -year = {2011}, -isbn = {9781450307444}, +@inproceedings{10.1145/1378889.1378951, +author = {Liew, Guo Min and Kan, Min-Yen}, +title = {Slide image retrieval: a preliminary study}, +year = {2008}, +isbn = {9781595939982}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, -url = {https://doi.org/10.1145/1998076.1998133}, -doi = {10.1145/1998076.1998133}, -abstract = {Serendipity occurs when one finds an interesting discovery while searching for something else. While search engines seek to report work relevant to a targeted query, recommendation engines are particularly well-suited for serendipitous recommendations as such processes do not need to fulfill a targeted query. Junior researchers can use such an engine to broaden their horizon and learn new areas, while senior researchers can discover interdisciplinary frontiers to apply integrative research. We adapt a state-of-the-art scholarly paper recommendation system's user profile construction to make use of information drawn from 1) dissimilar users and 2) co-authors to specifically target serendipitous recommendation.}, -booktitle = {Proceedings of the 11th Annual International ACM/IEEE Joint Conference on Digital Libraries}, -pages = {307–310}, +url = {https://doi.org/10.1145/1378889.1378951}, +doi = {10.1145/1378889.1378951}, +abstract = {We consider the task of automatic slide image retrieval, in which slide images are ranked for relevance against a textual query. Our implemented system, SLIDIR caters specifically for this task using features specifically designed for synthetic images embedded within slide presentation. We show promising results in both the ranking and binary relevance task and analyze the contribution of different features in the task performance.}, +booktitle = {Proceedings of the 8th ACM/IEEE-CS Joint Conference on Digital Libraries}, +pages = {359–362}, numpages = {4}, -keywords = {digital library, information retrieval, recommendation, serendipity, user modeling}, -location = {Ottawa, Ontario, Canada}, -series = {JCDL '11} -} - -@article{10.4018/jdls.2010100101, -author = {Kan, Min-Yen and Luong, Minh-Thang and Nguyen, Thuy Dung}, -title = {Logical Structure Recovery in Scholarly Articles with Rich Document Features}, -year = {2010}, -issue_date = {October 2010}, -publisher = {IGI Global}, -address = {USA}, -volume = {1}, -number = {4}, -issn = {1947-9077}, -url = {https://doi.org/10.4018/jdls.2010100101}, -doi = {10.4018/jdls.2010100101}, -abstract = {Scholarly digital libraries increasingly provide analytics to information within documents themselves. This includes information about the logical document structure of use to downstream components, such as search, navigation, and summarization. In this paper, the authors describe SectLabel, a module that further develops existing software to detect the logical structure of a document from existing PDF files, using the formalism of conditional random fields. While previous work has assumed access only to the raw text representation of the document, a key aspect of this work is to integrate the use of a richer representation of the document that includes features from optical character recognition OCR, such as font size and text position. Experiments reveal that using such rich features improves logical structure detection by a significant 9 F1 points, over a suitable baseline, motivating the use of richer document representations in other digital library applications.}, -journal = {Int. J. Digit. Library Syst.}, -month = oct, -pages = {1–23}, -numpages = {23}, -keywords = {Rich Document Features, ParsCit, Metadata Extraction, Logical Structure Discovery, Conditional Random Fields} +keywords = {synthetic images, slidir, slides, presentations, image retrieval}, +location = {Pittsburgh PA, PA, USA}, +series = {JCDL '08} } \ No newline at end of file