speech_paper_library.bib

@article{aishwaryaComputationallyEfficientSpeech2024,
  title = {A Computationally Efficient Speech Emotion Recognition System Employing Machine Learning Classifiers and Ensemble Learning},
  author = {Aishwarya, N. and Kaur, Kanwaljeet and Seemakurthy, Karthik},
  year = {2024},
  month = mar,
  journal = {International Journal of Speech Technology},
  volume = {27},
  number = {1},
  pages = {239--254},
  issn = {1572-8110},
  doi = {10.1007/s10772-024-10095-8},
  urldate = {2024-09-26},
  abstract = {Speech Emotion Recognition (SER) is the process of recognizing and classifying emotions expressed through speech. SER greatly facilitates personalized and empathetic interactions, enhances user experiences, enables sentiment analysis, and finds applications in psychology, healthcare, entertainment, and gaming industries. However, accurately detecting and classifying emotions is a highly challenging task for machines due to the complexity and multifaceted nature of emotions. This work gives a comparative analysis of two approaches for emotion recognition based on original and augmented speech signals. The first approach involves extracting 39 Mel Frequency Cepstrum Coefficients (MFCC) features, while the second approach involves using MFCC spectrograms and extracting features using deep learning models such as MobileNet V2, VGG16, Inception V3, VGG19 and ResNet 50. These features are then tested on Machine learning classifiers such as SVM, Linear SVM, Naive Bayes, k-Nearest Neighbours, Logistic Regression and Random Forest. From the experiments, it is observed that the SVM classifier works best with all the feature extraction techniques Furthermore, to enhance the results, ensembling techniques involving CatBoost, and the Voting classifier along with SVM were utilized, resulting in improved test accuracies of 97.04\% on the RAVDESS dataset, 93.24\% on the SAVEE dataset, and 99.83\% on the TESS dataset, respectively. It is worth noting that both approaches are computationally efficient as they required no training time.},
  langid = {english},
  keywords = {Artificial Intelligence,Ensemble learning,Machine learning classifiers,MFCC,Pre-trained models,Speech emotion recognition},
  file = {/Users/timokoch/Zotero/storage/LRDTLAD6/Aishwarya et al. - 2024 - A computationally efficient speech emotion recogni.pdf}
}

@article{akcaySpeechEmotionRecognition2020,
  title = {Speech Emotion Recognition: {{Emotional}} Models, Databases, Features, Preprocessing Methods, Supporting Modalities, and Classifiers},
  shorttitle = {Speech Emotion Recognition},
  author = {Ak{\c c}ay, Mehmet Berkehan and O{\u g}uz, Kaya},
  year = {2020},
  month = jan,
  journal = {Speech Communication},
  volume = {116},
  pages = {56--76},
  issn = {0167-6393},
  doi = {10.1016/j.specom.2019.12.001},
  urldate = {2024-06-10},
  abstract = {Speech is the most natural way of expressing ourselves as humans. It is only natural then to extend this communication medium to computer applications. We define speech emotion recognition (SER) systems as a collection of methodologies that process and classify speech signals to detect the embedded emotions. SER is not a new field, it has been around for over two decades, and has regained attention thanks to the recent advancements. These novel studies make use of the advances in all fields of computing and technology, making it necessary to have an update on the current methodologies and techniques that make SER possible. We have identified and discussed distinct areas of SER, provided a detailed survey of current literature of each, and also listed the current challenges.},
  keywords = {Classification,Speech databases,Speech emotion recognition,Speech features,Survey},
  file = {/Users/timokoch/Zotero/storage/X7U6ZWRU/S0167639319302262.html}
}

@misc{APAPsycNetFullTextHTML,
  title = {{{APA PsycNet FullTextHTML}} Page},
  urldate = {2024-04-25},
  howpublished = {https://psycnet.apa.org/fulltext/2023-87986-001.html},
  file = {/Users/timokoch/Zotero/storage/66TFJANB/2023-87986-001.html}
}

@article{aucouturierCovertDigitalManipulation2016,
  title = {Covert Digital Manipulation of Vocal Emotion Alter Speakers' Emotional States in a Congruent Direction},
  author = {Aucouturier, Jean-Julien and Johansson, Petter and Hall, Lars and Segnini, Rodrigo and Mercadi{\'e}, Lolita and Watanabe, Katsumi},
  year = {2016},
  month = jan,
  journal = {Proceedings of the National Academy of Sciences},
  volume = {113},
  number = {4},
  pages = {948--953},
  publisher = {Proceedings of the National Academy of Sciences},
  doi = {10.1073/pnas.1506552113},
  urldate = {2024-11-18},
  abstract = {Research has shown that people often exert control over their emotions. By modulating expressions, reappraising feelings, and redirecting attention, they can regulate their emotional experience. These findings have contributed to a blurring of the traditional boundaries between cognitive and emotional processes, and it has been suggested that emotional signals are produced in a goal-directed way and monitored for errors like other intentional actions. However, this interesting possibility has never been experimentally tested. To this end, we created a digital audio platform to covertly modify the emotional tone of participants' voices while they talked in the direction of happiness, sadness, or fear. The result showed that the audio transformations were being perceived as natural examples of the intended emotions, but the great majority of the participants, nevertheless, remained unaware that their own voices were being manipulated. This finding indicates that people are not continuously monitoring their own voice to make sure that it meets a predetermined emotional target. Instead, as a consequence of listening to their altered voices, the emotional state of the participants changed in congruence with the emotion portrayed, which was measured by both self-report and skin conductance level. This change is the first evidence, to our knowledge, of peripheral feedback effects on emotional experience in the auditory domain. As such, our result reinforces the wider framework of self-perception theory: that we often use the same inferential strategies to understand ourselves as those that we use to understand others.},
  file = {/Users/timokoch/Zotero/storage/LYZGZNZ2/Aucouturier et al. - 2016 - Covert digital manipulation of vocal emotion alter.pdf}
}

@book{auGroupedFeatureImportance2021,
  title = {Grouped {{Feature Importance}} and {{Combined Features Effect Plot}}},
  author = {Au, Quay and Herbinger, Julia and Stachl, Clemens and Bischl, Bernd and Casalicchio, Giuseppe},
  year = {2021},
  month = apr,
  abstract = {Interpretable machine learning has become a very active area of research due to the rising popularity of machine learning algorithms and their inherently challenging interpretability. Most work in this area has been focused on the interpretation of single features in a model. However, for researchers and practitioners, it is often equally important to quantify the importance or visualize the effect of feature groups. To address this research gap, we provide a comprehensive overview of how existing model-agnostic techniques can be defined for feature groups to assess the grouped feature importance, focusing on permutation-based, refitting, and Shapley-based methods. We also introduce an importance-based sequential procedure that identifies a stable and well-performing combination of features in the grouped feature space. Furthermore, we introduce the combined features effect plot, which is a technique to visualize the effect of a group of features based on a sparse, interpretable linear combination of features. We used simulation studies and a real data example from computational psychology to analyze, compare, and discuss these methods.},
  file = {/Users/timokoch/Zotero/storage/EI4CD9QA/Au et al. - 2021 - Grouped Feature Importance and Combined Features E.pdf}
}

@article{auGroupedFeatureImportance2022,
  title = {Grouped Feature Importance and Combined Features Effect Plot},
  author = {Au, Quay and Herbinger, Julia and Stachl, Clemens and Bischl, Bernd and Casalicchio, Giuseppe},
  year = {2022},
  month = jul,
  journal = {Data Mining and Knowledge Discovery},
  volume = {36},
  number = {4},
  pages = {1401--1450},
  issn = {1573-756X},
  doi = {10.1007/s10618-022-00840-5},
  urldate = {2022-07-31},
  abstract = {Interpretable machine learning has become a very active area of research due to the rising popularity of machine learning algorithms and their inherently challenging interpretability. Most work in this area has been focused on the interpretation of single features in a model. However, for researchers and practitioners, it is often equally important to quantify the importance or visualize the effect of feature groups. To address this research gap, we provide a comprehensive overview of how existing model-agnostic techniques can be defined for feature groups to assess the grouped feature importance, focusing on permutation-based, refitting, and Shapley-based methods. We also introduce an importance-based sequential procedure that identifies a stable and well-performing combination of features in the grouped feature space. Furthermore, we introduce the combined features effect plot, which is a technique to visualize the effect of a group of features based on a sparse, interpretable linear combination of features. We used simulation studies and real data examples to analyze, compare, and discuss these methods.},
  langid = {english},
  keywords = {Combined features effects,Dimension reduction,Grouped feature importance,Interpretable machine learning},
  file = {/Users/timokoch/Zotero/storage/C27N2C7B/Au et al. - 2022 - Grouped feature importance and combined features e.pdf}
}

@article{ayuso-mateosMultiCountryEvaluationAffective2013,
  title = {Multi-{{Country Evaluation}} of {{Affective Experience}}: {{Validation}} of an {{Abbreviated Version}} of the {{Day Reconstruction Method}} in {{Seven Countries}}},
  shorttitle = {Multi-{{Country Evaluation}} of {{Affective Experience}}},
  author = {{Ayuso-Mateos}, Jos{\'e} Luis and Miret, Marta and Caballero, Francisco F{\'e}lix and Olaya, Beatriz and Haro, Josep Maria and Kowal, Paul and Chatterji, Somnath},
  year = {2013},
  month = apr,
  journal = {PLOS ONE},
  volume = {8},
  number = {4},
  pages = {e61534},
  publisher = {Public Library of Science},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0061534},
  urldate = {2024-06-12},
  abstract = {Background The Day Reconstruction Method (DRM) was developed to assess affective states as measures of experienced well-being. The present study aimed to validate an abbreviated version of the DRM in a representative sample of the population in seven countries (China, Ghana, India, Mexico, Russia, South Africa, and Spain), and to examine whether there are country differences in affect and in the relationships among the activities based on the similarity of the affect associated with each of them. Methods Interviews were conducted with 47,222 non-institutionalized adults from seven countries, using an abbreviated version of the DRM. A cluster analysis was carried out to classify activities on the basis of the similarity of the associated affect. In each country, the factorial structure of the affect adjectives was tested through Confirmatory Factor Analysis. Internal consistency and construct validity were also assessed. Moreover, the differences in affect across countries and the diurnal cycles of affect were evaluated. Results The DRM showed adequate psychometric properties regarding reliability and construct validity in all countries. Respondents from Ghana and South Africa reported more positive net affect whereas Indian respondents reported less positive net affect. Most of the countries showed a similar diurnal variation of affect, which tended to improve throughout the day. Conclusions The results show that this abbreviated version of the DRM is a useful tool for multi-country evaluation of experienced well-being.},
  langid = {english},
  keywords = {Clustering algorithms,Ghana,Global health,India,Mexico,Russia,South Africa,Spain},
  file = {/Users/timokoch/Zotero/storage/FBE24DB3/Ayuso-Mateos et al. - 2013 - Multi-Country Evaluation of Affective Experience .pdf}
}

@misc{baevskiWav2vec20Framework2020,
  title = {Wav2vec 2.0: {{A Framework}} for {{Self-Supervised Learning}} of {{Speech Representations}}},
  shorttitle = {Wav2vec 2.0},
  author = {Baevski, Alexei and Zhou, Henry and Mohamed, Abdelrahman and Auli, Michael},
  year = {2020},
  month = oct,
  number = {arXiv:2006.11477},
  eprint = {2006.11477},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2006.11477},
  urldate = {2024-11-18},
  abstract = {We show for the first time that learning powerful representations from speech audio alone followed by fine-tuning on transcribed speech can outperform the best semi-supervised methods while being conceptually simpler. wav2vec 2.0 masks the speech input in the latent space and solves a contrastive task defined over a quantization of the latent representations which are jointly learned. Experiments using all labeled data of Librispeech achieve 1.8/3.3 WER on the clean/other test sets. When lowering the amount of labeled data to one hour, wav2vec 2.0 outperforms the previous state of the art on the 100 hour subset while using 100 times less labeled data. Using just ten minutes of labeled data and pre-training on 53k hours of unlabeled data still achieves 4.8/8.2 WER. This demonstrates the feasibility of speech recognition with limited amounts of labeled data.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning,Computer Science - Sound,Electrical Engineering and Systems Science - Audio and Speech Processing},
  file = {/Users/timokoch/Zotero/storage/V7DC5MLX/Baevski et al. - 2020 - wav2vec 2.0 A Framework for Self-Supervised Learn.pdf;/Users/timokoch/Zotero/storage/ST8YYBH5/2006.html}
}

@article{banseAcousticProfilesVocal1996,
  title = {Acoustic Profiles in Vocal Emotion Expression},
  author = {Banse, Rainer and Scherer, Klaus R.},
  year = {1996},
  journal = {Journal of Personality and Social Psychology},
  volume = {70},
  number = {3},
  pages = {614--636},
  publisher = {American Psychological Association},
  address = {US},
  issn = {1939-1315},
  doi = {10.1037/0022-3514.70.3.614},
  abstract = {Professional actors' portrayals of 14 emotions varying in intensity and valence were presented to judges. The results on decoding replicated earlier findings on the ability of judges to infer vocally expressed emotions with much-better-than-chance accuracy, including consistently found differences in the recognizability of different emotions. A total of 224 portrayals were subjected to digital acoustical analysis to obtain profiles of vocal parameters for different emotions. The data suggest that vocal parameters not only index the degree of intensity typical for different emotions but also differentiate valence or quality aspects. The data are also used to test theoretical predictions on vocal patterning based on the component process of model of emotion (K. R. Scherer, see record 1986-16849-001). Although most hypotheses are supported, some need to be revised on the basis of the empirical evidence. Discriminant analysis and jackknifing show remarkably high hit rates and patterns of confusion that closely mirror those found for listener-judges. (PsycINFO Database Record (c) 2016 APA, all rights reserved)},
  keywords = {Emotional States,Inference,Oral Communication,Speech Characteristics,Speech Perception},
  file = {/Users/timokoch/Zotero/storage/DSELHD5X/banse1996.pdf;/Users/timokoch/Zotero/storage/UL6FT9LI/Banse und Scherer - 1996 - Acoustic profiles in vocal emotion expression.pdf;/Users/timokoch/Zotero/storage/IFWHPSAM/1996-03014-015.html}
}

@article{banzigerIntroducingGenevaMultimodal2012,
  title = {Introducing the {{Geneva Multimodal}} Expression Corpus for Experimental Research on Emotion Perception},
  author = {B{\"a}nziger, Tanja and Mortillaro, Marcello and Scherer, Klaus R.},
  year = {2012},
  month = oct,
  journal = {Emotion (Washington, D.C.)},
  volume = {12},
  number = {5},
  pages = {1161--1179},
  issn = {1931-1516},
  doi = {10.1037/a0025827},
  abstract = {Research on the perception of emotional expressions in faces and voices is exploding in psychology, the neurosciences, and affective computing. This article provides an overview of some of the major emotion expression (EE) corpora currently available for empirical research and introduces a new, dynamic, multimodal corpus of emotion expressions, the Geneva Multimodal Emotion Portrayals Core Set (GEMEP-CS). The design features of the corpus are outlined and justified, and detailed validation data for the core set selection are presented and discussed. Finally, an associated database with microcoded facial, vocal, and body action elements, as well as observer ratings, is introduced.},
  langid = {english},
  pmid = {22081890},
  keywords = {Emotions,Facial Expression,Humans,Research,Voice}
}

@article{barrettEmotionalExpressionsReconsidered2019,
  title = {Emotional {{Expressions Reconsidered}}: {{Challenges}} to {{Inferring Emotion From Human Facial Movements}}},
  shorttitle = {Emotional {{Expressions Reconsidered}}},
  author = {Barrett, Lisa Feldman and Adolphs, Ralph and Marsella, Stacy and Martinez, Aleix M. and Pollak, Seth D.},
  year = {2019},
  month = jul,
  journal = {Psychological Science in the Public Interest},
  volume = {20},
  number = {1},
  pages = {1--68},
  publisher = {SAGE Publications Inc},
  issn = {1529-1006},
  doi = {10.1177/1529100619832930},
  urldate = {2021-12-14},
  abstract = {It is commonly assumed that a person's emotional state can be readily inferred from his or her facial movements, typically called emotional expressions or facial expressions. This assumption influences legal judgments, policy decisions, national security protocols, and educational practices; guides the diagnosis and treatment of psychiatric illness, as well as the development of commercial applications; and pervades everyday social interactions as well as research in other scientific fields such as artificial intelligence, neuroscience, and computer vision. In this article, we survey examples of this widespread assumption, which we refer to as the common view, and we then examine the scientific evidence that tests this view, focusing on the six most popular emotion categories used by consumers of emotion research: anger, disgust, fear, happiness, sadness, and surprise. The available scientific evidence suggests that people do sometimes smile when happy, frown when sad, scowl when angry, and so on, as proposed by the common view, more than what would be expected by chance. Yet how people communicate anger, disgust, fear, happiness, sadness, and surprise varies substantially across cultures, situations, and even across people within a single situation. Furthermore, similar configurations of facial movements variably express instances of more than one emotion category. In fact, a given configuration of facial movements, such as a scowl, often communicates something other than an emotional state. Scientists agree that facial movements convey a range of information and are important for social communication, emotional or otherwise. But our review suggests an urgent need for research that examines how people actually move their faces to express emotions and other social information in the variety of contexts that make up everyday life, as well as careful study of the mechanisms by which people perceive instances of emotion in one another. We make specific research recommendations that will yield a more valid picture of how people move their faces to express emotions and how they infer emotional meaning from facial movements in situations of everyday life. This research is crucial to provide consumers of emotion research with the translational information they require.},
  langid = {english},
  keywords = {emotion perception,emotion recognition,emotional expression},
  file = {/Users/timokoch/Zotero/storage/LNM8BUUY/Barrett et al. - 2019 - Emotional Expressions Reconsidered Challenges to .pdf}
}

@incollection{batlinerAutomaticRecognitionEmotions2011,
  title = {The {{Automatic Recognition}} of {{Emotions}} in {{Speech}}},
  booktitle = {Cognitive {{Technologies}}},
  author = {Batliner, Anton and Schuller, Bj{\"o}rn and Seppi, Dino and Steidl, Stefan and Devillers, Laurence and Vidrascu, Laurence and Vogt, Thurid and Aharonson, Vered and Amir, Noam},
  year = {2011},
  month = jan,
  pages = {71--99},
  doi = {10.1007/978-3-642-15184-2_6},
  abstract = {In this chapter, we focus on the automatic recognition of emotional states using acoustic and linguistic parameters as features and classifiers as tools to predict the `correct' emotional states. We first sketch history and state of the art in this field; then we describe the process of `corpus engineering', i.e. the design and the recording of databases, the annotation of emotional states, and further processing such as manual or automatic segmentation. Next, we present an overview of acoustic and linguistic features that are extracted automatically or manually. In the section on classifiers, we deal with topics such as the curse of dimensionality and the sparse data problem, classifiers, and evaluation. At the end of each section, we point out important aspects that should be taken into account for the planning or the assessment of studies. The subject area of this chapter is not emotions in some narrow sense but in a wider sense encompassing emotion-related states such as moods, attitudes, or interpersonal stances as well. We do not aim at an in-depth treatise of some specific aspects or algorithms but at an overview of approaches and strategies that have been used or should be used.},
  file = {/Users/timokoch/Zotero/storage/RZJEXMEE/Batliner et al. - 2011 - The Automatic Recognition of Emotions in Speech.pdf}
}

@article{ben-davidProsodySemanticsAre2016,
  title = {Prosody and {{Semantics Are Separate}} but {{Not Separable Channels}} in the {{Perception}} of {{Emotional Speech}}: {{Test}} for {{Rating}} of {{Emotions}} in {{Speech}}},
  shorttitle = {Prosody and {{Semantics Are Separate}} but {{Not Separable Channels}} in the {{Perception}} of {{Emotional Speech}}},
  author = {{Ben-David}, Boaz and Multani, Namita and Shakuf, Vered and Rudzicz, Frank and {van Lieshout}, Pascal H. H. M.},
  year = {2016},
  month = feb,
  journal = {Journal of Speech, Language, and Hearing Research},
  volume = {59},
  number = {1},
  pages = {72--89},
  publisher = {American Speech-Language-Hearing Association},
  doi = {10.1044/2015_JSLHR-H-14-0323},
  urldate = {2020-11-20},
  abstract = {Purpose       Our aim is to explore the complex interplay of prosody (tone of speech) and semantics          (verbal content) in the perception of discrete emotions in speech.              Method       We implement a novel tool, the Test for Rating of Emotions in Speech. Eighty native          English speakers were presented with spoken sentences made of different combinations          of 5 discrete emotions (anger, fear, happiness, sadness, and neutral) presented in          prosody and semantics. Listeners were asked to rate the sentence as a whole, integrating          both speech channels, or to focus on one channel only (prosody or semantics).              Results       We observed supremacy of congruency, failure of selective attention, and prosodic          dominance. Supremacy of congruency means that a sentence that presents the same emotion          in both speech channels was rated highest; failure of selective attention means that          listeners were unable to selectively attend to one channel when instructed; and prosodic          dominance means that prosodic information plays a larger role than semantics in processing          emotional speech.              Conclusions       Emotional prosody and semantics are separate but not separable channels, and it is          difficult to perceive one without the influence of the other. Our findings indicate          that the Test for Rating of Emotions in Speech can reveal specific aspects in the          processing of emotional speech and may in the future prove useful for understanding          emotion-processing deficits in individuals with pathologies.},
  file = {/Users/timokoch/Zotero/storage/Z5B7T3YI/Ben-David Boaz M. et al. - 2016 - Prosody and Semantics Are Separate but Not Separab.pdf;/Users/timokoch/Zotero/storage/8DXCIQCI/2015_JSLHR-H-14-0323.html}
}

@article{biecekDALEXExplainersComplex2018,
  title = {{{DALEX}}: {{Explainers}} for {{Complex Predictive Models}} in {{R}}},
  shorttitle = {{{DALEX}}},
  author = {Biecek, Przemyslaw},
  year = {2018},
  journal = {Journal of Machine Learning Research},
  volume = {19},
  number = {84},
  pages = {1--5},
  issn = {1533-7928},
  urldate = {2020-12-04},
  file = {/Users/timokoch/Zotero/storage/2GLJRE4D/Biecek - 2018 - DALEX Explainers for Complex Predictive Models in.pdf;/Users/timokoch/Zotero/storage/9X4J9LJ9/18-416.html;/Users/timokoch/Zotero/storage/Q73NDGYV/18-416.html}
}

@article{bischlResamplingMethodsMetaModel2012,
  title = {Resampling {{Methods}} for {{Meta-Model Validation}} with {{Recommendations}} for {{Evolutionary Computation}}},
  author = {Bischl, B. and Mersmann, O. and Trautmann, H. and Weihs, C.},
  year = {2012},
  month = jun,
  journal = {Evolutionary Computation},
  volume = {20},
  number = {2},
  pages = {249--275},
  issn = {1063-6560, 1530-9304},
  doi = {10.1162/EVCO_a_00069},
  urldate = {2019-09-18},
  abstract = {Meta-modeling has become a crucial tool in solving expensive optimization problems. Much of the work in the past has focused on finding a good regression method to model the fitness function. Examples include classical linear regression, splines, neural networks, Kriging and support vector regression. This paper specifically draws attention to the fact that assessing model accuracy is a crucial aspect in the meta-modeling framework. Resampling strategies such as cross-validation, subsampling, bootstrapping, and nested resampling are prominent methods for model validation and are systematically discussed with respect to possible pitfalls, shortcomings, and specific features. A survey of meta-modeling techniques within evolutionary optimization is provided. In addition, practical examples illustrating some of the pitfalls associated with model selection and performance assessment are presented. Finally, recommendations are given for choosing a model validation technique for a particular setting.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/TSY5U6LF/Bischl et al. - 2012 - Resampling Methods for Meta-Model Validation with .pdf}
}

@book{bolgerIntensiveLongitudinalMethods2013,
  title = {Intensive Longitudinal Methods: {{An}} Introduction to Diary and Experience Sampling Research},
  shorttitle = {Intensive Longitudinal Methods},
  author = {Bolger, Niall and Laurenceau, Jean-Philippe},
  year = {2013},
  series = {Intensive Longitudinal Methods: {{An}} Introduction to Diary and Experience Sampling Research},
  pages = {xv, 256},
  publisher = {Guilford Press},
  address = {New York, NY, US},
  abstract = {A complete, practical guide to planning and executing an intensive longitudinal study, this book provides the tools for understanding within-subject social, psychological, and physiological processes in everyday contexts. Intensive longitudinal studies involve many repeated measurements taken on individuals, dyads, or groups, and include diary and experience sampling studies. A range of engaging, worked-through research examples with datasets are featured. Coverage includes how to: select the best intensive longitudinal design for a particular research question, model within-subject change processes for continuous and categorical outcomes, distinguish within-subject from between-subjects effects, assess the reliability of within-subject changes, assure sufficient statistical power, and more. Several end-of-chapter write-ups illustrate effective ways to present study findings for publication. (PsycINFO Database Record (c) 2016 APA, all rights reserved)},
  isbn = {978-1-4625-0678-1 978-1-4625-0692-7},
  keywords = {Longitudinal Studies,Measurement,Methodology,Psychophysiology,Social Processes},
  file = {/Users/timokoch/Zotero/storage/DM3HS94F/2012-17340-000.html}
}

@article{boydPersonalityPanoramaConceptualizing2020,
  title = {The {{Personality Panorama}}: {{Conceptualizing Personality}} through {{Big Behavioural Data}}},
  shorttitle = {The {{Personality Panorama}}},
  author = {Boyd, Ryan L. and Pasca, Paola and Lanning, Kevin},
  year = {2020},
  month = sep,
  journal = {European Journal of Personality},
  volume = {34},
  number = {5},
  pages = {599--612},
  publisher = {SAGE Publications Ltd},
  issn = {0890-2070},
  doi = {10.1002/per.2254},
  urldate = {2022-12-16},
  abstract = {Personality psychology has long been grounded in data typologies, particularly in the delineation of behavioural, life outcome, informant?report, and self?report sources of data from one another. Such data typologies are becoming obsolete in the face of new methods, technologies, and data philosophies. In this article, we discuss personality psychology's historical thinking about data, modern data theory's place in personality psychology, and several qualities of big data that urge a rethinking of personality itself. We call for a move away from self?report questionnaires and a reprioritization of the study of behaviour within personality science. With big data and behavioural assessment, we have the potential to witness the confluence of situated, seamlessly interacting psychological processes, forming an inclusive, dynamic, multiangle view of personality. However, big behavioural data come hand in hand with important ethical considerations, and our emerging ability to create a ?personality panopticon? requires careful and thoughtful navigation. For our research to improve and thrive in partnership with new technologies, we must not only wield our new tools thoughtfully, but humanely. Through discourse and collaboration with other disciplines and the general public, we can foster mutual growth and ensure that humanity's burgeoning technological capabilities serve, rather than control, the public interest. ? 2020 European Association of Personality Psychology},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/RY3DHZLG/Boyd et al. - 2020 - The Personality Panorama Conceptualizing Personal.pdf}
}

@article{breimanRandomForests2001,
  title = {Random Forests},
  author = {Breiman, Leo},
  year = {2001},
  journal = {Machine learning},
  volume = {45},
  number = {1},
  pages = {5--32},
  issn = {0885-6125}
}

@article{brooksDeepLearningReveals2023,
  title = {Deep Learning Reveals What Vocal Bursts Express in Different Cultures},
  author = {Brooks, Jeffrey A. and Tzirakis, Panagiotis and Baird, Alice and Kim, Lauren and Opara, Michael and Fang, Xia and Keltner, Dacher and Monroy, Maria and Corona, Rebecca and Metrick, Jacob and Cowen, Alan S.},
  year = {2023},
  month = feb,
  journal = {Nature Human Behaviour},
  volume = {7},
  number = {2},
  pages = {240--250},
  publisher = {Nature Publishing Group},
  issn = {2397-3374},
  doi = {10.1038/s41562-022-01489-2},
  urldate = {2023-03-01},
  abstract = {Human social life is rich with sighs, chuckles, shrieks and other emotional vocalizations, called `vocal bursts'. Nevertheless, the meaning of vocal bursts across cultures is only beginning to be understood. Here, we combined large-scale experimental data collection with deep learning to reveal the shared and culture-specific meanings of vocal bursts. A total of n\,=\,4,031 participants in China, India, South Africa, the USA and Venezuela mimicked vocal bursts drawn from 2,756 seed recordings. Participants also judged the emotional meaning of each vocal burst. A deep neural network tasked with predicting the culture-specific meanings people attributed to vocal bursts while disregarding context and speaker identity discovered 24 acoustic dimensions, or kinds, of vocal expression with distinct emotion-related meanings. The meanings attributed to these complex vocal modulations were 79\% preserved across the five countries and three languages. These results reveal the underlying dimensions of human emotional vocalization in remarkable detail.},
  copyright = {2022 The Author(s), under exclusive licence to Springer Nature Limited},
  langid = {english},
  keywords = {Emotion,Human behaviour}
}

@inproceedings{burkhardtDatabaseGermanEmotional2005,
  title = {A Database of {{German}} Emotional Speech.},
  booktitle = {Interspeech},
  author = {Burkhardt, Felix and Paeschke, Astrid and Rolfes, Miriam and Sendlmeier, Walter F. and Weiss, Benjamin},
  year = {2005},
  volume = {5},
  pages = {1517--1520}
}

@book{burkhardtDatabaseGermanEmotional2005a,
  title = {A Database of {{German}} Emotional Speech},
  author = {Burkhardt, Felix and Paeschke, Astrid and Rolfes, M. and Sendlmeier, Walter and Weiss, Benjamin},
  year = {2005},
  month = sep,
  journal = {9th European Conference on Speech Communication and Technology},
  volume = {5},
  pages = {1520},
  doi = {10.21437/Interspeech.2005-446},
  abstract = {The article describes a database of emotional speech. Ten actors (5 female and 5 male) simulated the emotions, producing 10 German utterances (5 short and 5 longer sentences) which could be used in everyday communication and are interpretable in all applied emotions. The recordings were taken in an anechoic chamber with high-quality recording equipment. In addition to the sound electro-glottograms were recorded. The speech material comprises about 800 sentences (seven emotions * ten actors * ten sentences + some second versions). The complete database was evaluated in a perception test regarding the recognisability of emotions and their naturalness. Utterances recognised better than 80\% and judged as natural by more than 60\% of the listeners were phonetically labelled in a narrow transcription with special markers for voice-quality, phonatory and articulatory settings and articulatory features. The database can be accessed by the public via the internet (http://www.expressive-speech.net/emodb/).}
}

@article{cambriaAffectiveComputingSentiment2016,
  title = {Affective {{Computing}} and {{Sentiment Analysis}}},
  author = {Cambria, E.},
  year = {2016},
  month = mar,
  journal = {IEEE Intelligent Systems},
  volume = {31},
  number = {2},
  pages = {102--107},
  issn = {1941-1294},
  doi = {10.1109/MIS.2016.31},
  abstract = {Understanding emotions is an important aspect of personal development and growth, and as such it is a key tile for the emulation of human intelligence. Besides being important for the advancement of AI, emotion processing is also important for the closely related task of polarity detection. The opportunity to automatically capture the general public's sentiments about social events, political movements, marketing campaigns, and product preferences has raised interest in both the scientific community, for the exciting open challenges, and the business world, for the remarkable fallouts in marketing and financial market prediction. This has led to the emerging fields of affective computing and sentiment analysis, which leverage human-computer interaction, information retrieval, and multimodal signal processing for distilling people's sentiments from the ever-growing amount of online social data.},
  keywords = {affective computing,Affective computing,affective reasoning,emotion,emotion processing,emotion understanding,financial market prediction,human computer interaction,human intelligence emulation,human-computer interaction,information retrieval,intelligent systems,Knowledge based systems,marketing campaigns,multimodal signal processing,online social data,polarity detection,political movements,Pragmatics,public sentiments,scientific community,Semantics,sentiment analysis,Sentiment analysis,social events,social networking (online),Statistical analysis,Videos},
  file = {/Users/timokoch/Zotero/storage/IZMLJUQY/Cambria - 2016 - Affective Computing and Sentiment Analysis.pdf;/Users/timokoch/Zotero/storage/DFCY8SQ5/7435182.html}
}

@article{carlierSearchStateTrait2022,
  title = {In {{Search}} of {{State}} and {{Trait Emotion Markers}} in {{Mobile-Sensed Language}}: {{Field Study}}},
  shorttitle = {In {{Search}} of {{State}} and {{Trait Emotion Markers}} in {{Mobile-Sensed Language}}},
  author = {Carlier, Chiara and Niemeijer, Koen and Mestdagh, Merijn and Bauwens, Michael and Vanbrabant, Peter and Geurts, Luc and van Waterschoot, Toon and Kuppens, Peter},
  year = {2022},
  month = feb,
  journal = {JMIR Mental Health},
  volume = {9},
  number = {2},
  pages = {e31724},
  publisher = {JMIR Publications Inc., Toronto, Canada},
  doi = {10.2196/31724},
  urldate = {2022-07-15},
  abstract = {Background: Emotions and mood are important for overall well-being. Therefore, the search for continuous, effortless emotion prediction methods is an important field of study. Mobile sensing provides a promising tool and can capture one of the most telling signs of emotion: language. Objective: The aim of this study is to examine the separate and combined predictive value of mobile-sensed language data sources for detecting both momentary emotional experience as well as global individual differences in emotional traits and depression. Methods: In a 2-week experience sampling method study, we collected self-reported emotion ratings and voice recordings 10 times a day, continuous keyboard activity, and trait depression severity. We correlated state and trait emotions and depression and language, distinguishing between speech content (spoken words), speech form (voice acoustics), writing content (written words), and writing form (typing dynamics). We also investigated how well these features predicted state and trait emotions using cross-validation to select features and a hold-out set for validation. Results: Overall, the reported emotions and mobile-sensed language demonstrated weak correlations. The most significant correlations were found between speech content and state emotions and between speech form and state emotions, ranging up to 0.25. Speech content provided the best predictions for state emotions. None of the trait emotion--language correlations remained significant after correction. Among the emotions studied, valence and happiness displayed the most significant correlations and the highest predictive performance. Conclusions: Although using mobile-sensed language as an emotion marker shows some promise, correlations and predictive R2 values are low.},
  copyright = {Unless stated otherwise, all articles are open-access distributed under the terms of the Creative Commons Attribution License (http://creativecommons.org/licenses/by/2.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work ("first published in the Journal of Medical Internet Research...") is properly cited with original URL and bibliographic citation information. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/A66EL3FT/Carlier et al. - 2022 - In Search of State and Trait Emotion Markers in Mo.pdf;/Users/timokoch/Zotero/storage/IUI62U7D/e31724.html}
}

@article{cowenMapping24Emotions2019,
  title = {Mapping 24 Emotions Conveyed by Brief Human Vocalization},
  author = {Cowen, Alan S. and Elfenbein, Hillary Anger and Laukka, Petri and Keltner, Dacher},
  year = {2019},
  month = sep,
  journal = {The American Psychologist},
  volume = {74},
  number = {6},
  pages = {698--712},
  issn = {1935-990X},
  doi = {10.1037/amp0000399},
  abstract = {Emotional vocalizations are central to human social life. Recent studies have documented that people recognize at least 13 emotions in brief vocalizations. This capacity emerges early in development, is preserved in some form across cultures, and informs how people respond emotionally to music. What is poorly understood is how emotion recognition from vocalization is structured within what we call a semantic space, the study of which addresses questions critical to the field: How many distinct kinds of emotions can be expressed? Do expressions convey emotion categories or affective appraisals (e.g., valence, arousal)? Is the recognition of emotion expressions discrete or continuous? Guided by a new theoretical approach to emotion taxonomies, we apply large-scale data collection and analysis techniques to judgments of 2,032 emotional vocal bursts produced in laboratory settings (Study 1) and 48 found in the real world (Study 2) by U.S. English speakers (N = 1,105). We find that vocal bursts convey at least 24 distinct kinds of emotion. Emotion categories (sympathy, awe), more so than affective appraisals (including valence and arousal), organize emotion recognition. In contrast to discrete emotion theories, the emotion categories conveyed by vocal bursts are bridged by smooth gradients with continuously varying meaning. We visualize the complex, high-dimensional space of emotion conveyed by brief human vocalization within an online interactive map. (PsycINFO Database Record (c) 2019 APA, all rights reserved).},
  langid = {english},
  pmcid = {PMC6586540},
  pmid = {30570267},
  keywords = {Adolescent,Adult,Aged,Communication,Emotions,Female,Humans,Male,Middle Aged,Recognition Psychology,Semantics,Social Perception,Voice,Young Adult},
  file = {/Users/timokoch/Zotero/storage/8H4LMREM/Cowen et al. - 2019 - Mapping 24 emotions conveyed by brief human vocali.pdf}
}

@article{cowenPrimacyCategoriesRecognition2019,
  title = {The Primacy of Categories in the Recognition of 12 Emotions in Speech Prosody across Two Cultures},
  author = {Cowen, Alan S. and Laukka, Petri and Elfenbein, Hillary Anger and Liu, Runjing and Keltner, Dacher},
  year = {2019},
  month = apr,
  journal = {Nature human behaviour},
  volume = {3},
  number = {4},
  pages = {369--382},
  issn = {2397-3374},
  doi = {10.1038/s41562-019-0533-6},
  urldate = {2024-04-08},
  abstract = {Central to emotion science is the degree to which categories, such as awe, or broader affective features, such as valence, underlie the recognition of emotional expression. To explore the processes by which people recognize emotion from prosody, US and Indian participants were asked to judge the emotion categories or affective features communicated by 2,519 speech samples produced by 100 actors from five cultures. With large-scale statistical inference methods, we find that prosody can communicate at least 12 distinct kinds of emotion that are preserved across the two cultures. Analyses of the semantic and acoustic structure of emotion recognition reveal that emotion categories drive emotion recognition more so than affective features, including valence. In contrast to discrete emotion theories, however, emotion categories are bridged by gradients representing blends of emotions. Our findings, visualized within an interactive map (https://s3-us-west-1.amazonaws.com/venec/map.html), reveal a complex, high-dimensional space of emotional states recognized cross-culturally in speech prosody.},
  pmcid = {PMC6687085},
  pmid = {30971794},
  file = {/Users/timokoch/Zotero/storage/KXQDLLSH/Cowen et al. - 2019 - The primacy of categories in the recognition of 12.pdf}
}

@article{critchleyInteroceptionEmotion2017,
  title = {Interoception and Emotion},
  author = {Critchley, Hugo D and Garfinkel, Sarah N},
  year = {2017},
  month = oct,
  journal = {Current Opinion in Psychology},
  series = {Emotion},
  volume = {17},
  pages = {7--14},
  issn = {2352-250X},
  doi = {10.1016/j.copsyc.2017.04.020},
  urldate = {2022-12-16},
  abstract = {Influential theories suggest emotional feeling states arise from physiological changes from within the body. Interoception describes the afferent signalling, central processing, and neural and mental representation of internal bodily signals. Recent progress is made in conceptualizing interoception and its neural underpinnings. These developments are supported by empirical data concerning interoceptive mechanisms and their contribution to emotion. Fresh insights include description of short-term interoceptive effects on neural and mental processes (including fear-specific cardiac effects), the recognition of dissociable psychological dimensions of interoception, and models of interoceptive predictive coding that explain emotions and selfhood (reinforced by structural anatomical models and brain and experimental findings). This growing grasp of interoception is enriching our understanding of emotion and its disorders.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/9T6ZUA49/Critchley und Garfinkel - 2017 - Interoception and emotion.pdf}
}

@inproceedings{defrenEmotionalSpeechPerception2018,
  title = {Emotional {{Speech Perception}}: {{A}} Set of Semantically Validated {{German}} Neutral and Emotionally Affective Sentences},
  shorttitle = {Emotional {{Speech Perception}}},
  booktitle = {9th {{International Conference}} on {{Speech Prosody}} 2018},
  author = {Defren, Sabrina and {de Brito Castilho Wesseling}, Patricia and Allen, Shanley and Shakuf, Vered and {Ben-David}, Boaz and Lachmann, Thomas},
  year = {2018},
  month = jun,
  pages = {714--718},
  publisher = {ISCA},
  doi = {10.21437/SpeechProsody.2018-145},
  urldate = {2020-03-19},
  abstract = {In order to address the complex interplay of prosody and semantics, a set of sentences were generated, suitable for investigating emotional speech perception in German. Fortyseven German native speakers rated the emotional content of sentences on a 6-point Likert scale. From a set of 54 sentences, 10-11 each could reliably be associated with one of four distinct emotions. The remaining 11 were assessed as neutral (expressing no emotion). The unambiguous assignment of semantic (emotional) content enables the study of prosody as an independent factor. Moreover, the sentences were balanced regarding average word frequency, average phonological neighborhood density, and number of syllables per sentence. This linguistic balance enables an unbiased evaluation of the roles of semantic content and prosody in emotional speech.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/7X46GSUP/Defren et al. - 2018 - Emotional Speech Perception A set of semantically.pdf}
}

@article{dejonckheereAssessingReliabilitySingleitem2022,
  title = {Assessing the Reliability of Single-Item Momentary Affective Measurements in Experience Sampling.},
  author = {Dejonckheere, Egon and Demeyer, Febe and Geusens, Birte and Piot, Maarten and Tuerlinckx, Francis and Verdonck, Stijn and Mestdagh, Merijn},
  year = {2022},
  month = dec,
  journal = {Psychological Assessment},
  volume = {34},
  number = {12},
  pages = {1138--1154},
  issn = {1939-134X, 1040-3590},
  doi = {10.1037/pas0001178},
  urldate = {2023-01-24},
  abstract = {In research on emotions in daily life, measurement error is often ignored because emotions are assessed with a single item to reduce participant burden. We introduce two retests procedures to determine how reliable such emotion ratings are and show that measurement error variance is too substantial to simply disregard.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/Q62ZVHN7/Dejonckheere et al. - 2022 - Assessing the reliability of single-item momentary.pdf}
}

@incollection{demetriouSelfReportQuestionnaires2015,
  title = {Self-{{Report Questionnaires}}},
  booktitle = {The {{Encyclopedia}} of {{Clinical Psychology}}},
  author = {Demetriou, Constantina and Ozer, Bilge Uzun and Essau, Cecilia A.},
  editor = {Cautin, Robin L. and Lilienfeld, Scott O.},
  year = {2015},
  month = jan,
  pages = {1--6},
  publisher = {John Wiley \& Sons, Inc.},
  address = {Hoboken, NJ, USA},
  doi = {10.1002/9781118625392.wbecp507},
  urldate = {2019-03-07},
  isbn = {978-1-118-62539-2},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/ASZUM2V3/Demetriou et al. - 2015 - Self-Report Questionnaires.pdf}
}

@article{dengInterpretingTreeEnsembles2014,
  title = {Interpreting {{Tree Ensembles}} with {{inTrees}}},
  author = {Deng, Houtao},
  year = {2014},
  month = aug,
  journal = {arXiv:1408.5456 [cs, stat]},
  eprint = {1408.5456},
  primaryclass = {cs, stat},
  urldate = {2021-06-18},
  abstract = {Tree ensembles such as random forests and boosted trees are accurate but difficult to understand, debug and deploy. In this work, we provide the inTrees (interpretable trees) framework that extracts, measures, prunes and selects rules from a tree ensemble, and calculates frequent variable interactions. An rule-based learner, referred to as the simplified tree ensemble learner (STEL), can also be formed and used for future prediction. The inTrees framework can applied to both classification and regression problems, and is applicable to many types of tree ensembles, e.g., random forests, regularized random forests, and boosted trees. We implemented the inTrees algorithms in the "inTrees" R package.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/timokoch/Zotero/storage/GSLI2G3J/Deng - 2014 - Interpreting Tree Ensembles with inTrees.pdf;/Users/timokoch/Zotero/storage/X8DY5AFX/1408.html}
}

@article{dmelloReviewMetaAnalysisMultimodal2015,
  title = {A {{Review}} and {{Meta-Analysis}} of {{Multimodal Affect Detection Systems}}},
  author = {D'mello, Sidney K. and Kory, Jacqueline},
  year = {2015},
  month = apr,
  journal = {ACM Computing Surveys},
  volume = {47},
  number = {3},
  pages = {1--36},
  issn = {0360-0300, 1557-7341},
  doi = {10.1145/2682899},
  urldate = {2024-07-29},
  abstract = {Affect detection is an important pattern recognition problem that has inspired researchers from several areas. The field is in need of a systematic review due to the recent influx of Multimodal (MM) affect detection systems that differ in several respects and sometimes yield incompatible results. This article provides such a survey via a quantitative review and meta-analysis of 90 peer-reviewed MM systems. The review indicated that the state of the art mainly consists of person-dependent models (62.2\% of systems) that fuse audio and visual (55.6\%) information to detect acted (52.2\%) expressions of basic emotions and simple dimensions of arousal and valence (64.5\%) with feature- (38.9\%) and decision-level (35.6\%) fusion techniques. However, there were also person-independent systems that considered additional modalities to detect nonbasic emotions and complex dimensions using model-level fusion techniques. The meta-analysis revealed that MM systems were consistently (85\% of systems) more accurate than their best unimodal counterparts, with an average improvement of 9.83\% (median of 6.60\%). However, improvements were three times lower when systems were trained on natural (4.59\%) versus acted data (12.7\%). Importantly, MM accuracy could be accurately predicted (cross-validated               R               2               of 0.803) from unimodal accuracies and two system-level factors. Theoretical and applied implications and recommendations are discussed.},
  langid = {english}
}

@inproceedings{dubeyBigEARInferringAmbient2016,
  title = {{{BigEAR}}: {{Inferring}} the {{Ambient}} and {{Emotional Correlates}} from {{Smartphone-Based Acoustic Big Data}}},
  shorttitle = {{{BigEAR}}},
  booktitle = {2016 {{IEEE First International Conference}} on {{Connected Health}}: {{Applications}}, {{Systems}} and {{Engineering Technologies}} ({{CHASE}})},
  author = {Dubey, Harishchandra and Mehl, Matthias R. and Mankodiya, Kunal},
  year = {2016},
  month = jun,
  pages = {78--83},
  doi = {10.1109/CHASE.2016.46},
  abstract = {This paper presents a novel BigEAR big data framework that employs psychological audio processing chain (PAPC) to process smartphone-based acoustic big data collected when the user performs social conversations in naturalistic scenarios. The overarching goal of BigEAR is to identify moods of the wearer from various activities such as laughing, singing, crying, arguing, and sighing. These annotations are based on ground truth relevant for psychologists who intend to monitor/infer the social context of individuals coping with breast cancer. We pursued a case study on couples coping with breast cancer to know how the conversations affect emotional and social well being. In the state-of-the-art methods, psychologists and their team have to hear the audio recordings for making these inferences by subjective evaluations that not only are time-consuming and costly, but also demand manual data coding for thousands of audio files. The BigEAR framework automates the audio analysis. We computed the accuracy of BigEAR with respect to the ground truth obtained from a human rater. Our approach yielded overall average accuracy of 88.76\% on real-world data from couples coping with breast cancer.},
  keywords = {Acoustics,Big data,Breast cancer,Feature extraction,Mood,Speech},
  file = {/Users/timokoch/Zotero/storage/CXDEKMQV/Dubey et al. - 2016 - BigEAR Inferring the Ambient and Emotional Correl.pdf;/Users/timokoch/Zotero/storage/RFFHULEF/7545817.html}
}

@article{dukesRiseAffectivism2021,
  title = {The Rise of Affectivism},
  author = {Dukes, Daniel and Abrams, Kathryn and Adolphs, Ralph and Ahmed, Mohammed E. and Beatty, Andrew and Berridge, Kent C. and Broomhall, Susan and Brosch, Tobias and Campos, Joseph J. and Clay, Zanna and Cl{\'e}ment, Fabrice and Cunningham, William A. and Damasio, Antonio and Damasio, Hanna and D'Arms, Justin and Davidson, Jane W. and de Gelder, Beatrice and Deonna, Julien and de Sousa, Ronnie and Ekman, Paul and Ellsworth, Phoebe C. and Fehr, Ernst and Fischer, Agneta and Foolen, Ad and Frevert, Ute and Grandjean, Didier and Gratch, Jonathan and Greenberg, Leslie and Greenspan, Patricia and Gross, James J. and Halperin, Eran and Kappas, Arvid and Keltner, Dacher and Knutson, Brian and Konstan, David and Kret, Mariska E. and LeDoux, Joseph E. and Lerner, Jennifer S. and Levenson, Robert W. and Loewenstein, George and Manstead, Antony S. R. and Maroney, Terry A. and Moors, Agnes and Niedenthal, Paula and Parkinson, Brian and Pavlidis, Ioannis and Pelachaud, Catherine and Pollak, Seth D. and Pourtois, Gilles and {Roettger-Roessler}, Birgitt and Russell, James A. and Sauter, Disa and Scarantino, Andrea and Scherer, Klaus R. and Stearns, Peter and Stets, Jan E. and Tappolet, Christine and Teroni, Fabrice and Tsai, Jeanne and Turner, Jonathan and Reekum, Carien Van and Vuilleumier, Patrik and Wharton, Tim and Sander, David},
  year = {2021},
  month = jun,
  journal = {Nature Human Behaviour},
  pages = {1--5},
  publisher = {Nature Publishing Group},
  issn = {2397-3374},
  doi = {10.1038/s41562-021-01130-8},
  urldate = {2021-06-15},
  abstract = {Research over the past decades has demonstrated the explanatory power of emotions, feelings, motivations, moods, and other affective processes when trying to understand and predict how we think and behave. In this consensus article, we ask: has the increasingly recognized impact of affective phenomena ushered in a new era, the era of affectivism?},
  copyright = {2021 Springer Nature Limited},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/R4JKCVNS/Dukes et al. - 2021 - The rise of affectivism.pdf;/Users/timokoch/Zotero/storage/HQZ8GRRA/s41562-021-01130-8.html}
}

@article{ekmanArgumentBasicEmotions1992,
  title = {An Argument for Basic Emotions},
  author = {Ekman, Paul},
  year = {1992},
  month = may,
  journal = {Cognition and Emotion},
  volume = {6},
  number = {3-4},
  pages = {169--200},
  publisher = {Routledge},
  issn = {0269-9931},
  doi = {10.1080/02699939208411068},
  urldate = {2021-11-24},
  abstract = {Emotions are viewed as having evolved through their adaptive value in dealing with fundamental life-tasks. Each emotion has unique features: signal, physiology, and antecedent events. Each emotion also has characteristics in common with other emotions: rapid onset, short duration, unbidden occurrence, automatic appraisal, and coherence among responses. These shared and unique characteristics are the product of our evolution, and distinguish emotions from other affective phenomena.},
  file = {/Users/timokoch/Zotero/storage/8APFPSCL/02699939208411068.html}
}

@article{ekmanRepertoireNonverbalBehavior1969,
  title = {The {{Repertoire}} of {{Nonverbal Behavior}}: {{Categories}}, {{Origins}}, {{Usage}}, and {{Coding}}},
  shorttitle = {The {{Repertoire}} of {{Nonverbal Behavior}}},
  author = {Ekman, Paul and Friesen, Wallace V.},
  year = {1969},
  month = jan,
  journal = {Semiotica},
  volume = {1},
  number = {1},
  pages = {49--98},
  publisher = {De Gruyter Mouton},
  issn = {1613-3692},
  doi = {10.1515/semi.1969.1.1.49},
  urldate = {2024-09-26},
  abstract = {Der Artikel The Repertoire of Nonverbal Behavior: Categories, Origins, Usage, and Coding wurde am 1. Januar 1969 in der Zeitschrift Semiotica (Band 1, Heft 1) ver{\"o}ffentlicht.},
  copyright = {De Gruyter expressly reserves the right to use all content for commercial text and data mining within the meaning of Section 44b of the German Copyright Act.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/V2JLAI64/Ekman and Friesen - 1969 - The Repertoire of Nonverbal Behavior Categories, .pdf}
}

@article{elayadiSurveySpeechEmotion2011,
  title = {Survey on Speech Emotion Recognition: {{Features}}, Classification Schemes, and Databases},
  shorttitle = {Survey on Speech Emotion Recognition},
  author = {El Ayadi, Moataz and Kamel, Mohamed S. and Karray, Fakhri},
  year = {2011},
  month = mar,
  journal = {Pattern Recognition},
  volume = {44},
  number = {3},
  pages = {572--587},
  issn = {0031-3203},
  doi = {10.1016/j.patcog.2010.09.020},
  urldate = {2024-09-17},
  abstract = {Recently, increasing attention has been directed to the study of the emotional content of speech signals, and hence, many systems have been proposed to identify the emotional content of a spoken utterance. This paper is a survey of speech emotion classification addressing three important aspects of the design of a speech emotion recognition system. The first one is the choice of suitable features for speech representation. The second issue is the design of an appropriate classification scheme and the third issue is the proper preparation of an emotional speech database for evaluating system performance. Conclusions about the performance and limitations of current speech emotion recognition systems are discussed in the last section of this survey. This section also suggests possible ways of improving speech emotion recognition systems.},
  keywords = {Archetypal emotions,Dimensionality reduction techniques,Emotional speech databases,Speech emotion recognition,Statistical classifiers},
  file = {/Users/timokoch/Zotero/storage/APM9HW8Y/S0031320310004619.html}
}

@article{eybenGenevaMinimalisticAcoustic2016,
  title = {The {{Geneva Minimalistic Acoustic Parameter Set}} ({{GeMAPS}}) for {{Voice Research}} and {{Affective Computing}}},
  author = {Eyben, Florian and Scherer, Klaus R. and Schuller, Bj{\"o}rn and Sundberg, Johan and Andre, Elisabeth and Busso, Carlos and Devillers, Laurence Y. and Epps, Julien and Laukka, Petri and Narayanan, Shrikanth S. and Truong, Khiet P.},
  year = {2016},
  month = apr,
  journal = {IEEE Transactions on Affective Computing},
  volume = {7},
  number = {2},
  pages = {190--202},
  issn = {1949-3045},
  doi = {10.1109/TAFFC.2015.2457417},
  urldate = {2019-02-20},
  abstract = {Work on voice sciences over recent decades has led to a proliferation of acoustic parameters that are used quite selectively and are not always extracted in a similar fashion. With many independent teams working in different research areas, shared standards become an essential safeguard to ensure compliance with state-of-the-art methods allowing appropriate comparison of results across studies and potential integration and combination of extraction and recognition systems. In this paper we propose a basic standard acoustic parameter set for various areas of automatic voice analysis, such as paralinguistic or clinical speech analysis. In contrast to a large brute-force parameter set, we present a minimalistic set of voice parameters here. These were selected based on a) their potential to index affective physiological changes in voice production, b) their proven value in former studies as well as their automatic extractability, and c) their theoretical significance. The set is intended to provide a common baseline for evaluation of future research and eliminate differences caused by varying parameter sets or even different implementations of the same parameters. Our implementation is publicly available with the openSMILE toolkit. Comparative evaluations of the proposed feature set and large baseline feature sets of INTERSPEECH challenges show a high performance of the proposed set in relation to its size.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/CJPU7SQG/Eyben et al. - 2016 - The Geneva Minimalistic Acoustic Parameter Set (Ge.pdf}
}

@inproceedings{eybenOpensmileMunichVersatile2010,
  title = {Opensmile: The Munich Versatile and Fast Open-Source Audio Feature Extractor},
  shorttitle = {Opensmile},
  booktitle = {Proceedings of the International Conference on {{Multimedia}} - {{MM}} '10},
  author = {Eyben, Florian and W{\"o}llmer, Martin and Schuller, Bj{\"o}rn},
  year = {2010},
  pages = {1459},
  publisher = {ACM Press},
  address = {Firenze, Italy},
  doi = {10.1145/1873951.1874246},
  urldate = {2021-11-09},
  abstract = {We introduce the openSMILE feature extraction toolkit, which unites feature extraction algorithms from the speech processing and the Music Information Retrieval communities. Audio low-level descriptors such as CHROMA and CENS features, loudness, Mel-frequency cepstral coefficients, perceptual linear predictive cepstral coefficients, linear predictive coefficients, line spectral frequencies, fundamental frequency, and formant frequencies are supported. Delta regression and various statistical functionals can be applied to the low-level descriptors. openSMILE is implemented in C++ with no third-party dependencies for the core functionality. It is fast, runs on Unix and Windows platforms, and has a modular, component based architecture which makes extensions via plug-ins easy. It supports on-line incremental processing for all implemented features as well as off-line and batch processing. Numeric compatibility with future versions is ensured by means of unit tests. openSMILE can be downloaded from http://opensmile.sourceforge.net/.},
  isbn = {978-1-60558-933-6},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/F5KPA499/Eyben et al. - 2010 - Opensmile the munich versatile and fast open-sour.pdf}
}

@inproceedings{eybenRecentDevelopmentsOpenSMILE2013,
  title = {Recent {{Developments}} in {{openSMILE}}, the {{Munich Open-source Multimedia Feature Extractor}}},
  booktitle = {Proceedings of the 21st {{ACM International Conference}} on {{Multimedia}}},
  author = {Eyben, Florian and Weninger, Felix and Gross, Florian and Schuller, Bj{\"o}rn},
  year = {2013},
  series = {{{MM}} '13},
  pages = {835--838},
  publisher = {ACM},
  address = {New York, NY, USA},
  doi = {10.1145/2502081.2502224},
  urldate = {2018-11-21},
  abstract = {We present recent developments in the openSMILE feature extraction toolkit. Version 2.0 now unites feature extraction paradigms from speech, music, and general sound events with basic video features for multi-modal processing. Descriptors from audio and video can be processed jointly in a single framework allowing for time synchronization of parameters, on-line incremental processing as well as off-line and batch processing, and the extraction of statistical functionals (feature summaries), such as moments, peaks, regression parameters, etc. Postprocessing of the features includes statistical classifiers such as support vector machine models or file export for popular toolkits such as Weka or HTK. Available low-level descriptors include popular speech, music and video features including Mel-frequency and similar cepstral and spectral coefficients, Chroma, CENS, auditory model based loudness, voice quality, local binary pattern, color, and optical flow histograms. Besides, voice activity detection, pitch tracking and face detection are supported. openSMILE is implemented in C++, using standard open source libraries for on-line audio and video input. It is fast, runs on Unix and Windows platforms, and has a modular, component based architecture which makes extensions via plug-ins easy. openSMILE 2.0 is distributed under a research license and can be downloaded from http://opensmile.sourceforge.net/.},
  isbn = {978-1-4503-2404-5},
  keywords = {acoustic features,affect recognition,affective computing,audio features,computational paralinguistics,feature extraction,machine learning,multimedia analysis,openSMILE,video features,visual features},
  file = {/Users/timokoch/Zotero/storage/892EJHYH/Eyben et al. - 2013 - Recent Developments in openSMILE, the Munich Open-.pdf}
}

@article{fairbanksExperimentalStudyPitch1939,
  title = {An Experimental Study of the Pitch Characteristics of the Voice during the Expression of Emotions},
  author = {Fairbanks, G. and Pronovost, W.},
  year = {1939},
  journal = {Speech Monographs},
  volume = {6},
  pages = {87--104},
  publisher = {Taylor \& Francis},
  address = {United Kingdom},
  issn = {0038-7169},
  doi = {10.1080/03637753909374863},
  abstract = {The pitch characteristics of simulated emotions were investigated to determine their distinguishing characteristics. Six actors read for recording selections whose content facilitated the expression of anger, contempt, fear, grief, and indifference. All selections contained the same brief test section. Although not itself having a single inherent affective meaning, this recorded test section, when separated from the main selection, was accurately identified by high percentages of a group of 64 observers, averaging 84\% for contempt, 78\% for anger, 66\% for fear, 78\% for grief, and 88\% for indifference. The most accurately identified example for each emotion was analyzed. These five were all identified by 94\% or more of the subjects. Tables and graphs are presented to demonstrate that measurable pitch characteristics distinguish these expressions of emotion from each other. Variations in pitch level, inflections, shifts, and frequency of pitch changes are described. (PsycINFO Database Record (c) 2017 APA, all rights reserved)},
  file = {/Users/timokoch/Zotero/storage/Y3S3GGQ5/1940-02497-001.html}
}

@article{fanHowWellCan2023,
  title = {How Well Can an {{AI}} Chatbot Infer Personality? {{Examining}} Psychometric Properties of Machine-Inferred Personality Scores},
  shorttitle = {How Well Can an {{AI}} Chatbot Infer Personality?},
  author = {Fan, Jinyan and Sun, Tianjun and Liu, Jiayi and Zhao, Teng and Zhang, Bo and Chen, Zheng and Glorioso, Melissa and Hack, Elissa},
  year = {2023},
  journal = {Journal of Applied Psychology},
  pages = {No Pagination Specified-No Pagination Specified},
  publisher = {American Psychological Association},
  address = {US},
  issn = {1939-1854},
  doi = {10.1037/apl0001082},
  abstract = {The present study explores the plausibility of measuring personality indirectly through an artificial intelligence (AI) chatbot. This chatbot mines various textual features from users' free text responses collected during an online conversation/interview and then uses machine learning algorithms to infer personality scores. We comprehensively examine the psychometric properties of the machine-inferred personality scores, including reliability (internal consistency, split-half, and test--retest), factorial validity, convergent and discriminant validity, and criterion-related validity. Participants were undergraduate students (n = 1,444) enrolled in a large southeastern public university in the United States who completed a self-report Big Five personality measure (IPIP-300) and engaged with an AI chatbot for approximately 20--30 min. In a subsample (n = 407), we obtained participants' cumulative grade point averages from the University Registrar and had their peers rate their college adjustment. In an additional sample (n = 61), we obtained test--retest data. Results indicated that machine-inferred personality scores (a) had overall acceptable reliability at both the domain and facet levels, (b) yielded a comparable factor structure to self-reported questionnaire-derived personality scores, (c) displayed good convergent validity but relatively poor discriminant validity (averaged convergent correlations = .48 vs. averaged machine-score correlations = .35 in the test sample), (d) showed low criterion-related validity, and (e) exhibited incremental validity over self-reported questionnaire-derived personality scores in some analyses. In addition, there was strong evidence for cross-sample generalizability of psychometric properties of machine scores. Theoretical implications, future research directions, and practical considerations are discussed. (PsycInfo Database Record (c) 2023 APA, all rights reserved)},
  keywords = {Artificial Intelligence,Convergent Validity,Conversational Agents,Criterion Validity,Discriminant Validity,Factorial Validity,Internal Consistency,Machine Learning,Personality,Split-Half Reliability,Test Reliability,Test Scores,Test Validity,Test-Retest Reliability},
  file = {/Users/timokoch/Zotero/storage/B2ZFHA52/2023-43379-001.html}
}

@article{frickCommunicatingEmotionRole,
  title = {Communicating {{Emotion}}: {{The Role}} of {{Prosodic Features}}},
  author = {Frick, Robert W},
  pages = {18},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/A9XPRU8L/Frick - Communicating Emotion The Role of Prosodic Featur.pdf}
}

@article{friedmanRegularizationPathsGeneralized2010,
  title = {Regularization Paths for Generalized Linear Models via Coordinate Descent},
  author = {Friedman, Jerome and Hastie, Trevor and Tibshirani, Rob},
  year = {2010},
  journal = {Journal of statistical software},
  volume = {33},
  number = {1},
  pages = {1}
}

@inproceedings{gaoInvestigatingReliabilitySelfreport2021,
  title = {Investigating the {{Reliability}} of {{Self-report Data}} in the {{Wild}}: {{The Quest}} for {{Ground Truth}}},
  shorttitle = {Investigating the {{Reliability}} of {{Self-report Data}} in the {{Wild}}},
  booktitle = {Adjunct {{Proceedings}} of the 2021 {{ACM International Joint Conference}} on {{Pervasive}} and {{Ubiquitous Computing}} and {{Proceedings}} of the 2021 {{ACM International Symposium}} on {{Wearable Computers}}},
  author = {Gao, Nan and Saiedur Rahaman, Mohammad and Shao, Wei and Salim, Flora D},
  year = {2021},
  month = sep,
  series = {{{UbiComp}} '21},
  pages = {237--242},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  doi = {10.1145/3460418.3479338},
  urldate = {2022-12-16},
  abstract = {Inferring human mental state (e.g., emotion, depression, engagement) with sensing technology is one of the most valuable challenges in the affective computing area, which has a profound impact in all industries interacting with humans. Self-report is the most common way to quantify how people think, but prone to subjectivity and various responses bias. It is usually used as the ground truth for human mental state prediction. In recent years, many data-driven machine learning models are built based on self-report annotations as the target value. In this research, we investigate the reliability of self-report data in the wild by studying the confidence level of responses and survey completion time. We conduct a case study (i.e., student engagement inference) by recruiting 23 students in a high school setting over a period of 4 weeks. Overall, our participants volunteered 488 self-reported responses and sensing data from smart wristbands. We find that the physiologically measured student engagement and perceived student engagement are not always consistent. The findings from this research have great potential to benefit future studies in predicting engagement, depression, stress, and other emotion-related states in the field of affective computing and sensing technologies.},
  isbn = {978-1-4503-8461-2},
  keywords = {Ecological Momentary Assessment,Emotion Prediction,Field Study,Ground Truth,Physiological Signals,Reliability,Self-report Measures},
  file = {/Users/timokoch/Zotero/storage/4VUXHE2Z/Gao et al. - 2021 - Investigating the Reliability of Self-report Data .pdf;/Users/timokoch/Zotero/storage/FTGIAIKH/Gao et al. - 2021 - Investigating the Reliability of Self-report Data .pdf}
}

@article{geldhofReliabilityEstimationMultilevel2014,
  title = {Reliability Estimation in a Multilevel Confirmatory Factor Analysis Framework.},
  author = {Geldhof, G. John and Preacher, Kristopher J. and Zyphur, Michael J.},
  year = {2014},
  month = mar,
  journal = {Psychological Methods},
  volume = {19},
  number = {1},
  pages = {72--91},
  issn = {1939-1463, 1082-989X},
  doi = {10.1037/a0032138},
  urldate = {2021-11-26},
  abstract = {Scales with varying degrees of measurement reliability are often used in the context of multistage sampling, where variance exists at multiple levels of analysis (e.g., individual and group). Because methodological guidance on assessing and reporting reliability at multiple levels of analysis is currently lacking, we discuss the importance of examining level-specific reliability. We present a simulation study and an applied example showing different methods for estimating multilevel reliability using multilevel confirmatory factor analysis and provide supporting Mplus program code. We conclude that (a) single-level estimates will not reflect a scale's actual reliability unless reliability is identical at each level of analysis, (b) 2-level alpha and composite reliability (omega) perform relatively well in most settings, (c) estimates of maximal reliability (H) were more biased when estimated using multilevel data than either alpha or omega, and (d) small cluster size can lead to overestimates of reliability at the between level of analysis. We also show that Monte Carlo confidence intervals and Bayesian credible intervals closely reflect the sampling distribution of reliability estimates under most conditions. We discuss the estimation of credible intervals using Mplus and provide R code for computing Monte Carlo confidence intervals.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/3IFXKWIB/Geldhof et al. - 2014 - Reliability estimation in a multilevel confirmator.pdf}
}

@article{giordanoRepresentationalDynamicsPerceived2021,
  title = {The Representational Dynamics of Perceived Voice Emotions Evolve from Categories to Dimensions},
  author = {Giordano, Bruno L. and Whiting, Caroline and Kriegeskorte, Nikolaus and Kotz, Sonja A. and Gross, Joachim and Belin, Pascal},
  year = {2021},
  month = sep,
  journal = {Nature Human Behaviour},
  volume = {5},
  number = {9},
  pages = {1203--1213},
  publisher = {Nature Publishing Group},
  issn = {2397-3374},
  doi = {10.1038/s41562-021-01073-0},
  urldate = {2021-11-23},
  abstract = {Long-standing affective science theories conceive the perception of emotional stimuli either as discrete categories (for example, an angry voice) or continuous dimensional attributes (for example, an intense and negative vocal emotion). Which position provides a better account is still widely debated. Here we contrast the positions to account for acoustics-independent perceptual and cerebral representational geometry of perceived voice emotions. We combined multimodal imaging of the cerebral response to heard vocal stimuli (using functional magnetic resonance imaging and magneto-encephalography) with post-scanning behavioural assessment of voice emotion perception. By using representational similarity analysis, we find that categories prevail in perceptual and early (less than 200\,ms) frontotemporal cerebral representational geometries and that dimensions impinge predominantly on a later limbic--temporal network (at 240\,ms and after 500\,ms). These results reconcile the two opposing views by reframing the perception of emotions as the interplay of cerebral networks with different representational dynamics that emphasize either categories or dimensions.},
  copyright = {2021 The Author(s), under exclusive licence to Springer Nature Limited},
  langid = {english},
  keywords = {Human behaviour,Limbic system},
  annotation = {Bandiera\_abtest: a\\
Cg\_type: Nature Research Journals\\
Primary\_atype: Research\\
Subject\_term: Human behaviour;Limbic system\\
Subject\_term\_id: human-behaviour;limbic-system},
  file = {/Users/timokoch/Zotero/storage/2UQVIJNH/Giordano et al. - 2021 - The representational dynamics of perceived voice e.pdf;/Users/timokoch/Zotero/storage/4W7IG5DY/s41562-021-01073-0.html}
}

@inproceedings{goronImprovingDomainGeneralization2024,
  title = {Improving {{Domain Generalization}} in {{Speech Emotion Recognition}} with {{Whisper}}},
  booktitle = {{{ICASSP}} 2024 - 2024 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}} and {{Signal Processing}} ({{ICASSP}})},
  author = {Goron, Erik and Asai, Lena and Rut, Elias and Dinov, Martin},
  year = {2024},
  month = apr,
  pages = {11631--11635},
  issn = {2379-190X},
  doi = {10.1109/ICASSP48485.2024.10446997},
  urldate = {2024-11-18},
  abstract = {Transformers have been used successfully in a variety of settings, including Speech Emotion Recognition (SER). However, use of the latest transformer base models in domain generalization (DG) settings has mostly been unexplored or only weakly explored. We present here our state-of-the-art results in discrete emotion recognition across a variety of datasets, including acted and non-acted datasets, showing that Whisper is a powerful base Transformer model for this task. We show that our approach to DG with Whisper results in accuracy surpassing all previously published results, with an Unweighted Average Recall (UAR) of 74.5\% averaged across the 6 distinct datasets used. We discuss some of the possible reasons behind Whisper's superior performance to other Transformer models, though all 3 Transformer models evaluated here (HuBERT, WavLM, Whisper) show an ability to generalize as well as learn paralinguistic information successfully through fine-tuning with relatively few examples.},
  keywords = {Data models,Domain Generalization,Emotion recognition,Explainable AI,HuBERT,Signal processing,Signal processing algorithms,Speech Emotion Recognition (SER),Speech recognition,Transformers,Whisper},
  file = {/Users/timokoch/Zotero/storage/XNCE4III/Goron et al. - 2024 - Improving Domain Generalization in Speech Emotion .pdf}
}

@article{gotzUsersMainSmartphone2017,
  title = {Users of the Main Smartphone Operating Systems ({{iOS}}, {{Android}}) Differ Only Little in Personality},
  author = {G{\"o}tz, Friedrich M. and Stieger, Stefan and Reips, Ulf-Dietrich},
  year = {2017},
  month = mar,
  journal = {PLOS ONE},
  volume = {12},
  number = {5},
  pages = {e0176921},
  publisher = {Public Library of Science},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0176921},
  urldate = {2023-02-21},
  abstract = {The increasingly widespread use of mobile phone applications (apps) as research tools and cost-effective means of vast data collection raises new methodological challenges. In recent years, it has become a common practice for scientists to design apps that run only on a single operating system, thereby excluding large numbers of users who use a different operating system. However, empirical evidence investigating any selection biases that might result thereof is scarce. Henceforth, we conducted two studies drawing from a large multi-national (Study 1; N = 1,081) and a German-speaking sample (Study 2; N = 2,438). As such Study 1 compared iOS and Android users across an array of key personality traits (i.e., well-being, self-esteem, willingness to take risks, optimism, pessimism, Dark Triad, and the Big Five). Focusing on Big Five personality traits in a broader scope, in addition to smartphone users, Study 2 also examined users of the main computer operating systems (i.e., Mac OS, Windows). In both studies, very few significant differences were found, all of which were of small or even tiny effect size mostly disappearing after sociodemographics had been controlled for. Taken together, minor differences in personality seem to exist, but they are of small to negligible effect size (ranging from OR = 0.919 to 1.344 (Study 1), {$\eta$}p2 = .005 to .036 (Study 2), respectively) and may reflect differences in sociodemographic composition, rather than operating system of smartphone users.},
  langid = {english},
  keywords = {Apps,Cell phones,Educational attainment,Operating systems,Personality,Personality traits,Psychology,Questionnaires},
  file = {/Users/timokoch/Zotero/storage/LLGKVI2U/Götz et al. - 2017 - Users of the main smartphone operating systems (iO.pdf}
}

@inproceedings{grimmVeraAmMittag2008,
  title = {The {{Vera}} Am {{Mittag German}} Audio-Visual Emotional Speech Database},
  booktitle = {2008 {{IEEE International Conference}} on {{Multimedia}} and {{Expo}}},
  author = {Grimm, Michael and Kroschel, Kristian and Narayanan, Shrikanth},
  year = {2008},
  month = jun,
  pages = {865--868},
  issn = {1945-788X},
  doi = {10.1109/ICME.2008.4607572},
  abstract = {The lack of publicly available annotated databases is one of the major barriers to research advances on emotional information processing. In this contribution we present a recently collected database of spontaneous emotional speech in German which is being made available to the research community. The database consists of 12 hours of audio-visual recordings of the German TV talk show ldquoVera am Mittagrdquo, segmented into broadcasts, dialogue acts and utterances. This corpus contains spontaneous and very emotional speech recorded from unscripted, authentic discussions between the guests of the talk show. In addition to the audio-visual data and the segmented utterances we provide emotion labels for a great part of the data. The emotion labels are given on a continuous valued scale for three emotion primitives: valence, activation and dominance, using a large number of human evaluators. Such data is of great interest to all research groups working on spontaneous speech analysis, emotion recognition in both speech and facial expression, natural language understanding, and robust speech recognition.},
  keywords = {Data acquisition,Databases,Emotion recognition,Histograms,Speech,Speech analysis,Speech processing,Speech recognition,TV,Video signal processing},
  file = {/Users/timokoch/Zotero/storage/DYTHT82Y/4607572.html}
}

@article{grossDissociationEmotionExpression2000,
  title = {The {{Dissociation}} of {{Emotion Expression}} from {{Emotion Experience}}: {{A Personality Perspective}}},
  shorttitle = {The {{Dissociation}} of {{Emotion Expression}} from {{Emotion Experience}}},
  author = {Gross, James J. and John, Oliver P. and Richards, Jane M.},
  year = {2000},
  month = aug,
  journal = {Personality and Social Psychology Bulletin},
  volume = {26},
  number = {6},
  pages = {712--726},
  issn = {0146-1672, 1552-7433},
  doi = {10.1177/0146167200268006},
  urldate = {2021-11-23},
  abstract = {When we want to know what others are feeling, we look to the face for clues. However, individual differences matter: Some faces are more expressive than others. Do both emotion experience and dispositional expressivity predict emotion expression? Based on an analysis of display rules, the authors hypothesized that expressivity would moderate the relation between experience and expression for negative, but not for positive, emotion. Study 1 examined the relation between habitual emotion experience and peer-rated expressive behavior and showed the predicted moderator effect for negative emotion: Experience was related to expression only for dispositionally high-expressivity participants, not for low-expressivity participants. For positive emotion, however, experience was related to expression for both groups. Study 2 replicated these findings using momentary emotion experience and objectively coded expressive behavior during films that elicited amusement and sadness. Results are interpreted in terms of low-expressivity individuals' propensity to dynamically regulate negative emotion-expressive behavior.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/ELFDBIQF/Gross et al. - 2000 - The Dissociation of Emotion Expression from Emotio.pdf}
}

@article{grossedetersKeepScrollingUsing2024,
  title = {Keep on Scrolling? {{Using}} Intensive Longitudinal Smartphone Sensing Data to Assess How Everyday Smartphone Usage Behaviors Are Related to Well-Being},
  shorttitle = {Keep on Scrolling?},
  author = {{gro{\ss}e Deters}, Fenne and Schoedel, Ramona},
  year = {2024},
  month = jan,
  journal = {Computers in Human Behavior},
  volume = {150},
  pages = {107977},
  issn = {0747-5632},
  doi = {10.1016/j.chb.2023.107977},
  urldate = {2024-04-25},
  abstract = {Smartphones are an integral part of daily life for many people worldwide. However, concerns have been raised that long usage times and the fragmentation of daily life through smartphone usage are detrimental to well-being. This preregistered study assesses (1) whether differences in smartphone usage behaviors between individuals predict differences in a variety of well-being measures (between-person effects) and (2) whether differences in smartphone usage behaviors between situations predict whether an individual is feeling better or worse (within-person effects). In addition to total usage time, several indicators capturing the fragmentation of usage/nonusage time were developed. The study combines objectively measured smartphone usage with self-reports of well-being in surveys (N~=~236) and an experience sampling period (N~=~378, n~=~5775 datapoints). To ensure the robustness of the results, we replicated our analyses in a second measurement period (surveys: N~=~305; experience sampling: N~=~534, n~=~7287 datapoints) and considered the pattern of effects across different operational definitions and constructs. Results show that individuals who use their smartphone more report slightly lower well-being (between-person effect) but no evidence for within-person effects of total usage time emerged. With respect to fragmentation, we found no robust association with well-being.},
  keywords = {Experience sampling,Fragmentation,Psychological well-being,Smartphone sensing,Smartphone usage},
  file = {/Users/timokoch/Zotero/storage/76K5IPUM/S074756322300328X.html}
}

@article{grossRevealingFeelingsFacets1997,
  title = {Revealing Feelings: {{Facets}} of Emotional Expressivity in Self-Reports, Peer Ratings, and Expressive Behavior},
  shorttitle = {Revealing Feelings},
  author = {Gross, James J. and John, O. L. Iver E.},
  year = {1997},
  journal = {Journal of Personality and Social Psychology},
  pages = {434--447},
  abstract = {Drawing on an explicit model of emotion, we propose amultifaceted approach to emotional expressiv-ity, defined as the behavioral (e.g., facial, postural) changes associated with emotion. Study 1 shows that self-reported expressivity has 3 facets (Impulse Strength, Negative Expressivity, Positive Expressivity). Study 2 shows that the same 3 facets emerge in peer ratings and that there are robust relations between self- and peer-rated expressivity. In Study 3, emotion-expressive behavior was videotaped and related to expressivity self-reports obtained several months earlier. As expected, Negative Expressivity predicted behavioral expressions of sadness (but not amusement), whereas Positive Expressivity predicted amusement (but not sadness). These relations remained even when subjective motional experience and physiological response were controlled. These studies demon-strate the importance of a multifaceted approach to emotional expressivity and have implications for the understanding of personality and emotion. Emotions help us respond adaptively to environmental chal-lenges and opportunities (Frijda, 1988; Levenson, 1994; Plut-chik, 1980). Unlike other biologically based response tenden-cies, such as reflexes, however, emotions only incline us to act in certain ways; they do not compel us to do so. This means that we may deny expression to some emotional impulses while freely expressing others. Striking individual differences in ex-pressivity suggest that people differ in their response tendencies and in how they express these impulses as they arise. Because emotions influence such a wide range of intra- and interpersonal},
  file = {/Users/timokoch/Zotero/storage/FD7GEKYZ/Gross und John - 1997 - Revealing feelings Facets of emotional expressivi.pdf;/Users/timokoch/Zotero/storage/3VIGAGEF/summary.html}
}

@article{hanelStudentSamplesProvide2016,
  title = {Do {{Student Samples Provide}} an {{Accurate Estimate}} of the {{General Public}}?},
  author = {Hanel, Paul H. P. and Vione, Katia C.},
  year = {2016},
  month = dec,
  journal = {PLoS ONE},
  volume = {11},
  number = {12},
  pages = {e0168354},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0168354},
  urldate = {2023-02-15},
  abstract = {Most psychological studies rely on student samples. Students are usually considered as more homogenous than representative samples both within and across countries. However, little is known about the nature of the differences between student and representative samples. This is an important gap, also because knowledge about the degree of difference between student and representative samples may allow to infer from the former to the latter group. Across 59 countries and 12 personality (Big-5) and attitudinal variables we found that differences between students and general public were partly substantial, incoherent, and contradicted previous findings. Two often used cultural variables, embeddedness and intellectual autonomy, failed to explain the differences between both groups across countries. We further found that students vary as much as the general population both between and within countries. In summary, our results indicate that generalizing from students to the general public can be problematic when personal and attitudinal variables are used, as students vary mostly randomly from the general public. Findings are also discussed in terms of the replication crisis within psychology.},
  pmcid = {PMC5176168},
  pmid = {28002494},
  file = {/Users/timokoch/Zotero/storage/MGHRSBWN/Hanel und Vione - 2016 - Do Student Samples Provide an Accurate Estimate of.pdf}
}

@article{harariUsingSmartphonesCollect2016,
  title = {Using {{Smartphones}} to {{Collect Behavioral Data}} in {{Psychological Science}}: {{Opportunities}}, {{Practical Considerations}}, and {{Challenges}}},
  shorttitle = {Using {{Smartphones}} to {{Collect Behavioral Data}} in {{Psychological Science}}},
  author = {Harari, Gabriella M. and Lane, Nicholas D. and Wang, Rui and Crosier, Benjamin S. and Campbell, Andrew T. and Gosling, Samuel D.},
  year = {2016},
  month = nov,
  journal = {Perspectives on psychological science : a journal of the Association for Psychological Science},
  volume = {11},
  number = {6},
  pages = {838--854},
  issn = {1745-6916},
  doi = {10.1177/1745691616650285},
  urldate = {2021-06-30},
  abstract = {Smartphones now offer the promise of collecting behavioral data unobtrusively, in situ, as it unfolds in the course of daily life. Data can be collected from the onboard sensors and other phone logs embedded in today's off-the-shelf smartphone devices. These data permit fine-grained, continuous collection of people's social interactions (e.g., speaking rates in conversation, size of social groups, calls, and text messages), daily activities (e.g., physical activity and sleep), and mobility patterns (e.g., frequency and duration of time spent at various locations). In this article, we have drawn on the lessons from the first wave of smartphone-sensing research to highlight areas of opportunity for psychological research, present practical considerations for designing smartphone studies, and discuss the ongoing methodological and ethical challenges associated with research in this domain. It is our hope that these practical guidelines will facilitate the use of smartphones as a behavioral observation tool in psychological science.},
  pmcid = {PMC5572675},
  pmid = {27899727},
  file = {/Users/timokoch/Zotero/storage/RVQ8I8KL/Harari et al. - 2016 - Using Smartphones to Collect Behavioral Data in Ps.pdf}
}

@article{heavenWhyFacesDon2020,
  title = {Why Faces Don't Always Tell the Truth about Feelings},
  author = {Heaven, Douglas},
  year = {2020},
  month = feb,
  journal = {Nature},
  volume = {578},
  number = {7796},
  pages = {502--504},
  publisher = {Nature Publishing Group},
  doi = {10.1038/d41586-020-00507-5},
  urldate = {2021-12-14},
  abstract = {Although AI companies market software for recognizing emotions in faces, psychologists debate whether expressions can be read so easily.},
  copyright = {2021 Nature},
  langid = {english},
  keywords = {Computer science,Psychology,Society},
  annotation = {Bandiera\_abtest: a\\
Cg\_type: News Feature\\
Subject\_term: Psychology, Society, Computer science},
  file = {/Users/timokoch/Zotero/storage/AW2CR8IT/Heaven - 2020 - Why faces don’t always tell the truth about feelin.pdf}
}

@article{henrichWeirdestPeopleWorld2010,
  title = {The Weirdest People in the World?},
  author = {Henrich, Joseph and Heine, Steven J. and Norenzayan, Ara},
  year = {2010},
  month = jun,
  journal = {The Behavioral and Brain Sciences},
  volume = {33},
  number = {2-3},
  pages = {61-83; discussion 83-135},
  issn = {1469-1825},
  doi = {10.1017/S0140525X0999152X},
  abstract = {Behavioral scientists routinely publish broad claims about human psychology and behavior in the world's top journals based on samples drawn entirely from Western, Educated, Industrialized, Rich, and Democratic (WEIRD) societies. Researchers - often implicitly - assume that either there is little variation across human populations, or that these "standard subjects" are as representative of the species as any other population. Are these assumptions justified? Here, our review of the comparative database from across the behavioral sciences suggests both that there is substantial variability in experimental results across populations and that WEIRD subjects are particularly unusual compared with the rest of the species - frequent outliers. The domains reviewed include visual perception, fairness, cooperation, spatial reasoning, categorization and inferential induction, moral reasoning, reasoning styles, self-concepts and related motivations, and the heritability of IQ. The findings suggest that members of WEIRD societies, including young children, are among the least representative populations one could find for generalizing about humans. Many of these findings involve domains that are associated with fundamental aspects of psychology, motivation, and behavior - hence, there are no obvious a priori grounds for claiming that a particular behavioral phenomenon is universal based on sampling from a single subpopulation. Overall, these empirical patterns suggests that we need to be less cavalier in addressing questions of human nature on the basis of data drawn from this particularly thin, and rather unusual, slice of humanity. We close by proposing ways to structurally re-organize the behavioral sciences to best tackle these challenges.},
  langid = {english},
  pmid = {20550733},
  keywords = {Behavioral Sciences,Cognition,Cross-Cultural Comparison,Decision Making,Humans,Morals,Population Groups,Visual Perception},
  file = {/Users/timokoch/Zotero/storage/C7SQDQVH/Henrich et al. - 2010 - The weirdest people in the world.pdf}
}

@article{hildebrandVoiceAnalyticsBusiness2020,
  title = {Voice Analytics in Business Research: {{Conceptual}} Foundations, Acoustic Feature Extraction, and Applications},
  shorttitle = {Voice Analytics in Business Research},
  author = {Hildebrand, Christian and Efthymiou, Fotis and Busquet, Francesc and Hampton, William H. and Hoffman, Donna L. and Novak, Thomas P.},
  year = {2020},
  month = dec,
  journal = {Journal of Business Research},
  volume = {121},
  pages = {364--374},
  issn = {0148-2963},
  doi = {10.1016/j.jbusres.2020.09.020},
  urldate = {2021-11-16},
  abstract = {Recent advances in artificial intelligence and natural language processing are gradually transforming how humans search, shop, and express their preferences. Leveraging the new affordances and modalities of human--machine interaction through voice-controlled interfaces will require a nuanced understanding of the physics and psychology of speech formation as well as the systematic extraction and analysis of vocal features from the human voice. In this paper, we first develop a conceptual framework linking vocal features in the human voice to experiential outcomes and emotional states. We then illustrate the effective processing, editing, analysis, and visualization of voice data based on an Amazon Alexa user interaction, utilizing state-of-the-art signal-processing packages in R. Finally, we offer novel insight into the ways in which business research might employ voice and sound analytics moving forward, including a discussion of the ethical implications of building multi-modal databases for business and society.},
  langid = {english},
  keywords = {Acoustic markers of emotion,Emotion detection,Natural language processing,Voice Analytics,Voice-controlled interfaces},
  file = {/Users/timokoch/Zotero/storage/PRRC3NCW/Hildebrand et al. - 2020 - Voice analytics in business research Conceptual f.pdf}
}

@article{hoemannContextawareExperienceSampling2020,
  title = {Context-Aware Experience Sampling Reveals the Scale of Variation in Affective Experience},
  author = {Hoemann, Katie and Khan, Zulqarnain and Feldman, Mallory J. and Nielson, Catie and Devlin, Madeleine and Dy, Jennifer and Barrett, Lisa Feldman and Wormwood, Jolie B. and Quigley, Karen S.},
  year = {2020},
  month = jul,
  journal = {Scientific Reports},
  volume = {10},
  pages = {12459},
  issn = {2045-2322},
  doi = {10.1038/s41598-020-69180-y},
  urldate = {2022-01-25},
  abstract = {Emotion research typically searches for consistency and specificity in physiological activity across instances of an emotion category, such as anger or fear, yet studies to date have observed more variation than expected. In the present study, we adopt an alternative approach, searching inductively for structure within variation, both within and across participants. Following a novel, physiologically-triggered experience sampling procedure, participants' self-reports and peripheral physiological activity were recorded when substantial changes in cardiac activity occurred in the absence of movement. Unsupervised clustering analyses revealed variability in the number and nature of patterns of physiological activity that recurred within individuals, as well as in the affect ratings and emotion labels associated with each pattern. There were also broad patterns that recurred across individuals. These findings support a constructionist account of emotion which, drawing on Darwin, proposes that emotion categories are populations of variable instances tied to situation-specific needs.},
  pmcid = {PMC7385108},
  pmid = {32719368},
  file = {/Users/timokoch/Zotero/storage/DVXXQMAG/Hoemann et al. - 2020 - Context-aware experience sampling reveals the scal.pdf}
}

@article{hoemannContextawareExperienceSampling2020a,
  title = {Context-Aware Experience Sampling Reveals the Scale of Variation in Affective Experience},
  author = {Hoemann, Katie and Khan, Zulqarnain and Feldman, Mallory J. and Nielson, Catie and Devlin, Madeleine and Dy, Jennifer and Barrett, Lisa Feldman and Wormwood, Jolie B. and Quigley, Karen S.},
  year = {2020},
  month = jul,
  journal = {Scientific Reports},
  volume = {10},
  pages = {12459},
  issn = {2045-2322},
  doi = {10.1038/s41598-020-69180-y},
  urldate = {2022-01-25},
  abstract = {Emotion research typically searches for consistency and specificity in physiological activity across instances of an emotion category, such as anger or fear, yet studies to date have observed more variation than expected. In the present study, we adopt an alternative approach, searching inductively for structure within variation, both within and across participants. Following a novel, physiologically-triggered experience sampling procedure, participants' self-reports and peripheral physiological activity were recorded when substantial changes in cardiac activity occurred in the absence of movement. Unsupervised clustering analyses revealed variability in the number and nature of patterns of physiological activity that recurred within individuals, as well as in the affect ratings and emotion labels associated with each pattern. There were also broad patterns that recurred across individuals. These findings support a constructionist account of emotion which, drawing on Darwin, proposes that emotion categories are populations of variable instances tied to situation-specific needs.},
  pmcid = {PMC7385108},
  pmid = {32719368},
  file = {/Users/timokoch/Zotero/storage/Y5Q7TCGI/Hoemann et al. - 2020 - Context-aware experience sampling reveals the scal.pdf}
}

@misc{holtAmazonAlexaGet,
  title = {Amazon's {{Alexa Is About To Get More Emotional}}},
  author = {Holt, Kris},
  journal = {Forbes},
  urldate = {2021-06-30},
  abstract = {Developers can tap into Alexa's new excited and disappointed voice tones.},
  chapter = {Consumer Tech},
  howpublished = {https://www.forbes.com/sites/krisholt/2019/11/27/amazons-alexa-is-about-to-get-more-emotional/},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/F6P9Q3P4/amazons-alexa-is-about-to-get-more-emotional.html}
}

@misc{hsuHuBERTSelfSupervisedSpeech2021,
  title = {{{HuBERT}}: {{Self-Supervised Speech Representation Learning}} by {{Masked Prediction}} of {{Hidden Units}}},
  shorttitle = {{{HuBERT}}},
  author = {Hsu, Wei-Ning and Bolte, Benjamin and Tsai, Yao-Hung Hubert and Lakhotia, Kushal and Salakhutdinov, Ruslan and Mohamed, Abdelrahman},
  year = {2021},
  month = jun,
  number = {arXiv:2106.07447},
  eprint = {2106.07447},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2106.07447},
  urldate = {2024-11-18},
  abstract = {Self-supervised approaches for speech representation learning are challenged by three unique problems: (1) there are multiple sound units in each input utterance, (2) there is no lexicon of input sound units during the pre-training phase, and (3) sound units have variable lengths with no explicit segmentation. To deal with these three problems, we propose the Hidden-Unit BERT (HuBERT) approach for self-supervised speech representation learning, which utilizes an offline clustering step to provide aligned target labels for a BERT-like prediction loss. A key ingredient of our approach is applying the prediction loss over the masked regions only, which forces the model to learn a combined acoustic and language model over the continuous inputs. HuBERT relies primarily on the consistency of the unsupervised clustering step rather than the intrinsic quality of the assigned cluster labels. Starting with a simple k-means teacher of 100 clusters, and using two iterations of clustering, the HuBERT model either matches or improves upon the state-of-the-art wav2vec 2.0 performance on the Librispeech (960h) and Libri-light (60,000h) benchmarks with 10min, 1h, 10h, 100h, and 960h fine-tuning subsets. Using a 1B parameter model, HuBERT shows up to 19\% and 13\% relative WER reduction on the more challenging dev-other and test-other evaluation subsets.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning,Electrical Engineering and Systems Science - Audio and Speech Processing},
  file = {/Users/timokoch/Zotero/storage/4QL4FQF4/Hsu et al. - 2021 - HuBERT Self-Supervised Speech Representation Lear.pdf;/Users/timokoch/Zotero/storage/ANEUEL22/2106.html}
}

@article{huangPredictionEmotionChange2018,
  title = {Prediction of {{Emotion Change From Speech}}},
  author = {Huang, Zhaocheng and Epps, Julien},
  year = {2018},
  journal = {Frontiers in ICT},
  volume = {5},
  issn = {2297-198X},
  urldate = {2023-02-13},
  abstract = {The fact that emotions are dynamic in nature and evolve across time has been explored relatively less often in automatic emotion recognition systems to date. Although within-utterance information about emotion changes recently has received some attention, there remain open questions unresolved, such as how to approach delta emotion ground truth, how to predict the extent of emotion change from speech, and how well change can be predicted relative to absolute emotion ratings. In this article, we investigate speech-based automatic systems for continuous prediction of the extent of emotion changes in arousal/valence. We propose the use of regression (smoothed) deltas as ground truth for emotion change, which yielded considerably higher inter-rater reliability than first-order deltas, a commonly used approach in previous research, and represent a more appropriate approach to derive annotations for emotion change research, findings which are applicable beyond speech-based systems. In addition, the first system design for continuous emotion change prediction from speech is explored. Experimental results under the Output-Associative Relevance Vector Machine framework interestingly show that changes in emotion ratings may be better predicted than absolute emotion ratings on the RECOLA database, achieving 0.74 vs. 0.71 for arousal and 0.41 vs. 0.37 for valence in concordance correlation coefficients. However, further work is needed to achieve effective emotion change prediction performances on the SEMAINE database, due to the large number of non-change frames in the absolute emotion ratings.},
  file = {/Users/timokoch/Zotero/storage/B2J4KMUW/Huang und Epps - 2018 - Prediction of Emotion Change From Speech.pdf}
}

@misc{IntroducingGenevaMultimodal,
  title = {Introducing the {{Geneva Multimodal}} Expression Corpus for Experimental Research on Emotion Perception.},
  urldate = {2023-02-16},
  howpublished = {https://psycnet.apa.org/fulltext/2011-25876-001.html},
  file = {/Users/timokoch/Zotero/storage/P3JYYFBN/2011-25876-001.html}
}

@article{israelEmotionPredictionWeighted2019,
  title = {Emotion {{Prediction}} with {{Weighted Appraisal Models}} - {{Validating}} a {{Psychological Theory}} of {{Affect}}},
  author = {Israel, L. S. F. and Sch{\"o}nbrodt, F. D.},
  year = {2019},
  journal = {IEEE Transactions on Affective Computing},
  pages = {1--1},
  issn = {1949-3045},
  doi = {10.1109/TAFFC.2019.2940937},
  abstract = {Appraisal theories are a prominent approach for the explanation and prediction of emotions. According to these theories, the subjective perception of an emotion results from a series of specific event evaluations. To validate and extend one of the most known representatives of appraisal theory, the Component Process Model by Klaus Scherer, we implemented four computational appraisal models that predicted emotion labels based on prototype similarity calculations. Different weighting algorithms, mapping the models' input to a distinct emotion label, were integrated in the models. We evaluated the plausibility of the models' structure by assessing their predictive power and comparing their performance to a baseline model and a highly predictive machine learning algorithm. Model parameters were estimated from empirical data and validated out-of-sample. All models were notably better than the baseline model and able to explain part of the variance in the emotion labels. The preferred model, yielding a relatively high performance and stable parameter estimations, was able to predict a correct emotion label with an accuracy of 40.2\% and a correct emotion family with an accuracy of 76.9\%. The weighting algorithm of this favored model corresponds to the weighting complexity implied by the Component Process Model, but uses differing weighting parameters.},
  keywords = {Adaptation models,Affective computing,Appraisal,appraisal theory,Component Process Model,Computational modeling,Data models,emotion,Prediction algorithms,predictive models,Predictive models,Prototypes},
  file = {/Users/timokoch/Zotero/storage/INLTQ72B/8832270.html}
}

@article{israelPredictingAffectiveAppraisals2021,
  title = {Predicting Affective Appraisals from Facial Expressions and Physiology Using Machine Learning},
  author = {Israel, Laura S. F. and Sch{\"o}nbrodt, Felix D.},
  year = {2021},
  month = apr,
  journal = {Behavior Research Methods},
  volume = {53},
  number = {2},
  pages = {574--592},
  issn = {1554-3528},
  doi = {10.3758/s13428-020-01435-y},
  urldate = {2022-11-08},
  abstract = {The present study explored the interrelations between a broad set of appraisal ratings and five physiological signals, including facial EMG, electrodermal activity, and heart rate variability, that were assessed in 157 participants watching 10 emotionally charged videos. A total of 134 features were extracted from the physiological data, and a benchmark comparing different kinds of machine learning algorithms was conducted to test how well the appraisal dimensions can be predicted from these features. For 13 out of 21 appraisals, a robust positive R2 was attained, indicating that the dimensions are actually related to the considered physiological channels. The highest R2 (.407) was reached for the appraisal dimension intrinsic pleasantness. Moreover, the comparison of linear and nonlinear algorithms and the inspection of the links between the appraisals and single physiological features using accumulated local effects plots indicates that the relationship between physiology and appraisals is nonlinear. By constructing different importance measures for the assessed physiological channels, we showed that for the 13 predictable appraisals, the five channels explained different amounts of variance and that only a few blocks incrementally explained variance beyond the other physiological channels.},
  langid = {english},
  keywords = {Appraisal theory,Component process model,Machine learning,Physiology,Predictive modeling},
  file = {/Users/timokoch/Zotero/storage/CRWNIYCV/Israel und Schönbrodt - 2021 - Predicting affective appraisals from facial expres.pdf}
}

@article{jacobucciMachineLearningPsychological2020,
  title = {Machine {{Learning}} and {{Psychological Research}}: {{The Unexplored Effect}} of {{Measurement}}},
  shorttitle = {Machine {{Learning}} and {{Psychological Research}}},
  author = {Jacobucci, Ross and Grimm, Kevin J.},
  year = {2020},
  month = may,
  journal = {Perspectives on Psychological Science},
  volume = {15},
  number = {3},
  pages = {809--816},
  publisher = {SAGE Publications Inc},
  issn = {1745-6916},
  doi = {10.1177/1745691620902467},
  urldate = {2021-11-26},
  abstract = {Machine learning (i.e., data mining, artificial intelligence, big data) has been increasingly applied in psychological science. Although some areas of research have benefited tremendously from a new set of statistical tools, most often in the use of biological or genetic variables, the hype has not been substantiated in more traditional areas of research. We argue that this phenomenon results from measurement errors that prevent machine-learning algorithms from accurately modeling nonlinear relationships, if indeed they exist. This shortcoming is showcased across a set of simulated examples, demonstrating that model selection between a machine-learning algorithm and regression depends on the measurement quality, regardless of sample size. We conclude with a set of recommendations and a discussion of ways to better integrate machine learning with statistics as traditionally practiced in psychological science.},
  langid = {english},
  keywords = {data mining,machine learning,measurement error,psychometrics,structural-equation modeling},
  file = {/Users/timokoch/Zotero/storage/TYYSYYSP/Jacobucci und Grimm - 2020 - Machine Learning and Psychological Research The U.pdf}
}

@article{jiaInferringEmotionsLargeScale2019,
  title = {Inferring {{Emotions From Large-Scale Internet Voice Data}}},
  author = {Jia, Jia and Zhou, Suping and Yin, Yufeng and Wu, Boya and Chen, Wei and Meng, Fanbo and Wang, Yanfeng},
  year = {2019},
  month = jul,
  journal = {IEEE Transactions on Multimedia},
  volume = {21},
  number = {7},
  pages = {1853--1866},
  issn = {1520-9210, 1941-0077},
  doi = {10.1109/TMM.2018.2887016},
  urldate = {2021-06-30},
  abstract = {As voice dialog applications (VDAs, e.g., Siri,1 Cortana,2 Google Now3) are increasing in popularity, inferring emotions from the large-scale internet voice data generated from VDAs can help give a more reasonable and humane response. However, the tremendous amounts of users in large-scale internet voice data lead to a great diversity of users accents and expression patterns. Therefore, the traditional speech emotion recognition methods, which mainly target acted corpora, cannot effectively handle the massive and diverse amount of internet voice data. To address this issue, we carry out a series of observations, find suitable emotion categories for large-scale internet voice data, and verify the indicators of the social attributes (query time, query topic, and users location) and emotion inferring. Based on our observations, two different strategies are employed to solve the problem. First, a deep sparse neural network model that uses acoustic information, textual information, and three indicators (a temporal indicator, descriptive indicator, and geosocial indicator) as the input is proposed. Then, to capture the contextual information, we propose a hybrid emotion inference model that includes long short-term memory to capture the acoustic features and a latent dirichlet allocation to extract text features. Experiments on 93 000 utterances collected from the Sogou Voice Assistant4 (Chinese Siri) validate the effectiveness of the proposed methodologies. Furthermore, we compare the two methodologies and give their advantages and disadvantages.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/XCDWF9JQ/Jia et al. - 2019 - Inferring Emotions From Large-Scale Internet Voice.pdf}
}

@article{jurgensEffectActingExperience2015,
  title = {Effect of {{Acting Experience}} on {{Emotion Expression}} and {{Recognition}} in {{Voice}}: {{Non-Actors Provide Better Stimuli}} than {{Expected}}},
  shorttitle = {Effect of {{Acting Experience}} on {{Emotion Expression}} and {{Recognition}} in {{Voice}}},
  author = {J{\"u}rgens, Rebecca and Grass, Annika and Drolet, Matthis and Fischer, Julia},
  year = {2015},
  month = sep,
  journal = {Journal of Nonverbal Behavior},
  volume = {39},
  number = {3},
  pages = {195--214},
  issn = {1573-3653},
  doi = {10.1007/s10919-015-0209-5},
  urldate = {2021-11-23},
  abstract = {Both in the performative arts and in emotion research, professional actors are assumed to be capable of delivering emotions comparable to spontaneous emotional expressions. This study examines the effects of acting training on vocal emotion depiction and recognition. We predicted that professional actors express emotions in a more realistic fashion than non-professional actors. However, professional acting training may lead to a particular speech pattern; this might account for vocal expressions by actors that are less comparable to authentic samples than the ones by non-professional actors. We compared 80 emotional speech tokens from radio interviews with 80 re-enactments by professional and inexperienced actors, respectively. We analyzed recognition accuracies for emotion and authenticity ratings and compared the acoustic structure of the speech tokens. Both play-acted conditions yielded similar recognition accuracies and possessed more variable pitch contours than the spontaneous recordings. However, professional actors exhibited signs of different articulation patterns compared to non-trained speakers. Our results indicate that for emotion research, emotional expressions by professional actors are not better suited than those from non-actors.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/JEFECLPV/Jürgens et al. - 2015 - Effect of Acting Experience on Emotion Expression .pdf}
}

@article{juslinCommunicationEmotionsVocal2003,
  title = {Communication of Emotions in Vocal Expression and Music Performance: {{Different}} Channels, Same Code?},
  shorttitle = {Communication of Emotions in Vocal Expression and Music Performance},
  author = {Juslin, Patrik N. and Laukka, Petri},
  year = {2003},
  journal = {Psychological Bulletin},
  volume = {129},
  number = {5},
  pages = {770--814},
  issn = {1939-1455, 0033-2909},
  doi = {10.1037/0033-2909.129.5.770},
  urldate = {2021-11-16},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/CLJYZAAL/Juslin und Laukka - 2003 - Communication of emotions in vocal expression and .pdf}
}

@article{juslinMirrorOurSoul2018,
  title = {The {{Mirror}} to {{Our Soul}}? {{Comparisons}} of {{Spontaneous}} and {{Posed Vocal Expression}} of {{Emotion}}},
  shorttitle = {The {{Mirror}} to {{Our Soul}}?},
  author = {Juslin, Patrik N. and Laukka, Petri and B{\"a}nziger, Tanja},
  year = {2018},
  month = mar,
  journal = {Journal of Nonverbal Behavior},
  volume = {42},
  number = {1},
  pages = {1--40},
  issn = {1573-3653},
  doi = {10.1007/s10919-017-0268-x},
  urldate = {2021-10-28},
  abstract = {It has been the subject of much debate in the study of vocal expression of emotions whether posed expressions (e.g., actor portrayals) are different from spontaneous expressions. In the present investigation, we assembled a new database consisting of 1877 voice clips from 23 datasets, and used it to systematically compare spontaneous and posed expressions across 3 experiments. Results showed that (a) spontaneous expressions were generally rated as more genuinely emotional than were posed expressions, even when controlling for differences in emotion intensity, (b) there were differences between the two stimulus types with regard to their acoustic characteristics, and (c) spontaneous expressions with a high emotion intensity conveyed discrete emotions to listeners to a similar degree as has previously been found for posed expressions, supporting a dose--response relationship between intensity of expression and discreteness in perceived emotions. Our conclusion is that there are reliable differences between spontaneous and posed expressions, though not necessarily in the ways commonly assumed. Implications for emotion theories and the use of emotion portrayals in studies of vocal expression are discussed.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/P674JFBY/Juslin et al. - 2018 - The Mirror to Our Soul Comparisons of Spontaneous.pdf}
}

@article{juslinSpontaneousVocalExpressions2020,
  title = {Spontaneous Vocal Expressions from Everyday Life Convey Discrete Emotions to Listeners},
  author = {Juslin, Patrik N. and Laukka, Petri and Harmat, L{\'a}szl{\'o} and Ovsiannikow, Melissa},
  year = {2020},
  journal = {Emotion},
  pages = {No Pagination Specified-No Pagination Specified},
  publisher = {American Psychological Association},
  address = {US},
  issn = {1931-1516},
  doi = {10.1037/emo0000762},
  abstract = {Emotional expression is crucial for social interaction. Yet researchers disagree about whether nonverbal expressions truly reflect felt emotions and whether they convey discrete emotions to perceivers in everyday life. In the present study, 384 clips of vocal expression recorded in a field setting were rated by the speakers themselves and by na{\"i}ve listeners with regard to their emotional contents. Results suggested that most expressions in everyday life are reflective of felt emotions in speakers. Seventy-three percent of the voice clips involved moderate to high emotion intensity. Speaker--listener agreement concerning expressed emotions was 5 times higher than would be expected from chance alone, and agreement was significantly higher for voice clips with high emotion intensity than for clips with low intensity. Acoustic analysis of the clips revealed emotion-specific patterns of voice cues. ``Mixed emotions'' occurred in 41\% of the clips. Such expressions were typically interpreted by listeners as conveying one or the other of the two felt emotions. Mixed emotions were rarely recognized as such. The results are discussed regarding their implications for the domain of emotional expression in general, and vocal expression in particular. (PsycInfo Database Record (c) 2020 APA, all rights reserved)},
  keywords = {Audiences,Auditory Perception,Emotionality (Personality),Emotions,Nonverbal Communication,Oral Communication,Social Interaction,Vocalization,Voice},
  file = {/Users/timokoch/Zotero/storage/6NK63JYX/Juslin et al. - 2020 - Spontaneous vocal expressions from everyday life c.pdf;/Users/timokoch/Zotero/storage/PZZSXFG9/2020-69299-001.html}
}

@inproceedings{karadoganCombiningSemanticAcoustic2012,
  title = {Combining Semantic and Acoustic Features for Valence and Arousal Recognition in Speech},
  booktitle = {2012 3rd {{International Workshop}} on {{Cognitive Information Processing}} ({{CIP}})},
  author = {Karadogan, Seliz Gulsen and Larsen, Jan},
  year = {2012},
  month = may,
  pages = {1--6},
  publisher = {IEEE},
  address = {Baiona, Spain},
  doi = {10.1109/CIP.2012.6232924},
  urldate = {2021-11-28},
  abstract = {The recognition of affect in speech has attracted a lot of interest recently; especially in the area of cognitive and computer sciences. Most of the previous studies focused on the recognition of basic emotions (such as happiness, sadness and anger) using categorical approach. Recently, the focus has been shifting towards dimensional affect recognition based on the idea that emotional states are not independent from one another but related in a systematic manner. In this paper, we design a continuous dimensional speech affect recognition model that combines acoustic and semantic features. We design our own corpus that consists of 59 short movie clips with audio and text in subtitle format, rated by human subjects in arousal and valence (A-V) dimensions. For the acoustic part, we combine many features and use correlation based feature selection and apply support vector regression. For the semantic part, we use the affective norms for English words (ANEW), that are rated also in A-V dimensions, as keywords and apply latent semantics analysis (LSA) on those words and words in the clips to estimate A-V values in the clips. Finally, the results of acoustic and semantic parts are combined. We show that combining semantic and acoustic information for dimensional speech recognition improves the results. Moreover, we show that valence is better estimated using semantic features while arousal is better estimated using acoustic features.},
  isbn = {978-1-4673-1878-5 978-1-4673-1877-8 978-1-4673-1876-1},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/HJUUZ4BS/Karadogan und Larsen - 2012 - Combining semantic and acoustic features for valen.pdf}
}

@article{kassamEffectsMeasuringEmotion2013,
  title = {The Effects of Measuring Emotion: Physiological Reactions to Emotional Situations Depend on Whether Someone Is Asking},
  shorttitle = {The Effects of Measuring Emotion},
  author = {Kassam, Karim S. and Mendes, Wendy Berry},
  year = {2013},
  journal = {PloS One},
  volume = {8},
  number = {7},
  pages = {e64959},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0064959},
  abstract = {Measurement effects exist throughout the sciences-the act of measuring often changes the properties of the observed. We suggest emotion research is no exception. The awareness and conscious assessment required by self-report of emotion may significantly alter emotional processes. In this study, participants engaged in a difficult math task designed to induce anger or shame while their cardiovascular responses were measured. Half of the participants were asked to report on their emotional states and appraise their feelings throughout the experiment, whereas the other half completed a control questionnaire. Among those in the anger condition, participants assigned to report on their emotions exhibited qualitatively different physiological responses from those who did not report. For participants in the shame condition, there were no significant differences in physiology based on the self-report manipulation. The study demonstrates that the simple act of reporting on an emotional state may have a substantial impact on the body's reaction to an emotional situation.},
  langid = {english},
  pmcid = {PMC3680163},
  pmid = {23785407},
  keywords = {Adult,Anger,Awareness,Cardiac Output,Consciousness,Female,Heart Rate,Humans,Male,Psychological Tests,Self Report,Shame,Social Behavior,Stroke Volume,Surveys and Questionnaires,Vascular Resistance},
  file = {/Users/timokoch/Zotero/storage/FFS9WUCN/Kassam und Mendes - 2013 - The effects of measuring emotion physiological re.pdf}
}

@article{keuschCoverageErrorData2020,
  title = {Coverage {{Error}} in {{Data Collection Combining Mobile Surveys With Passive Measurement Using Apps}}: {{Data From}} a {{German National Survey}}},
  shorttitle = {Coverage {{Error}} in {{Data Collection Combining Mobile Surveys With Passive Measurement Using Apps}}},
  author = {Keusch, Florian and B{\"a}hr, Sebastian and Haas, Georg-Christoph and Kreuter, Frauke and Trappmann, Mark},
  year = {2020},
  month = apr,
  journal = {Sociological Methods \& Research},
  pages = {0049124120914924},
  publisher = {SAGE Publications Inc},
  issn = {0049-1241},
  doi = {10.1177/0049124120914924},
  urldate = {2023-02-21},
  abstract = {Researchers are combining self-reports from mobile surveys with passive data collection using sensors and apps on smartphones increasingly more often. While smartphones are commonly used in some groups of individuals, smartphone penetration is significantly lower in other groups. In addition, different operating systems (OSs) limit how mobile data can be collected passively. These limitations cause concern about coverage error in studies targeting the general population. Based on data from the Panel Study Labour Market and Social Security (PASS), an annual probability-based mixed-mode survey on the labor market and poverty in Germany, we find that smartphone ownership and ownership of smartphones with specific OSs are correlated with a number of sociodemographic and substantive variables. The use of weighting techniques based on sociodemographic information available for both owners and nonowners reduces these differences but does not eliminate them.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/YHHTDATI/Keusch et al. - 2020 - Coverage Error in Data Collection Combining Mobile.pdf}
}

@article{kjellNaturalLanguageAnalyzed2022,
  title = {Natural Language Analyzed with {{AI-based}} Transformers Predict Traditional Subjective Well-Being Measures Approaching the Theoretical Upper Limits in Accuracy},
  author = {Kjell, Oscar and Sikstr{\"o}m, Sverker and Kjell, Katarina and Schwartz, H. Andrew},
  year = {2022},
  month = dec,
  journal = {Scientific Reports},
  volume = {12},
  number = {1},
  pages = {3918},
  issn = {2045-2322},
  doi = {10.1038/s41598-022-07520-w},
  urldate = {2022-03-15},
  abstract = {Abstract                            We show that using a recent break-through in artificial intelligence --               transformers--               , psychological assessments from text-responses can approach theoretical upper limits in accuracy, converging with standard psychological rating scales. Text-responses use people's primary form of communication --               natural language               -- and have been suggested as a more ecologically-valid response format than closed-ended rating scales that dominate social science. However, previous language analysis techniques left a gap between how accurately they converged with standard rating scales and how well ratings scales converge with themselves -- a theoretical upper-limit in accuracy. Most recently, AI-based language analysis has gone through a transformation as nearly all of its applications, from Web search to personalized assistants (e.g., Alexa and Siri), have shown unprecedented improvement by using transformers. We evaluate transformers for estimating psychological well-being from questionnaire text- and descriptive word-responses, and find accuracies converging with rating scales that approach the theoretical upper limits (Pearson               r               \,=~0.85,               p               \,{$<$}\,0.001,               N               \,=\,608; in line with most metrics of rating scale reliability). These findings suggest an avenue for modernizing the ubiquitous questionnaire and ultimately opening doors to a greater understanding of the human condition.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/HWSPAH9Q/Kjell et al. - 2022 - Natural language analyzed with AI-based transforme.pdf}
}

@misc{kjellTextRpackageAnalyzing2021,
  title = {Text: {{An R-package}} for {{Analyzing}} and {{Visualizing Human Language Using Natural Language Processing}} and {{Deep Learning}}},
  shorttitle = {Text},
  author = {Kjell, Oscar and Giorgi, Salvatore and Schwartz, H. Andrew},
  year = {2021},
  month = apr,
  publisher = {PsyArXiv},
  doi = {10.31234/osf.io/293kt},
  urldate = {2022-08-12},
  abstract = {The language that individuals use for expressing themselves contains rich psychological information. Recent significant advances in Natural Language Processing (NLP) and Deep Learning (DL), namely transformers, have resulted in large performance gains in tasks related to understanding natural language such as machine translation. However, these state-of-the-art methods have not yet been made easily accessible for psychology researchers, nor designed to be optimal for human-level analyses. This tutorial introduces text (www.r-text.org), a new R-package for analyzing and visualizing human language using transformers, the latest techniques from NLP and DL. The text package is both a modular solution for accessing state-of-the-art language models and an end-to-end solution catered for human-level analyses. Hence, text provides user-friendly functions tailored to test hypotheses in social sciences for both relatively small and large datasets. This tutorial describes useful methods for analyzing text, providing functions with reliable defaults that can be used off-the-shelf as well as providing a framework for the advanced users to build on for novel techniques and analysis pipelines. The reader learns about three core methods: 1) textEmbed: to transform text to traditional or modern transformer-based word embeddings (i.e., numeric representations of words); 2) textTrain and textPredict: to train predictive models with word embeddings as input, and use the models to predict from; 3) textSimilarity: to compute semantic similarity scores between texts. The reader also learns about two extended methods: 1) textSimilarityTest: to significance test the difference in meaning between two sets of texts; and 2) textProjection and textProjectionPlot: to examine and visualize text within the embedding space according to latent or specified construct dimensions (e.g., low to high rating scale scores).},
  langid = {american},
  keywords = {Computational Language Assessments,Machine Learning,Natural Language Processing,Quantitative Methods,R,r-text,Social and Behavioral Sciences,Statistical Methods},
  file = {/Users/timokoch/Zotero/storage/IZTHMXV6/Kjell et al. - 2021 - Text An R-package for Analyzing and Visualizing H.pdf}
}

@misc{knightAmazonWorkingMaking2016,
  title = {Amazon {{Working}} on {{Making Alexa Recognize Your Emotions}}},
  author = {Knight, Will},
  year = {2016},
  journal = {MIT Technology Review},
  urldate = {2021-06-30},
  abstract = {With Google and Apple preparing voice devices for the home, Amazon is teaching Alexa to listen for emotions.},
  howpublished = {https://www.technologyreview.com/2016/06/13/159665/amazon-working-on-making-alexa-recognize-your-emotions/},
  langid = {english}
}

@article{kochPredictingAffectiveStates2021,
  title = {Predicting {{Affective States}} from {{Acoustic Voice Cues Collected}} with {{Smartphones}}},
  author = {Koch, Timo and Schoedel, Ramona},
  year = {2021},
  month = jan,
  publisher = {PsychArchives},
  doi = {10.23668/psycharchives.4454},
  urldate = {2021-11-23},
  abstract = {The expression and recognition of emotions (i.e., short-lived and directed representations of affective states) through the acoustic properties of speech is a unique feature of human communication (Weninger et al., 2013). Researchers have identified acoustic features, which are predictable of affective states, and emotion detecting algorithms have been developed (Schuller, 2018). However, most studies used speech data produced by actors, who had instructions to act out a given emotion, or speech samples labelled by raters, who were instructed to add affective labels to recorded utterances (e.g., from TV shows). Both, enacted and labelled speech, come with multiple downsides since these approaches assess expressed affect rather than the experience of actual affective states through voice. In this work, we want to investigate if we can predict in-situ self-reported affective states from objective voice parameters collected with smartphones in everyday life. Further, we want to explore which acoustic features are most predictive for the prediction of the experience of affective states. Finally, we want to analyze how the affective quality of instructed spoken language (e.g., a sentence with negative affective valence) translates into objective markers in the acoustic signal, which then in turn could alter the predictions in our models.},
  copyright = {CC-BY 4.0},
  langid = {english},
  annotation = {Accepted: 2021-01-07T10:19:21Z},
  file = {/Users/timokoch/Zotero/storage/C6SRTYQA/Koch und Schoedel - 2021 - Predicting Affective States from Acoustic Voice Cu.pdf;/Users/timokoch/Zotero/storage/2ZBC8RCU/4033.html}
}

@article{koelewijnEffectsLexicalContent2021,
  title = {The Effects of Lexical Content, Acoustic and Linguistic Variability, and Vocoding on Voice Cue Perception},
  author = {Koelewijn, Thomas and Gaudrain, Etienne and Tamati, Terrin and Ba{\c s}kent, Deniz},
  year = {2021},
  month = sep,
  journal = {The Journal of the Acoustical Society of America},
  volume = {150},
  number = {3},
  pages = {1620--1634},
  publisher = {Acoustical Society of America},
  issn = {0001-4966},
  doi = {10.1121/10.0005938},
  urldate = {2022-08-19},
  abstract = {Perceptual differences in voice cues, such as fundamental frequency (F0) and vocal tract length (VTL), can facilitate speech understanding in challenging conditions. Yet, we hypothesized that in the presence of spectrotemporal signal degradations, as imposed by cochlear implants (CIs) and vocoders, acoustic cues that overlap for voice perception and phonemic categorization could be mistaken for one another, leading to a strong interaction between linguistic and indexical (talker-specific) content. Fifteen normal-hearing participants performed an odd-one-out adaptive task measuring just-noticeable differences (JNDs) in F0 and VTL. Items used were words (lexical content) or time-reversed words (no lexical content). The use of lexical content was either promoted (by using variable items across comparison intervals) or not (fixed item). Finally, stimuli were presented without or with vocoding. Results showed that JNDs for both F0 and VTL were significantly smaller (better) for non-vocoded compared with vocoded speech and for fixed compared with variable items. Lexical content (forward vs reversed) affected VTL JNDs in the variable item condition, but F0 JNDs only in the non-vocoded, fixed condition. In conclusion, lexical content had a positive top--down effect on VTL perception when acoustic and linguistic variability was present but not on F0 perception. Lexical advantage persisted in the most degraded conditions and vocoding even enhanced the effect of item variability, suggesting that linguistic content could support compensation for poor voice perception in CI users.},
  file = {/Users/timokoch/Zotero/storage/IIPA5JV5/Koelewijn et al. - 2021 - The effects of lexical content, acoustic and lingu.pdf}
}

@article{koolagudiEmotionRecognitionSpeech2012,
  title = {Emotion Recognition from Speech: A Review},
  shorttitle = {Emotion Recognition from Speech},
  author = {Koolagudi, Shashidhar G. and Rao, K. Sreenivasa},
  year = {2012},
  month = jun,
  journal = {International Journal of Speech Technology},
  volume = {15},
  number = {2},
  pages = {99--117},
  issn = {1381-2416, 1572-8110},
  doi = {10.1007/s10772-011-9125-1},
  urldate = {2021-11-17},
  abstract = {Emotion recognition from speech has emerged as an important research area in the recent past. In this regard, review of existing work on emotional speech processing is useful for carrying out further research. In this paper, the recent literature on speech emotion recognition has been presented considering the issues related to emotional speech corpora, different types of speech features and models used for recognition of emotions from speech. Thirty two representative speech databases are reviewed in this work from point of view of their language, number of speakers, number of emotions, and purpose of collection. The issues related to emotional speech databases used in emotional speech recognition are also briefly discussed. Literature on different features used in the task of emotion recognition from speech is presented. The importance of choosing different classification models has been discussed along with the review. The important issues to be considered for further emotion recognition research in general and in specific to the Indian context have been highlighted where ever necessary.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/W4P5UZMT/Koolagudi und Rao - 2012 - Emotion recognition from speech a review.pdf}
}

@article{kotzWhenEmotionalProsody2007,
  title = {When Emotional Prosody and Semantics Dance Cheek to Cheek: {{ERP}} Evidence},
  shorttitle = {When Emotional Prosody and Semantics Dance Cheek to Cheek},
  author = {Kotz, Sonja A. and Paulmann, Silke},
  year = {2007},
  month = jun,
  journal = {Brain Research},
  volume = {1151},
  pages = {107--118},
  issn = {00068993},
  doi = {10.1016/j.brainres.2007.03.015},
  urldate = {2021-10-29},
  abstract = {To communicate emotionally entails that a listener understands a verbal message but also the emotional prosody going along with it. So far the time course and interaction of these emotional `channels' is still poorly understood. The current set of event-related brain potential (ERP) experiments investigated both the interactive time course of emotional prosody with semantics and of emotional prosody independent of emotional semantics using a cross-splicing method. In a probe verification task (Experiment 1) prosodic expectancy violations elicited a positivity, while a combined prosodic--semantic expectancy violation elicited a negativity. Comparable ERP results were obtained in an emotional prosodic categorization task (Experiment 2). The present data support different ERP responses with distinct time courses and topographies elicited as a function of prosodic expectancy and combined prosodic-semantic expectancy during emotional prosodic processing and combined emotional prosody/emotional semantic processing. These differences suggest that the interaction of more than one emotional channel facilitates subtle transitions in an emotional sentence context.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/QL5ZKY2U/Kotz und Paulmann - 2007 - When emotional prosody and semantics dance cheek t.pdf}
}

@article{krausVoiceOnlyCommunicationEnhances,
  title = {Voice-{{Only Communication Enhances Empathic Accuracy}}},
  author = {Kraus, Michael W},
  abstract = {This research tests the prediction that voice-only communication increases empathic accuracy over communication across senses. We theorized that people often intentionally communicate their feelings and internal states through the voice, and as such, voice-only communication allows perceivers to focus their attention on the channel of communication most active and accurate in conveying emotions to others. We used 5 experiments to test this hypothesis (N ϭ 1,772), finding that voice-only communication elicits higher rates of empathic accuracy relative to vision-only and multisense communication both while engaging in interactions and perceiving emotions in recorded interactions of strangers. Experiments 4 and 5 reveal that voice-only communication is particularly likely to enhance empathic accuracy through increasing focused attention on the linguistic and paralinguistic vocal cues that accompany speech. Overall, the studies question the primary role of the face in communication of emotion, and offer new insights for improving emotion recognition accuracy in social interactions.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/KKFPGIM2/Kraus - Voice-Only Communication Enhances Empathic Accurac.pdf}
}

@article{krausVoiceonlyCommunicationEnhances2017,
  title = {Voice-Only Communication Enhances Empathic Accuracy},
  author = {Kraus, Michael W.},
  year = {2017},
  month = oct,
  journal = {The American Psychologist},
  volume = {72},
  number = {7},
  pages = {644--654},
  issn = {1935-990X},
  doi = {10.1037/amp0000147},
  abstract = {This research tests the prediction that voice-only communication increases empathic accuracy over communication across senses. We theorized that people often intentionally communicate their feelings and internal states through the voice, and as such, voice-only communication allows perceivers to focus their attention on the channel of communication most active and accurate in conveying emotions to others. We used 5 experiments to test this hypothesis (N = 1,772), finding that voice-only communication elicits higher rates of empathic accuracy relative to vision-only and multisense communication both while engaging in interactions and perceiving emotions in recorded interactions of strangers. Experiments 4 and 5 reveal that voice-only communication is particularly likely to enhance empathic accuracy through increasing focused attention on the linguistic and paralinguistic vocal cues that accompany speech. Overall, the studies question the primary role of the face in communication of emotion, and offer new insights for improving emotion recognition accuracy in social interactions. (PsycINFO Database Record},
  langid = {english},
  pmid = {29016168},
  keywords = {Adult,Empathy,Female,Humans,Interpersonal Relations,Male,Social Perception,Speech,Voice},
  file = {/Users/timokoch/Zotero/storage/JWLUFZIC/Kraus - 2017 - Voice-only communication enhances empathic accurac.pdf}
}

@incollection{krogerPrivacyImplicationsVoice2020,
  title = {Privacy {{Implications}} of {{Voice}} and {{Speech Analysis}} -- {{Information Disclosure}} by {{Inference}}},
  booktitle = {Privacy and {{Identity Management}}. {{Data}} for {{Better Living}}: {{AI}} and {{Privacy}}},
  author = {Kr{\"o}ger, Jacob Leon and Lutz, Otto Hans-Martin and Raschke, Philip},
  editor = {Friedewald, Michael and {\"O}nen, Melek and Lievens, Eva and Krenn, Stephan and Fricker, Samuel},
  year = {2020},
  volume = {576},
  pages = {242--258},
  publisher = {Springer International Publishing},
  address = {Cham},
  doi = {10.1007/978-3-030-42504-3_16},
  urldate = {2021-11-08},
  abstract = {Internet-connected devices, such as smartphones, smartwatches, and laptops, have become ubiquitous in modern life, reaching ever deeper into our private spheres. Among the sensors most commonly found in such devices are microphones. While various privacy concerns related to microphone-equipped devices have been raised and thoroughly discussed, the threat of unexpected inferences from audio data remains largely overlooked. Drawing from literature of diverse disciplines, this paper presents an overview of sensitive pieces of information that can, with the help of advanced data analysis methods, be derived from human speech and other acoustic elements in recorded audio. In addition to the linguistic content of speech, a speaker's voice characteristics and manner of expression may implicitly contain a rich array of personal information, including cues to a speaker's biometric identity, personality, physical traits, geographical origin, emotions, level of intoxication and sleepiness, age, gender, and health condition. Even a person's socioeconomic status can be reflected in certain speech patterns. The findings compiled in this paper demonstrate that recent advances in voice and speech processing induce a new generation of privacy threats.},
  isbn = {978-3-030-42503-6 978-3-030-42504-3},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/4FDN46SP/Kröger et al. - 2020 - Privacy Implications of Voice and Speech Analysis .pdf}
}

@article{krossDoesCountingEmotion2019,
  title = {Does Counting Emotion Words on Online Social Networks Provide a Window into People's Subjective Experience of Emotion? {{A}} Case Study on {{Facebook}}.},
  shorttitle = {Does Counting Emotion Words on Online Social Networks Provide a Window into People's Subjective Experience of Emotion?},
  author = {Kross, Ethan and Verduyn, Philippe and Boyer, Margaret and Drake, Brittany and Gainsburg, Izzy and Vickers, Brian and Ybarra, Oscar and Jonides, John},
  year = {2019},
  month = feb,
  journal = {Emotion},
  volume = {19},
  number = {1},
  pages = {97--107},
  issn = {1931-1516, 1528-3542},
  doi = {10.1037/emo0000416},
  urldate = {2020-11-20},
  abstract = {Psychologists have long debated whether it is possible to assess how people subjectively feel without asking them. The recent proliferation of online social networks has recently added a fresh chapter to this discussion, with research now suggesting that it is possible to index people's subjective experience of emotion by simply counting the number of emotion words contained in their online social network posts. Whether the conclusions that emerge from this work are valid, however, rests on a critical assumption: that people's usage of emotion words in their posts accurately reflects how they feel. Although this assumption is widespread in psychological research, here we suggest that there are reasons to challenge it. We corroborate these assertions in 2 ways. First, using data from 4 experience-sampling studies of emotion in young adults, we show that people's reports of how they feel throughout the day neither predict, nor are predicted by, their use of emotion words on Facebook. Second, using simulations we show that although significant relationships emerge between the use of emotion words on Facebook and self-reported affect with increasingly large numbers of observations, the relationship between these variables was in the opposite of the theoretically expected direction 50\% of the time (i.e., 3 of 6 models that we performed simulations on). In contrast to counting emotion words, we show that judges' ratings of the emotionality of participants' Facebook posts consistently predicts how people feel across all analyses. These findings shed light on how to draw inferences about emotion using online social network data.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/5FGPBXAE/Kross et al. - 2019 - Does counting emotion words on online social netwo.pdf}
}

@article{kuppensFeelingsChangeAccounting2010,
  title = {Feelings Change: Accounting for Individual Differences in the Temporal Dynamics of Affect},
  shorttitle = {Feelings Change},
  author = {Kuppens, Peter and Oravecz, Zita and Tuerlinckx, Francis},
  year = {2010},
  month = dec,
  journal = {Journal of Personality and Social Psychology},
  volume = {99},
  number = {6},
  pages = {1042--1060},
  issn = {1939-1315},
  doi = {10.1037/a0020962},
  abstract = {People display a remarkable variability in the patterns and trajectories with which their feelings change over time. In this article, we present a theoretical account for the dynamics of affect (DynAffect) that identifies the major processes underlying individual differences in the temporal dynamics of affective experiences. It is hypothesized that individuals are characterized by an affective home base, a baseline attractor state around which affect fluctuates. These fluctuations vary as the result of internal or external processes to which an individual is more or less sensitive and are regulated and tied back to the home base by the attractor strength. Individual differences in these 3 processes--affective home base, variability, and attractor strength--are proposed to underlie individual differences in affect dynamics. The DynAffect account is empirically evaluated by means of a diffusion modeling approach in 2 extensive experience-sampling studies on people's core affective experiences. The findings show that the model is capable of adequately capturing the observed dynamics in core affect across both large (Study 1) and shorter time scales (Study 2) and illuminate how the key processes are related to personality and emotion dispositions. Implications for the understanding of affect dynamics and affective dysfunctioning in psychopathology are also discussed.},
  langid = {english},
  pmid = {20853980},
  keywords = {Adult,Affect,Belgium,Emotions,Female,Humans,Individuality,Male,Models Psychological,Nonlinear Dynamics,Personality,Temperament,Time Factors},
  file = {/Users/timokoch/Zotero/storage/4WZRCAJG/Kuppens et al. - 2010 - Feelings Change Accounting for Individual Differe.pdf}
}

@article{kuppensRelationValenceArousal2017,
  title = {The {{Relation Between Valence}} and {{Arousal}} in {{Subjective Experience Varies With Personality}} and {{Culture}}: {{Relation}} of {{Valence}} to {{Arousal}}},
  shorttitle = {The {{Relation Between Valence}} and {{Arousal}} in {{Subjective Experience Varies With Personality}} and {{Culture}}},
  author = {Kuppens, Peter and Tuerlinckx, Francis and Yik, Michelle and Koval, Peter and Coosemans, Joachim and Zeng, Kevin J. and Russell, James A.},
  year = {2017},
  month = aug,
  journal = {Journal of Personality},
  volume = {85},
  number = {4},
  pages = {530--542},
  issn = {00223506},
  doi = {10.1111/jopy.12258},
  urldate = {2021-12-14},
  abstract = {Objective: While in general arousal increases with positive or negative valence (a so-called V-shaped relation), there are large differences among individuals in how these two fundamental dimensions of affect are related in people's experience. In two studies, we examined two possible sources of this variation: personality and culture. Method: In Study 1, participants (Belgian university students) recalled a recent event that was characterized by high or low valence or arousal and reported on their feelings and their personality in terms of the Five-Factor Model. In Study 2, participants from Canada, China/Hong Kong, Japan, Korea, and Spain reported on their feelings in a thin slice of time and on their personality. Results: In Study 1, we replicated the V-shape as characterizing the relation between valence and arousal, and identified personality correlates of experiencing particular valence--arousal combinations. In Study 2, we documented how the V-shaped relation varied as a function of Western versus Eastern cultural background and personality. Conclusions: The results showed that the steepness of the V-shaped relation between valence and arousal increases with Extraversion within cultures, and with a West-East distinction between cultures. Implications for the personality--emotion link and research on cultural differences in affect are discussed.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/82PFCPE4/Kuppens et al. - 2017 - The Relation Between Valence and Arousal in Subjec.pdf}
}

@article{kwasnyGenderAgeEstimation2021,
  title = {Gender and {{Age Estimation Methods Based}} on {{Speech Using Deep Neural Networks}}},
  author = {Kwasny, Damian and Hemmerling, Daria},
  year = {2021},
  month = jul,
  journal = {Sensors (Basel, Switzerland)},
  volume = {21},
  number = {14},
  pages = {4785},
  issn = {1424-8220},
  doi = {10.3390/s21144785},
  abstract = {The speech signal contains a vast spectrum of information about the speaker such as speakers' gender, age, accent, or health state. In this paper, we explored different approaches to automatic speaker's gender classification and age estimation system using speech signals. We applied various Deep Neural Network-based embedder architectures such as x-vector and d-vector to age estimation and gender classification tasks. Furthermore, we have applied a transfer learning-based training scheme with pre-training the embedder network for a speaker recognition task using the Vox-Celeb1 dataset and then fine-tuning it for the joint age estimation and gender classification task. The best performing system achieves new state-of-the-art results on the age estimation task using popular TIMIT dataset with a mean absolute error (MAE) of 5.12 years for male and 5.29 years for female speakers and a root-mean square error (RMSE) of 7.24 and 8.12 years for male and female speakers, respectively, and an overall gender recognition accuracy of 99.60\%.},
  langid = {english},
  pmcid = {PMC8309811},
  pmid = {34300525},
  keywords = {age estimation,Child Preschool,Female,gender classification,Humans,Male,neural networks,Neural Networks Computer,Recognition Psychology,Speech,Speech Perception,speech processing,x-vector},
  file = {/Users/timokoch/Zotero/storage/9PFAWFX9/Kwasny and Hemmerling - 2021 - Gender and Age Estimation Methods Based on Speech .pdf}
}

@article{laneDistinctionsEmotionMood2005,
  title = {Distinctions between {{Emotion}} and {{Mood}}},
  author = {Lane, Andrew and Beedie, Christopher and Terry, Peter},
  year = {2005},
  month = sep,
  journal = {Cognition and Emotion},
  volume = {19},
  doi = {10.1080/02699930541000057},
  abstract = {Most academics agree that emotions and moods are related but distinct phenomena. The present study assessed emotion-mood distinctions among a non-academic population and compared these views with distinctions proposed in the literature. Content analysis of responses from 106 participants identified 16 themes, with cause (65\% of respondents), duration (40\%), control (25\%), experience (15\%) and consequences (14\%) the most frequently cited distinctions. Among 65 contributions to the academic literature, eight themes were proposed, with duration (62\% of authors), intentionality (41\%), cause (31\%), consequences (31\%) and function (18\%) the most frequently cited. When the eight themes cited by both academics and non-academics were rank ordered, approximately 60\% overlap in opinion was evident. A data-derived summary of emotion-mood distinctions is provided. These data should prove useful to investigators interested in developing a clearer scientific distinction between emotion and mood than is currently available.},
  file = {/Users/timokoch/Zotero/storage/NMUSX6XV/Lane et al. - 2005 - Distinctions between Emotion and Mood.pdf}
}

@article{langMlr3ModernObjectoriented2019,
  title = {Mlr3: {{A}} Modern Object-Oriented Machine Learning Framework in {{R}}},
  shorttitle = {Mlr3},
  author = {Lang, Michel and Binder, Martin and Richter, Jakob and Schratz, Patrick and Pfisterer, Florian and Coors, Stefan and Au, Quay and Casalicchio, Giuseppe and Kotthoff, Lars and Bischl, Bernd},
  year = {2019},
  month = dec,
  journal = {Journal of Open Source Software},
  volume = {4},
  number = {44},
  pages = {1903},
  issn = {2475-9066},
  doi = {10.21105/joss.01903},
  urldate = {2020-11-30},
  abstract = {Lang et al., (2019). mlr3: A modern object-oriented machine learning framework in R. Journal of Open Source Software, 4(44), 1903, https://doi.org/10.21105/joss.01903},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/HXFDRJJX/Lang et al. - 2019 - mlr3 A modern object-oriented machine learning fr.pdf;/Users/timokoch/Zotero/storage/EFXHGNPN/joss.html}
}

@article{larrouy-maestriSoundEmotionalProsody2024,
  title = {The {{Sound}} of {{Emotional Prosody}}: {{Nearly}} 3 {{Decades}} of {{Research}} and {{Future Directions}}},
  shorttitle = {The {{Sound}} of {{Emotional Prosody}}},
  author = {{Larrouy-Maestri}, Pauline and Poeppel, David and Pell, Marc D.},
  year = {2024},
  month = jan,
  journal = {Perspectives on Psychological Science},
  pages = {17456916231217722},
  publisher = {SAGE Publications Inc},
  issn = {1745-6916},
  doi = {10.1177/17456916231217722},
  urldate = {2024-04-05},
  abstract = {Emotional voices attract considerable attention. A search on any browser using ?emotional prosody? as a key phrase leads to more than a million entries. Such interest is evident in the scientific literature as well; readers are reminded in the introductory paragraphs of countless articles of the great importance of prosody and that listeners easily infer the emotional state of speakers through acoustic information. However, despite decades of research on this topic and important achievements, the mapping between acoustics and emotional states is still unclear. In this article, we chart the rich literature on emotional prosody for both newcomers to the field and researchers seeking updates. We also summarize problems revealed by a sample of the literature of the last decades and propose concrete research directions for addressing them, ultimately to satisfy the need for more mechanistic knowledge of emotional prosody.},
  file = {/Users/timokoch/Zotero/storage/Q6LLI3T2/Larrouy-Maestri et al. - 2024 - The Sound of Emotional Prosody Nearly 3 Decades o.pdf}
}

@article{laukkaCrossCulturalEmotionRecognition2021,
  title = {Cross-{{Cultural Emotion Recognition}} and {{In-Group Advantage}} in {{Vocal Expression}}: {{A Meta-Analysis}}},
  shorttitle = {Cross-{{Cultural Emotion Recognition}} and {{In-Group Advantage}} in {{Vocal Expression}}},
  author = {Laukka, Petri and Elfenbein, Hillary Anger},
  year = {2021},
  month = jan,
  journal = {Emotion Review},
  volume = {13},
  number = {1},
  pages = {3--11},
  publisher = {SAGE Publications},
  issn = {1754-0739},
  doi = {10.1177/1754073919897295},
  urldate = {2024-09-27},
  abstract = {Most research on cross-cultural emotion recognition has focused on facial expressions. To integrate the body of evidence on vocal expression, we present a meta-analysis of 37 cross-cultural studies of emotion recognition from speech prosody and nonlinguistic vocalizations, including expressers from 26 cultural groups and perceivers from 44 different cultures. Results showed that a wide variety of positive and negative emotions could be recognized with above-chance accuracy in cross-cultural conditions. However, there was also evidence for in-group advantage with higher accuracy in within- versus cross-cultural conditions. The distance between expresser and perceiver culture, measured via Hofstede's cultural dimensions, was negatively correlated with recognition accuracy and positively correlated with in-group advantage. Results are discussed in relation to the dialect theory of emotion.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/EYF62CXH/Laukka and Elfenbein - 2021 - Cross-Cultural Emotion Recognition and In-Group Ad.pdf}
}

@article{laukkaDimensionalApproachVocal2005,
  title = {A Dimensional Approach to Vocal Expression of Emotion},
  author = {Laukka, Petri and Juslin, Patrik and Bresin, Roberto},
  year = {2005},
  month = aug,
  journal = {Cognition and Emotion},
  volume = {19(5)},
  pages = {633--653},
  doi = {10.1080/02699930441000445},
  abstract = {This study explored a dimensional approach to vocal expression of emotion. Actors vocally portrayed emotions (anger, disgust, fear, happiness, sadness) with weak and strong emotion intensity. Listeners (30 university students and 6 speech experts) rated each portrayal on four emotion dimensions (activation, valence, potency, emotion intensity). The portrayals were also acoustically analysed with respect to 20 vocal cues (e.g., speech rate, voice intensity, fundamental frequency, spectral energy distribution). The results showed that: (a) there were distinct patterns of ratings of activation, valence, and potency for the different emotions; (b) all four emotion dimensions were correlated with several vocal cues; (c) listeners' ratings could be successfully predicted from the vocal cues for all dimensions except valence; and (d) the intensity dimension was positively correlated with the activation dimension in the listeners' ratings.},
  file = {/Users/timokoch/Zotero/storage/YMICBFT5/Laukka et al. - 2005 - A dimensional approach to vocal expression of emot.pdf}
}

@article{laukkaExpressionRecognitionEmotions2016,
  title = {The {{Expression}} and {{Recognition}} of {{Emotions}} in the {{Voice Across Five Nations}}: {{A Lens Model Analysis Based}} on {{Acoustic Features}}},
  shorttitle = {The {{Expression}} and {{Recognition}} of {{Emotions}} in the {{Voice Across Five Nations}}},
  author = {Laukka, Petri and Elfenbein, Hillary and Thingujam, Nutankumar and Rockstuhl, Thomas and Iraki, Frederick and Chui, Wanda and Althoff, Jean},
  year = {2016},
  month = aug,
  journal = {Journal of personality and social psychology},
  volume = {111},
  doi = {10.1037/pspi0000066},
  abstract = {This study extends previous work on emotion communication across cultures with a large-scale investigation of the physical expression cues in vocal tone. In doing so, it provides the first direct test of a key proposition of dialect theory, namely that greater accuracy of detecting emotions from one's own cultural group-known as in-group advantage-results from a match between culturally specific schemas in emotional expression style and culturally specific schemas in emotion recognition. Study 1 used stimuli from 100 professional actors from five English-speaking nations vocally conveying 11 emotional states (anger, contempt, fear, happiness, interest, lust, neutral, pride, relief, sadness, and shame) using standard-content sentences. Detailed acoustic analyses showed many similarities across groups, and yet also systematic group differences. This provides evidence for cultural accents in expressive style at the level of acoustic cues. In Study 2, listeners evaluated these expressions in a 5 {\texttimes} 5 design balanced across groups. Cross-cultural accuracy was greater than expected by chance. However, there was also in-group advantage, which varied across emotions. A lens model analysis of fundamental acoustic properties examined patterns in emotional expression and perception within and across groups. Acoustic cues were used relatively similarly across groups both to produce and judge emotions, and yet there were also subtle cultural differences. Speakers appear to have a culturally nuanced schema for enacting vocal tones via acoustic cues, and perceivers have a culturally nuanced schema in judging them. Consistent with dialect theory's prediction, in-group judgments showed a greater match between these schemas used for emotional expression and perception. (PsycINFO Database Record},
  file = {/Users/timokoch/Zotero/storage/3RECZUNV/Laukka et al. - 2016 - The Expression and Recognition of Emotions in the .pdf}
}

@phdthesis{laukkaVocalExpressionEmotion2004,
  title = {Vocal Expression of Emotion: Discrete-Emotions and Dimensional Accounts},
  shorttitle = {Vocal Expression of Emotion},
  author = {Laukka, Petri},
  year = {2004},
  school = {Acta Universitatis Upsaliensis},
  file = {/Users/timokoch/Zotero/storage/LPYT4TK8/FULLTEXT01.pdf;/Users/timokoch/Zotero/storage/QM7EX6D4/record.html}
}

@article{lausenEmotionRecognitionConfidence2020,
  title = {Emotion Recognition and Confidence Ratings Predicted by Vocal Stimulus Type and Prosodic Parameters},
  author = {Lausen, Adi and Hammerschmidt, Kurt},
  year = {2020},
  month = jun,
  journal = {Humanities and Social Sciences Communications},
  volume = {7},
  number = {1},
  pages = {1--17},
  publisher = {Palgrave},
  issn = {2662-9992},
  doi = {10.1057/s41599-020-0499-z},
  urldate = {2020-11-16},
  abstract = {Human speech expresses emotional meaning not only through semantics, but also through certain attributes of the voice, such as pitch or loudness. In investigations of vocal emotion recognition, there is considerable variability in the types of stimuli and procedures used to examine their influence on emotion recognition. In addition, accurate metacognition was argued to promote correct and confident interpretations in emotion recognition tasks. Nevertheless, such associations have rarely been studied previously. We addressed this gap by examining the impact of vocal stimulus type and prosodic speech attributes on emotion recognition and a person's confidence in a given response. We analysed a total of 1038 emotional expressions according to a baseline set of 13 prosodic acoustic parameters. Results showed that these parameters provided sufficient discrimination between expressions of emotional categories to permit accurate statistical classification. Emotion recognition and confidence judgments were found to depend on stimulus material as they could be reliably predicted by different constellations of acoustic features. Finally, results indicated that listeners' accuracy and confidence judgements were significantly higher for affect bursts than speech-embedded stimuli and that the correct classification of emotional expressions elicited increased confidence judgements. Together, these findings show that vocal stimulus type and prosodic attributes of speech strongly influence emotion recognition and listeners' confidence in these given responses.},
  copyright = {2020 The Author(s)},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/JU5SXHJ2/Lausen und Hammerschmidt - 2020 - Emotion recognition and confidence ratings predict.pdf;/Users/timokoch/Zotero/storage/A583NDGK/s41599-020-0499-z.html}
}

@article{lazarevicAmbulatoryAssessmentLanguage2020,
  title = {Ambulatory Assessment of Language Use: {{Evidence}} on the Temporal Stability of {{Electronically Activated Recorder}} and Stream of Consciousness Data},
  shorttitle = {Ambulatory Assessment of Language Use},
  author = {Lazarevic, Ljiljana and Bjekic, Jovana and Zivanovic, Marko and Knezevic, Goran},
  year = {2020},
  month = feb,
  journal = {Behavior Research Methods},
  doi = {10.3758/s13428-020-01361-z},
  abstract = {The ambulatory assessment offers a wide range of methods enabling researchers to investigate psychological, behavioral, emotional, and biological processes. These methods enable us to gather data on individual differences in language use for psychological research. Two studies were conducted with an aim to evaluate and compare the temporal stability of language measures extracted by LIWC software form data obtained by two frequently used methods for assessment of language use, i.e., Electronically Activated Recorder (EAR) and stream of consciousness (SOC) task. Additionally, we examined the amount of variance in language use (assessed by both methods) that can be attributed to intra-individual variability and stable individual differences. Study 1 was focused on investigating language use obtained from 74 respondents using the EAR for 3 consecutive days. Study 2 was conducted on 250 respondents participating in a SOC task where verbal production was collected at ten time points over a 2-month period. Results show that measures obtained using the SOC task have higher temporal stability and consistency, and to a certain extent enable better detection of individual differences. Taking into account certain situational variations improves the reliability of EAR measures.}
}

@article{liebenthalLanguageToneProsody2016,
  title = {The {{Language}}, {{Tone}} and {{Prosody}} of {{Emotions}}: {{Neural Substrates}} and {{Dynamics}} of {{Spoken-Word Emotion Perception}}},
  shorttitle = {The {{Language}}, {{Tone}} and {{Prosody}} of {{Emotions}}},
  author = {Liebenthal, Einat and Silbersweig, David A. and Stern, Emily},
  year = {2016},
  month = nov,
  journal = {Frontiers in Neuroscience},
  volume = {10},
  publisher = {Frontiers},
  issn = {1662-453X},
  doi = {10.3389/fnins.2016.00506},
  urldate = {2024-06-10},
  abstract = {{$<$}p{$>$}Rapid assessment of emotions is important for detecting and prioritizing salient input. Emotions are conveyed in spoken words via verbal and non-verbal channels that are mutually informative and unveil in parallel over time, but the neural dynamics and interactions of these processes are not well understood. In this paper, we review the literature on emotion perception in faces, written words, and voices, as a basis for understanding the functional organization of emotion perception in spoken words. The characteristics of visual and auditory routes to the amygdala---a subcortical center for emotion perception---are compared across these stimulus classes in terms of neural dynamics, hemispheric lateralization, and functionality. Converging results from neuroimaging, electrophysiological, and lesion studies suggest the existence of an afferent route to the amygdala and primary visual cortex for fast and subliminal processing of coarse emotional face cues. We suggest that a fast route to the amygdala may also function for brief non-verbal vocalizations (e.g., laugh, cry), in which emotional category is conveyed effectively by voice tone and intensity. However, emotional prosody which evolves on longer time scales and is conveyed by fine-grained spectral cues appears to be processed via a slower, indirect cortical route. For verbal emotional content, the bulk of current evidence, indicating predominant left lateralization of the amygdala response and timing of emotional effects attributable to speeded lexical access, is more consistent with an indirect cortical route to the amygdala. Top-down linguistic modulation may play an important role for prioritized perception of emotions in words. Understanding the neural dynamics and interactions of emotion and language perception is important for selecting potent stimuli and devising effective training and/or treatment approaches for the alleviation of emotional dysfunction across a range of neuropsychiatric states.{$<$}/p{$>$}},
  langid = {english},
  keywords = {Amygdala,Emotions,ERPs (Event-Related Potentials),fMRI,Neural Pathways,semantics,Speech Perception,Voice perception,Word Processing},
  file = {/Users/timokoch/Zotero/storage/2WAU3NLR/Liebenthal et al. - 2016 - The Language, Tone and Prosody of Emotions Neural.pdf}
}

@article{limCulturalDifferencesEmotion2016,
  title = {Cultural Differences in Emotion: Differences in Emotional Arousal Level between the {{East}} and the {{West}}},
  shorttitle = {Cultural Differences in Emotion},
  author = {Lim, Nangyeon},
  year = {2016},
  month = jun,
  journal = {Integrative Medicine Research},
  volume = {5},
  number = {2},
  pages = {105--109},
  issn = {2213-4220},
  doi = {10.1016/j.imr.2016.03.004},
  urldate = {2021-12-14},
  abstract = {Whether emotion is universal or social is a recurrent issue in the history of emotion study among psychologists. Some researchers view emotion as a universal construct, and that a large part of emotional experience is biologically based. However, emotion is not only biologically determined, but is also influenced by the environment. Therefore, cultural differences exist in some aspects of emotions, one such important aspect of emotion being emotional arousal level. All affective states are systematically represented as two bipolar dimensions, valence and arousal. Arousal level of actual and ideal emotions has consistently been found to have cross-cultural differences. In Western or individualist culture, high arousal emotions are valued and promoted more than low arousal emotions. Moreover, Westerners experience high arousal emotions more than low arousal emotions. By contrast, in Eastern or collectivist culture, low arousal emotions are valued more than high arousal emotions. Moreover, people in the East actually experience and prefer to experience low arousal emotions more than high arousal emotions. Mechanism of these cross-cultural differences and implications are also discussed.},
  langid = {english},
  keywords = {collectivist culture,cultural difference,emotional arousal level,individualist culture},
  file = {/Users/timokoch/Zotero/storage/R8G6PDV5/Lim - 2016 - Cultural differences in emotion differences in em.pdf}
}

@article{linyiProsodyDominatesSemantics2020,
  title = {Prosody {{Dominates Over Semantics}} in {{Emotion Word Processing}}: {{Evidence From Cross-Channel}} and {{Cross-Modal Stroop Effects}}},
  shorttitle = {Prosody {{Dominates Over Semantics}} in {{Emotion Word Processing}}},
  author = {{Lin Yi} and {Ding Hongwei} and {Zhang Yang}},
  year = {2020},
  month = mar,
  journal = {Journal of Speech, Language, and Hearing Research},
  volume = {63},
  number = {3},
  pages = {896--912},
  publisher = {American Speech-Language-Hearing Association},
  doi = {10.1044/2020_JSLHR-19-00258},
  urldate = {2021-06-30},
  abstract = {Purpose       Emotional speech communication involves multisensory integration of linguistic (e.g.,          semantic content) and paralinguistic (e.g., prosody and facial expressions) messages.          Previous studies on linguistic versus paralinguistic salience effects in emotional          speech processing have produced inconsistent findings. In this study, we investigated          the relative perceptual saliency of emotion cues in cross-channel auditory alone task          (i.e., semantics--prosody Stroop task) and cross-modal audiovisual task (i.e., semantics--prosody--face          Stroop task).              Method       Thirty normal Chinese adults participated in two Stroop experiments with spoken emotion          adjectives in Mandarin Chinese. Experiment 1 manipulated auditory pairing of emotional          prosody (happy or sad) and lexical semantic content in congruent and incongruent conditions.          Experiment 2 extended the protocol to cross-modal integration by introducing visual          facial expression during auditory stimulus presentation. Participants were asked to          judge emotional information for each test trial according to the instruction of selective          attention.              Results       Accuracy and reaction time data indicated that, despite an increase in cognitive demand          and task complexity in Experiment 2, prosody was consistently more salient than semantic          content for emotion word processing and did not take precedence over facial expression.          While congruent stimuli enhanced performance in both experiments, the facilitatory          effect was smaller in Experiment 2.              Conclusion       Together, the results demonstrate the salient role of paralinguistic prosodic cues          in emotion word processing and congruence facilitation effect in multisensory integration.          Our study contributes tonal language data on how linguistic and paralinguistic messages          converge in multisensory speech processing and lays a foundation for further exploring          the brain mechanisms of cross-channel/modal emotion integration with potential clinical          applications.},
  file = {/Users/timokoch/Zotero/storage/MZDJTXZT/Lin Yi et al. - 2020 - Prosody Dominates Over Semantics in Emotion Word P.pdf}
}

@article{liuMultimodalPrivacypreservingMood2020,
  title = {Multimodal {{Privacy-preserving Mood Prediction}} from {{Mobile Data}}: {{A Preliminary Study}}},
  shorttitle = {Multimodal {{Privacy-preserving Mood Prediction}} from {{Mobile Data}}},
  author = {Liu, Terrance and Liang, Paul Pu and Muszynski, Michal and Ishii, Ryo and Brent, David and Auerbach, Randy and Allen, Nicholas and Morency, Louis-Philippe},
  year = {2020},
  month = dec,
  journal = {arXiv:2012.02359 [cs, stat]},
  eprint = {2012.02359},
  primaryclass = {cs, stat},
  urldate = {2020-12-30},
  abstract = {Mental health conditions remain under-diagnosed even in countries with common access to advanced medical care. The ability to accurately and efficiently predict mood from easily collectible data has several important implications towards the early detection and intervention of mental health disorders. One promising data source to help monitor human behavior is from daily smartphone usage. However, care must be taken to summarize behaviors without identifying the user through personal (e.g., personally identifiable information) or protected attributes (e.g., race, gender). In this paper, we study behavioral markers or daily mood using a recent dataset of mobile behaviors from high-risk adolescent populations. Using computational models, we find that multimodal modeling of both text and app usage features is highly predictive of daily mood over each modality alone. Furthermore, we evaluate approaches that reliably obfuscate user identity while remaining predictive of daily mood. By combining multimodal representations with privacy-preserving learning, we are able to push forward the performance-privacy frontier as compared to unimodal approaches.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computers and Society,Computer Science - Machine Learning,Statistics - Applications},
  file = {/Users/timokoch/Zotero/storage/8ZBKUF5P/Liu et al. - 2020 - Multimodal Privacy-preserving Mood Prediction from.pdf;/Users/timokoch/Zotero/storage/TYBDCMHM/2012.html}
}

@misc{liuRoBERTaRobustlyOptimized2019,
  title = {{{RoBERTa}}: {{A Robustly Optimized BERT Pretraining Approach}}},
  shorttitle = {{{RoBERTa}}},
  author = {Liu, Yinhan and Ott, Myle and Goyal, Naman and Du, Jingfei and Joshi, Mandar and Chen, Danqi and Levy, Omer and Lewis, Mike and Zettlemoyer, Luke and Stoyanov, Veselin},
  year = {2019},
  month = jul,
  number = {arXiv:1907.11692},
  eprint = {1907.11692},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1907.11692},
  urldate = {2023-02-09},
  abstract = {Language model pretraining has led to significant performance gains but careful comparison between different approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes, and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These results highlight the importance of previously overlooked design choices, and raise questions about the source of recently reported improvements. We release our models and code.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/timokoch/Zotero/storage/236IWGW6/Liu et al. - 2019 - RoBERTa A Robustly Optimized BERT Pretraining App.pdf;/Users/timokoch/Zotero/storage/DGRGF2HP/1907.html}
}

@misc{mandellSpotifyPatentsVoice2020,
  title = {Spotify {{Patents A Voice Assistant That Can Read Your Emotions}}},
  author = {Mandell, Josh},
  year = {2020},
  journal = {Forbes},
  urldate = {2021-06-30},
  abstract = {Spotify appears to be interested in challenging Alexa and Siri with its own state of the art voice assistant.},
  chapter = {Hollywood \& Entertainment},
  howpublished = {https://www.forbes.com/sites/joshmandell/2020/03/12/spotify-patents-a-voice-assistant--that-can-read-your-emotions/},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/XA3S6T3P/spotify-patents-a-voice-assistant--that-can-read-your-emotions.html}
}

@article{marreroEvaluatingVoiceSamples2022,
  title = {Evaluating Voice Samples as a Potential Source of Information about Personality},
  author = {Marrero, Zachariah N. K. and Gosling, Samuel D. and Pennebaker, James W. and Harari, Gabriella M.},
  year = {2022},
  month = oct,
  journal = {Acta Psychologica},
  volume = {230},
  pages = {103740},
  issn = {0001-6918},
  doi = {10.1016/j.actpsy.2022.103740},
  urldate = {2022-11-15},
  abstract = {Speech is a powerful medium through which a variety of psychologically relevant phenomena are expressed. Here we take a first step in evaluating the potential of using voice samples as non-self-report measures of personality. In particular, we examine the extent to which linguistic and vocal information extracted from semi-structured vocal samples can be used to predict conventional measures of personality. We extracted 94 linguistic features (using Linquistic Inquiry Word Count, 2015) and 272 vocal features (using pyAudioAnalysis) from 614 voice samples of at least 50 words. Using a two-stage, fully automatable machine learning pipeline we evaluated the extent to which these features predicted self-report personality scales (Big Five Inventory). For comparison purposes, we also examined the predictive performance of these voice features with respect to depression, age, and gender. Results showed that voice samples accounted for 10.67~\% of the variance in personality traits on average and that the same samples could also predict depression, age, and gender. Moreover, the results reported here provide a conservative estimate of the degree to which features derived from voice samples could be used to predict personality traits and suggest a number of opportunities to optimize personality prediction and better understand how voice samples carry information about personality.},
  langid = {english},
  keywords = {Audio data,LIWC,Machine learning,Personality prediction,Voice},
  file = {/Users/timokoch/Zotero/storage/JRPNQSJY/Marrero et al. - 2022 - Evaluating voice samples as a potential source of .pdf;/Users/timokoch/Zotero/storage/M5CZKD2I/S0001691822002554.html}
}

@inproceedings{materoEvaluatingContextualEmbeddings2022,
  title = {Evaluating {{Contextual Embeddings}} and Their {{Extraction Layers}} for {{Depression Assessment}}},
  booktitle = {Proceedings of the 12th {{Workshop}} on {{Computational Approaches}} to {{Subjectivity}}, {{Sentiment}} \& {{Social Media Analysis}}},
  author = {Matero, Matthew and Hung, Albert and Schwartz, H. Andrew},
  year = {2022},
  month = may,
  pages = {89--94},
  publisher = {Association for Computational Linguistics},
  address = {Dublin, Ireland},
  doi = {10.18653/v1/2022.wassa-1.9},
  urldate = {2022-08-12},
  abstract = {Many recent works in natural language processing have demonstrated ability to assess aspects of mental health from personal discourse. At the same time, pre-trained contextual word embedding models have grown to dominate much of NLP but little is known empirically on how to best apply them for mental health assessment. Using degree of depression as a case study, we do an empirical analysis on which off-the-shelf language model, individual layers, and combinations of layers seem most promising when applied to human-level NLP tasks. Notably, we find RoBERTa most effective and, despite the standard in past work suggesting the second-to-last or concatenation of the last 4 layers, we find layer 19 (sixth-to last) is at least as good as layer 23 when using 1 layer. Further, when using multiple layers, distributing them across the second half (i.e. Layers 12+), rather than last 4, of the 24 layers yielded the most accurate results.},
  file = {/Users/timokoch/Zotero/storage/DAW8AGM9/Matero et al. - 2022 - Evaluating Contextual Embeddings and their Extract.pdf}
}

@article{matzUsingBigData2017,
  title = {Using {{Big Data}} as a Window into Consumers' Psychology},
  author = {Matz, Sandra C and Netzer, Oded},
  year = {2017},
  month = dec,
  journal = {Current Opinion in Behavioral Sciences},
  series = {Big Data in the Behavioural Sciences},
  volume = {18},
  pages = {7--12},
  issn = {2352-1546},
  doi = {10.1016/j.cobeha.2017.05.009},
  urldate = {2024-04-25},
  abstract = {The rise of `Big Data' had a big impact on marketing research and practice. In this article, we first highlight sources of useful consumer information that are now available at large scale and very little or no cost. We subsequently discuss how this information -- with the help of new analytical techniques -- can be translated into valuable insights on consumers' psychological states and traits that can, in turn, be used to inform marketing strategy. Finally, we discuss opportunities and challenges related to the use of Big Data as a window into consumers' psychology, and provide recommendations for how to implement related technologies in a way that benefits both businesses and consumers.},
  file = {/Users/timokoch/Zotero/storage/QRA92PRL/S2352154617300566.html}
}

@article{meegahapolaGeneralizationPersonalizationMobile2023,
  title = {Generalization and {{Personalization}} of {{Mobile Sensing-Based Mood Inference Models}}: {{An Analysis}} of {{College Students}} in {{Eight Countries}}},
  shorttitle = {Generalization and {{Personalization}} of {{Mobile Sensing-Based Mood Inference Models}}},
  author = {Meegahapola, Lakmal and Droz, William and Kun, Peter and {de G{\"o}tzen}, Amalia and Nutakki, Chaitanya and Diwakar, Shyam and Correa, Salvador Ruiz and Song, Donglei and Xu, Hao and Bidoglia, Miriam and Gaskell, George and Chagnaa, Altangerel and Ganbold, Amarsanaa and Zundui, Tsolmon and Caprini, Carlo and Miorandi, Daniele and Hume, Alethia and Zarza, Jose Luis and Cernuzzi, Luca and Bison, Ivano and Britez, Marcelo Rodas and Busso, Matteo and {Chenu-Abente}, Ronald and G{\"u}nel, Can and Giunchiglia, Fausto and Schelenz, Laura and {Gatica-Perez}, Daniel},
  year = {2023},
  month = jan,
  journal = {Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies},
  volume = {6},
  number = {4},
  pages = {176:1--176:32},
  doi = {10.1145/3569483},
  urldate = {2023-03-13},
  abstract = {Mood inference with mobile sensing data has been studied in ubicomp literature over the last decade. This inference enables context-aware and personalized user experiences in general mobile apps and valuable feedback and interventions in mobile health apps. However, even though model generalization issues have been highlighted in many studies, the focus has always been on improving the accuracies of models using different sensing modalities and machine learning techniques, with datasets collected in homogeneous populations. In contrast, less attention has been given to studying the performance of mood inference models to assess whether models generalize to new countries. In this study, we collected a mobile sensing dataset with 329K self-reports from 678 participants in eight countries (China, Denmark, India, Italy, Mexico, Mongolia, Paraguay, UK) to assess the effect of geographical diversity on mood inference models. We define and evaluate country-specific (trained and tested within a country), continent-specific (trained and tested within a continent), country-agnostic (tested on a country not seen on training data), and multi-country (trained and tested with multiple countries) approaches trained on sensor data for two mood inference tasks with population-level (non-personalized) and hybrid (partially personalized) models. We show that partially personalized country-specific models perform the best yielding area under the receiver operating characteristic curve (AUROC) scores of the range 0.78--0.98 for two-class (negative vs. positive valence) and 0.76--0.94 for three-class (negative vs. neutral vs. positive valence) inference. Further, with the country-agnostic approach, we show that models do not perform well compared to country-specific settings, even when models are partially personalized. We also show that continent-specific models outperform multi-country models in the case of Europe. Overall, we uncover generalization issues of mood inference models to new countries and how the geographical similarity of countries might impact mood inference.},
  keywords = {affect,distributional shift,domain shift,generalization,mood,mood inference,mood tracking,passive sensing,personalization,smartphone sensing,valence},
  file = {/Users/timokoch/Zotero/storage/BKDDF4XQ/Meegahapola et al. - 2023 - Generalization and Personalization of Mobile Sensi.pdf}
}

@article{mehlElectronicallyActivatedRecorder2017,
  title = {The {{Electronically Activated Recorder}} ({{EAR}}): {{A Method}} for the {{Naturalistic Observation}} of {{Daily Social Behavior}}},
  shorttitle = {The {{Electronically Activated Recorder}} ({{EAR}})},
  author = {Mehl, Matthias R.},
  year = {2017},
  month = apr,
  journal = {Current Directions in Psychological Science},
  volume = {26},
  number = {2},
  pages = {184--190},
  publisher = {SAGE Publications Inc},
  issn = {0963-7214},
  doi = {10.1177/0963721416680611},
  urldate = {2021-09-16},
  abstract = {This article reviews the Electronically Activated Recorder (EAR) as an ambulatory ecological momentary assessment tool for the real-world observation of daily behavior. Technically, the EAR is an audio recorder that intermittently records snippets of ambient sounds while participants go about their lives. Conceptually, it is a naturalistic observation method that yields an acoustic log of a person's day as it unfolds. The power of the EAR lies in unobtrusively collecting authentic real-life observational data. In preserving a high degree of naturalism at the level of the raw recordings, it resembles ethnographic methods; through its sampling and coding, it enables larger empirical studies. This article provides an overview of the EAR method; reviews its validity, utility, and limitations; and discusses it in the context of current developments in ambulatory assessment, specifically the emerging field of mobile sensing.},
  langid = {english},
  keywords = {ambulatory assessment,ecological momentary assessment,naturalistic observation,smartphone sensing},
  file = {/Users/timokoch/Zotero/storage/27G98JAN/Mehl - 2017 - The Electronically Activated Recorder (EAR) A Met.pdf}
}

@article{michelettiOptimalSamplingStrategies2020,
  title = {Optimal Sampling Strategies for Characterizing Behavior and Affect from Ambulatory Audio Recordings.},
  author = {Micheletti, Megan and {de Barbaro}, Kaya and Fellows, Michelle D. and Hixon, J. Gregory and Slatcher, Richard B. and Pennebaker, James W.},
  year = {2020},
  month = apr,
  journal = {Journal of Family Psychology},
  issn = {1939-1293, 0893-3200},
  doi = {10.1037/fam0000654},
  urldate = {2020-12-01},
  abstract = {Advances in mobile and wearable technologies mean it is now feasible to record hours to days of participant behavior in its naturalistic context, a great boon for psychologists interested in family processes and development. While automated activity recognition algorithms exist for a limited set of behaviors, time-consuming human annotations are still required to robustly characterize the vast majority of behavioral and affective markers of interest. This report is the first to date which systematically tests the efficacy of different sampling strategies for characterizing behavior from audio recordings to provide practical guidelines for researchers. Using continuous audio recordings of the daily lives of 11 preschoolaged children, we compared sampling techniques to determine the most accurate and efficient approach. Results suggest that sampling both low and high frequency verbal and overt behaviors is best if samples are short in duration, systematically rather than randomly selected, and sampled to cover at least 12.5\% of recordings. Implications for assessment of real-world behavior are discussed.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/F5DAQGXI/Micheletti et al. - 2020 - Optimal sampling strategies for characterizing beh.pdf}
}

@article{millerSmartphonePsychologyManifesto2012,
  title = {The {{Smartphone Psychology Manifesto}}},
  author = {Miller, Geoffrey},
  year = {2012},
  month = may,
  journal = {Perspectives on Psychological Science: A Journal of the Association for Psychological Science},
  volume = {7},
  number = {3},
  pages = {221--237},
  issn = {1745-6916},
  doi = {10.1177/1745691612441215},
  abstract = {By 2025, when most of today's psychology undergraduates will be in their mid-30s, more than 5 billion people on our planet will be using ultra-broadband, sensor-rich smartphones far beyond the abilities of today's iPhones, Androids, and Blackberries. Although smartphones were not designed for psychological research, they can collect vast amounts of ecologically valid data, easily and quickly, from large global samples. If participants download the right "psych apps," smartphones can record where they are, what they are doing, and what they can see and hear and can run interactive surveys, tests, and experiments through touch screens and wireless connections to nearby screens, headsets, biosensors, and other peripherals. This article reviews previous behavioral research using mobile electronic devices, outlines what smartphones can do now and will be able to do in the near future, explains how a smartphone study could work practically given current technology (e.g., in studying ovulatory cycle effects on women's sexuality), discusses some limitations and challenges of smartphone research, and compares smartphones to other research methods. Smartphone research will require new skills in app development and data analysis and will raise tough new ethical issues, but smartphones could transform psychology even more profoundly than PCs and brain imaging did.},
  langid = {english},
  pmid = {26168460},
  keywords = {behavioral informatics,digital sensors,GPS/GIS,human subjects/IRB issues,mobile computing,telecommunications}
}

@article{millingSpeechNewBlood2022,
  title = {Is {{Speech}} the {{New Blood}}? {{Recent Progress}} in {{AI-Based Disease Detection From Audio}} in a {{Nutshell}}},
  shorttitle = {Is {{Speech}} the {{New Blood}}?},
  author = {Milling, Manuel and Pokorny, Florian and {Bartl-Pokorny}, Katrin and Schuller, Bj{\"o}rn},
  year = {2022},
  month = may,
  journal = {Frontiers in Digital Health},
  volume = {4},
  pages = {886615},
  doi = {10.3389/fdgth.2022.886615},
  abstract = {In recent years, advancements in the field of artificial intelligence (AI) have impacted several areas of research and application. Besides more prominent examples like self-driving cars or media consumption algorithms, AI-based systems have further started to gain more and more popularity in the health care sector, however whilst being restrained by high requirements for accuracy, robustness, and explainability. Health-oriented AI research as a sub-field of digital health investigates a plethora of human-centered modalities. In this article, we address recent advances in the so far understudied but highly promising audio domain with a particular focus on speech data and present corresponding state-of-the-art technologies. Moreover, we give an excerpt of recent studies on the automatic audio-based detection of diseases ranging from acute and chronic respiratory diseases via psychiatric disorders to developmental disorders and neurodegenerative disorders. Our selection of presented literature shows that the recent success of deep learning methods in other fields of AI also more and more translates to the field of digital health, albeit expert-designed feature extractors and classical ML methodologies are still prominently used. Limiting factors, especially for speech-based disease detection systems, are related to the amount and diversity of available data, e. g., the number of patients and healthy controls as well as the underlying distribution of age, languages, and cultures. Finally, we contextualize and outline application scenarios of speech-based disease detection systems as supportive tools for health-care professionals under ethical consideration of privacy protection and faulty prediction.},
  file = {/Users/timokoch/Zotero/storage/VUZW9G9L/Milling et al. - 2022 - Is Speech the New Blood Recent Progress in AI-Bas.pdf}
}

@article{minerAssessingAccuracyAutomatic2020,
  title = {Assessing the Accuracy of Automatic Speech Recognition for Psychotherapy},
  author = {Miner, Adam S. and Haque, Albert and Fries, Jason A. and Fleming, Scott L. and Wilfley, Denise E. and Terence Wilson, G. and Milstein, Arnold and Jurafsky, Dan and Arnow, Bruce A. and Stewart Agras, W. and {Fei-Fei}, Li and Shah, Nigam H.},
  year = {2020},
  month = jun,
  journal = {npj Digital Medicine},
  volume = {3},
  number = {1},
  pages = {1--8},
  publisher = {Nature Publishing Group},
  issn = {2398-6352},
  doi = {10.1038/s41746-020-0285-8},
  urldate = {2023-02-13},
  abstract = {Accurate transcription of audio recordings in psychotherapy would improve therapy effectiveness, clinician training, and safety monitoring. Although automatic speech recognition software is commercially available, its accuracy in mental health settings has not been well described. It is unclear which metrics and thresholds are appropriate for different clinical use cases, which may range from population descriptions to individual safety monitoring. Here we show that automatic speech recognition is feasible in psychotherapy, but further improvements in accuracy are needed before widespread use. Our HIPAA-compliant automatic speech recognition system demonstrated a transcription word error rate of 25\%. For depression-related utterances, sensitivity was 80\% and positive predictive value was 83\%. For clinician-identified harm-related sentences, the word error rate was 34\%. These results suggest that automatic speech recognition may support understanding of language patterns and subgroup variation in existing treatments but may not be ready for individual-level safety surveillance.},
  copyright = {2020 The Author(s)},
  langid = {english},
  keywords = {Depression,Translational research},
  file = {/Users/timokoch/Zotero/storage/IDJNTZZQ/Miner et al. - 2020 - Assessing the accuracy of automatic speech recogni.pdf}
}

@inproceedings{minerConversationalAgentsMental2016,
  title = {Conversational {{Agents}} and {{Mental Health}}: {{Theory-Informed Assessment}} of {{Language}} and {{Affect}}},
  shorttitle = {Conversational {{Agents}} and {{Mental Health}}},
  booktitle = {Proceedings of the {{Fourth International Conference}} on {{Human Agent Interaction}}},
  author = {Miner, Adam and Chow, Amanda and Adler, Sarah and Zaitsev, Ilia and Tero, Paul and Darcy, Alison and Paepcke, Andreas},
  year = {2016},
  month = oct,
  series = {{{HAI}} '16},
  pages = {123--130},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  doi = {10.1145/2974804.2974820},
  urldate = {2021-12-02},
  abstract = {A study deployed the mental health Relational Frame Theory as grounding for an analysis of sentiment dynamics in human-language dialogs. The work takes a step towards enabling use of conversational agents in mental health settings. Sentiment tendencies and mirroring behaviors in 11k human-human dialogs were compared with behaviors when humans interacted with conversational agents in a similar-sized collection. The study finds that human sentiment-related interaction norms persist in human-agent dialogs, but that humans are twice as likely to respond negatively when faced with a negative utterance by a robot than in a comparable situation with humans. Similarly, inhibition towards use of obscenity is greatly reduced. We introduce a new Affective Neural Net implementation that specializes in analyzing sentiment in real time.},
  isbn = {978-1-4503-4508-8},
  keywords = {conversational agents,neural sentiment model,psychotherapy,relational frame theory,sentiment analysis}
}

@book{molnarInterpretableMachineLearning2019,
  title = {Interpretable {{Machine Learning}}},
  author = {Molnar, Christoph},
  year = {2019},
  urldate = {2020-04-12},
  abstract = {Machine learning algorithms usually operate as black boxes and it is unclear how they derived a certain decision. This book is a guide for practitioners to make machine learning decisions interpretable.},
  file = {/Users/timokoch/Zotero/storage/TUVJ7QFL/interpretable-ml-book.html}
}

@article{montagWeStillNeed2022,
  title = {Do We Still Need Psychological Self-Report Questionnaires in the Age of the {{Internet}} of {{Things}}?},
  author = {Montag, Christian and Dagum, Paul and Hall, Brian J. and Elhai, Jon D.},
  year = {2022},
  month = jan,
  journal = {Discover Psychology},
  volume = {2},
  number = {1},
  pages = {1},
  issn = {2731-4537},
  doi = {10.1007/s44202-021-00012-4},
  urldate = {2022-11-08},
  abstract = {Digital data are abundantly available for researchers in the age of the Internet of Things. In the psychological and psychiatric sciences such data can be used in myriad ways to obtain insights into mental states and traits. Most importantly, such data allow researchers to record and analyze behavior in a real-world context, a scientific approach which was expensive and difficult to conduct until only recently. Much research in recent years linked digital footprints to self-report questionnaire data, likely to demonstrate proof of concept(s)---for instance linking socializing on the smartphone to self-reported extraversion (a personality trait linked to socializing)---in the sciences investigating the human mind. The present perspective piece reflects on this approach by revisiting recent work which has been carried out mining smartphone log and social media data and questions if and when self-report data will still be of relevance in psychological/psychiatric research in the near future.},
  langid = {english},
  keywords = {Big data,Digital footprints,Digital phenotyping,Mobile sensing,Personality,Self-report,Smartphone},
  file = {/Users/timokoch/Zotero/storage/ET9DLFTB/Montag et al. - 2022 - Do we still need psychological self-report questio.pdf}
}

@book{muaremiAssessingBipolarEpisodes2014,
  title = {Assessing {{Bipolar Episodes Using Speech Cues Derived}} from {{Phone Calls}}},
  author = {Muaremi, Amir and Gravenhorst, Franz and Gr{\"u}nerbl, Agnes and Arnrich, Bert and Tr{\"o}ster, Gerhard},
  year = {2014},
  month = may,
  journal = {Lecture Notes of the Institute for Computer Sciences, Social-Informatics and Telecommunications Engineering, LNICST},
  volume = {100},
  doi = {10.1007/978-3-319-11564-1_11},
  abstract = {In this work 1 we show how phone call conversations can be used to objectively predict manic and depressive episodes of bipolar dis-ordered people. In particular, we use phone call statistics, parameters derived from dyadic phone conversations and emotional acoustic fea-tures to build and test user-specific classification models. Using random forest, we were able to detect the bipolar states with an average F1 score of 83 \%, and we identified the speaking length and phone call length, the HNR value, the number of short turns and the variance of pitch F0 to be the most important variables for prediction.},
  isbn = {978-3-319-11563-4},
  file = {/Users/timokoch/Zotero/storage/YCAIX8PA/Muaremi et al. - 2014 - Assessing Bipolar Episodes Using Speech Cues Deriv.pdf}
}

@article{mullerDepressionPredictionsGPSbased2021,
  title = {Depression Predictions from {{GPS-based}} Mobility Do Not Generalize Well to Large Demographically Heterogeneous Samples},
  author = {M{\"u}ller, Sandrine R. and Chen, Xi Leslie and Peters, Heinrich and Chaintreau, Augustin and Matz, Sandra C.},
  year = {2021},
  month = jul,
  journal = {Scientific Reports},
  volume = {11},
  number = {1},
  pages = {14007},
  issn = {2045-2322},
  doi = {10.1038/s41598-021-93087-x},
  abstract = {Depression is one of the most common mental health issues in the United States, affecting the lives of millions of people suffering from it as well as those close to them. Recent advances in research on mobile sensing technologies and machine learning have suggested that a person's depression can be passively measured by observing patterns in people's mobility behaviors. However, the majority of work in this area has relied on highly homogeneous samples, most frequently college students. In this study, we analyse over 57 million GPS data points to show that the same procedure that leads to high prediction accuracy in a homogeneous student sample (N = 57; AUC = 0.82), leads to accuracies only slightly higher than chance in a U.S.-wide sample that is heterogeneous in its socio-demographic composition as well as mobility patterns (N = 5,262; AUC = 0.57). This pattern holds across three different modelling approaches which consider both linear and non-linear relationships. Further analyses suggest that the prediction accuracy is low across different socio-demographic groups, and that training the models on more homogeneous subsamples does not substantially improve prediction accuracy. Overall, the findings highlight the challenge of applying mobility-based predictions of depression at scale.},
  langid = {english},
  pmcid = {PMC8263566},
  pmid = {34234186},
  keywords = {Adult,Depression,Female,Geographic Information Systems,Humans,Machine Learning,Male,Models Theoretical,Population Surveillance,Reproducibility of Results,Social Mobility,Students,United States,Young Adult},
  file = {/Users/timokoch/Zotero/storage/IC9XH4CZ/Müller et al. - 2021 - Depression predictions from GPS-based mobility do .pdf}
}

@article{murraySimulationEmotionSynthetic1993,
  title = {Toward the Simulation of Emotion in Synthetic Speech: {{A}} Review of the Literature on Human Vocal Emotion},
  shorttitle = {Toward the Simulation of Emotion in Synthetic Speech},
  author = {Murray, Iain R. and Arnott, John L.},
  year = {1993},
  month = feb,
  journal = {The Journal of the Acoustical Society of America},
  volume = {93},
  number = {2},
  pages = {1097--1108},
  publisher = {Acoustical Society of America},
  issn = {0001-4966},
  doi = {10.1121/1.405558},
  urldate = {2020-12-03},
  file = {/Users/timokoch/Zotero/storage/VEVI3ZY4/Murray und Arnott - 1993 - Toward the simulation of emotion in synthetic spee.pdf;/Users/timokoch/Zotero/storage/IJUWHTVP/1.html}
}

@article{nadeauInferenceGeneralizationError2003,
  title = {Inference for the {{Generalization Error}}},
  author = {Nadeau, Claude and Bengio, Yoshua},
  year = {2003},
  month = sep,
  journal = {Machine Learning},
  volume = {52},
  number = {3},
  pages = {239--281},
  issn = {1573-0565},
  doi = {10.1023/A:1024068626366},
  urldate = {2020-11-02},
  abstract = {In order to compare learning algorithms, experimental results reported in the machine learning literature often use statistical tests of significance to support the claim that a new learning algorithm generalizes better. Such tests should take into account the variability due to the choice of training set and not only that due to the test examples, as is often the case. This could lead to gross underestimation of the variance of the cross-validation estimator, and to the wrong conclusion that the new algorithm is significantly better when it is not. We perform a theoretical investigation of the variance of a variant of the cross-validation estimator of the generalization error that takes into account the variability due to the randomness of the training set as well as test examples. Our analysis shows that all the variance estimators that are based only on the results of the cross-validation experiment must be biased. This analysis allows us to propose new estimators of this variance. We show, via simulations, that tests of hypothesis about the generalization error using those new variance estimators have better properties than tests involving variance estimators currently in use and listed in Dietterich (1998). In particular, the new tests have correct size and good power. That is, the new tests do not reject the null hypothesis too often when the hypothesis is true, but they tend to frequently reject the null hypothesis when the latter is false.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/L5JA42C7/Nadeau und Bengio - 2003 - Inference for the Generalization Error.pdf}
}

@article{nygaardCommunicatingEmotionLinking2008,
  title = {Communicating Emotion: {{Linking}} Affective Prosody and Word Meaning},
  shorttitle = {Communicating Emotion},
  author = {Nygaard, Lynne and Queen, Jennifer},
  year = {2008},
  month = sep,
  journal = {Journal of experimental psychology. Human perception and performance},
  volume = {34},
  pages = {1017--30},
  doi = {10.1037/0096-1523.34.4.1017},
  abstract = {The present study investigated the role of emotional tone of voice in the perception of spoken words. Listeners were presented with words that had either a happy, sad, or neutral meaning. Each word was spoken in a tone of voice (happy, sad, or neutral) that was congruent, incongruent, or neutral with respect to affective meaning, and naming latencies were collected. Across experiments, tone of voice was either blocked or mixed with respect to emotional meaning. The results suggest that emotional tone of voice facilitated linguistic processing of emotional words in an emotion-congruent fashion. These findings suggest that information about emotional tone is used in the processing of linguistic content influencing the recognition and naming of spoken words in an emotion-congruent manner.},
  file = {/Users/timokoch/Zotero/storage/FEB42V97/Nygaard und Queen - 2008 - Communicating emotion Linking affective prosody a.pdf}
}

@article{nygaardSemanticsProsodyAcoustic2009,
  title = {The {{Semantics}} of {{Prosody}}: {{Acoustic}} and {{Perceptual Evidence}} of {{Prosodic Correlates}} to {{Word Meaning}}},
  shorttitle = {The {{Semantics}} of {{Prosody}}},
  author = {Nygaard, Lynne C. and Herold, Debora S. and Namy, Laura L.},
  year = {2009},
  journal = {Cognitive Science},
  volume = {33},
  number = {1},
  pages = {127--146},
  issn = {1551-6709},
  doi = {10.1111/j.1551-6709.2008.01007.x},
  urldate = {2021-06-30},
  abstract = {This investigation examined whether speakers produce reliable prosodic correlates to meaning across semantic domains and whether listeners use these cues to derive word meaning from novel words. Speakers were asked to produce phrases in infant-directed speech in which novel words were used to convey one of two meanings from a set of antonym pairs (e.g., big/small). Acoustic analyses revealed that some acoustic features were correlated with overall valence of the meaning. However, each word meaning also displayed a unique acoustic signature, and semantically related meanings elicited similar acoustic profiles. In two perceptual tests, listeners either attempted to identify the novel words with a matching meaning dimension (picture pair) or with mismatched meaning dimensions. Listeners inferred the meaning of the novel words significantly more often when prosody matched the word meaning choices than when prosody mismatched. These findings suggest that speech contains reliable prosodic markers to word meaning and that listeners use these prosodic cues to differentiate meanings. That prosody is semantic suggests a reconceptualization of traditional distinctions between linguistic and nonlinguistic properties of spoken language.},
  langid = {english},
  keywords = {Acoustic analysis of speech,Prosody,Semantics,Spoken language processing,Word learning,Word meaning},
  file = {/Users/timokoch/Zotero/storage/4VBMQBHT/Nygaard et al. - 2009 - The Semantics of Prosody Acoustic and Perceptual .pdf;/Users/timokoch/Zotero/storage/8PI2DZHR/j.1551-6709.2008.01007.html}
}

@article{pajupuuInfluenceVerbalContent2015,
  title = {Influence of Verbal Content on Acoustics of Speech Emotions},
  author = {Pajupuu, Hille and Pajupuu, Jaan and Tamuri, Kairi and Altrov, Rene},
  year = {2015},
  month = jan,
  journal = {Proceedings of the 18th International Congress of Phonetic Sciences},
  abstract = {This paper deals with the issue of the influence of verbal content on listeners who have to identify or evaluate speech emotions, and whether or not the emotional aspect of verbal content should be eliminated. We compare the acoustic parameters of sentences expressing joy, anger, sadness and neutrality of two groups: (1) where the verbal content aids the listener in identifying emotions; and (2), where the verbal content does not aid the listener in identifying emotions. The results reveal few significant differences in the acoustic parameters of emotions in the two groups of sentences, and indicate that the elimination of emotional verbal content in speech presented for emotion identification or evaluation is, in most cases, not necessary.},
  file = {/Users/timokoch/Zotero/storage/QFQVUYEB/Pajupuu et al. - 2015 - Influence of verbal content on acoustics of speech.pdf}
}

@misc{PDFDatabaseGerman,
  title = {(2) ({{PDF}}) {{A}} Database of {{German}} Emotional Speech},
  urldate = {2023-02-16},
  howpublished = {https://www.researchgate.net/publication/221491017\_A\_database\_of\_German\_emotional\_speech},
  file = {/Users/timokoch/Zotero/storage/DY8LR4NL/221491017_A_database_of_German_emotional_speech.html}
}

@article{pellEmotionalSpeechProcessing2011,
  title = {Emotional Speech Processing: {{Disentangling}} the Effects of Prosody and Semantic Cues},
  shorttitle = {Emotional Speech Processing},
  author = {Pell, Marc D. and Jaywant, Abhishek and Monetta, Laura and Kotz, Sonja A.},
  year = {2011},
  month = aug,
  journal = {Cognition \& Emotion},
  volume = {25},
  number = {5},
  pages = {834--853},
  issn = {0269-9931, 1464-0600},
  doi = {10.1080/02699931.2010.516915},
  urldate = {2021-06-30},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/RX9RZDYU/Pell et al. - 2011 - Emotional speech processing Disentangling the eff.pdf}
}

@article{pfeiferHowReadySpeechtotext2024,
  title = {How Ready Is Speech-to-Text for Psychological Language Research? {{Evaluating}} the Validity of {{AI-generated English}} Transcripts for Analyzing Free-Spoken Responses in Younger and Older Adults},
  shorttitle = {How Ready Is Speech-to-Text for Psychological Language Research?},
  author = {Pfeifer, Valeria A. and Chilton, Trish D. and Grilli, Matthew D. and Mehl, Matthias R.},
  year = {2024},
  month = may,
  journal = {Behavior Research Methods},
  issn = {1554-3528},
  doi = {10.3758/s13428-024-02440-1},
  urldate = {2024-06-12},
  abstract = {For the longest time, the gold standard in preparing spoken language corpora for text analysis in psychology was using human transcription. However, such standard comes at extensive cost, and creates barriers to quantitative spoken language analysis that recent advances in speech-to-text technology could address. The current study quantifies the accuracy of AI-generated transcripts compared to human-corrected transcripts across younger (n = 100) and older (n = 92) adults and two spoken language tasks. Further, it evaluates the validity of Linguistic Inquiry and Word Count (LIWC)-features extracted from these two kinds of transcripts, as well as transcripts specifically prepared for LIWC analyses via tagging. We find that overall, AI-generated transcripts are highly accurate with a word error rate of 2.50\% to 3.36\%, albeit being slightly less accurate for younger compared to older adults. LIWC features extracted from either transcripts are highly correlated, while the tagging procedure significantly alters filler word categories. Based on these results, automatic speech-to-text appears to be ready for psychological language research when using spoken language tasks in relatively quiet environments, unless filler words are of interest to researchers.},
  langid = {english},
  keywords = {LIWC,Speech-to-TextAging,Text analysis},
  file = {/Users/timokoch/Zotero/storage/QK3A9WBC/Pfeifer et al. - 2024 - How ready is speech-to-text for psychological lang.pdf}
}

@incollection{phanMobileSensingGlobe2023,
  title = {Mobile {{Sensing}} around the {{Globe}}: {{Considerations}} for {{Cross-Cultural Research}}},
  shorttitle = {Mobile {{Sensing}} around the {{Globe}}},
  booktitle = {Technology and {{Measurement}} around the {{Globe}}},
  author = {Phan, Le Vy and Modersitzki, Nick and Gloystein, Kim K. and M{\"u}ller, Sandrine R.},
  editor = {Tay, Louis and Woo, Sang Eun and Behrend, Tara},
  year = {2023},
  series = {Educational and {{Psychological Testing}} in a {{Global Context}}},
  pages = {176--210},
  publisher = {Cambridge University Press},
  address = {Cambridge},
  doi = {10.1017/9781009099813.009},
  urldate = {2024-09-26},
  abstract = {The ubiquity of mobile devices allows researchers to assess people's real-life behaviors objectively, unobtrusively, and with high temporal resolution. As a result, psychological mobile sensing research has grown rapidly. However, only very few cross-cultural mobile sensing studies have been conducted to date. In addition, existing multi-country studies often fail to acknowledge or examine possible cross-cultural differences. In this chapter, we illustrate biases that can occur when conducting cross-cultural mobile sensing studies. Such biases can relate to measurement, construct, sample, device type, user practices, and environmental factors. We also propose mitigation strategies to minimize these biases, such as the use of informants with expertise in local culture, the development of cross-culturally comparable instruments, the use of culture-specific recruiting strategies and incentives, and rigorous reporting standards regarding the generalizability of research findings. We hope to inspire rigorous comparative research to establish and refine mobile sensing methodologies for cross-cultural psychology.},
  isbn = {978-1-316-51528-0},
  keywords = {cross-cultural research,measurement bias,measurement invariance,mitigation strategies,mobile sensing},
  file = {/Users/timokoch/Zotero/storage/JWJXMBEK/Phan et al. - 2023 - Mobile Sensing around the Globe Considerations fo.pdf;/Users/timokoch/Zotero/storage/TNXJSI42/8C68100F4DFF744A49F064865C938434.html}
}

@article{politouSurveyMobileAffective2017,
  title = {A Survey on Mobile Affective Computing},
  author = {Politou, Eugenia and Alepis, Efthymios and Patsakis, Constantinos},
  year = {2017},
  month = aug,
  journal = {Computer Science Review},
  volume = {25},
  doi = {10.1016/j.cosrev.2017.07.002},
  abstract = {The spontaneous recognition of emotional states and personality traits of individuals has been puzzling researchers for years whereas pertinent studies demonstrating the progress in the field, despite their diversity, are still encouraging. This work surveys the most well-known research studies and the state-of-the-art on affect recognition domain based on smartphone acquired data, namely smartphone embedded sensors and smartphone usage. Inevitably, supplementary modalities employed in many eminent studies are also reported here for the sake of completeness. Nevertheless, the intention of the survey is threefold; firstly to document all the to-date relevant literature on affect recognition through smartphone modalities, secondly to argue for the full potential of smartphone use in the inference of affect, and thirdly to demonstrate the current research trends towards mobile affective computing.},
  file = {/Users/timokoch/Zotero/storage/XM9YBDHA/Politou et al. - 2017 - A survey on mobile affective computing.pdf}
}

@article{polzehlAngerRecognitionSpeech2011,
  title = {Anger Recognition in Speech Using Acoustic and Linguistic Cues},
  author = {Polzehl, Tim and Schmitt, Alexander and Metze, Florian and Wagner, Michael},
  year = {2011},
  month = nov,
  journal = {Speech Communication},
  series = {Sensing {{Emotion}} and {{Affect}} - {{Facing Realism}} in {{Speech Processing}}},
  volume = {53},
  number = {9},
  pages = {1198--1209},
  issn = {0167-6393},
  doi = {10.1016/j.specom.2011.05.002},
  urldate = {2020-11-20},
  abstract = {The present study elaborates on the exploitation of both linguistic and acoustic feature modeling for anger classification. In terms of acoustic modeling we generate statistics from acoustic audio descriptors, e.g. pitch, loudness, spectral characteristics. Ranking our features we see that loudness and MFCC seem most promising for all databases. For the English database also pitch features are important. In terms of linguistic modeling we apply probabilistic and entropy-based models of words and phrases, e.g. Bag-of-Words (BOW), Term Frequency (TF), Term Frequency -- Inverse Document Frequency (TF.IDF) and the Self-Referential Information (SRI). SRI clearly outperforms vector space models. Modeling phrases slightly improves the scores. After classification of both acoustic and linguistic information on separated levels we fuse information on decision level adding confidences. We compare the obtained scores on three different databases. Two databases are taken from the IVR customer care domain, another database accounts for a WoZ data collection. All corpora are of realistic speech condition. We observe promising results for the IVR databases while the WoZ database shows lower scores overall. In order to provide comparability between the results we evaluate classification success using the f1 measurement in addition to overall accuracy figures. As a result, acoustic modeling clearly outperforms linguistic modeling. Fusion slightly improves overall scores. With a baseline of approximately 60\% accuracy and .40 f1-measurement by constant majority class voting we obtain an accuracy of 75\% with respective .70 f1 for the WoZ database. For the IVR databases we obtain approximately 79\% accuracy with respective .78 f1 over a baseline of 60\% accuracy with respective .38 f1.},
  langid = {english},
  keywords = {Anger classification,Decision fusion,Emotion detection,IGR ranking,IVR speech,Linguistic and prosodic acoustic modeling},
  file = {/Users/timokoch/Zotero/storage/V7K5WHF2/Polzehl et al. - 2011 - Anger recognition in speech using acoustic and lin.pdf}
}

@article{ponsotCrackingSocialCode2018,
  title = {Cracking the Social Code of Speech Prosody Using Reverse Correlation},
  author = {Ponsot, Emmanuel and Burred, Juan Jos{\'e} and Belin, Pascal and Aucouturier, Jean-Julien},
  year = {2018},
  month = apr,
  journal = {Proceedings of the National Academy of Sciences},
  volume = {115},
  number = {15},
  pages = {3972--3977},
  publisher = {Proceedings of the National Academy of Sciences},
  doi = {10.1073/pnas.1716090115},
  urldate = {2024-11-18},
  abstract = {Human listeners excel at forming high-level social representations about each other, even from the briefest of utterances. In particular, pitch is widely recognized as the auditory dimension that conveys most of the information about a speaker's traits, emotional states, and attitudes. While past research has primarily looked at the influence of mean pitch, almost nothing is known about how intonation patterns, i.e., finely tuned pitch trajectories around the mean, may determine social judgments in speech. Here, we introduce an experimental paradigm that combines state-of-the-art voice transformation algorithms with psychophysical reverse correlation and show that two of the most important dimensions of social judgments, a speaker's perceived dominance and trustworthiness, are driven by robust and distinguishing pitch trajectories in short utterances like the word ``Hello,'' which remained remarkably stable whether male or female listeners judged male or female speakers. These findings reveal a unique communicative adaptation that enables listeners to infer social traits regardless of speakers' physical characteristics, such as sex and mean pitch. By characterizing how any given individual's mental representations may differ from this generic code, the method introduced here opens avenues to explore dysprosody and social-cognitive deficits in disorders like autism spectrum and schizophrenia. In addition, once derived experimentally, these prototypes can be applied to novel utterances, thus providing a principled way to modulate personality impressions in arbitrary speech signals.},
  file = {/Users/timokoch/Zotero/storage/F9X839X7/Ponsot et al. - 2018 - Cracking the social code of speech prosody using r.pdf}
}

@article{posnerCircumplexModelAffect2005,
  title = {The Circumplex Model of Affect: {{An}} Integrative Approach to Affective Neuroscience, Cognitive Development, and Psychopathology},
  shorttitle = {The Circumplex Model of Affect},
  author = {Posner, Jonathan and Russell, James A. and Peterson, Bradley S.},
  year = {2005},
  journal = {Development and psychopathology},
  volume = {17},
  number = {3},
  pages = {715--734},
  issn = {0954-5794},
  doi = {10.1017/S0954579405050340},
  urldate = {2021-11-23},
  abstract = {The circumplex model of affect proposes that all affective states arise from cognitive interpretations of core neural sensations that are the product of two independent neurophysiological systems. This model stands in contrast to theories of basic emotions, which posit that a discrete and independent neural system subserves every emotion. We propose that basic emotion theories no longer explain adequately the vast number of empirical observations from studies in affective neuroscience, and we suggest that a conceptual shift is needed in the empirical approaches taken to the study of emotion and affective psychopathologies. The circumplex model of affect is more consistent with many recent findings from behavioral, cognitive neuroscience, neuroimaging, and developmental studies of affect. Moreover, the model offers new theoretical and empirical approaches to studying the development of affective disorders as well as the genetic and cognitive underpinnings of affective processing within the central nervous system.},
  pmcid = {PMC2367156},
  pmid = {16262989},
  file = {/Users/timokoch/Zotero/storage/PDQC43PD/Posner et al. - 2005 - The circumplex model of affect An integrative app.pdf}
}

@article{posnerCircumplexModelAffect2005a,
  title = {The Circumplex Model of Affect: {{An}} Integrative Approach to Affective Neuroscience, Cognitive Development, and Psychopathology},
  shorttitle = {The Circumplex Model of Affect},
  author = {Posner, Jonathan and Russell, James A. and Peterson, Bradley S.},
  year = {2005},
  journal = {Development and psychopathology},
  volume = {17},
  number = {3},
  pages = {715--734},
  issn = {0954-5794},
  doi = {10.1017/S0954579405050340},
  urldate = {2021-11-23},
  abstract = {The circumplex model of affect proposes that all affective states arise from cognitive interpretations of core neural sensations that are the product of two independent neurophysiological systems. This model stands in contrast to theories of basic emotions, which posit that a discrete and independent neural system subserves every emotion. We propose that basic emotion theories no longer explain adequately the vast number of empirical observations from studies in affective neuroscience, and we suggest that a conceptual shift is needed in the empirical approaches taken to the study of emotion and affective psychopathologies. The circumplex model of affect is more consistent with many recent findings from behavioral, cognitive neuroscience, neuroimaging, and developmental studies of affect. Moreover, the model offers new theoretical and empirical approaches to studying the development of affective disorders as well as the genetic and cognitive underpinnings of affective processing within the central nervous system.},
  pmcid = {PMC2367156},
  pmid = {16262989},
  file = {/Users/timokoch/Zotero/storage/V48PPXTE/Posner et al. - 2005 - The circumplex model of affect An integrative app.pdf}
}

@article{ranaOpportunisticContextawareAffect2015,
  title = {Opportunistic and {{Context-aware Affect Sensing}} on {{Smartphones}}: {{The Concept}}, {{Challenges}} and {{Opportunities}}},
  shorttitle = {Opportunistic and {{Context-aware Affect Sensing}} on {{Smartphones}}},
  author = {Rana, Rajib and Hume, Margee and Reilly, John and Jurdak, Raja and {\textpm}x, Jurdak and Soar, Jeffrey},
  year = {2015},
  month = aug,
  abstract = {Opportunistic affect sensing offers unprecedented potential for capturing spontaneous affect, eliminating biases inherent in the controlled setting. Facial expression and voice are two major affective displays, however most affect sensing systems on smartphone avoid them due to extensive power requirements. Encouragingly, due to the recent advent of low-power DSP (Digital Signal Processing) co-processor and GPU (Graphics Processing Unit) technology, audio and video sensing are becoming more feasible on smartphone. To utilize opportunistically captured facial expression and voice, gathering contextual information about the dynamic audiovisual stimuli is also important. This paper discusses recent advances of affect sensing on the smartphone and identifies the key barriers and potential solutions for implementing opportunistic and context-aware affect sensing on smartphone platforms. In addition to exploring the technical challenges (privacy, battery life and robust algorithms), the challenges of recruiting and retention of mental health patients have also been considered; as experimentation with mental health patients is difficult but crucial to showcase the importance/effectiveness of the smartphone centred affect sensing technology.},
  file = {/Users/timokoch/Zotero/storage/AR9CPMQH/Rana et al. - 2015 - Opportunistic and Context-aware Affect Sensing on .pdf}
}

@misc{rcoreteamLanguageEnvironmentStatistical2021,
  title = {R: {{A Language}} and {{Environment}} for {{Statistical Computing}}},
  author = {{R Core Team}},
  year = {2021},
  address = {Vienna, Austria},
  howpublished = {R Foundation for Statistical Computing}
}

@misc{ReliabilityEstimationMultilevel,
  title = {Reliability Estimation in a Multilevel Confirmatory Factor Analysis Framework. - {{PsycNET}}},
  journal = {APA PsycNET},
  doi = {10.1037/a0032138},
  urldate = {2021-11-26},
  abstract = {Scales with varying degrees of measurement reliability are often used in the context of multistage sampling, where variance exists at multiple levels of analysis (e.g., individual and group). Because methodological guidance on assessing and reporting reliability at multiple levels of analysis is currently lacking, we discuss the importance of examining level-specific reliability. We present a simulation study and an applied example showing different methods for estimating multilevel reliability using multilevel confirmatory factor analysis and provide supporting Mplus program code. We conclude that (a) single-level estimates will not reflect a scale's actual reliability unless reliability is identical at each level of analysis, (b) 2-level alpha and composite reliability (omega) perform relatively well in most settings, (c) estimates of maximal reliability (H) were more biased when estimated using multilevel data than either alpha or omega, and (d) small cluster size can lead to overestimates of reliability at the between level of analysis. We also show that Monte Carlo confidence intervals and Bayesian credible intervals closely reflect the sampling distribution of reliability estimates under most conditions. We discuss the estimation of credible intervals using Mplus and provide R code for computing Monte Carlo confidence intervals. (PsycINFO Database Record (c) 2019 APA, all rights reserved)},
  howpublished = {https://doi.apa.org/record/2013-15544-001?doi=1},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/QU44G3M6/2013-15544-001.html}
}

@misc{ReliabilityEstimationMultilevela,
  title = {Reliability Estimation in a Multilevel Confirmatory Factor Analysis Framework. - {{PsycNET}}},
  urldate = {2021-11-26},
  abstract = {APA PsycNet DoiLanding page},
  howpublished = {https://doi.apa.org/doiLanding?doi=10.1037\%2Fa0032138},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/22JMG6AJ/doiLanding.html}
}

@inproceedings{ringevalIntroducingRECOLAMultimodal2013,
  title = {Introducing the {{RECOLA}} Multimodal Corpus of Remote Collaborative and Affective Interactions},
  booktitle = {2013 10th {{IEEE International Conference}} and {{Workshops}} on {{Automatic Face}} and {{Gesture Recognition}} ({{FG}})},
  author = {Ringeval, Fabien and Sonderegger, Andreas and Sauer, Juergen and Lalanne, Denis},
  year = {2013},
  month = apr,
  pages = {1--8},
  doi = {10.1109/FG.2013.6553805},
  urldate = {2024-07-29},
  abstract = {We present in this paper a new multimodal corpus of spontaneous collaborative and affective interactions in French: RECOLA, which is being made available to the research community. Participants were recorded in dyads during a video conference while completing a task requiring collaboration. Different multimodal data, i.e., audio, video, ECG and EDA, were recorded continuously and synchronously. In total, 46 participants took part in the test, for which the first 5 minutes of interaction were kept to ease annotation. In addition to these recordings, 6 annotators measured emotion continuously on two dimensions: arousal and valence, as well as social behavior labels on live dimensions. The corpus allowed us to take self-report measures of users during task completion. Methodologies and issues related to affective corpus construction are briefly reviewed in this paper. We further detail how the corpus was constructed, i.e., participants, procedure and task, the multimodal recording setup, the annotation of data and some analysis of the quality of these annotations.},
  keywords = {Collaboration,Context,Mood,Physiology,Software,Synchronization},
  file = {/Users/timokoch/Zotero/storage/IRXLRLEH/Ringeval et al. - 2013 - Introducing the RECOLA multimodal corpus of remote.pdf;/Users/timokoch/Zotero/storage/NQIZDDJJ/6553805.html}
}

@article{russellAffectGridSingleitem1989,
  title = {Affect {{Grid}}: {{A}} Single-Item Scale of Pleasure and Arousal},
  shorttitle = {Affect {{Grid}}},
  author = {Russell, James A. and Weiss, Anna and Mendelsohn, Gerald A.},
  year = {1989},
  journal = {Journal of Personality and Social Psychology},
  volume = {57},
  number = {3},
  pages = {493--502},
  publisher = {American Psychological Association},
  address = {US},
  issn = {1939-1315},
  doi = {10.1037/0022-3514.57.3.493},
  abstract = {This article introduces a single-item scale, the Affect Grid, designed as a quick means of assessing affect along the dimensions of pleasure--displeasure and arousal--sleepiness. The Affect Grid is potentially suitable for any study that requires judgments about affect of either a descriptive or a subjective kind. The scale was shown to have adequate reliability, convergent validity, and discriminant validity in 4 studies in which college students used the Affect Grid to describe (a) their current mood, (b) the meaning of emotion-related words, and (c) the feelings conveyed by facial expressions. Other studies (e.g., J. Snodgrass et al; see record 1989-13842-001) are cited to illustrate the potential uses of the Affect Grid as a measure of mood. (PsycINFO Database Record (c) 2016 APA, all rights reserved)},
  keywords = {Physiological Arousal,Pleasure,Rating Scales,Test Reliability,Test Validity},
  file = {/Users/timokoch/Zotero/storage/AHTRC79N/Russell et al. - Affect Grid A Single-Item Scale of Pleasure and A.pdf;/Users/timokoch/Zotero/storage/FIIAKCHW/1990-00158-001.html}
}

@article{russellCircumplexModelAffect1980,
  title = {A {{Circumplex Model}} of {{Affect}}},
  author = {Russell, James},
  year = {1980},
  month = dec,
  journal = {Journal of Personality and Social Psychology},
  volume = {39},
  pages = {1161--1178},
  doi = {10.1037/h0077714},
  abstract = {Factor-analytic evidence has led most psychologists to describe affect as a set of dimensions, such as displeasure, distress, depression, excitement, and so on, with each dimension varying independently of the others. However, there is other evidence that rather than being independent, these affective dimensions are interrelated in a highly systematic fashion. The evidence suggests that these interrelationships can be represented by a spatial model in which affective concepts fall in a circle in the following order: pleasure (0), excitement (45), arousal (90), distress (135), displeasure (180), depression (225), sleepiness (270), and relaxation (315). This model was offered both as a way psychologists can represent the structure of affective experience, as assessed through self-report, and as a representation of the cognitive structure that laymen utilize in conceptualizing affect. Supportive evidence was obtained by scaling 28 emotion-denoting adjectives in 4 different ways: R. T. Ross's (1938) technique for a circular ordering of variables, a multidimensional scaling procedure based on perceived similarity among the terms, a unidimensional scaling on hypothesized pleasure--displeasure and degree-of-arousal dimensions, and a principal-components analysis of 343 Ss' self-reports of their current affective states. (70 ref) (PsycINFO Database Record (c) 2012 APA, all rights reserved)},
  file = {/Users/timokoch/Zotero/storage/3AQNIJ2P/Russell - 1980 - A Circumplex Model of Affect.pdf}
}

@article{russellCircumplexModelAffect1980a,
  title = {A {{Circumplex Model}} of {{Affect}}},
  author = {Russell, James A.},
  year = {1980},
  month = dec,
  journal = {Journal of Personality and Social Psychology},
  volume = {39},
  pages = {1161--1178},
  doi = {10.1037/h0077714},
  abstract = {Factor-analytic evidence has led most psychologists to describe affect as a set of dimensions, such as displeasure, distress, depression, excitement, and so on, with each dimension varying independently of the others. However, there is other evidence that rather than being independent, these affective dimensions are interrelated in a highly systematic fashion. The evidence suggests that these interrelationships can be represented by a spatial model in which affective concepts fall in a circle in the following order: pleasure (0), excitement (45), arousal (90), distress (135), displeasure (180), depression (225), sleepiness (270), and relaxation (315). This model was offered both as a way psychologists can represent the structure of affective experience, as assessed through self-report, and as a representation of the cognitive structure that laymen utilize in conceptualizing affect. Supportive evidence was obtained by scaling 28 emotion-denoting adjectives in 4 different ways: R. T. Ross's (1938) technique for a circular ordering of variables, a multidimensional scaling procedure based on perceived similarity among the terms, a unidimensional scaling on hypothesized pleasure--displeasure and degree-of-arousal dimensions, and a principal-components analysis of 343 Ss' self-reports of their current affective states. (70 ref) (PsycINFO Database Record (c) 2012 APA, all rights reserved)},
  file = {/Users/timokoch/Zotero/storage/WKACWCDU/Russell - 1980 - A Circumplex Model of Affect.pdf}
}

@article{safdarVariationsEmotionalDisplay2009,
  title = {Variations of Emotional Display Rules within and across Cultures: {{A}} Comparison between {{Canada}}, {{USA}}, and {{Japan}}},
  shorttitle = {Variations of Emotional Display Rules within and across Cultures},
  author = {Safdar, Saba and Friedlmeier, Wolfgang and Matsumoto, David and Yoo, Seung Hee and Kwantes, Catherine T. and Kakai, Hisako and Shigemasu, Eri},
  year = {2009},
  journal = {Canadian Journal of Behavioural Science / Revue canadienne des sciences du comportement},
  volume = {41},
  number = {1},
  pages = {1--10},
  publisher = {Educational Publishing Foundation},
  address = {US},
  issn = {1879-2669},
  doi = {10.1037/a0014387},
  abstract = {This study investigates emotional display rules for seven basic emotions. The main goal was to compare emotional display rules of Canadians, US Americans, and Japanese across as well as within cultures regarding the specific emotion, the type of interaction partner, and gender. A total of 835 university students participated in the study. The results indicate that Japanese display rules permit the expression of powerful (anger, contempt, and disgust) significantly less than those of the two North American samples. Japanese also think that they should express positive emotions (happiness, surprise) significantly less than the Canadian sample. Furthermore, Japanese varied the display rules for different interaction partners more than the two North American samples did only for powerful emotions. Gender differences were similar across all three cultural groups. Men expressed powerful emotions more than women and women expressed powerless emotions (sadness, fear) and happiness more than men. Depending on the type of emotion and interaction partner some shared display rules occurred across culture and gender. The implications of these findings are discussed in relation to cultural dimensions and other cultural characteristics. (PsycINFO Database Record (c) 2016 APA, all rights reserved)},
  keywords = {Cross Cultural Differences,Emotions,Human Sex Differences,Ingroup Outgroup,Social Influences,Social Norms},
  file = {/Users/timokoch/Zotero/storage/EGK86VIH/Safdar et al. - 2009 - Variations of emotional display rules within and a.pdf;/Users/timokoch/Zotero/storage/3QI87LTH/2008-19283-001.html}
}

@article{sauterCrossculturalRecognitionBasic2010,
  title = {Cross-Cultural Recognition of Basic Emotions through Nonverbal Emotional Vocalizations},
  author = {Sauter, Disa A. and Eisner, Frank and Ekman, Paul and Scott, Sophie K.},
  year = {2010},
  month = feb,
  journal = {Proceedings of the National Academy of Sciences},
  volume = {107},
  number = {6},
  pages = {2408--2412},
  publisher = {Proceedings of the National Academy of Sciences},
  doi = {10.1073/pnas.0908239106},
  urldate = {2024-11-18},
  abstract = {Emotional signals are crucial for sharing important information, with conspecifics, for example, to warn humans of danger. Humans use a range of different cues to communicate to others how they feel, including facial, vocal, and gestural signals. We examined the recognition of nonverbal emotional vocalizations, such as screams and laughs, across two dramatically different cultural groups. Western participants were compared to individuals from remote, culturally isolated Namibian villages. Vocalizations communicating the so-called ``basic emotions'' (anger, disgust, fear, joy, sadness, and surprise) were bidirectionally recognized. In contrast, a set of additional emotions was only recognized within, but not across, cultural boundaries. Our findings indicate that a number of primarily negative emotions have vocalizations that can be recognized across cultures, while most positive emotions are communicated with culture-specific signals.},
  file = {/Users/timokoch/Zotero/storage/33KL5UDF/Sauter et al. - 2010 - Cross-cultural recognition of basic emotions throu.pdf}
}

@article{savageSpotifyWantsSuggest2021,
  title = {Spotify Wants to Suggest Songs Based on Your Emotions},
  author = {Savage, Mark},
  year = {2021},
  month = jan,
  journal = {BBC News},
  urldate = {2021-02-01},
  abstract = {The streaming giant patents a method of suggesting songs based on a scan of your emotional state.},
  chapter = {Entertainment \& Arts},
  langid = {british},
  file = {/Users/timokoch/Zotero/storage/ZFZWW7EI/entertainment-arts-55839655.html}
}

@article{schererVocalAffectExpression1986,
  title = {Vocal Affect Expression: {{A}} Review and a Model for Future Research},
  shorttitle = {Vocal Affect Expression},
  author = {Scherer, Klaus R.},
  year = {1986},
  journal = {Psychological Bulletin},
  volume = {99},
  number = {2},
  pages = {143--165},
  publisher = {American Psychological Association},
  address = {US},
  issn = {1939-1455},
  doi = {10.1037/0033-2909.99.2.143},
  abstract = {Contends that in the literature on the vocal expression of emotion, there is a discrepancy between reported high accuracy in vocal-auditory recognition and a lack of evidence for the acoustic differentiation of vocal expression. The latter is explained by (a) a paucity of research on voice quality, (b) neglect of the social signaling functions of affect vocalization, and (c) insufficiently precise conceptualization of the underlying emotional states. A component-patterning model of vocal affect expression is proposed that attempts to link the outcomes of antecedent event evaluation to biologically based response patterns. The likely phonatory and articulatory correlates of the physiological responses characterizing different emotional states are described in the form of 3 major voice types (narrow/wide, lax/tense, full/thin). Specific predictions about changes in acoustic parameters resulting from changing voice types are compared with the pattern of empirical findings yielded by a comprehensive survey of the literature on vocal cues in emotional expression. Although the comparison is largely limited to the lax/tense voice type (because acoustic parameters relevant to the other voice types have not yet been systematically studied), a high degree of convergence is revealed. (120 ref) (PsycINFO Database Record (c) 2016 APA, all rights reserved)},
  keywords = {Emotional States,Experimentation,Literature Review,Oral Communication,Voice},
  file = {/Users/timokoch/Zotero/storage/XA5HCM6Y/Scherer - Vocal Affect Expression A Review and a Model for .pdf;/Users/timokoch/Zotero/storage/6J9Q4L8A/1986-16849-001.html}
}

@article{schererVocalCommunicationEmotion2003,
  title = {Vocal Communication of Emotion: {{A}} Review of Research Paradigms},
  shorttitle = {Vocal Communication of Emotion},
  author = {Scherer, K},
  year = {2003},
  month = apr,
  journal = {Speech Communication},
  volume = {40},
  number = {1-2},
  pages = {227--256},
  issn = {01676393},
  doi = {10.1016/S0167-6393(02)00084-5},
  urldate = {2021-11-10},
  abstract = {The current state of research on emotion effects on voice and speech is reviewed and issues for future research efforts are discussed. In particular, it is suggested to use the Brunswikian lens model as a base for research on the vocal communication of emotion. This approach allows one to model the complete process, including both encoding (expression), transmission, and decoding (impression) of vocal emotion communication. Special emphasis is placed on the conceptualization and operationalization of the major elements of the model (i.e., the speaker{\~O}s emotional state, the listener{\~O}s attribution, and the mediating acoustic cues). In addition, the advantages and disadvantages of research paradigms for the induction or observation of emotional expression in voice and speech and the experimental manipulation of vocal cues are discussed, using pertinent examples drawn from past and present research.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/7KJMC9TL/Scherer - 2003 - Vocal communication of emotion A review of resear.pdf}
}

@article{schoedelBasicProtocolSmartphone2020,
  title = {Basic {{Protocol}}: {{Smartphone Sensing Panel Study}}},
  shorttitle = {Basic {{Protocol}}},
  author = {Schoedel, Ramona and Oldemeier, Michelle},
  year = {2020},
  month = may,
  doi = {10.23668/psycharchives.2901},
  urldate = {2020-12-03},
  abstract = {The study Smartphone Sensing Panel Study aims to create a benchmark dataset for the scientific community, including three data collection modalities: (1) smartphone sensing (2) experience sampling, and (3) monthly online surveys. The study is a panel study lasting for three to six months and resulting in high-dimensional and longitudinal behavioral and situational sensing data, self-report data in situ, and traditional questionnaire data about a broad range of psychological traits and phenomena. A quota sample of N = 800 participants is recruited and asked to install the research app PhoneStudy for three to six months. During the period of study participation, sensing data is continuously logged. Sensing data collection is complemented by monthly online surveys and experience sampling periods},
  copyright = {CC-BY-SA 4.0},
  langid = {english},
  annotation = {Accepted: 2020-05-14T20:07:45Z},
  file = {/Users/timokoch/Zotero/storage/HFS7MDIM/Schoedel und Oldemeier - 2020 - Basic Protocol Smartphone Sensing Panel Study.pdf;/Users/timokoch/Zotero/storage/EP2MBL78/2522.html}
}

@article{schoedelSnapshotsDailyLife,
  title = {Snapshots of {{Daily Life}}: {{Situations Investigated Through}} the {{Lens}} of {{Smartphone Sensing}}},
  author = {Schoedel, Ramona and Kunz, Fiona and Bergmann, Maximilian and Bemmann, Florian},
  abstract = {Daily life unfolds in a sequence of situational contexts, which are pivotal for explaining people's thoughts, feelings, and behaviors. While situational data were previously difficult to collect, the ubiquity of smartphones now opens up new opportunities for assessing situations in situ, that is, while they occur. Seizing this opportunity, the present study demonstrates how smartphones can help establish associations between the psychological perception and physical reality of situations. We employed an intensive longitudinal sampling design and investigated 9,790 situational snapshots experienced by 455 participants for 14 consecutive days. These snapshots combined self-reported situation characteristics from experience samplings with their corresponding objective cues obtained via smartphone sensing. More precisely, we extracted a total of 1,356 granular cues from different sensing modalities to account for the complexity of real-world situations. We applied linear and nonlinear machine learning algorithms to examine how well these cues predicted the perceived characteristics in terms of the Situational Eight Duty, Intellect, Adversity, Mating, pOsitivity, Negativity, Deception, Sociality (DIAMONDS), finding significant out-of-sample predictions for the five dimensions reflecting the situations' Duty, Intellect, Mating, pOsitivity, and Sociality. In a series of follow-up analyses, we further explored the data patterns captured by our models, revealing, for example, that those cues related to time and location were particularly informative of the respective situation characteristics. We conclude by interpreting the mapping between cues and characteristics in real-world situations and discussing how smartphone-based situational snapshots may push the boundaries of psychological research on situations.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/PGUV8ZAQ/Schoedel et al. - Snapshots of Daily Life Situations Investigated T.pdf}
}

@misc{schoedelSnapshotsDailyLife2022,
  title = {Snapshots of {{Daily Life}}: {{Situations Investigated Through}} the {{Lens}} of {{Smartphone Sensing}}},
  shorttitle = {Snapshots of {{Daily Life}}},
  author = {Schoedel, Ramona and Kunz, Fiona and Bergmann, Maximilian and Bemmann, Florian and B{\"u}hner, Markus and Sust, Larissa},
  year = {2022},
  month = aug,
  publisher = {PsyArXiv},
  doi = {10.31234/osf.io/f3htz},
  urldate = {2023-01-14},
  abstract = {Daily life unfolds in a sequence of situational contexts, which are pivotal for explaining people's thoughts, feelings, and behaviors. While situational data were previously difficult to collect, the ubiquity of smartphones now opens up new opportunities for assessing situations in situ, that is, while they occur. Seizing this development, the present study demonstrates how smartphones can help establish associations between the psychological perception and the physical reality of situations. We employed an intensive longitudinal sampling design and investigated 9,790 situational snapshots experienced by 455 participants for 14 consecutive days. These snapshots combined self-reported situation characteristics from experience samplings with their corresponding objective situation cues obtained via smartphone sensing. To account for the complexity of real-world situations, we extracted a total of 1,356 granular situation cues from different sensing modalities. We applied linear and nonlinear machine learning algorithms to examine how well these cues predicted the perceived characteristics in terms of the Situational Eight DIAMONDS, finding significant out-of-sample predictions for the five dimensions capturing the situations' Duty, Intellect, Mating, pOsitivity, and Sociality. Analyses of (grouped) feature importance revealed that these predictions relied on complex constellations of cues representing various situational information about the Persons/Interactions and Objects present, the Events/Activities happening, and the current Location and Time. Furthermore, a nomological network analysis provided evidence for the construct validity of our cue-based DIAMONDS predictions. We conclude by discussing how smartphone-based situational snapshots, in general, and our prediction models, in particular, advance psychological research on situations.},
  langid = {american},
  keywords = {mobile sensing,Personality and Situations,psychological situation,situation characteristics,situation cues,situation perception,Situational Eight DIAMONDS,smartphone sensing,Social and Behavioral Sciences,Social and Personality Psychology},
  file = {/Users/timokoch/Zotero/storage/VVS9UI7P/Schoedel et al. - 2022 - Snapshots of Daily Life Situations Investigated T.pdf}
}

@article{schoedelSnapshotsDailyLife2023,
  title = {Snapshots of Daily Life: {{Situations}} Investigated through the Lens of Smartphone Sensing.},
  shorttitle = {Snapshots of Daily Life},
  author = {Schoedel, Ramona and Kunz, Fiona and Bergmann, Maximilian and Bemmann, Florian and B{\"u}hner, Markus and Sust, Larissa},
  year = {2023},
  journal = {Journal of Personality and Social Psychology},
  publisher = {American Psychological Association},
  urldate = {2024-04-25},
  file = {/Users/timokoch/Zotero/storage/EYWZXG87/Schoedel et al. - 2023 - Snapshots of daily life Situations investigated t.pdf}
}

@article{schroederHumanizingVoiceSpeech2017,
  title = {The {{Humanizing Voice}}: {{Speech Reveals}}, and {{Text Conceals}}, a {{More Thoughtful Mind}} in the {{Midst}} of {{Disagreement}}},
  shorttitle = {The {{Humanizing Voice}}},
  author = {Schroeder, Juliana and Kardas, Michael and Epley, Nicholas},
  year = {2017},
  month = dec,
  journal = {Psychological Science},
  volume = {28},
  number = {12},
  pages = {1745--1762},
  issn = {0956-7976, 1467-9280},
  doi = {10.1177/0956797617713798},
  urldate = {2021-11-03},
  abstract = {A person's speech communicates his or her thoughts and feelings. We predicted that beyond conveying the contents of a person's mind, a person's speech also conveys mental capacity, such that hearing a person explain his or her beliefs makes the person seem more mentally capable---and therefore seem to possess more uniquely human mental traits---than reading the same content. We expected this effect to emerge when people are perceived as relatively mindless, such as when they disagree with the evaluator's own beliefs. Three experiments involving polarizing attitudinal issues and political opinions supported these hypotheses. A fourth experiment identified paralinguistic cues in the human voice that convey basic mental capacities. These results suggest that the medium through which people communicate may systematically influence the impressions they form of each other. The tendency to denigrate the minds of the opposition may be tempered by giving them, quite literally, a voice.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/GVGVQ7U7/Schroeder et al. - 2017 - The Humanizing Voice Speech Reveals, and Text Con.pdf}
}

@inproceedings{schullerCombiningSpeechRecognition2008,
  title = {Combining Speech Recognition and Acoustic Word Emotion Models for Robust Text-Independent Emotion Recognition},
  booktitle = {2008 {{IEEE International Conference}} on {{Multimedia}} and {{Expo}}},
  author = {Schuller, Bjorn and Vlasenko, Bogdan and Arsic, Dejan and Rigoll, Gerhard and Wendemuth, Andreas},
  year = {2008},
  month = jun,
  pages = {1333--1336},
  publisher = {IEEE},
  address = {Hannover, Germany},
  doi = {10.1109/ICME.2008.4607689},
  urldate = {2021-11-29},
  abstract = {Recognition of emotion in speech usually uses acoustic models that ignore the spoken content. Likewise one general model per emotion is trained independent of the phonetic structure. Given sufficient data, this approach seemingly works well enough. Yet, this paper tries to answer the question whether acoustic emotion recognition strongly depends on phonetic content, and if models tailored for the spoken unit can lead to higher accuracies. We therefore investigate phoneme-, and word-models by use of a large prosodic, spectral, and voice quality feature space and Support Vector Machines (SVM). Experiments also take the necessity of ASR into account to select appropriate unitmodels. Test-runs on the well-known EMO-DB database facing speaker-independence demonstrate superiority of word emotion models over today's common general models provided sufficient occurrences in the training corpus.},
  isbn = {978-1-4244-2570-9},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/BTX4ERVB/Schuller et al. - 2008 - Combining speech recognition and acoustic word emo.pdf}
}

@inproceedings{schullerCombiningSpeechRecognition2008a,
  title = {Combining Speech Recognition and Acoustic Word Emotion Models for Robust Text-Independent Emotion Recognition},
  booktitle = {2008 {{IEEE International Conference}} on {{Multimedia}} and {{Expo}}},
  author = {Schuller, Bjorn and Vlasenko, Bogdan and Arsic, Dejan and Rigoll, Gerhard and Wendemuth, Andreas},
  year = {2008},
  month = jun,
  pages = {1333--1336},
  publisher = {IEEE},
  address = {Hannover, Germany},
  doi = {10.1109/ICME.2008.4607689},
  urldate = {2021-11-28},
  abstract = {Recognition of emotion in speech usually uses acoustic models that ignore the spoken content. Likewise one general model per emotion is trained independent of the phonetic structure. Given sufficient data, this approach seemingly works well enough. Yet, this paper tries to answer the question whether acoustic emotion recognition strongly depends on phonetic content, and if models tailored for the spoken unit can lead to higher accuracies. We therefore investigate phoneme-, and word-models by use of a large prosodic, spectral, and voice quality feature space and Support Vector Machines (SVM). Experiments also take the necessity of ASR into account to select appropriate unitmodels. Test-runs on the well-known EMO-DB database facing speaker-independence demonstrate superiority of word emotion models over today's common general models provided sufficient occurrences in the training corpus.},
  isbn = {978-1-4244-2570-9},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/7PJHFQM8/Schuller et al. - 2008 - Combining speech recognition and acoustic word emo.pdf}
}

@book{schullerINTERSPEECH2016Computational2016,
  title = {The {{INTERSPEECH}} 2016 {{Computational Paralinguistics Challenge}}: {{Deception}}, {{Sincerity}} and {{Native Language}}},
  shorttitle = {The {{INTERSPEECH}} 2016 {{Computational Paralinguistics Challenge}}},
  author = {Schuller, Bj{\"o}rn and Steidl, Stefan and Batliner, Anton and Hirschberg, Julia and Burgoon, Judee and Baird, Alice and Elkins, Aaron and Zhang, Yue and Coutinho, Eduardo and Evanini, Keelan},
  year = {2016},
  month = sep,
  pages = {2005},
  doi = {10.21437/Interspeech.2016-129},
  file = {/Users/timokoch/Zotero/storage/TSBUETHH/Schuller et al. - 2016 - The INTERSPEECH 2016 Computational Paralinguistics.pdf}
}

@article{schullerRecognisingRealisticEmotions2011,
  title = {Recognising Realistic Emotions and Affect in Speech: {{State}} of the Art and Lessons Learnt from the First Challenge},
  shorttitle = {Recognising Realistic Emotions and Affect in Speech},
  author = {Schuller, Bj{\"o}rn and Batliner, Anton and Steidl, Stefan and Seppi, Dino},
  year = {2011},
  month = nov,
  journal = {Speech Communication},
  series = {Sensing {{Emotion}} and {{Affect}} - {{Facing Realism}} in {{Speech Processing}}},
  volume = {53},
  number = {9},
  pages = {1062--1087},
  issn = {0167-6393},
  doi = {10.1016/j.specom.2011.01.011},
  urldate = {2021-09-17},
  abstract = {More than a decade has passed since research on automatic recognition of emotion from speech has become a new field of research in line with its `big brothers' speech and speaker recognition. This article attempts to provide a short overview on where we are today, how we got there and what this can reveal us on where to go next and how we could arrive there. In a first part, we address the basic phenomenon reflecting the last fifteen years, commenting on databases, modelling and annotation, the unit of analysis and prototypicality. We then shift to automatic processing including discussions on features, classification, robustness, evaluation, and implementation and system integration. From there we go to the first comparative challenge on emotion recognition from speech -- the INTERSPEECH 2009 Emotion Challenge, organised by (part of) the authors, including the description of the Challenge's database, Sub-Challenges, participants and their approaches, the winners, and the fusion of results to the actual learnt lessons before we finally address the ever-lasting problems and future promising attempts.},
  langid = {english},
  keywords = {Adaptation,Affect,Automatic classification,Emotion,Evaluation,Feature selection,Feature types,Noise robustness,Standardisation,Usability},
  file = {/Users/timokoch/Zotero/storage/ZSNECF99/Schuller et al. - 2011 - Recognising realistic emotions and affect in speec.pdf;/Users/timokoch/Zotero/storage/GUPR4A58/S0167639311000185.html}
}

@inproceedings{schullerSpeechEmotionRecognition2004,
  title = {Speech Emotion Recognition Combining Acoustic Features and Linguistic Information in a Hybrid Support Vector Machine-Belief Network Architecture},
  booktitle = {2004 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}}, and {{Signal Processing}}},
  author = {Schuller, Bj{\"o}rn and Rigoll, G. and Lang, M.},
  year = {2004},
  month = may,
  volume = {1},
  pages = {I-577},
  issn = {1520-6149},
  doi = {10.1109/ICASSP.2004.1326051},
  urldate = {2024-09-17},
  abstract = {In this paper we introduce a novel approach to the combination of acoustic features and language information for a most robust automatic recognition of a speaker's emotion. Seven discrete emotional states are classified throughout the work. Firstly a model for the recognition of emotion by acoustic features is presented. The derived features of the signal-, pitch-, energy, and spectral contours are ranked by their quantitative contribution to the estimation of an emotion. Several different classification methods including linear classifiers, Gaussian mixture models, neural nets, and support vector machines are compared by their performance within this task. Secondly an approach to emotion recognition by the spoken content is introduced applying belief network based spotting for emotional key-phrases. Finally the two information sources are integrated in a soft decision fusion by using a neural net. The gain is evaluated and compared to other advances. Two emotional speech corpora used for training and evaluation are described in detail and the results achieved applying the propagated novel advance to speaker emotion recognition are presented and discussed.},
  keywords = {Emotion recognition,Information analysis,Information retrieval,Intelligent networks,Natural languages,Neural networks,Robustness,Speech analysis,Support vector machine classification,Support vector machines},
  file = {/Users/timokoch/Zotero/storage/GBH2IAGF/Schuller et al. - 2004 - Speech emotion recognition combining acoustic feat.pdf;/Users/timokoch/Zotero/storage/F4GNJRR4/1326051.html}
}

@article{schullerSpeechEmotionRecognition2018,
  title = {Speech Emotion Recognition: Two Decades in a Nutshell, Benchmarks, and Ongoing Trends},
  shorttitle = {Speech Emotion Recognition},
  author = {Schuller, Bj{\"o}rn},
  year = {2018},
  month = apr,
  journal = {Communications of the ACM},
  volume = {61},
  number = {5},
  pages = {90--99},
  issn = {00010782},
  doi = {10.1145/3129340},
  urldate = {2018-12-09},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/8QPDU5UT/Schuller - 2018 - Speech emotion recognition two decades in a nutsh.pdf}
}

@article{schwartzEmotionalSpeechProcessing2012,
  title = {Emotional {{Speech Processing}} at the {{Intersection}} of {{Prosody}} and {{Semantics}}},
  author = {Schwartz, Rachel and Pell, Marc D.},
  editor = {Stamatakis, Emmanuel Andreas},
  year = {2012},
  month = oct,
  journal = {PLoS ONE},
  volume = {7},
  number = {10},
  pages = {e47279},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0047279},
  urldate = {2021-06-30},
  abstract = {The ability to accurately perceive emotions is crucial for effective social interaction. Many questions remain regarding how different sources of emotional cues in speech (e.g., prosody, semantic information) are processed during emotional communication. Using a cross-modal emotional priming paradigm (Facial affect decision task), we compared the relative contributions of processing utterances with single-channel (prosody-only) versus multi-channel (prosody and semantic) cues on the perception of happy, sad, and angry emotional expressions. Our data show that emotional speech cues produce robust congruency effects on decisions about an emotionally related face target, although no processing advantage occurred when prime stimuli contained multi-channel as opposed to single-channel speech cues. Our data suggest that utterances with prosodic cues alone and utterances with combined prosody and semantic cues both activate knowledge that leads to emotional congruency (priming) effects, but that the convergence of these two information sources does not always heighten access to this knowledge during emotional speech processing.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/D5DXCNRA/Schwartz und Pell - 2012 - Emotional Speech Processing at the Intersection of.pdf}
}

@article{seiferthHowEmentalHealth2023,
  title = {How to E-Mental Health: A Guideline for Researchers and Practitioners Using Digital Technology in the Context of Mental Health},
  shorttitle = {How to E-Mental Health},
  author = {Seiferth, Caroline and Vogel, Lea and Aas, Benjamin and Brandhorst, Isabel and Carlbring, Per and Conzelmann, Annette and Esfandiari, Narges and Finkbeiner, Marlene and Hollmann, Karsten and Lautenbacher, Heinrich and Meinzinger, Edith and Newbold, Alexandra and Opitz, Ansgar and Renner, Tobias J. and Sander, Lasse Bosse and Santangelo, Philip S. and Schoedel, Ramona and Schuller, Bj{\"o}rn and Stachl, Clemens and Terhorst, Yannik and Torous, John and Wac, Katarzyna and {Werner-Seidler}, Aliza and Wolf, Sebastian and L{\"o}chner, Johanna},
  year = {2023},
  month = aug,
  journal = {Nature Mental Health},
  volume = {1},
  number = {8},
  pages = {542--554},
  publisher = {Nature Publishing Group},
  issn = {2731-6076},
  doi = {10.1038/s44220-023-00085-1},
  urldate = {2024-07-19},
  abstract = {Despite an exponentially growing number of digital or e-mental health services, methodological guidelines for research and practical implementation are scarce. Here we aim to promote the methodological quality, evidence and long-term implementation of technical innovations in the healthcare system. This expert consensus is based on an iterative Delphi adapted process and provides an overview of the current state-of-the-art guidelines and practical recommendations on the most relevant topics in e-mental health assessment and intervention. Covering three objectives, that is, development, study specifics and intervention evaluation, 11 topics were addressed and co-reviewed by 25 international experts and a think tank in the field of e-mental health. This expert consensus provides a comprehensive essence of scientific knowledge and practical recommendations for e-mental health researchers and clinicians. This way, we aim to enhance the promise of e-mental health: low-threshold access to mental health treatment worldwide.},
  copyright = {2023 Springer Nature America, Inc.},
  langid = {english},
  keywords = {Health care,Scientific community},
  file = {/Users/timokoch/Zotero/storage/UPAZMK3A/Seiferth et al. - 2023 - How to e-mental health a guideline for researcher.pdf}
}

@inproceedings{servia-rodriguezMobileSensingService2017,
  title = {Mobile {{Sensing}} at the {{Service}} of {{Mental Well-being}}: A {{Large-scale Longitudinal Study}}},
  shorttitle = {Mobile {{Sensing}} at the {{Service}} of {{Mental Well-being}}},
  booktitle = {Proceedings of the 26th {{International Conference}} on {{World Wide Web}}},
  author = {{Servia-Rodr{\'i}guez}, Sandra and Rachuri, Kiran K. and Mascolo, Cecilia and Rentfrow, Peter J. and Lathia, Neal and Sandstrom, Gillian M.},
  year = {2017},
  month = apr,
  pages = {103--112},
  publisher = {International World Wide Web Conferences Steering Committee},
  address = {Perth Australia},
  doi = {10.1145/3038912.3052618},
  urldate = {2021-06-30},
  abstract = {Measuring mental well-being with mobile sensing has been an increasingly active research topic. Pervasiveness of smartphones combined with the convenience of mobile app distribution platforms (e.g., Google Play) provide a tremendous opportunity to reach out to millions of users. However, the studies at the confluence of mental health and mobile sensing have been longitudinally limited, controlled, or confined to a small number of participants. In this paper we report on what we believe is the largest longitudinal in-the-wild study of mood through smartphones. We describe an Android app to collect participants' self-reported moods and system triggered experience sampling data while passively measuring their physical activity, sociability, and mobility via their device's sensors. We report the results of a large-scale analysis of the data collected for about three years from {$\sim$} 18, 000 users.},
  isbn = {978-1-4503-4913-0},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/RD986SBN/Servia-Rodríguez et al. - 2017 - Mobile Sensing at the Service of Mental Well-being.pdf}
}

@inproceedings{shenAutomaticSpeechEmotion2011,
  title = {Automatic {{Speech Emotion Recognition}} Using {{Support Vector Machine}}},
  booktitle = {Proceedings of 2011 {{International Conference}} on {{Electronic}} \& {{Mechanical Engineering}} and {{Information Technology}}},
  author = {Shen, Peipei and Changjun, Zhou and Chen, Xiong},
  year = {2011},
  month = aug,
  volume = {2},
  pages = {621--625},
  doi = {10.1109/EMEIT.2011.6023178},
  abstract = {Automatic Speech Emotion Recognition (SER) is a current research topic in the field of Human Computer Interaction (HCI) with wide range of applications. The purpose of speech emotion recognition system is to automatically classify speaker's utterances into five emotional states such as disgust, boredom, sadness, neutral, and happiness. The speech samples are from Berlin emotional database and the features extracted from these utterances are energy, pitch, linear prediction cepstrum coefficients (LPCC), Mel Frequency cepstrum coefficients (MFCC), Linear Prediction coefficients and Mel cepstrum coefficients (LPCMCC). The Support Vector Machine (SVM) is used as a classifier to classify different emotional states. The system gives 66.02\% classification accuracy for only using energy and pitch features, 70.7\% for only using LPCMCC features, and 82.5\% for using both of them.},
  keywords = {Automatic Emotion Recognition,Cepstrum,Emotion recognition,Energy,Feature extraction,LPCC,LPCMCC,Mel frequency cepstral coefficient,MFCC,Pitch,Speech,Speech Emotion,Speech recognition,Support vector machines,SVM},
  file = {/Users/timokoch/Zotero/storage/7U9TWC4W/Shen et al. - 2011 - Automatic Speech Emotion Recognition using Support.pdf;/Users/timokoch/Zotero/storage/2U75IW5T/6023178.html}
}

@article{skinnerCalibratedRecordingAnalysis1935,
  title = {A Calibrated Recording and Analysis of the Pitch, Force and Quality of Vocal Tones Expressing Happiness and Sadness; and a Determination of the Pitch and Force of the Subjective Concepts of Ordinary, Soft and Loud Tones},
  author = {Skinner, E. R.},
  year = {1935},
  journal = {Speech Monographs},
  volume = {2},
  pages = {81--137},
  publisher = {Taylor \& Francis},
  address = {United Kingdom},
  issn = {0038-7169},
  doi = {10.1080/03637753509374833},
  abstract = {9males and 10 females were asked to repeat the vowel ah immediately after reading a piece of literature and listening to phonographic recordings of music judged by "experts" as indicating sadness and happiness. Oscillographic records of the vowels were made and analyzed. Results showed that the vocal responses to stimuli which evoke happiness are appreciably higher in pitch than the ordinary tones of the same subjects and higher than tones representative of sad states. This difference was found to be significant in both sexes. The average tones in response to literature or music judged as sad are practically the same as that of the subjects' ordinary tones. Differences in intensity and in tone quality were also observed for the two emotional states. Psychogalvanic readings taken during the experiment showed the presence of disturbances of an emotional nature. A second experiment to determine the subjects' conception of ordinary, soft, and loud tones showed that pitch changes with the intensity of the tones. Soft tones are lower in pitch than those designated as ordinary, and loud tones are invariably higher in pitch than either. (PsycINFO Database Record (c) 2017 APA, all rights reserved)},
  file = {/Users/timokoch/Zotero/storage/2D6KR9KL/1939-02653-001.html}
}

@inproceedings{spathisPassiveMobileSensing2019,
  title = {Passive Mobile Sensing and Psychological Traits for Large Scale Mood Prediction},
  booktitle = {Proceedings of the 13th {{EAI International Conference}} on {{Pervasive Computing Technologies}} for {{Healthcare}}},
  author = {Spathis, Dimitris and {Servia-Rodriguez}, Sandra and Farrahi, Katayoun and Mascolo, Cecilia and Rentfrow, Jason},
  year = {2019},
  month = may,
  pages = {272--281},
  publisher = {ACM},
  address = {Trento Italy},
  doi = {10.1145/3329189.3329213},
  urldate = {2020-12-30},
  abstract = {Experience sampling has long been the established method to sample people's mood in order to assess their mental state. Smartphones start to be used as experience sampling tools for mental health state as they accompany individuals during their day and can therefore gather in-the-moment data. However, the granularity of the data needs to be traded off with the level of interruption these tools introduce. As a consequence the data collected with this technique is often sparse. This has been obviated by the use of passive sensing in addition to mood reports, however, this adds additional noise.},
  isbn = {978-1-4503-6126-2},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/RHU2WMMS/Spathis et al. - 2019 - Passive mobile sensing and psychological traits fo.pdf}
}

@article{sridharUnsupervisedPersonalizationEmotion2022,
  title = {Unsupervised {{Personalization}} of an {{Emotion Recognition System}}: {{The Unique Properties}} of the {{Externalization}} of {{Valence}} in {{Speech}}},
  shorttitle = {Unsupervised {{Personalization}} of an {{Emotion Recognition System}}},
  author = {Sridhar, Kusha and Busso, Carlos},
  year = {2022},
  month = oct,
  journal = {IEEE Transactions on Affective Computing},
  volume = {13},
  number = {4},
  pages = {1959--1972},
  issn = {1949-3045},
  doi = {10.1109/TAFFC.2022.3187336},
  abstract = {The prediction of valence from speech is an important, but challenging problem. The expression of valence in speech has speaker-dependent cues, which contribute to performances that are often significantly lower than the prediction of other emotional attributes such as arousal and dominance. A practical approach to improve valence prediction from speech is to adapt the models to the target speakers in the test set. Adapting a speech emotion recognition (SER) system to a particular speaker is a hard problem, especially with deep neural networks (DNNs), since it requires optimizing millions of parameters. This study proposes an unsupervised approach to address this problem by searching for speakers in the train set with similar acoustic patterns as the speaker in the test set. Speech samples from the selected speakers are used to create the adaptation set. This approach leverages transfer learning using pre-trained models, which are adapted with these speech samples. We propose three alternative adaptation strategies: unique speaker, oversampling and weighting approaches. These methods differ on the use of the adaptation set in the personalization of the valence models. The results demonstrate that a valence prediction model can be efficiently personalized with these unsupervised approaches, leading to relative improvements as high as 13.52\%.},
  keywords = {Acoustics,adaptation,Adaptation models,Data models,Emotion recognition,emotional dimensions,Predictive models,Speech emotion recognition,Speech recognition,Training,transfer learning,valence},
  file = {/Users/timokoch/Zotero/storage/X9YB2S4H/Sridhar und Busso - 2022 - Unsupervised Personalization of an Emotion Recogni.pdf;/Users/timokoch/Zotero/storage/T3TWPTHW/9812504.html}
}

@article{stilpAcousticContextEffects2020,
  title = {Acoustic Context Effects in Speech Perception},
  author = {Stilp, Christian},
  year = {2020},
  month = jan,
  journal = {Wiley interdisciplinary reviews. Cognitive science},
  volume = {11},
  pages = {1--18},
  doi = {10.1002/wcs.1517},
  abstract = {The extreme acoustic variability of speech is well established, which makes the proficiency of human speech perception all the more impressive. Speech perception, like perception in any modality, is relative to context, and this provides a means to normalize the acoustic variability in the speech signal. Acoustic context effects in speech perception have been widely documented, but a clear understanding of how these effects relate to each other across stimuli, timescales, and acoustic domains is lacking. Here we review the influences that spectral context, temporal context, and spectrotemporal context have on speech perception. Studies are organized in terms of whether the context precedes the target (forward effects) or follows it (backward effects), and whether the context is adjacent to the target (proximal) or temporally removed from it (distal). Special cases where proximal and distal contexts have competing influences on perception are also considered. Across studies, a common theme emerges: acoustic differences between contexts and targets are perceptually magnified, producing contrast effects that facilitate perception of target sounds and words. This indicates enhanced sensitivity to changes in the acoustic environment, which maximizes the amount of potential information that can be transmitted to the perceiver. This article is categorized under: Linguistics {$>$} Language in Mind and Brain Psychology {$>$} Perception and Psychophysics Acoustic context effects in speech perception. Contexts can be temporally adjacent to the target speech sound (proximal) or temporally nonadjacent to the target (distal). Contexts that precede the target in time are forward effects; contexts that follow the target are backward effects. These combinations of context timescales and directions apply equally to spectral context effects and temporal context effects in speech perception},
  file = {/Users/timokoch/Zotero/storage/Y4DNVPXS/Stilp - 2020 - Acoustic context effects in speech perception.pdf}
}

@article{sunLanguageWellbeingTracking2020,
  title = {The Language of Well-Being: {{Tracking}} Fluctuations in Emotion Experience through Everyday Speech},
  shorttitle = {The Language of Well-Being},
  author = {Sun, Jessie and Schwartz, H. Andrew and Son, Youngseo and Kern, Margaret L. and Vazire, Simine},
  year = {2020},
  month = feb,
  journal = {Journal of Personality and Social Psychology},
  volume = {118},
  number = {2},
  pages = {364--387},
  issn = {1939-1315},
  doi = {10.1037/pspp0000244},
  abstract = {The words that people use have been found to reflect stable psychological traits, but less is known about the extent to which everyday fluctuations in spoken language reflect transient psychological states. We explored within-person associations between spoken words and self-reported state emotion among 185 participants who wore the Electronically Activated Recorder (EAR; an unobtrusive audio recording device) and completed experience sampling reports of their positive and negative emotions 4 times per day for 7 days (1,579 observations). We examined language using the Linguistic Inquiry and Word Count program (LIWC; theoretically created dictionaries) and open-vocabulary themes (clusters of data-driven semantically-related words). Although some studies give the impression that LIWC's positive and negative emotion dictionaries can be used as indicators of emotion experience, we found that when computed on spoken language, LIWC emotion scores were not significantly associated with self-reports of state emotion experience. Exploration of other categories of language variables suggests a number of hypotheses about substantive everyday correlates of momentary positive and negative emotion that can be tested in future studies. These findings (a) suggest that LIWC positive and negative emotion dictionaries may not capture self-reported subjective emotion experience when applied to everyday speech, (b) emphasize the importance of establishing the validity of language-based measures within one's target domain, (c) demonstrate the potential for developing new hypotheses about personality processes from the open-ended words that are used in everyday speech, and (d) extend perspectives on intraindividual variability to the domain of spoken language. (PsycINFO Database Record (c) 2020 APA, all rights reserved).},
  langid = {english},
  pmid = {30945904},
  keywords = {Adolescent,Adult,Ecological Momentary Assessment,Emotions,Female,Humans,Linguistics,Longitudinal Studies,Male,Mental Health,Semantics,Speech,Students,Vocabulary,Young Adult},
  file = {/Users/timokoch/Zotero/storage/IW2QJJDQ/Sun et al. - 2020 - The language of well-being Tracking fluctuations .pdf;/Users/timokoch/Zotero/storage/V8MI6QSL/Sun et al. - The Language of Well-Being Tracking Fluctuations .pdf}
}

@misc{TalkMeHow,
  title = {Talk to {{Me}}: {{How Voice Computing Will Transform}} the {{Way We Live}}, {{Work}}, and {{Think}} - {{James Vlahos}} - {{Google Books}}},
  urldate = {2021-11-23},
  howpublished = {https://books.google.de/books/about/Talk\_to\_Me.html?id=hRE8swEACAAJ\&redir\_esc=y},
  file = {/Users/timokoch/Zotero/storage/VS3278GB/Talk_to_Me.html}
}

@article{tibshiraniRegressionShrinkageSelection1996,
  title = {Regression Shrinkage and Selection via the Lasso},
  author = {Tibshirani, Robert},
  year = {1996},
  journal = {Journal of the Royal Statistical Society. Series B (Methodological)},
  pages = {267--288},
  issn = {0035-9246},
  file = {/Users/timokoch/Zotero/storage/8QFCI7YD/Tibshirani - 1996 - Regression shrinkage and selection via the lasso.pdf}
}

@article{toisoulEstimationContinuousValence2021,
  title = {Estimation of Continuous Valence and Arousal Levels from Faces in Naturalistic Conditions},
  author = {Toisoul, Antoine and Kossaifi, Jean and Bulat, Adrian and Tzimiropoulos, Georgios and Pantic, Maja},
  year = {2021},
  month = jan,
  journal = {Nature Machine Intelligence},
  volume = {3},
  number = {1},
  pages = {42--50},
  publisher = {Nature Publishing Group},
  issn = {2522-5839},
  doi = {10.1038/s42256-020-00280-0},
  urldate = {2021-12-20},
  abstract = {Facial affect analysis aims to create new types of human--computer interactions by enabling computers to better understand a person's emotional state in order to provide ad hoc help and interactions. Since discrete emotional classes (such as anger, happiness, sadness and so on) are not representative of the full spectrum of emotions displayed by humans on a daily basis, psychologists typically rely on dimensional measures, namely valence (how positive the emotional display is) and arousal (how calming or exciting the emotional display looks like). However, while estimating these values from a face is natural for humans, it is extremely difficult for computer-based systems and automatic estimation of valence and arousal in naturalistic conditions is an open problem. Additionally, the subjectivity of these measures makes it hard to obtain good quality data. Here we introduce a novel deep neural network architecture to analyse facial affect in naturalistic conditions with a high level of accuracy. The proposed network integrates face alignment and jointly estimates both categorical and continuous emotions in a single pass, making it suitable for real-time applications. We test our method on three challenging datasets collected in naturalistic conditions and show that our approach outperforms all previous methods. We also discuss caveats regarding the use of this tool, and ethical aspects that must be considered in its application.},
  copyright = {2021 The Author(s), under exclusive licence to Springer Nature Limited},
  langid = {english},
  keywords = {Human behaviour,Machine learning},
  annotation = {Bandiera\_abtest: a\\
Cg\_type: Nature Research Journals\\
Primary\_atype: Research\\
Subject\_term: Human behaviour;Machine learning\\
Subject\_term\_id: human-behaviour;machine-learning},
  file = {/Users/timokoch/Zotero/storage/5H9WTSCQ/Toisoul et al. - 2021 - Estimation of continuous valence and arousal level.pdf;/Users/timokoch/Zotero/storage/FK6K7GE3/s42256-020-00280-0.html}
}

@misc{triantafyllopoulosInsightsModellingPhysiological2022,
  title = {Insights on {{Modelling Physiological}}, {{Appraisal}}, and {{Affective Indicators}} of {{Stress}} Using {{Audio Features}}},
  author = {Triantafyllopoulos, Andreas and Z{\"a}nkert, Sandra and Baird, Alice and Konzok, Julian and Kudielka, Brigitte M. and Schuller, Bj{\"o}rn W.},
  year = {2022},
  month = may,
  number = {arXiv:2205.04328},
  eprint = {2205.04328},
  primaryclass = {cs, eess},
  institution = {arXiv},
  urldate = {2022-05-20},
  abstract = {Stress is a major threat to well-being that manifests in a variety of physiological and mental symptoms. Utilising speech samples collected while the subject is undergoing an induced stress episode has recently shown promising results for the automatic characterisation of individual stress responses. In this work, we introduce new findings that shed light onto whether speech signals are suited to model physiological biomarkers, as obtained via cortisol measurements, or self-assessed appraisal and affect measurements. Our results show that different indicators impact acoustic features in a diverse way, but that their complimentary information can nevertheless be effectively harnessed by a multi-tasking architecture to improve prediction performance for all of them.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning,Computer Science - Sound,Electrical Engineering and Systems Science - Audio and Speech Processing},
  file = {/Users/timokoch/Zotero/storage/ZPSVMW3A/Triantafyllopoulos et al. - 2022 - Insights on Modelling Physiological, Appraisal, an.pdf;/Users/timokoch/Zotero/storage/ZNZX52NV/2205.html}
}

@article{vanberkelExperienceSamplingMethod2017,
  title = {The {{Experience Sampling Method}} on {{Mobile Devices}}},
  author = {Van Berkel, Niels and Ferreira, Denzil and Kostakos, Vassilis},
  year = {2017},
  month = dec,
  journal = {ACM Computing Surveys},
  volume = {50},
  number = {6},
  pages = {1--40},
  issn = {03600300},
  doi = {10.1145/3123988},
  urldate = {2019-02-08},
  langid = {english},
  keywords = {ambulatory assessment,data collection,ecological momentary assessment,EMA,ESM,Experience sampling method,in situ,methodology,mobile devices,qualitative data,sensor,smartphone},
  file = {/Users/timokoch/Zotero/storage/4F6JRW2X/Berkel et al. - 2017 - The Experience Sampling Method on Mobile Devices.pdf}
}

@article{vanrijnModellingIndividualCrosscultural2023,
  title = {Modelling Individual and Cross-Cultural Variation in the Mapping of Emotions to Speech Prosody},
  author = {{van Rijn}, Pol and {Larrouy-Maestri}, Pauline},
  year = {2023},
  month = mar,
  journal = {Nature Human Behaviour},
  volume = {7},
  number = {3},
  pages = {386--396},
  publisher = {Nature Publishing Group},
  issn = {2397-3374},
  doi = {10.1038/s41562-022-01505-5},
  urldate = {2024-04-08},
  abstract = {The existence of a mapping between emotions and speech prosody is commonly assumed. We propose a Bayesian modelling framework to analyse this mapping. Our models are fitted to a large collection of intended emotional prosody, yielding more than 3,000 minutes of recordings. Our descriptive study reveals that the mapping within corpora is relatively constant, whereas the mapping varies across corpora. To account for this heterogeneity, we fit a series of increasingly complex models. Model comparison reveals that models taking into account mapping differences across countries, languages, sexes and individuals outperform models that only assume a global mapping. Further analysis shows that differences across individuals, cultures and sexes contribute more to the model prediction than a shared global mapping. Our models, which can be explored in an online interactive visualization, offer a description of the mapping between acoustic features and emotions in prosody.},
  copyright = {2023 The Author(s)},
  langid = {english},
  keywords = {Human behaviour,Science,technology and society},
  file = {/Users/timokoch/Zotero/storage/QNJSEWKX/van Rijn and Larrouy-Maestri - 2023 - Modelling individual and cross-cultural variation .pdf}
}

@article{ververidisEmotionalSpeechRecognition2006,
  title = {Emotional Speech Recognition: {{Resources}}, Features, and Methods},
  shorttitle = {Emotional Speech Recognition},
  author = {Ververidis, Dimitrios and Kotropoulos, Constantine},
  year = {2006},
  month = sep,
  journal = {Speech Communication},
  volume = {48},
  number = {9},
  pages = {1162--1181},
  issn = {01676393},
  doi = {10.1016/j.specom.2006.04.003},
  urldate = {2021-11-17},
  abstract = {In this paper we overview emotional speech recognition having in mind three goals. The first goal is to provide an up-todate record of the available emotional speech data collections. The number of emotional states, the language, the number of speakers, and the kind of speech are briefly addressed. The second goal is to present the most frequent acoustic features used for emotional speech recognition and to assess how the emotion affects them. Typical features are the pitch, the formants, the vocal tract cross-section areas, the mel-frequency cepstral coefficients, the Teager energy operator-based features, the intensity of the speech signal, and the speech rate. The third goal is to review appropriate techniques in order to classify speech into emotional states. We examine separately classification techniques that exploit timing information from which that ignore it. Classification techniques based on hidden Markov models, artificial neural networks, linear discriminant analysis, k-nearest neighbors, support vector machines are reviewed.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/6F5CZD2E/Ververidis und Kotropoulos - 2006 - Emotional speech recognition Resources, features,.pdf}
}

@inproceedings{vganesanEmpiricalEvaluationPretrained2021,
  title = {Empirical {{Evaluation}} of {{Pre-trained Transformers}} for {{Human-Level NLP}}: {{The Role}} of {{Sample Size}} and {{Dimensionality}}},
  shorttitle = {Empirical {{Evaluation}} of {{Pre-trained Transformers}} for {{Human-Level NLP}}},
  booktitle = {Proceedings of the 2021 {{Conference}} of the {{North American Chapter}} of the {{Association}} for {{Computational Linguistics}}: {{Human Language Technologies}}},
  author = {V Ganesan, Adithya and Matero, Matthew and Ravula, Aravind Reddy and Vu, Huy and Schwartz, H. Andrew},
  year = {2021},
  month = jun,
  pages = {4515--4532},
  publisher = {Association for Computational Linguistics},
  address = {Online},
  doi = {10.18653/v1/2021.naacl-main.357},
  urldate = {2022-08-20},
  abstract = {In human-level NLP tasks, such as predicting mental health, personality, or demographics, the number of observations is often smaller than the standard 768+ hidden state sizes of each layer within modern transformer-based language models, limiting the ability to effectively leverage transformers. Here, we provide a systematic study on the role of dimension reduction methods (principal components analysis, factorization techniques, or multi-layer auto-encoders) as well as the dimensionality of embedding vectors and sample sizes as a function of predictive performance. We first find that fine-tuning large models with a limited amount of data pose a significant difficulty which can be overcome with a pre-trained dimension reduction regime. RoBERTa consistently achieves top performance in human-level tasks, with PCA giving benefit over other reduction methods in better handling users that write longer texts. Finally, we observe that a majority of the tasks achieve results comparable to the best performance with just 1/12 of the embedding dimensions.},
  file = {/Users/timokoch/Zotero/storage/LWYG45JR/V Ganesan et al. - 2021 - Empirical Evaluation of Pre-trained Transformers f.pdf}
}

@book{vlahosTalkMeHow2019,
  title = {Talk to {{Me}}: {{How Voice Computing Will Transform}} the {{Way We Live}}, {{Work}}, and {{Think}}},
  shorttitle = {Talk to {{Me}}},
  author = {Vlahos, James},
  year = {2019},
  publisher = {Eamon Dolan Books},
  file = {/Users/timokoch/Zotero/storage/JQU6VJTB/books.html}
}

@incollection{vlasenkoInfluencePhoneticContent2008,
  title = {On the {{Influence}} of {{Phonetic Content Variation}} for {{Acoustic Emotion Recognition}}},
  booktitle = {Perception in {{Multimodal Dialogue Systems}}},
  author = {Vlasenko, Bogdan and Schuller, Bj{\"o}rn and Wendemuth, Andreas and Rigoll, Gerhard},
  editor = {Andr{\'e}, Elisabeth and Dybkj{\ae}r, Laila and Minker, Wolfgang and Neumann, Heiko and Pieraccini, Roberto and Weber, Michael},
  year = {2008},
  volume = {5078},
  pages = {217--220},
  publisher = {Springer Berlin Heidelberg},
  address = {Berlin, Heidelberg},
  issn = {0302-9743, 1611-3349},
  doi = {10.1007/978-3-540-69369-7_24},
  urldate = {2021-11-28},
  abstract = {Acoustic Modeling in today's emotion recognition engines employs general models independent of the spoken phonetic content. This seems to work well enough given sufficient instances to cover for a broad variety of phonetic structures and emotions at the same time. However, data is usually sparse in the field and the question arises whether unit specific models as word emotion models could outperform the typical general models. In this respect this paper tries to answer the question how strongly acoustic emotion models depend on the textual and phonetic content. We investigate the influence on the turn and word level by use of state-of-the-art techniques for frame and word modeling on the well-known public Berlin Emotional Speech and Speech Under Simulated and Actual Stress databases. In the result it is clearly shown that the phonetic structure does strongly influence the accuracy of emotion recognition.},
  isbn = {978-3-540-69368-0 978-3-540-69369-7},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/ZUQFU6CZ/Vlasenko et al. - 2008 - On the Influence of Phonetic Content Variation for.pdf}
}

@incollection{vogtAutomaticRecognitionEmotions2008,
  title = {Automatic {{Recognition}} of {{Emotions}} from {{Speech}}: {{A Review}} of the {{Literature}} and {{Recommendations}} for {{Practical Realisation}}},
  shorttitle = {Automatic {{Recognition}} of {{Emotions}} from {{Speech}}},
  booktitle = {Affect and {{Emotion}} in {{Human-Computer Interaction}}},
  author = {Vogt, Thurid and Andr{\'e}, Elisabeth and Wagner, Johannes},
  editor = {Peter, Christian and Beale, Russell},
  year = {2008},
  volume = {4868},
  pages = {75--91},
  publisher = {Springer Berlin Heidelberg},
  address = {Berlin, Heidelberg},
  issn = {0302-9743, 1611-3349},
  doi = {10.1007/978-3-540-85099-1_7},
  urldate = {2021-06-30},
  abstract = {In this article we give guidelines on how to address the major technical challenges of automatic emotion recognition from speech in human-computer interfaces, which include audio segmentation to find appropriate units for emotions, extraction of emotion relevant features, classification of emotions, and training databases with emotional speech. Research so far has mostly dealt with offline evaluation of vocal emotions, and online processing has hardly been addressed. Online processing is, however, a necessary prerequisite for the realization of human-computer interfaces that analyze and respond to the user's emotions while he or she is interacting with an application. By means of a sample application, we demonstrate how the challenges arising from online processing may be solved. The overall objective of the paper is to help readers to assess the feasibility of human-computer interfaces that are sensitive to the user's emotional voice and to provide them with guidelines of how to technically realize such interfaces.},
  isbn = {978-3-540-85098-4 978-3-540-85099-1},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/CC89DMMH/Vogt et al. - 2008 - Automatic Recognition of Emotions from Speech A R.pdf}
}

@article{wagnerDawnTransformerEra2023,
  title = {Dawn of the {{Transformer Era}} in {{Speech Emotion Recognition}}: {{Closing}} the {{Valence Gap}}},
  shorttitle = {Dawn of the {{Transformer Era}} in {{Speech Emotion Recognition}}},
  author = {Wagner, Johannes and Triantafyllopoulos, Andreas and Wierstorf, Hagen and Schmitt, Maximilian and Burkhardt, Felix and Eyben, Florian and Schuller, Bj{\"o}rn W.},
  year = {2023},
  month = sep,
  journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  volume = {45},
  number = {9},
  pages = {10745--10759},
  issn = {1939-3539},
  doi = {10.1109/TPAMI.2023.3263585},
  urldate = {2024-11-18},
  abstract = {Recent advances in transformer-based architectures have shown promise in several machine learning tasks. In the audio domain, such architectures have been successfully utilised in the field of speech emotion recognition (SER). However, existing works have not evaluated the influence of model size and pre-training data on downstream performance, and have shown limited attention to generalisation, robustness, fairness, and efficiency. The present contribution conducts a thorough analysis of these aspects on several pre-trained variants of wav2vec 2.0 and HuBERT that we fine-tuned on the dimensions arousal, dominance, and valence of MSP-Podcast, while additionally using IEMOCAP and MOSI to test cross-corpus generalisation. To the best of our knowledge, we obtain the top performance for valence prediction without use of explicit linguistic information, with a concordance correlation coefficient (CCC) of. 638 on MSP-Podcast. Our investigations reveal that transformer-based architectures are more robust compared to a CNN-based baseline and fair with respect to gender groups, but not towards individual speakers. Finally, we show that their success on valence is based on implicit linguistic information, which explains why they perform on-par with recent multimodal approaches that explicitly utilise textual information. To make our findings reproducible, we release the best performing model to the community.},
  keywords = {Affective computing,Computer architecture,Data models,Emotion recognition,Robustness,speech emotion recognition,Speech recognition,Task analysis,transformers,Transformers},
  file = {/Users/timokoch/Zotero/storage/NNUB984M/Wagner et al. - 2023 - Dawn of the Transformer Era in Speech Emotion Reco.pdf;/Users/timokoch/Zotero/storage/6IE9K9RT/10089511.html}
}

@inproceedings{wangAlexaCoachLeveraging2020,
  title = {Alexa as {{Coach}}: {{Leveraging Smart Speakers}} to {{Build Social Agents}} That {{Reduce Public Speaking Anxiety}}},
  shorttitle = {Alexa as {{Coach}}},
  booktitle = {Proceedings of the 2020 {{CHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
  author = {Wang, Jinping and Yang, Hyun and Shao, Ruosi and Abdullah, Saeed and Sundar, S. Shyam},
  year = {2020},
  month = apr,
  pages = {1--13},
  publisher = {ACM},
  address = {Honolulu HI USA},
  doi = {10.1145/3313831.3376561},
  urldate = {2021-12-02},
  abstract = {Public speaking anxiety is one of the most common social phobias. We explore the feasibility of using a conversational agent to reduce this anxiety. We developed a public-speaking tutor on the Amazon Alexa platform that enables users to engage in cognitive reconstruction exercises. We also investigated how the sociability of the agent might affect its performance as a tutor. A user study of 53 college students with fear of public speaking showed that the interaction with the agent served to assuage pre-speech state anxiety. Agent sociability improved the sense of interpersonal closeness, which was associated with lower pre-speech anxiety. Moreover, sociability of the agent increased participants' satisfaction and their willingness to continue engagement. Our findings, thus, have implications not only for addressing public speaking anxiety in a scalable way but also for the design of future conversational agents using smart speaker platforms.},
  isbn = {978-1-4503-6708-0},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/8EWQ8BHN/Wang et al. - 2020 - Alexa as Coach Leveraging Smart Speakers to Build.pdf}
}

@inproceedings{wangStudentLifeAssessingMental2014,
  title = {{{StudentLife}}: Assessing Mental Health, Academic Performance and Behavioral Trends of College Students Using Smartphones},
  shorttitle = {{{StudentLife}}},
  booktitle = {Proceedings of the 2014 {{ACM International Joint Conference}} on {{Pervasive}} and {{Ubiquitous Computing}}},
  author = {Wang, Rui and Chen, Fanglin and Chen, Zhenyu and Li, Tianxing and Harari, Gabriella and Tignor, Stefanie and Zhou, Xia and {Ben-Zeev}, Dror and Campbell, Andrew T.},
  year = {2014},
  month = sep,
  pages = {3--14},
  publisher = {ACM},
  address = {Seattle Washington},
  doi = {10.1145/2632048.2632054},
  urldate = {2021-09-16},
  abstract = {Much of the stress and strain of student life remains hidden. The StudentLife continuous sensing app assesses the day-today and week-by-week impact of workload on stress, sleep, activity, mood, sociability, mental well-being and academic performance of a single class of 48 students across a 10 week term at Dartmouth College using Android phones. Results from the StudentLife study show a number of significant correlations between the automatic objective sensor data from smartphones and mental health and educational outcomes of the student body. We also identify a Dartmouth term lifecycle in the data that shows students start the term with high positive affect and conversation levels, low stress, and healthy sleep and daily activity patterns. As the term progresses and the workload increases, stress appreciably rises while positive affect, sleep, conversation and activity drops off. The StudentLife dataset is publicly available on the web.},
  isbn = {978-1-4503-2968-2},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/9ZICKIWQ/Wang et al. - 2014 - StudentLife assessing mental health, academic per.pdf}
}

@article{weidmanNotHearingHappiness2020,
  title = {({{Not}}) Hearing Happiness: {{Predicting}} Fluctuations in Happy Mood from Acoustic Cues Using Machine Learning},
  shorttitle = {({{Not}}) Hearing Happiness},
  author = {Weidman, Aaron C. and Sun, Jessie and Vazire, Simine and Quoidbach, Jordi and Ungar, Lyle H. and Dunn, Elizabeth W.},
  year = {2020},
  month = jun,
  journal = {Emotion (Washington, D.C.)},
  volume = {20},
  number = {4},
  pages = {642--658},
  issn = {1931-1516},
  doi = {10.1037/emo0000571},
  abstract = {Recent popular claims surrounding virtual assistants suggest that computers will soon be able to hear our emotions. Supporting this possibility, promising work has harnessed big data and emergent technologies to automatically predict stable levels of one specific emotion, happiness, at the community (e.g., counties) and trait (i.e., people) levels. Furthermore, research in affective science has shown that nonverbal vocal bursts (e.g., sighs, gasps) and specific acoustic features (e.g., pitch, energy) can differentiate between distinct emotions (e.g., anger, happiness) and that machine-learning algorithms can detect these differences. Yet, to our knowledge, no work has tested whether computers can automatically detect normal, everyday, within-person fluctuations in one emotional state from acoustic analysis. To address this issue in the context of happy mood, across 3 studies (total N = 20,197), we asked participants to repeatedly report their state happy mood and to provide audio recordings-including both direct speech and ambient sounds-from which we extracted acoustic features. Using three different machine learning algorithms (neural networks, random forests, and support vector machines) and two sets of acoustic features, we found that acoustic features yielded minimal predictive insight into happy mood above chance. Neither multilevel modeling analyses nor human coders provided additional insight into state happy mood. These findings suggest that it is not yet possible to automatically assess fluctuations in one emotional state (i.e., happy mood) from acoustic analysis, pointing to a critical future direction for affective scientists interested in acoustic analysis of emotion and automated emotion detection. (PsycInfo Database Record (c) 2020 APA, all rights reserved).},
  langid = {english},
  pmid = {30742458},
  keywords = {Adult,Cues,Emotions,Female,Happiness,Humans,Machine Learning,Male,Young Adult},
  file = {/Users/timokoch/Zotero/storage/7X22PH3V/Weidman et al. - (Not) Hearing Happiness Predicting Fluctuations i.pdf;/Users/timokoch/Zotero/storage/CWT73CZY/EMO-2018-0967_Supplemental_Materials.docx;/Users/timokoch/Zotero/storage/ZJBN5KPF/Weidman-et-al.-in-press-Not-Hearing-Happiness.pdf}
}

@article{weningerAcousticsEmotionAudio2013,
  title = {On the {{Acoustics}} of {{Emotion}} in {{Audio}}: {{What Speech}}, {{Music}}, and {{Sound}} Have in {{Common}}},
  shorttitle = {On the {{Acoustics}} of {{Emotion}} in {{Audio}}},
  author = {Weninger, Felix and Eyben, Florian and Schuller, Bj{\"o}rn and Mortillaro, Marcello and Scherer, Klaus R.},
  year = {2013},
  month = may,
  journal = {Frontiers in Psychology},
  volume = {4},
  issn = {1664-1078},
  doi = {10.3389/fpsyg.2013.00292},
  urldate = {2020-11-20},
  abstract = {Without doubt, there is emotional information in almost any kind of sound received by humans every day: be it the affective state of a person transmitted by means of speech; the emotion intended by a composer while writing a musical piece, or conveyed by a musician while performing it; or the affective state connected to an acoustic event occurring in the environment, in the soundtrack of a movie, or in a radio play. In the field of affective computing, there is currently some loosely connected research concerning either of these phenomena, but a holistic computational model of affect in sound is still lacking. In turn, for tomorrow's pervasive technical systems, including affective companions and robots, it is expected to be highly beneficial to understand the affective dimensions of ``the sound that something makes,'' in order to evaluate the system's auditory environment and its own audio output. This article aims at a first step toward a holistic computational model: starting from standard acoustic feature extraction schemes in the domains of speech, music, and sound analysis, we interpret the worth of individual features across these three domains, considering four audio databases with observer annotations in the arousal and valence dimensions. In the results, we find that by selection of appropriate descriptors, cross-domain arousal, and valence regression is feasible achieving significant correlations with the observer annotations of up to 0.78 for arousal (training on sound and testing on enacted speech) and 0.60 for valence (training on enacted speech and testing on music). The high degree of cross-domain consistency in encoding the two main dimensions of affect may be attributable to the co-evolution of speech and music from multimodal affect bursts, including the integration of nature sounds for expressive effects.},
  pmcid = {PMC3664314},
  pmid = {23750144},
  file = {/Users/timokoch/Zotero/storage/NA9ZH55C/Weninger et al. - 2013 - On the Acoustics of Emotion in Audio What Speech,.pdf}
}

@article{wiltingRealVsActed2006,
  title = {Real vs. Acted Emotional Speech},
  author = {Wilting, J. and Krahmer, E. J. and Swerts, M. G. J.},
  year = {2006},
  journal = {Proceedings of the International Conference on Spoken Language Processing (Interspeech 2006)},
  publisher = {ISCA},
  urldate = {2021-06-30},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/GAKWYQ2S/Wilting et al. - 2006 - Real vs. acted emotional speech.pdf;/Users/timokoch/Zotero/storage/YRHV2F8L/real-vs-acted-emotional-speech.html}
}

@article{wrightRangerFastImplementation2017,
  title = {Ranger: {{A Fast Implementation}} of {{Random Forests}} for {{High Dimensional Data}} in {{C}}++ and {{R}}},
  shorttitle = {Ranger},
  author = {Wright, Marvin N. and Ziegler, Andreas},
  year = {2017},
  month = mar,
  journal = {Journal of Statistical Software},
  volume = {77},
  number = {1},
  pages = {1--17},
  issn = {1548-7660},
  doi = {10.18637/jss.v077.i01},
  urldate = {2020-08-17},
  copyright = {Copyright (c) 2017 Marvin N. Wright, Andreas Ziegler},
  langid = {english},
  keywords = {C++,classification,machine learning,R,random forests,Rcpp,recursive partitioning,survival analysis},
  file = {/Users/timokoch/Zotero/storage/7PEX7JWG/Wright und Ziegler - 2017 - ranger A Fast Implementation of Random Forests fo.pdf;/Users/timokoch/Zotero/storage/L98MJ77W/v077i01.html}
}

@article{wuMultimodalDataCollection2021,
  title = {Multi-Modal Data Collection for Measuring Health, Behavior, and Living Environment of Large-Scale Participant Cohorts},
  author = {Wu, C. and Fritz, H. and Bastami, S. and Maestre, J.P. and Thomaz, E. and Julien, C. and Castelli, D.M. and De Barbaro, K. and Bearman, S.K. and Harari, G.M. and Craddock, R.C. and Kinney, K.A. and Gosling, S.D. and Schnyer, D.M. and Nagy, Z.},
  year = {2021},
  journal = {GigaScience},
  volume = {10},
  number = {6},
  issn = {2047-217X},
  doi = {10.1093/gigascience/giab044},
  abstract = {Background: As mobile technologies become ever more sensor-rich, portable, and ubiquitous, data captured by smart devices are lending rich insights into users' daily lives with unprecedented comprehensiveness and ecological validity. A number of human-subject studies have been conducted to examine the use of mobile sensing to uncover individual behavioral patterns and health outcomes, yet minimal attention has been placed on measuring living environments together with other human-centered sensing data. Moreover, the participant sample size in most existing studies falls well below a few hundred, leaving questions open about the reliability of findings on the relations between mobile sensing signals and human outcomes. Results: To address these limitations, we developed a home environment sensor kit for continuous indoor air quality tracking and deployed it in conjunction with smartphones, Fitbits, and ecological momentary assessments in a cohort study of up to 1,584 college student participants per data type for 3 weeks. We propose a conceptual framework that systematically organizes human-centric data modalities by their temporal coverage and spatial freedom. Then we report our study procedure, technologies and methods deployed, and descriptive statistics of the collected data that reflect the participants' mood, sleep, behavior, and living environment. Conclusions: We were able to collect from a large participant cohort satisfactorily complete multi-modal sensing and survey data in terms of both data continuity and participant adherence. Our novel data and conceptual development provide important guidance for data collection and hypothesis generation in future human-centered sensing studies. {\copyright} 2021 The Author(s). Published by Oxford University Press GigaScience.},
  langid = {english},
  keywords = {BEVO Beacon,college students,ecological momentary assessment,Fitbit,health,human-centered computing,multi-modal sensing,smartphone},
  file = {/Users/timokoch/Zotero/storage/ZNVKYCNK/Wu et al. - 2021 - Multi-modal data collection for measuring health, .pdf;/Users/timokoch/Zotero/storage/8CREJWKE/display.html}
}

@article{zouRegularizationVariableSelection2005,
  title = {Regularization and Variable Selection via the Elastic Net},
  author = {Zou, Hui and Hastie, Trevor},
  year = {2005},
  month = apr,
  journal = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
  volume = {67},
  number = {2},
  pages = {301--320},
  issn = {1369-7412, 1467-9868},
  doi = {10.1111/j.1467-9868.2005.00503.x},
  urldate = {2019-09-18},
  abstract = {We propose the elastic net, a new regularization and variable selection method. Real world data and a simulation study show that the elastic net often outperforms the lasso, while enjoying a similar sparsity of representation. In addition, the elastic net encourages a grouping effect, where strongly correlated predictors tend to be in or out of the model together. The elastic net is particularly useful when the number of predictors (p) is much bigger than the number of observations (n). By contrast, the lasso is not a very satisfactory variable selection method in the p n case. An algorithm called LARS-EN is proposed for computing elastic net regularization paths efficiently, much like algorithm LARS does for the lasso.},
  langid = {english},
  file = {/Users/timokoch/Zotero/storage/U2SIATQC/Zou und Hastie - 2005 - Regularization and variable selection via the elas.pdf}
}