-
Notifications
You must be signed in to change notification settings - Fork 0
/
speech_paper_library.bib
2925 lines (2742 loc) · 311 KB
/
speech_paper_library.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@article{aishwaryaComputationallyEfficientSpeech2024,
title = {A Computationally Efficient Speech Emotion Recognition System Employing Machine Learning Classifiers and Ensemble Learning},
author = {Aishwarya, N. and Kaur, Kanwaljeet and Seemakurthy, Karthik},
year = {2024},
month = mar,
journal = {International Journal of Speech Technology},
volume = {27},
number = {1},
pages = {239--254},
issn = {1572-8110},
doi = {10.1007/s10772-024-10095-8},
urldate = {2024-09-26},
abstract = {Speech Emotion Recognition (SER) is the process of recognizing and classifying emotions expressed through speech. SER greatly facilitates personalized and empathetic interactions, enhances user experiences, enables sentiment analysis, and finds applications in psychology, healthcare, entertainment, and gaming industries. However, accurately detecting and classifying emotions is a highly challenging task for machines due to the complexity and multifaceted nature of emotions. This work gives a comparative analysis of two approaches for emotion recognition based on original and augmented speech signals. The first approach involves extracting 39 Mel Frequency Cepstrum Coefficients (MFCC) features, while the second approach involves using MFCC spectrograms and extracting features using deep learning models such as MobileNet V2, VGG16, Inception V3, VGG19 and ResNet 50. These features are then tested on Machine learning classifiers such as SVM, Linear SVM, Naive Bayes, k-Nearest Neighbours, Logistic Regression and Random Forest. From the experiments, it is observed that the SVM classifier works best with all the feature extraction techniques Furthermore, to enhance the results, ensembling techniques involving CatBoost, and the Voting classifier along with SVM were utilized, resulting in improved test accuracies of 97.04\% on the RAVDESS dataset, 93.24\% on the SAVEE dataset, and 99.83\% on the TESS dataset, respectively. It is worth noting that both approaches are computationally efficient as they required no training time.},
langid = {english},
keywords = {Artificial Intelligence,Ensemble learning,Machine learning classifiers,MFCC,Pre-trained models,Speech emotion recognition},
file = {/Users/timokoch/Zotero/storage/LRDTLAD6/Aishwarya et al. - 2024 - A computationally efficient speech emotion recogni.pdf}
}
@article{akcaySpeechEmotionRecognition2020,
title = {Speech Emotion Recognition: {{Emotional}} Models, Databases, Features, Preprocessing Methods, Supporting Modalities, and Classifiers},
shorttitle = {Speech Emotion Recognition},
author = {Ak{\c c}ay, Mehmet Berkehan and O{\u g}uz, Kaya},
year = {2020},
month = jan,
journal = {Speech Communication},
volume = {116},
pages = {56--76},
issn = {0167-6393},
doi = {10.1016/j.specom.2019.12.001},
urldate = {2024-06-10},
abstract = {Speech is the most natural way of expressing ourselves as humans. It is only natural then to extend this communication medium to computer applications. We define speech emotion recognition (SER) systems as a collection of methodologies that process and classify speech signals to detect the embedded emotions. SER is not a new field, it has been around for over two decades, and has regained attention thanks to the recent advancements. These novel studies make use of the advances in all fields of computing and technology, making it necessary to have an update on the current methodologies and techniques that make SER possible. We have identified and discussed distinct areas of SER, provided a detailed survey of current literature of each, and also listed the current challenges.},
keywords = {Classification,Speech databases,Speech emotion recognition,Speech features,Survey},
file = {/Users/timokoch/Zotero/storage/X7U6ZWRU/S0167639319302262.html}
}
@misc{APAPsycNetFullTextHTML,
title = {{{APA PsycNet FullTextHTML}} Page},
urldate = {2024-04-25},
howpublished = {https://psycnet.apa.org/fulltext/2023-87986-001.html},
file = {/Users/timokoch/Zotero/storage/66TFJANB/2023-87986-001.html}
}
@article{aucouturierCovertDigitalManipulation2016,
title = {Covert Digital Manipulation of Vocal Emotion Alter Speakers' Emotional States in a Congruent Direction},
author = {Aucouturier, Jean-Julien and Johansson, Petter and Hall, Lars and Segnini, Rodrigo and Mercadi{\'e}, Lolita and Watanabe, Katsumi},
year = {2016},
month = jan,
journal = {Proceedings of the National Academy of Sciences},
volume = {113},
number = {4},
pages = {948--953},
publisher = {Proceedings of the National Academy of Sciences},
doi = {10.1073/pnas.1506552113},
urldate = {2024-11-18},
abstract = {Research has shown that people often exert control over their emotions. By modulating expressions, reappraising feelings, and redirecting attention, they can regulate their emotional experience. These findings have contributed to a blurring of the traditional boundaries between cognitive and emotional processes, and it has been suggested that emotional signals are produced in a goal-directed way and monitored for errors like other intentional actions. However, this interesting possibility has never been experimentally tested. To this end, we created a digital audio platform to covertly modify the emotional tone of participants' voices while they talked in the direction of happiness, sadness, or fear. The result showed that the audio transformations were being perceived as natural examples of the intended emotions, but the great majority of the participants, nevertheless, remained unaware that their own voices were being manipulated. This finding indicates that people are not continuously monitoring their own voice to make sure that it meets a predetermined emotional target. Instead, as a consequence of listening to their altered voices, the emotional state of the participants changed in congruence with the emotion portrayed, which was measured by both self-report and skin conductance level. This change is the first evidence, to our knowledge, of peripheral feedback effects on emotional experience in the auditory domain. As such, our result reinforces the wider framework of self-perception theory: that we often use the same inferential strategies to understand ourselves as those that we use to understand others.},
file = {/Users/timokoch/Zotero/storage/LYZGZNZ2/Aucouturier et al. - 2016 - Covert digital manipulation of vocal emotion alter.pdf}
}
@book{auGroupedFeatureImportance2021,
title = {Grouped {{Feature Importance}} and {{Combined Features Effect Plot}}},
author = {Au, Quay and Herbinger, Julia and Stachl, Clemens and Bischl, Bernd and Casalicchio, Giuseppe},
year = {2021},
month = apr,
abstract = {Interpretable machine learning has become a very active area of research due to the rising popularity of machine learning algorithms and their inherently challenging interpretability. Most work in this area has been focused on the interpretation of single features in a model. However, for researchers and practitioners, it is often equally important to quantify the importance or visualize the effect of feature groups. To address this research gap, we provide a comprehensive overview of how existing model-agnostic techniques can be defined for feature groups to assess the grouped feature importance, focusing on permutation-based, refitting, and Shapley-based methods. We also introduce an importance-based sequential procedure that identifies a stable and well-performing combination of features in the grouped feature space. Furthermore, we introduce the combined features effect plot, which is a technique to visualize the effect of a group of features based on a sparse, interpretable linear combination of features. We used simulation studies and a real data example from computational psychology to analyze, compare, and discuss these methods.},
file = {/Users/timokoch/Zotero/storage/EI4CD9QA/Au et al. - 2021 - Grouped Feature Importance and Combined Features E.pdf}
}
@article{auGroupedFeatureImportance2022,
title = {Grouped Feature Importance and Combined Features Effect Plot},
author = {Au, Quay and Herbinger, Julia and Stachl, Clemens and Bischl, Bernd and Casalicchio, Giuseppe},
year = {2022},
month = jul,
journal = {Data Mining and Knowledge Discovery},
volume = {36},
number = {4},
pages = {1401--1450},
issn = {1573-756X},
doi = {10.1007/s10618-022-00840-5},
urldate = {2022-07-31},
abstract = {Interpretable machine learning has become a very active area of research due to the rising popularity of machine learning algorithms and their inherently challenging interpretability. Most work in this area has been focused on the interpretation of single features in a model. However, for researchers and practitioners, it is often equally important to quantify the importance or visualize the effect of feature groups. To address this research gap, we provide a comprehensive overview of how existing model-agnostic techniques can be defined for feature groups to assess the grouped feature importance, focusing on permutation-based, refitting, and Shapley-based methods. We also introduce an importance-based sequential procedure that identifies a stable and well-performing combination of features in the grouped feature space. Furthermore, we introduce the combined features effect plot, which is a technique to visualize the effect of a group of features based on a sparse, interpretable linear combination of features. We used simulation studies and real data examples to analyze, compare, and discuss these methods.},
langid = {english},
keywords = {Combined features effects,Dimension reduction,Grouped feature importance,Interpretable machine learning},
file = {/Users/timokoch/Zotero/storage/C27N2C7B/Au et al. - 2022 - Grouped feature importance and combined features e.pdf}
}
@article{ayuso-mateosMultiCountryEvaluationAffective2013,
title = {Multi-{{Country Evaluation}} of {{Affective Experience}}: {{Validation}} of an {{Abbreviated Version}} of the {{Day Reconstruction Method}} in {{Seven Countries}}},
shorttitle = {Multi-{{Country Evaluation}} of {{Affective Experience}}},
author = {{Ayuso-Mateos}, Jos{\'e} Luis and Miret, Marta and Caballero, Francisco F{\'e}lix and Olaya, Beatriz and Haro, Josep Maria and Kowal, Paul and Chatterji, Somnath},
year = {2013},
month = apr,
journal = {PLOS ONE},
volume = {8},
number = {4},
pages = {e61534},
publisher = {Public Library of Science},
issn = {1932-6203},
doi = {10.1371/journal.pone.0061534},
urldate = {2024-06-12},
abstract = {Background The Day Reconstruction Method (DRM) was developed to assess affective states as measures of experienced well-being. The present study aimed to validate an abbreviated version of the DRM in a representative sample of the population in seven countries (China, Ghana, India, Mexico, Russia, South Africa, and Spain), and to examine whether there are country differences in affect and in the relationships among the activities based on the similarity of the affect associated with each of them. Methods Interviews were conducted with 47,222 non-institutionalized adults from seven countries, using an abbreviated version of the DRM. A cluster analysis was carried out to classify activities on the basis of the similarity of the associated affect. In each country, the factorial structure of the affect adjectives was tested through Confirmatory Factor Analysis. Internal consistency and construct validity were also assessed. Moreover, the differences in affect across countries and the diurnal cycles of affect were evaluated. Results The DRM showed adequate psychometric properties regarding reliability and construct validity in all countries. Respondents from Ghana and South Africa reported more positive net affect whereas Indian respondents reported less positive net affect. Most of the countries showed a similar diurnal variation of affect, which tended to improve throughout the day. Conclusions The results show that this abbreviated version of the DRM is a useful tool for multi-country evaluation of experienced well-being.},
langid = {english},
keywords = {Clustering algorithms,Ghana,Global health,India,Mexico,Russia,South Africa,Spain},
file = {/Users/timokoch/Zotero/storage/FBE24DB3/Ayuso-Mateos et al. - 2013 - Multi-Country Evaluation of Affective Experience .pdf}
}
@misc{baevskiWav2vec20Framework2020,
title = {Wav2vec 2.0: {{A Framework}} for {{Self-Supervised Learning}} of {{Speech Representations}}},
shorttitle = {Wav2vec 2.0},
author = {Baevski, Alexei and Zhou, Henry and Mohamed, Abdelrahman and Auli, Michael},
year = {2020},
month = oct,
number = {arXiv:2006.11477},
eprint = {2006.11477},
publisher = {arXiv},
doi = {10.48550/arXiv.2006.11477},
urldate = {2024-11-18},
abstract = {We show for the first time that learning powerful representations from speech audio alone followed by fine-tuning on transcribed speech can outperform the best semi-supervised methods while being conceptually simpler. wav2vec 2.0 masks the speech input in the latent space and solves a contrastive task defined over a quantization of the latent representations which are jointly learned. Experiments using all labeled data of Librispeech achieve 1.8/3.3 WER on the clean/other test sets. When lowering the amount of labeled data to one hour, wav2vec 2.0 outperforms the previous state of the art on the 100 hour subset while using 100 times less labeled data. Using just ten minutes of labeled data and pre-training on 53k hours of unlabeled data still achieves 4.8/8.2 WER. This demonstrates the feasibility of speech recognition with limited amounts of labeled data.},
archiveprefix = {arXiv},
keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning,Computer Science - Sound,Electrical Engineering and Systems Science - Audio and Speech Processing},
file = {/Users/timokoch/Zotero/storage/V7DC5MLX/Baevski et al. - 2020 - wav2vec 2.0 A Framework for Self-Supervised Learn.pdf;/Users/timokoch/Zotero/storage/ST8YYBH5/2006.html}
}
@article{banseAcousticProfilesVocal1996,
title = {Acoustic Profiles in Vocal Emotion Expression},
author = {Banse, Rainer and Scherer, Klaus R.},
year = {1996},
journal = {Journal of Personality and Social Psychology},
volume = {70},
number = {3},
pages = {614--636},
publisher = {American Psychological Association},
address = {US},
issn = {1939-1315},
doi = {10.1037/0022-3514.70.3.614},
abstract = {Professional actors' portrayals of 14 emotions varying in intensity and valence were presented to judges. The results on decoding replicated earlier findings on the ability of judges to infer vocally expressed emotions with much-better-than-chance accuracy, including consistently found differences in the recognizability of different emotions. A total of 224 portrayals were subjected to digital acoustical analysis to obtain profiles of vocal parameters for different emotions. The data suggest that vocal parameters not only index the degree of intensity typical for different emotions but also differentiate valence or quality aspects. The data are also used to test theoretical predictions on vocal patterning based on the component process of model of emotion (K. R. Scherer, see record 1986-16849-001). Although most hypotheses are supported, some need to be revised on the basis of the empirical evidence. Discriminant analysis and jackknifing show remarkably high hit rates and patterns of confusion that closely mirror those found for listener-judges. (PsycINFO Database Record (c) 2016 APA, all rights reserved)},
keywords = {Emotional States,Inference,Oral Communication,Speech Characteristics,Speech Perception},
file = {/Users/timokoch/Zotero/storage/DSELHD5X/banse1996.pdf;/Users/timokoch/Zotero/storage/UL6FT9LI/Banse und Scherer - 1996 - Acoustic profiles in vocal emotion expression.pdf;/Users/timokoch/Zotero/storage/IFWHPSAM/1996-03014-015.html}
}
@article{banzigerIntroducingGenevaMultimodal2012,
title = {Introducing the {{Geneva Multimodal}} Expression Corpus for Experimental Research on Emotion Perception},
author = {B{\"a}nziger, Tanja and Mortillaro, Marcello and Scherer, Klaus R.},
year = {2012},
month = oct,
journal = {Emotion (Washington, D.C.)},
volume = {12},
number = {5},
pages = {1161--1179},
issn = {1931-1516},
doi = {10.1037/a0025827},
abstract = {Research on the perception of emotional expressions in faces and voices is exploding in psychology, the neurosciences, and affective computing. This article provides an overview of some of the major emotion expression (EE) corpora currently available for empirical research and introduces a new, dynamic, multimodal corpus of emotion expressions, the Geneva Multimodal Emotion Portrayals Core Set (GEMEP-CS). The design features of the corpus are outlined and justified, and detailed validation data for the core set selection are presented and discussed. Finally, an associated database with microcoded facial, vocal, and body action elements, as well as observer ratings, is introduced.},
langid = {english},
pmid = {22081890},
keywords = {Emotions,Facial Expression,Humans,Research,Voice}
}
@article{barrettEmotionalExpressionsReconsidered2019,
title = {Emotional {{Expressions Reconsidered}}: {{Challenges}} to {{Inferring Emotion From Human Facial Movements}}},
shorttitle = {Emotional {{Expressions Reconsidered}}},
author = {Barrett, Lisa Feldman and Adolphs, Ralph and Marsella, Stacy and Martinez, Aleix M. and Pollak, Seth D.},
year = {2019},
month = jul,
journal = {Psychological Science in the Public Interest},
volume = {20},
number = {1},
pages = {1--68},
publisher = {SAGE Publications Inc},
issn = {1529-1006},
doi = {10.1177/1529100619832930},
urldate = {2021-12-14},
abstract = {It is commonly assumed that a person's emotional state can be readily inferred from his or her facial movements, typically called emotional expressions or facial expressions. This assumption influences legal judgments, policy decisions, national security protocols, and educational practices; guides the diagnosis and treatment of psychiatric illness, as well as the development of commercial applications; and pervades everyday social interactions as well as research in other scientific fields such as artificial intelligence, neuroscience, and computer vision. In this article, we survey examples of this widespread assumption, which we refer to as the common view, and we then examine the scientific evidence that tests this view, focusing on the six most popular emotion categories used by consumers of emotion research: anger, disgust, fear, happiness, sadness, and surprise. The available scientific evidence suggests that people do sometimes smile when happy, frown when sad, scowl when angry, and so on, as proposed by the common view, more than what would be expected by chance. Yet how people communicate anger, disgust, fear, happiness, sadness, and surprise varies substantially across cultures, situations, and even across people within a single situation. Furthermore, similar configurations of facial movements variably express instances of more than one emotion category. In fact, a given configuration of facial movements, such as a scowl, often communicates something other than an emotional state. Scientists agree that facial movements convey a range of information and are important for social communication, emotional or otherwise. But our review suggests an urgent need for research that examines how people actually move their faces to express emotions and other social information in the variety of contexts that make up everyday life, as well as careful study of the mechanisms by which people perceive instances of emotion in one another. We make specific research recommendations that will yield a more valid picture of how people move their faces to express emotions and how they infer emotional meaning from facial movements in situations of everyday life. This research is crucial to provide consumers of emotion research with the translational information they require.},
langid = {english},
keywords = {emotion perception,emotion recognition,emotional expression},
file = {/Users/timokoch/Zotero/storage/LNM8BUUY/Barrett et al. - 2019 - Emotional Expressions Reconsidered Challenges to .pdf}
}
@incollection{batlinerAutomaticRecognitionEmotions2011,
title = {The {{Automatic Recognition}} of {{Emotions}} in {{Speech}}},
booktitle = {Cognitive {{Technologies}}},
author = {Batliner, Anton and Schuller, Bj{\"o}rn and Seppi, Dino and Steidl, Stefan and Devillers, Laurence and Vidrascu, Laurence and Vogt, Thurid and Aharonson, Vered and Amir, Noam},
year = {2011},
month = jan,
pages = {71--99},
doi = {10.1007/978-3-642-15184-2_6},
abstract = {In this chapter, we focus on the automatic recognition of emotional states using acoustic and linguistic parameters as features and classifiers as tools to predict the `correct' emotional states. We first sketch history and state of the art in this field; then we describe the process of `corpus engineering', i.e. the design and the recording of databases, the annotation of emotional states, and further processing such as manual or automatic segmentation. Next, we present an overview of acoustic and linguistic features that are extracted automatically or manually. In the section on classifiers, we deal with topics such as the curse of dimensionality and the sparse data problem, classifiers, and evaluation. At the end of each section, we point out important aspects that should be taken into account for the planning or the assessment of studies. The subject area of this chapter is not emotions in some narrow sense but in a wider sense encompassing emotion-related states such as moods, attitudes, or interpersonal stances as well. We do not aim at an in-depth treatise of some specific aspects or algorithms but at an overview of approaches and strategies that have been used or should be used.},
file = {/Users/timokoch/Zotero/storage/RZJEXMEE/Batliner et al. - 2011 - The Automatic Recognition of Emotions in Speech.pdf}
}
@article{ben-davidProsodySemanticsAre2016,
title = {Prosody and {{Semantics Are Separate}} but {{Not Separable Channels}} in the {{Perception}} of {{Emotional Speech}}: {{Test}} for {{Rating}} of {{Emotions}} in {{Speech}}},
shorttitle = {Prosody and {{Semantics Are Separate}} but {{Not Separable Channels}} in the {{Perception}} of {{Emotional Speech}}},
author = {{Ben-David}, Boaz and Multani, Namita and Shakuf, Vered and Rudzicz, Frank and {van Lieshout}, Pascal H. H. M.},
year = {2016},
month = feb,
journal = {Journal of Speech, Language, and Hearing Research},
volume = {59},
number = {1},
pages = {72--89},
publisher = {American Speech-Language-Hearing Association},
doi = {10.1044/2015_JSLHR-H-14-0323},
urldate = {2020-11-20},
abstract = {Purpose Our aim is to explore the complex interplay of prosody (tone of speech) and semantics (verbal content) in the perception of discrete emotions in speech. Method We implement a novel tool, the Test for Rating of Emotions in Speech. Eighty native English speakers were presented with spoken sentences made of different combinations of 5 discrete emotions (anger, fear, happiness, sadness, and neutral) presented in prosody and semantics. Listeners were asked to rate the sentence as a whole, integrating both speech channels, or to focus on one channel only (prosody or semantics). Results We observed supremacy of congruency, failure of selective attention, and prosodic dominance. Supremacy of congruency means that a sentence that presents the same emotion in both speech channels was rated highest; failure of selective attention means that listeners were unable to selectively attend to one channel when instructed; and prosodic dominance means that prosodic information plays a larger role than semantics in processing emotional speech. Conclusions Emotional prosody and semantics are separate but not separable channels, and it is difficult to perceive one without the influence of the other. Our findings indicate that the Test for Rating of Emotions in Speech can reveal specific aspects in the processing of emotional speech and may in the future prove useful for understanding emotion-processing deficits in individuals with pathologies.},
file = {/Users/timokoch/Zotero/storage/Z5B7T3YI/Ben-David Boaz M. et al. - 2016 - Prosody and Semantics Are Separate but Not Separab.pdf;/Users/timokoch/Zotero/storage/8DXCIQCI/2015_JSLHR-H-14-0323.html}
}
@article{biecekDALEXExplainersComplex2018,
title = {{{DALEX}}: {{Explainers}} for {{Complex Predictive Models}} in {{R}}},
shorttitle = {{{DALEX}}},
author = {Biecek, Przemyslaw},
year = {2018},
journal = {Journal of Machine Learning Research},
volume = {19},
number = {84},
pages = {1--5},
issn = {1533-7928},
urldate = {2020-12-04},
file = {/Users/timokoch/Zotero/storage/2GLJRE4D/Biecek - 2018 - DALEX Explainers for Complex Predictive Models in.pdf;/Users/timokoch/Zotero/storage/9X4J9LJ9/18-416.html;/Users/timokoch/Zotero/storage/Q73NDGYV/18-416.html}
}
@article{bischlResamplingMethodsMetaModel2012,
title = {Resampling {{Methods}} for {{Meta-Model Validation}} with {{Recommendations}} for {{Evolutionary Computation}}},
author = {Bischl, B. and Mersmann, O. and Trautmann, H. and Weihs, C.},
year = {2012},
month = jun,
journal = {Evolutionary Computation},
volume = {20},
number = {2},
pages = {249--275},
issn = {1063-6560, 1530-9304},
doi = {10.1162/EVCO_a_00069},
urldate = {2019-09-18},
abstract = {Meta-modeling has become a crucial tool in solving expensive optimization problems. Much of the work in the past has focused on finding a good regression method to model the fitness function. Examples include classical linear regression, splines, neural networks, Kriging and support vector regression. This paper specifically draws attention to the fact that assessing model accuracy is a crucial aspect in the meta-modeling framework. Resampling strategies such as cross-validation, subsampling, bootstrapping, and nested resampling are prominent methods for model validation and are systematically discussed with respect to possible pitfalls, shortcomings, and specific features. A survey of meta-modeling techniques within evolutionary optimization is provided. In addition, practical examples illustrating some of the pitfalls associated with model selection and performance assessment are presented. Finally, recommendations are given for choosing a model validation technique for a particular setting.},
langid = {english},
file = {/Users/timokoch/Zotero/storage/TSY5U6LF/Bischl et al. - 2012 - Resampling Methods for Meta-Model Validation with .pdf}
}
@book{bolgerIntensiveLongitudinalMethods2013,
title = {Intensive Longitudinal Methods: {{An}} Introduction to Diary and Experience Sampling Research},
shorttitle = {Intensive Longitudinal Methods},
author = {Bolger, Niall and Laurenceau, Jean-Philippe},
year = {2013},
series = {Intensive Longitudinal Methods: {{An}} Introduction to Diary and Experience Sampling Research},
pages = {xv, 256},
publisher = {Guilford Press},
address = {New York, NY, US},
abstract = {A complete, practical guide to planning and executing an intensive longitudinal study, this book provides the tools for understanding within-subject social, psychological, and physiological processes in everyday contexts. Intensive longitudinal studies involve many repeated measurements taken on individuals, dyads, or groups, and include diary and experience sampling studies. A range of engaging, worked-through research examples with datasets are featured. Coverage includes how to: select the best intensive longitudinal design for a particular research question, model within-subject change processes for continuous and categorical outcomes, distinguish within-subject from between-subjects effects, assess the reliability of within-subject changes, assure sufficient statistical power, and more. Several end-of-chapter write-ups illustrate effective ways to present study findings for publication. (PsycINFO Database Record (c) 2016 APA, all rights reserved)},
isbn = {978-1-4625-0678-1 978-1-4625-0692-7},
keywords = {Longitudinal Studies,Measurement,Methodology,Psychophysiology,Social Processes},
file = {/Users/timokoch/Zotero/storage/DM3HS94F/2012-17340-000.html}
}
@article{boydPersonalityPanoramaConceptualizing2020,
title = {The {{Personality Panorama}}: {{Conceptualizing Personality}} through {{Big Behavioural Data}}},
shorttitle = {The {{Personality Panorama}}},
author = {Boyd, Ryan L. and Pasca, Paola and Lanning, Kevin},
year = {2020},
month = sep,
journal = {European Journal of Personality},
volume = {34},
number = {5},
pages = {599--612},
publisher = {SAGE Publications Ltd},
issn = {0890-2070},
doi = {10.1002/per.2254},
urldate = {2022-12-16},
abstract = {Personality psychology has long been grounded in data typologies, particularly in the delineation of behavioural, life outcome, informant?report, and self?report sources of data from one another. Such data typologies are becoming obsolete in the face of new methods, technologies, and data philosophies. In this article, we discuss personality psychology's historical thinking about data, modern data theory's place in personality psychology, and several qualities of big data that urge a rethinking of personality itself. We call for a move away from self?report questionnaires and a reprioritization of the study of behaviour within personality science. With big data and behavioural assessment, we have the potential to witness the confluence of situated, seamlessly interacting psychological processes, forming an inclusive, dynamic, multiangle view of personality. However, big behavioural data come hand in hand with important ethical considerations, and our emerging ability to create a ?personality panopticon? requires careful and thoughtful navigation. For our research to improve and thrive in partnership with new technologies, we must not only wield our new tools thoughtfully, but humanely. Through discourse and collaboration with other disciplines and the general public, we can foster mutual growth and ensure that humanity's burgeoning technological capabilities serve, rather than control, the public interest. ? 2020 European Association of Personality Psychology},
langid = {english},
file = {/Users/timokoch/Zotero/storage/RY3DHZLG/Boyd et al. - 2020 - The Personality Panorama Conceptualizing Personal.pdf}
}
@article{breimanRandomForests2001,
title = {Random Forests},
author = {Breiman, Leo},
year = {2001},
journal = {Machine learning},
volume = {45},
number = {1},
pages = {5--32},
issn = {0885-6125}
}
@article{brooksDeepLearningReveals2023,
title = {Deep Learning Reveals What Vocal Bursts Express in Different Cultures},
author = {Brooks, Jeffrey A. and Tzirakis, Panagiotis and Baird, Alice and Kim, Lauren and Opara, Michael and Fang, Xia and Keltner, Dacher and Monroy, Maria and Corona, Rebecca and Metrick, Jacob and Cowen, Alan S.},
year = {2023},
month = feb,
journal = {Nature Human Behaviour},
volume = {7},
number = {2},
pages = {240--250},
publisher = {Nature Publishing Group},
issn = {2397-3374},
doi = {10.1038/s41562-022-01489-2},
urldate = {2023-03-01},
abstract = {Human social life is rich with sighs, chuckles, shrieks and other emotional vocalizations, called `vocal bursts'. Nevertheless, the meaning of vocal bursts across cultures is only beginning to be understood. Here, we combined large-scale experimental data collection with deep learning to reveal the shared and culture-specific meanings of vocal bursts. A total of n\,=\,4,031 participants in China, India, South Africa, the USA and Venezuela mimicked vocal bursts drawn from 2,756 seed recordings. Participants also judged the emotional meaning of each vocal burst. A deep neural network tasked with predicting the culture-specific meanings people attributed to vocal bursts while disregarding context and speaker identity discovered 24 acoustic dimensions, or kinds, of vocal expression with distinct emotion-related meanings. The meanings attributed to these complex vocal modulations were 79\% preserved across the five countries and three languages. These results reveal the underlying dimensions of human emotional vocalization in remarkable detail.},
copyright = {2022 The Author(s), under exclusive licence to Springer Nature Limited},
langid = {english},
keywords = {Emotion,Human behaviour}
}
@inproceedings{burkhardtDatabaseGermanEmotional2005,
title = {A Database of {{German}} Emotional Speech.},
booktitle = {Interspeech},
author = {Burkhardt, Felix and Paeschke, Astrid and Rolfes, Miriam and Sendlmeier, Walter F. and Weiss, Benjamin},
year = {2005},
volume = {5},
pages = {1517--1520}
}
@book{burkhardtDatabaseGermanEmotional2005a,
title = {A Database of {{German}} Emotional Speech},
author = {Burkhardt, Felix and Paeschke, Astrid and Rolfes, M. and Sendlmeier, Walter and Weiss, Benjamin},
year = {2005},
month = sep,
journal = {9th European Conference on Speech Communication and Technology},
volume = {5},
pages = {1520},
doi = {10.21437/Interspeech.2005-446},
abstract = {The article describes a database of emotional speech. Ten actors (5 female and 5 male) simulated the emotions, producing 10 German utterances (5 short and 5 longer sentences) which could be used in everyday communication and are interpretable in all applied emotions. The recordings were taken in an anechoic chamber with high-quality recording equipment. In addition to the sound electro-glottograms were recorded. The speech material comprises about 800 sentences (seven emotions * ten actors * ten sentences + some second versions). The complete database was evaluated in a perception test regarding the recognisability of emotions and their naturalness. Utterances recognised better than 80\% and judged as natural by more than 60\% of the listeners were phonetically labelled in a narrow transcription with special markers for voice-quality, phonatory and articulatory settings and articulatory features. The database can be accessed by the public via the internet (http://www.expressive-speech.net/emodb/).}
}
@article{cambriaAffectiveComputingSentiment2016,
title = {Affective {{Computing}} and {{Sentiment Analysis}}},
author = {Cambria, E.},
year = {2016},
month = mar,
journal = {IEEE Intelligent Systems},
volume = {31},
number = {2},
pages = {102--107},
issn = {1941-1294},
doi = {10.1109/MIS.2016.31},
abstract = {Understanding emotions is an important aspect of personal development and growth, and as such it is a key tile for the emulation of human intelligence. Besides being important for the advancement of AI, emotion processing is also important for the closely related task of polarity detection. The opportunity to automatically capture the general public's sentiments about social events, political movements, marketing campaigns, and product preferences has raised interest in both the scientific community, for the exciting open challenges, and the business world, for the remarkable fallouts in marketing and financial market prediction. This has led to the emerging fields of affective computing and sentiment analysis, which leverage human-computer interaction, information retrieval, and multimodal signal processing for distilling people's sentiments from the ever-growing amount of online social data.},
keywords = {affective computing,Affective computing,affective reasoning,emotion,emotion processing,emotion understanding,financial market prediction,human computer interaction,human intelligence emulation,human-computer interaction,information retrieval,intelligent systems,Knowledge based systems,marketing campaigns,multimodal signal processing,online social data,polarity detection,political movements,Pragmatics,public sentiments,scientific community,Semantics,sentiment analysis,Sentiment analysis,social events,social networking (online),Statistical analysis,Videos},
file = {/Users/timokoch/Zotero/storage/IZMLJUQY/Cambria - 2016 - Affective Computing and Sentiment Analysis.pdf;/Users/timokoch/Zotero/storage/DFCY8SQ5/7435182.html}
}
@article{carlierSearchStateTrait2022,
title = {In {{Search}} of {{State}} and {{Trait Emotion Markers}} in {{Mobile-Sensed Language}}: {{Field Study}}},
shorttitle = {In {{Search}} of {{State}} and {{Trait Emotion Markers}} in {{Mobile-Sensed Language}}},
author = {Carlier, Chiara and Niemeijer, Koen and Mestdagh, Merijn and Bauwens, Michael and Vanbrabant, Peter and Geurts, Luc and van Waterschoot, Toon and Kuppens, Peter},
year = {2022},
month = feb,
journal = {JMIR Mental Health},
volume = {9},
number = {2},
pages = {e31724},
publisher = {JMIR Publications Inc., Toronto, Canada},
doi = {10.2196/31724},
urldate = {2022-07-15},
abstract = {Background: Emotions and mood are important for overall well-being. Therefore, the search for continuous, effortless emotion prediction methods is an important field of study. Mobile sensing provides a promising tool and can capture one of the most telling signs of emotion: language. Objective: The aim of this study is to examine the separate and combined predictive value of mobile-sensed language data sources for detecting both momentary emotional experience as well as global individual differences in emotional traits and depression. Methods: In a 2-week experience sampling method study, we collected self-reported emotion ratings and voice recordings 10 times a day, continuous keyboard activity, and trait depression severity. We correlated state and trait emotions and depression and language, distinguishing between speech content (spoken words), speech form (voice acoustics), writing content (written words), and writing form (typing dynamics). We also investigated how well these features predicted state and trait emotions using cross-validation to select features and a hold-out set for validation. Results: Overall, the reported emotions and mobile-sensed language demonstrated weak correlations. The most significant correlations were found between speech content and state emotions and between speech form and state emotions, ranging up to 0.25. Speech content provided the best predictions for state emotions. None of the trait emotion--language correlations remained significant after correction. Among the emotions studied, valence and happiness displayed the most significant correlations and the highest predictive performance. Conclusions: Although using mobile-sensed language as an emotion marker shows some promise, correlations and predictive R2 values are low.},
copyright = {Unless stated otherwise, all articles are open-access distributed under the terms of the Creative Commons Attribution License (http://creativecommons.org/licenses/by/2.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work ("first published in the Journal of Medical Internet Research...") is properly cited with original URL and bibliographic citation information. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.},
langid = {english},
file = {/Users/timokoch/Zotero/storage/A66EL3FT/Carlier et al. - 2022 - In Search of State and Trait Emotion Markers in Mo.pdf;/Users/timokoch/Zotero/storage/IUI62U7D/e31724.html}
}
@article{cowenMapping24Emotions2019,
title = {Mapping 24 Emotions Conveyed by Brief Human Vocalization},
author = {Cowen, Alan S. and Elfenbein, Hillary Anger and Laukka, Petri and Keltner, Dacher},
year = {2019},
month = sep,
journal = {The American Psychologist},
volume = {74},
number = {6},
pages = {698--712},
issn = {1935-990X},
doi = {10.1037/amp0000399},
abstract = {Emotional vocalizations are central to human social life. Recent studies have documented that people recognize at least 13 emotions in brief vocalizations. This capacity emerges early in development, is preserved in some form across cultures, and informs how people respond emotionally to music. What is poorly understood is how emotion recognition from vocalization is structured within what we call a semantic space, the study of which addresses questions critical to the field: How many distinct kinds of emotions can be expressed? Do expressions convey emotion categories or affective appraisals (e.g., valence, arousal)? Is the recognition of emotion expressions discrete or continuous? Guided by a new theoretical approach to emotion taxonomies, we apply large-scale data collection and analysis techniques to judgments of 2,032 emotional vocal bursts produced in laboratory settings (Study 1) and 48 found in the real world (Study 2) by U.S. English speakers (N = 1,105). We find that vocal bursts convey at least 24 distinct kinds of emotion. Emotion categories (sympathy, awe), more so than affective appraisals (including valence and arousal), organize emotion recognition. In contrast to discrete emotion theories, the emotion categories conveyed by vocal bursts are bridged by smooth gradients with continuously varying meaning. We visualize the complex, high-dimensional space of emotion conveyed by brief human vocalization within an online interactive map. (PsycINFO Database Record (c) 2019 APA, all rights reserved).},
langid = {english},
pmcid = {PMC6586540},
pmid = {30570267},
keywords = {Adolescent,Adult,Aged,Communication,Emotions,Female,Humans,Male,Middle Aged,Recognition Psychology,Semantics,Social Perception,Voice,Young Adult},
file = {/Users/timokoch/Zotero/storage/8H4LMREM/Cowen et al. - 2019 - Mapping 24 emotions conveyed by brief human vocali.pdf}
}
@article{cowenPrimacyCategoriesRecognition2019,
title = {The Primacy of Categories in the Recognition of 12 Emotions in Speech Prosody across Two Cultures},
author = {Cowen, Alan S. and Laukka, Petri and Elfenbein, Hillary Anger and Liu, Runjing and Keltner, Dacher},
year = {2019},
month = apr,
journal = {Nature human behaviour},
volume = {3},
number = {4},
pages = {369--382},
issn = {2397-3374},
doi = {10.1038/s41562-019-0533-6},
urldate = {2024-04-08},
abstract = {Central to emotion science is the degree to which categories, such as awe, or broader affective features, such as valence, underlie the recognition of emotional expression. To explore the processes by which people recognize emotion from prosody, US and Indian participants were asked to judge the emotion categories or affective features communicated by 2,519 speech samples produced by 100 actors from five cultures. With large-scale statistical inference methods, we find that prosody can communicate at least 12 distinct kinds of emotion that are preserved across the two cultures. Analyses of the semantic and acoustic structure of emotion recognition reveal that emotion categories drive emotion recognition more so than affective features, including valence. In contrast to discrete emotion theories, however, emotion categories are bridged by gradients representing blends of emotions. Our findings, visualized within an interactive map (https://s3-us-west-1.amazonaws.com/venec/map.html), reveal a complex, high-dimensional space of emotional states recognized cross-culturally in speech prosody.},
pmcid = {PMC6687085},
pmid = {30971794},
file = {/Users/timokoch/Zotero/storage/KXQDLLSH/Cowen et al. - 2019 - The primacy of categories in the recognition of 12.pdf}
}
@article{critchleyInteroceptionEmotion2017,
title = {Interoception and Emotion},
author = {Critchley, Hugo D and Garfinkel, Sarah N},
year = {2017},
month = oct,
journal = {Current Opinion in Psychology},
series = {Emotion},
volume = {17},
pages = {7--14},
issn = {2352-250X},
doi = {10.1016/j.copsyc.2017.04.020},
urldate = {2022-12-16},
abstract = {Influential theories suggest emotional feeling states arise from physiological changes from within the body. Interoception describes the afferent signalling, central processing, and neural and mental representation of internal bodily signals. Recent progress is made in conceptualizing interoception and its neural underpinnings. These developments are supported by empirical data concerning interoceptive mechanisms and their contribution to emotion. Fresh insights include description of short-term interoceptive effects on neural and mental processes (including fear-specific cardiac effects), the recognition of dissociable psychological dimensions of interoception, and models of interoceptive predictive coding that explain emotions and selfhood (reinforced by structural anatomical models and brain and experimental findings). This growing grasp of interoception is enriching our understanding of emotion and its disorders.},
langid = {english},
file = {/Users/timokoch/Zotero/storage/9T6ZUA49/Critchley und Garfinkel - 2017 - Interoception and emotion.pdf}
}
@inproceedings{defrenEmotionalSpeechPerception2018,
title = {Emotional {{Speech Perception}}: {{A}} Set of Semantically Validated {{German}} Neutral and Emotionally Affective Sentences},
shorttitle = {Emotional {{Speech Perception}}},
booktitle = {9th {{International Conference}} on {{Speech Prosody}} 2018},
author = {Defren, Sabrina and {de Brito Castilho Wesseling}, Patricia and Allen, Shanley and Shakuf, Vered and {Ben-David}, Boaz and Lachmann, Thomas},
year = {2018},
month = jun,
pages = {714--718},
publisher = {ISCA},
doi = {10.21437/SpeechProsody.2018-145},
urldate = {2020-03-19},
abstract = {In order to address the complex interplay of prosody and semantics, a set of sentences were generated, suitable for investigating emotional speech perception in German. Fortyseven German native speakers rated the emotional content of sentences on a 6-point Likert scale. From a set of 54 sentences, 10-11 each could reliably be associated with one of four distinct emotions. The remaining 11 were assessed as neutral (expressing no emotion). The unambiguous assignment of semantic (emotional) content enables the study of prosody as an independent factor. Moreover, the sentences were balanced regarding average word frequency, average phonological neighborhood density, and number of syllables per sentence. This linguistic balance enables an unbiased evaluation of the roles of semantic content and prosody in emotional speech.},
langid = {english},
file = {/Users/timokoch/Zotero/storage/7X46GSUP/Defren et al. - 2018 - Emotional Speech Perception A set of semantically.pdf}
}
@article{dejonckheereAssessingReliabilitySingleitem2022,
title = {Assessing the Reliability of Single-Item Momentary Affective Measurements in Experience Sampling.},
author = {Dejonckheere, Egon and Demeyer, Febe and Geusens, Birte and Piot, Maarten and Tuerlinckx, Francis and Verdonck, Stijn and Mestdagh, Merijn},
year = {2022},
month = dec,
journal = {Psychological Assessment},
volume = {34},
number = {12},
pages = {1138--1154},
issn = {1939-134X, 1040-3590},
doi = {10.1037/pas0001178},
urldate = {2023-01-24},
abstract = {In research on emotions in daily life, measurement error is often ignored because emotions are assessed with a single item to reduce participant burden. We introduce two retests procedures to determine how reliable such emotion ratings are and show that measurement error variance is too substantial to simply disregard.},
langid = {english},
file = {/Users/timokoch/Zotero/storage/Q62ZVHN7/Dejonckheere et al. - 2022 - Assessing the reliability of single-item momentary.pdf}
}
@incollection{demetriouSelfReportQuestionnaires2015,
title = {Self-{{Report Questionnaires}}},
booktitle = {The {{Encyclopedia}} of {{Clinical Psychology}}},
author = {Demetriou, Constantina and Ozer, Bilge Uzun and Essau, Cecilia A.},
editor = {Cautin, Robin L. and Lilienfeld, Scott O.},
year = {2015},
month = jan,
pages = {1--6},
publisher = {John Wiley \& Sons, Inc.},
address = {Hoboken, NJ, USA},
doi = {10.1002/9781118625392.wbecp507},
urldate = {2019-03-07},
isbn = {978-1-118-62539-2},
langid = {english},
file = {/Users/timokoch/Zotero/storage/ASZUM2V3/Demetriou et al. - 2015 - Self-Report Questionnaires.pdf}
}
@article{dengInterpretingTreeEnsembles2014,
title = {Interpreting {{Tree Ensembles}} with {{inTrees}}},
author = {Deng, Houtao},
year = {2014},
month = aug,
journal = {arXiv:1408.5456 [cs, stat]},
eprint = {1408.5456},
primaryclass = {cs, stat},
urldate = {2021-06-18},
abstract = {Tree ensembles such as random forests and boosted trees are accurate but difficult to understand, debug and deploy. In this work, we provide the inTrees (interpretable trees) framework that extracts, measures, prunes and selects rules from a tree ensemble, and calculates frequent variable interactions. An rule-based learner, referred to as the simplified tree ensemble learner (STEL), can also be formed and used for future prediction. The inTrees framework can applied to both classification and regression problems, and is applicable to many types of tree ensembles, e.g., random forests, regularized random forests, and boosted trees. We implemented the inTrees algorithms in the "inTrees" R package.},
archiveprefix = {arXiv},
keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
file = {/Users/timokoch/Zotero/storage/GSLI2G3J/Deng - 2014 - Interpreting Tree Ensembles with inTrees.pdf;/Users/timokoch/Zotero/storage/X8DY5AFX/1408.html}
}
@article{dmelloReviewMetaAnalysisMultimodal2015,
title = {A {{Review}} and {{Meta-Analysis}} of {{Multimodal Affect Detection Systems}}},
author = {D'mello, Sidney K. and Kory, Jacqueline},
year = {2015},
month = apr,
journal = {ACM Computing Surveys},
volume = {47},
number = {3},
pages = {1--36},
issn = {0360-0300, 1557-7341},
doi = {10.1145/2682899},
urldate = {2024-07-29},
abstract = {Affect detection is an important pattern recognition problem that has inspired researchers from several areas. The field is in need of a systematic review due to the recent influx of Multimodal (MM) affect detection systems that differ in several respects and sometimes yield incompatible results. This article provides such a survey via a quantitative review and meta-analysis of 90 peer-reviewed MM systems. The review indicated that the state of the art mainly consists of person-dependent models (62.2\% of systems) that fuse audio and visual (55.6\%) information to detect acted (52.2\%) expressions of basic emotions and simple dimensions of arousal and valence (64.5\%) with feature- (38.9\%) and decision-level (35.6\%) fusion techniques. However, there were also person-independent systems that considered additional modalities to detect nonbasic emotions and complex dimensions using model-level fusion techniques. The meta-analysis revealed that MM systems were consistently (85\% of systems) more accurate than their best unimodal counterparts, with an average improvement of 9.83\% (median of 6.60\%). However, improvements were three times lower when systems were trained on natural (4.59\%) versus acted data (12.7\%). Importantly, MM accuracy could be accurately predicted (cross-validated R 2 of 0.803) from unimodal accuracies and two system-level factors. Theoretical and applied implications and recommendations are discussed.},
langid = {english}
}
@inproceedings{dubeyBigEARInferringAmbient2016,
title = {{{BigEAR}}: {{Inferring}} the {{Ambient}} and {{Emotional Correlates}} from {{Smartphone-Based Acoustic Big Data}}},
shorttitle = {{{BigEAR}}},
booktitle = {2016 {{IEEE First International Conference}} on {{Connected Health}}: {{Applications}}, {{Systems}} and {{Engineering Technologies}} ({{CHASE}})},
author = {Dubey, Harishchandra and Mehl, Matthias R. and Mankodiya, Kunal},
year = {2016},
month = jun,
pages = {78--83},
doi = {10.1109/CHASE.2016.46},
abstract = {This paper presents a novel BigEAR big data framework that employs psychological audio processing chain (PAPC) to process smartphone-based acoustic big data collected when the user performs social conversations in naturalistic scenarios. The overarching goal of BigEAR is to identify moods of the wearer from various activities such as laughing, singing, crying, arguing, and sighing. These annotations are based on ground truth relevant for psychologists who intend to monitor/infer the social context of individuals coping with breast cancer. We pursued a case study on couples coping with breast cancer to know how the conversations affect emotional and social well being. In the state-of-the-art methods, psychologists and their team have to hear the audio recordings for making these inferences by subjective evaluations that not only are time-consuming and costly, but also demand manual data coding for thousands of audio files. The BigEAR framework automates the audio analysis. We computed the accuracy of BigEAR with respect to the ground truth obtained from a human rater. Our approach yielded overall average accuracy of 88.76\% on real-world data from couples coping with breast cancer.},
keywords = {Acoustics,Big data,Breast cancer,Feature extraction,Mood,Speech},
file = {/Users/timokoch/Zotero/storage/CXDEKMQV/Dubey et al. - 2016 - BigEAR Inferring the Ambient and Emotional Correl.pdf;/Users/timokoch/Zotero/storage/RFFHULEF/7545817.html}
}
@article{dukesRiseAffectivism2021,
title = {The Rise of Affectivism},
author = {Dukes, Daniel and Abrams, Kathryn and Adolphs, Ralph and Ahmed, Mohammed E. and Beatty, Andrew and Berridge, Kent C. and Broomhall, Susan and Brosch, Tobias and Campos, Joseph J. and Clay, Zanna and Cl{\'e}ment, Fabrice and Cunningham, William A. and Damasio, Antonio and Damasio, Hanna and D'Arms, Justin and Davidson, Jane W. and de Gelder, Beatrice and Deonna, Julien and de Sousa, Ronnie and Ekman, Paul and Ellsworth, Phoebe C. and Fehr, Ernst and Fischer, Agneta and Foolen, Ad and Frevert, Ute and Grandjean, Didier and Gratch, Jonathan and Greenberg, Leslie and Greenspan, Patricia and Gross, James J. and Halperin, Eran and Kappas, Arvid and Keltner, Dacher and Knutson, Brian and Konstan, David and Kret, Mariska E. and LeDoux, Joseph E. and Lerner, Jennifer S. and Levenson, Robert W. and Loewenstein, George and Manstead, Antony S. R. and Maroney, Terry A. and Moors, Agnes and Niedenthal, Paula and Parkinson, Brian and Pavlidis, Ioannis and Pelachaud, Catherine and Pollak, Seth D. and Pourtois, Gilles and {Roettger-Roessler}, Birgitt and Russell, James A. and Sauter, Disa and Scarantino, Andrea and Scherer, Klaus R. and Stearns, Peter and Stets, Jan E. and Tappolet, Christine and Teroni, Fabrice and Tsai, Jeanne and Turner, Jonathan and Reekum, Carien Van and Vuilleumier, Patrik and Wharton, Tim and Sander, David},
year = {2021},
month = jun,
journal = {Nature Human Behaviour},
pages = {1--5},
publisher = {Nature Publishing Group},
issn = {2397-3374},
doi = {10.1038/s41562-021-01130-8},
urldate = {2021-06-15},
abstract = {Research over the past decades has demonstrated the explanatory power of emotions, feelings, motivations, moods, and other affective processes when trying to understand and predict how we think and behave. In this consensus article, we ask: has the increasingly recognized impact of affective phenomena ushered in a new era, the era of affectivism?},
copyright = {2021 Springer Nature Limited},
langid = {english},
file = {/Users/timokoch/Zotero/storage/R4JKCVNS/Dukes et al. - 2021 - The rise of affectivism.pdf;/Users/timokoch/Zotero/storage/HQZ8GRRA/s41562-021-01130-8.html}
}
@article{ekmanArgumentBasicEmotions1992,
title = {An Argument for Basic Emotions},
author = {Ekman, Paul},
year = {1992},
month = may,
journal = {Cognition and Emotion},
volume = {6},
number = {3-4},
pages = {169--200},
publisher = {Routledge},
issn = {0269-9931},
doi = {10.1080/02699939208411068},
urldate = {2021-11-24},
abstract = {Emotions are viewed as having evolved through their adaptive value in dealing with fundamental life-tasks. Each emotion has unique features: signal, physiology, and antecedent events. Each emotion also has characteristics in common with other emotions: rapid onset, short duration, unbidden occurrence, automatic appraisal, and coherence among responses. These shared and unique characteristics are the product of our evolution, and distinguish emotions from other affective phenomena.},
file = {/Users/timokoch/Zotero/storage/8APFPSCL/02699939208411068.html}
}
@article{ekmanRepertoireNonverbalBehavior1969,
title = {The {{Repertoire}} of {{Nonverbal Behavior}}: {{Categories}}, {{Origins}}, {{Usage}}, and {{Coding}}},
shorttitle = {The {{Repertoire}} of {{Nonverbal Behavior}}},
author = {Ekman, Paul and Friesen, Wallace V.},
year = {1969},
month = jan,
journal = {Semiotica},
volume = {1},
number = {1},
pages = {49--98},
publisher = {De Gruyter Mouton},
issn = {1613-3692},
doi = {10.1515/semi.1969.1.1.49},
urldate = {2024-09-26},
abstract = {Der Artikel The Repertoire of Nonverbal Behavior: Categories, Origins, Usage, and Coding wurde am 1. Januar 1969 in der Zeitschrift Semiotica (Band 1, Heft 1) ver{\"o}ffentlicht.},
copyright = {De Gruyter expressly reserves the right to use all content for commercial text and data mining within the meaning of Section 44b of the German Copyright Act.},
langid = {english},
file = {/Users/timokoch/Zotero/storage/V2JLAI64/Ekman and Friesen - 1969 - The Repertoire of Nonverbal Behavior Categories, .pdf}
}
@article{elayadiSurveySpeechEmotion2011,
title = {Survey on Speech Emotion Recognition: {{Features}}, Classification Schemes, and Databases},
shorttitle = {Survey on Speech Emotion Recognition},
author = {El Ayadi, Moataz and Kamel, Mohamed S. and Karray, Fakhri},
year = {2011},
month = mar,
journal = {Pattern Recognition},
volume = {44},
number = {3},
pages = {572--587},
issn = {0031-3203},
doi = {10.1016/j.patcog.2010.09.020},
urldate = {2024-09-17},
abstract = {Recently, increasing attention has been directed to the study of the emotional content of speech signals, and hence, many systems have been proposed to identify the emotional content of a spoken utterance. This paper is a survey of speech emotion classification addressing three important aspects of the design of a speech emotion recognition system. The first one is the choice of suitable features for speech representation. The second issue is the design of an appropriate classification scheme and the third issue is the proper preparation of an emotional speech database for evaluating system performance. Conclusions about the performance and limitations of current speech emotion recognition systems are discussed in the last section of this survey. This section also suggests possible ways of improving speech emotion recognition systems.},
keywords = {Archetypal emotions,Dimensionality reduction techniques,Emotional speech databases,Speech emotion recognition,Statistical classifiers},
file = {/Users/timokoch/Zotero/storage/APM9HW8Y/S0031320310004619.html}
}
@article{eybenGenevaMinimalisticAcoustic2016,
title = {The {{Geneva Minimalistic Acoustic Parameter Set}} ({{GeMAPS}}) for {{Voice Research}} and {{Affective Computing}}},
author = {Eyben, Florian and Scherer, Klaus R. and Schuller, Bj{\"o}rn and Sundberg, Johan and Andre, Elisabeth and Busso, Carlos and Devillers, Laurence Y. and Epps, Julien and Laukka, Petri and Narayanan, Shrikanth S. and Truong, Khiet P.},
year = {2016},
month = apr,
journal = {IEEE Transactions on Affective Computing},
volume = {7},
number = {2},
pages = {190--202},
issn = {1949-3045},
doi = {10.1109/TAFFC.2015.2457417},
urldate = {2019-02-20},
abstract = {Work on voice sciences over recent decades has led to a proliferation of acoustic parameters that are used quite selectively and are not always extracted in a similar fashion. With many independent teams working in different research areas, shared standards become an essential safeguard to ensure compliance with state-of-the-art methods allowing appropriate comparison of results across studies and potential integration and combination of extraction and recognition systems. In this paper we propose a basic standard acoustic parameter set for various areas of automatic voice analysis, such as paralinguistic or clinical speech analysis. In contrast to a large brute-force parameter set, we present a minimalistic set of voice parameters here. These were selected based on a) their potential to index affective physiological changes in voice production, b) their proven value in former studies as well as their automatic extractability, and c) their theoretical significance. The set is intended to provide a common baseline for evaluation of future research and eliminate differences caused by varying parameter sets or even different implementations of the same parameters. Our implementation is publicly available with the openSMILE toolkit. Comparative evaluations of the proposed feature set and large baseline feature sets of INTERSPEECH challenges show a high performance of the proposed set in relation to its size.},
langid = {english},
file = {/Users/timokoch/Zotero/storage/CJPU7SQG/Eyben et al. - 2016 - The Geneva Minimalistic Acoustic Parameter Set (Ge.pdf}
}
@inproceedings{eybenOpensmileMunichVersatile2010,
title = {Opensmile: The Munich Versatile and Fast Open-Source Audio Feature Extractor},
shorttitle = {Opensmile},
booktitle = {Proceedings of the International Conference on {{Multimedia}} - {{MM}} '10},
author = {Eyben, Florian and W{\"o}llmer, Martin and Schuller, Bj{\"o}rn},
year = {2010},
pages = {1459},
publisher = {ACM Press},
address = {Firenze, Italy},
doi = {10.1145/1873951.1874246},
urldate = {2021-11-09},
abstract = {We introduce the openSMILE feature extraction toolkit, which unites feature extraction algorithms from the speech processing and the Music Information Retrieval communities. Audio low-level descriptors such as CHROMA and CENS features, loudness, Mel-frequency cepstral coefficients, perceptual linear predictive cepstral coefficients, linear predictive coefficients, line spectral frequencies, fundamental frequency, and formant frequencies are supported. Delta regression and various statistical functionals can be applied to the low-level descriptors. openSMILE is implemented in C++ with no third-party dependencies for the core functionality. It is fast, runs on Unix and Windows platforms, and has a modular, component based architecture which makes extensions via plug-ins easy. It supports on-line incremental processing for all implemented features as well as off-line and batch processing. Numeric compatibility with future versions is ensured by means of unit tests. openSMILE can be downloaded from http://opensmile.sourceforge.net/.},
isbn = {978-1-60558-933-6},
langid = {english},
file = {/Users/timokoch/Zotero/storage/F5KPA499/Eyben et al. - 2010 - Opensmile the munich versatile and fast open-sour.pdf}
}
@inproceedings{eybenRecentDevelopmentsOpenSMILE2013,
title = {Recent {{Developments}} in {{openSMILE}}, the {{Munich Open-source Multimedia Feature Extractor}}},
booktitle = {Proceedings of the 21st {{ACM International Conference}} on {{Multimedia}}},
author = {Eyben, Florian and Weninger, Felix and Gross, Florian and Schuller, Bj{\"o}rn},
year = {2013},
series = {{{MM}} '13},
pages = {835--838},
publisher = {ACM},
address = {New York, NY, USA},
doi = {10.1145/2502081.2502224},
urldate = {2018-11-21},
abstract = {We present recent developments in the openSMILE feature extraction toolkit. Version 2.0 now unites feature extraction paradigms from speech, music, and general sound events with basic video features for multi-modal processing. Descriptors from audio and video can be processed jointly in a single framework allowing for time synchronization of parameters, on-line incremental processing as well as off-line and batch processing, and the extraction of statistical functionals (feature summaries), such as moments, peaks, regression parameters, etc. Postprocessing of the features includes statistical classifiers such as support vector machine models or file export for popular toolkits such as Weka or HTK. Available low-level descriptors include popular speech, music and video features including Mel-frequency and similar cepstral and spectral coefficients, Chroma, CENS, auditory model based loudness, voice quality, local binary pattern, color, and optical flow histograms. Besides, voice activity detection, pitch tracking and face detection are supported. openSMILE is implemented in C++, using standard open source libraries for on-line audio and video input. It is fast, runs on Unix and Windows platforms, and has a modular, component based architecture which makes extensions via plug-ins easy. openSMILE 2.0 is distributed under a research license and can be downloaded from http://opensmile.sourceforge.net/.},
isbn = {978-1-4503-2404-5},
keywords = {acoustic features,affect recognition,affective computing,audio features,computational paralinguistics,feature extraction,machine learning,multimedia analysis,openSMILE,video features,visual features},
file = {/Users/timokoch/Zotero/storage/892EJHYH/Eyben et al. - 2013 - Recent Developments in openSMILE, the Munich Open-.pdf}
}
@article{fairbanksExperimentalStudyPitch1939,
title = {An Experimental Study of the Pitch Characteristics of the Voice during the Expression of Emotions},
author = {Fairbanks, G. and Pronovost, W.},
year = {1939},
journal = {Speech Monographs},
volume = {6},
pages = {87--104},
publisher = {Taylor \& Francis},
address = {United Kingdom},
issn = {0038-7169},
doi = {10.1080/03637753909374863},
abstract = {The pitch characteristics of simulated emotions were investigated to determine their distinguishing characteristics. Six actors read for recording selections whose content facilitated the expression of anger, contempt, fear, grief, and indifference. All selections contained the same brief test section. Although not itself having a single inherent affective meaning, this recorded test section, when separated from the main selection, was accurately identified by high percentages of a group of 64 observers, averaging 84\% for contempt, 78\% for anger, 66\% for fear, 78\% for grief, and 88\% for indifference. The most accurately identified example for each emotion was analyzed. These five were all identified by 94\% or more of the subjects. Tables and graphs are presented to demonstrate that measurable pitch characteristics distinguish these expressions of emotion from each other. Variations in pitch level, inflections, shifts, and frequency of pitch changes are described. (PsycINFO Database Record (c) 2017 APA, all rights reserved)},
file = {/Users/timokoch/Zotero/storage/Y3S3GGQ5/1940-02497-001.html}
}
@article{fanHowWellCan2023,
title = {How Well Can an {{AI}} Chatbot Infer Personality? {{Examining}} Psychometric Properties of Machine-Inferred Personality Scores},
shorttitle = {How Well Can an {{AI}} Chatbot Infer Personality?},
author = {Fan, Jinyan and Sun, Tianjun and Liu, Jiayi and Zhao, Teng and Zhang, Bo and Chen, Zheng and Glorioso, Melissa and Hack, Elissa},
year = {2023},
journal = {Journal of Applied Psychology},
pages = {No Pagination Specified-No Pagination Specified},
publisher = {American Psychological Association},
address = {US},
issn = {1939-1854},
doi = {10.1037/apl0001082},
abstract = {The present study explores the plausibility of measuring personality indirectly through an artificial intelligence (AI) chatbot. This chatbot mines various textual features from users' free text responses collected during an online conversation/interview and then uses machine learning algorithms to infer personality scores. We comprehensively examine the psychometric properties of the machine-inferred personality scores, including reliability (internal consistency, split-half, and test--retest), factorial validity, convergent and discriminant validity, and criterion-related validity. Participants were undergraduate students (n = 1,444) enrolled in a large southeastern public university in the United States who completed a self-report Big Five personality measure (IPIP-300) and engaged with an AI chatbot for approximately 20--30 min. In a subsample (n = 407), we obtained participants' cumulative grade point averages from the University Registrar and had their peers rate their college adjustment. In an additional sample (n = 61), we obtained test--retest data. Results indicated that machine-inferred personality scores (a) had overall acceptable reliability at both the domain and facet levels, (b) yielded a comparable factor structure to self-reported questionnaire-derived personality scores, (c) displayed good convergent validity but relatively poor discriminant validity (averaged convergent correlations = .48 vs. averaged machine-score correlations = .35 in the test sample), (d) showed low criterion-related validity, and (e) exhibited incremental validity over self-reported questionnaire-derived personality scores in some analyses. In addition, there was strong evidence for cross-sample generalizability of psychometric properties of machine scores. Theoretical implications, future research directions, and practical considerations are discussed. (PsycInfo Database Record (c) 2023 APA, all rights reserved)},
keywords = {Artificial Intelligence,Convergent Validity,Conversational Agents,Criterion Validity,Discriminant Validity,Factorial Validity,Internal Consistency,Machine Learning,Personality,Split-Half Reliability,Test Reliability,Test Scores,Test Validity,Test-Retest Reliability},
file = {/Users/timokoch/Zotero/storage/B2ZFHA52/2023-43379-001.html}
}
@article{frickCommunicatingEmotionRole,
title = {Communicating {{Emotion}}: {{The Role}} of {{Prosodic Features}}},
author = {Frick, Robert W},
pages = {18},
langid = {english},
file = {/Users/timokoch/Zotero/storage/A9XPRU8L/Frick - Communicating Emotion The Role of Prosodic Featur.pdf}
}
@article{friedmanRegularizationPathsGeneralized2010,
title = {Regularization Paths for Generalized Linear Models via Coordinate Descent},
author = {Friedman, Jerome and Hastie, Trevor and Tibshirani, Rob},
year = {2010},
journal = {Journal of statistical software},
volume = {33},
number = {1},
pages = {1}
}
@inproceedings{gaoInvestigatingReliabilitySelfreport2021,
title = {Investigating the {{Reliability}} of {{Self-report Data}} in the {{Wild}}: {{The Quest}} for {{Ground Truth}}},
shorttitle = {Investigating the {{Reliability}} of {{Self-report Data}} in the {{Wild}}},
booktitle = {Adjunct {{Proceedings}} of the 2021 {{ACM International Joint Conference}} on {{Pervasive}} and {{Ubiquitous Computing}} and {{Proceedings}} of the 2021 {{ACM International Symposium}} on {{Wearable Computers}}},
author = {Gao, Nan and Saiedur Rahaman, Mohammad and Shao, Wei and Salim, Flora D},
year = {2021},
month = sep,
series = {{{UbiComp}} '21},
pages = {237--242},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
doi = {10.1145/3460418.3479338},
urldate = {2022-12-16},
abstract = {Inferring human mental state (e.g., emotion, depression, engagement) with sensing technology is one of the most valuable challenges in the affective computing area, which has a profound impact in all industries interacting with humans. Self-report is the most common way to quantify how people think, but prone to subjectivity and various responses bias. It is usually used as the ground truth for human mental state prediction. In recent years, many data-driven machine learning models are built based on self-report annotations as the target value. In this research, we investigate the reliability of self-report data in the wild by studying the confidence level of responses and survey completion time. We conduct a case study (i.e., student engagement inference) by recruiting 23 students in a high school setting over a period of 4 weeks. Overall, our participants volunteered 488 self-reported responses and sensing data from smart wristbands. We find that the physiologically measured student engagement and perceived student engagement are not always consistent. The findings from this research have great potential to benefit future studies in predicting engagement, depression, stress, and other emotion-related states in the field of affective computing and sensing technologies.},
isbn = {978-1-4503-8461-2},
keywords = {Ecological Momentary Assessment,Emotion Prediction,Field Study,Ground Truth,Physiological Signals,Reliability,Self-report Measures},
file = {/Users/timokoch/Zotero/storage/4VUXHE2Z/Gao et al. - 2021 - Investigating the Reliability of Self-report Data .pdf;/Users/timokoch/Zotero/storage/FTGIAIKH/Gao et al. - 2021 - Investigating the Reliability of Self-report Data .pdf}
}
@article{geldhofReliabilityEstimationMultilevel2014,
title = {Reliability Estimation in a Multilevel Confirmatory Factor Analysis Framework.},
author = {Geldhof, G. John and Preacher, Kristopher J. and Zyphur, Michael J.},
year = {2014},
month = mar,
journal = {Psychological Methods},
volume = {19},
number = {1},
pages = {72--91},
issn = {1939-1463, 1082-989X},
doi = {10.1037/a0032138},
urldate = {2021-11-26},
abstract = {Scales with varying degrees of measurement reliability are often used in the context of multistage sampling, where variance exists at multiple levels of analysis (e.g., individual and group). Because methodological guidance on assessing and reporting reliability at multiple levels of analysis is currently lacking, we discuss the importance of examining level-specific reliability. We present a simulation study and an applied example showing different methods for estimating multilevel reliability using multilevel confirmatory factor analysis and provide supporting Mplus program code. We conclude that (a) single-level estimates will not reflect a scale's actual reliability unless reliability is identical at each level of analysis, (b) 2-level alpha and composite reliability (omega) perform relatively well in most settings, (c) estimates of maximal reliability (H) were more biased when estimated using multilevel data than either alpha or omega, and (d) small cluster size can lead to overestimates of reliability at the between level of analysis. We also show that Monte Carlo confidence intervals and Bayesian credible intervals closely reflect the sampling distribution of reliability estimates under most conditions. We discuss the estimation of credible intervals using Mplus and provide R code for computing Monte Carlo confidence intervals.},
langid = {english},
file = {/Users/timokoch/Zotero/storage/3IFXKWIB/Geldhof et al. - 2014 - Reliability estimation in a multilevel confirmator.pdf}
}
@article{giordanoRepresentationalDynamicsPerceived2021,
title = {The Representational Dynamics of Perceived Voice Emotions Evolve from Categories to Dimensions},
author = {Giordano, Bruno L. and Whiting, Caroline and Kriegeskorte, Nikolaus and Kotz, Sonja A. and Gross, Joachim and Belin, Pascal},
year = {2021},
month = sep,
journal = {Nature Human Behaviour},
volume = {5},
number = {9},
pages = {1203--1213},
publisher = {Nature Publishing Group},
issn = {2397-3374},
doi = {10.1038/s41562-021-01073-0},
urldate = {2021-11-23},
abstract = {Long-standing affective science theories conceive the perception of emotional stimuli either as discrete categories (for example, an angry voice) or continuous dimensional attributes (for example, an intense and negative vocal emotion). Which position provides a better account is still widely debated. Here we contrast the positions to account for acoustics-independent perceptual and cerebral representational geometry of perceived voice emotions. We combined multimodal imaging of the cerebral response to heard vocal stimuli (using functional magnetic resonance imaging and magneto-encephalography) with post-scanning behavioural assessment of voice emotion perception. By using representational similarity analysis, we find that categories prevail in perceptual and early (less than 200\,ms) frontotemporal cerebral representational geometries and that dimensions impinge predominantly on a later limbic--temporal network (at 240\,ms and after 500\,ms). These results reconcile the two opposing views by reframing the perception of emotions as the interplay of cerebral networks with different representational dynamics that emphasize either categories or dimensions.},
copyright = {2021 The Author(s), under exclusive licence to Springer Nature Limited},
langid = {english},
keywords = {Human behaviour,Limbic system},
annotation = {Bandiera\_abtest: a\\
Cg\_type: Nature Research Journals\\
Primary\_atype: Research\\
Subject\_term: Human behaviour;Limbic system\\
Subject\_term\_id: human-behaviour;limbic-system},
file = {/Users/timokoch/Zotero/storage/2UQVIJNH/Giordano et al. - 2021 - The representational dynamics of perceived voice e.pdf;/Users/timokoch/Zotero/storage/4W7IG5DY/s41562-021-01073-0.html}
}
@inproceedings{goronImprovingDomainGeneralization2024,
title = {Improving {{Domain Generalization}} in {{Speech Emotion Recognition}} with {{Whisper}}},
booktitle = {{{ICASSP}} 2024 - 2024 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}} and {{Signal Processing}} ({{ICASSP}})},
author = {Goron, Erik and Asai, Lena and Rut, Elias and Dinov, Martin},
year = {2024},
month = apr,
pages = {11631--11635},
issn = {2379-190X},
doi = {10.1109/ICASSP48485.2024.10446997},
urldate = {2024-11-18},
abstract = {Transformers have been used successfully in a variety of settings, including Speech Emotion Recognition (SER). However, use of the latest transformer base models in domain generalization (DG) settings has mostly been unexplored or only weakly explored. We present here our state-of-the-art results in discrete emotion recognition across a variety of datasets, including acted and non-acted datasets, showing that Whisper is a powerful base Transformer model for this task. We show that our approach to DG with Whisper results in accuracy surpassing all previously published results, with an Unweighted Average Recall (UAR) of 74.5\% averaged across the 6 distinct datasets used. We discuss some of the possible reasons behind Whisper's superior performance to other Transformer models, though all 3 Transformer models evaluated here (HuBERT, WavLM, Whisper) show an ability to generalize as well as learn paralinguistic information successfully through fine-tuning with relatively few examples.},
keywords = {Data models,Domain Generalization,Emotion recognition,Explainable AI,HuBERT,Signal processing,Signal processing algorithms,Speech Emotion Recognition (SER),Speech recognition,Transformers,Whisper},
file = {/Users/timokoch/Zotero/storage/XNCE4III/Goron et al. - 2024 - Improving Domain Generalization in Speech Emotion .pdf}
}
@article{gotzUsersMainSmartphone2017,
title = {Users of the Main Smartphone Operating Systems ({{iOS}}, {{Android}}) Differ Only Little in Personality},
author = {G{\"o}tz, Friedrich M. and Stieger, Stefan and Reips, Ulf-Dietrich},
year = {2017},
month = mar,
journal = {PLOS ONE},
volume = {12},
number = {5},
pages = {e0176921},
publisher = {Public Library of Science},
issn = {1932-6203},
doi = {10.1371/journal.pone.0176921},
urldate = {2023-02-21},
abstract = {The increasingly widespread use of mobile phone applications (apps) as research tools and cost-effective means of vast data collection raises new methodological challenges. In recent years, it has become a common practice for scientists to design apps that run only on a single operating system, thereby excluding large numbers of users who use a different operating system. However, empirical evidence investigating any selection biases that might result thereof is scarce. Henceforth, we conducted two studies drawing from a large multi-national (Study 1; N = 1,081) and a German-speaking sample (Study 2; N = 2,438). As such Study 1 compared iOS and Android users across an array of key personality traits (i.e., well-being, self-esteem, willingness to take risks, optimism, pessimism, Dark Triad, and the Big Five). Focusing on Big Five personality traits in a broader scope, in addition to smartphone users, Study 2 also examined users of the main computer operating systems (i.e., Mac OS, Windows). In both studies, very few significant differences were found, all of which were of small or even tiny effect size mostly disappearing after sociodemographics had been controlled for. Taken together, minor differences in personality seem to exist, but they are of small to negligible effect size (ranging from OR = 0.919 to 1.344 (Study 1), {$\eta$}p2 = .005 to .036 (Study 2), respectively) and may reflect differences in sociodemographic composition, rather than operating system of smartphone users.},
langid = {english},
keywords = {Apps,Cell phones,Educational attainment,Operating systems,Personality,Personality traits,Psychology,Questionnaires},
file = {/Users/timokoch/Zotero/storage/LLGKVI2U/Götz et al. - 2017 - Users of the main smartphone operating systems (iO.pdf}
}
@inproceedings{grimmVeraAmMittag2008,
title = {The {{Vera}} Am {{Mittag German}} Audio-Visual Emotional Speech Database},
booktitle = {2008 {{IEEE International Conference}} on {{Multimedia}} and {{Expo}}},
author = {Grimm, Michael and Kroschel, Kristian and Narayanan, Shrikanth},
year = {2008},
month = jun,
pages = {865--868},
issn = {1945-788X},
doi = {10.1109/ICME.2008.4607572},
abstract = {The lack of publicly available annotated databases is one of the major barriers to research advances on emotional information processing. In this contribution we present a recently collected database of spontaneous emotional speech in German which is being made available to the research community. The database consists of 12 hours of audio-visual recordings of the German TV talk show ldquoVera am Mittagrdquo, segmented into broadcasts, dialogue acts and utterances. This corpus contains spontaneous and very emotional speech recorded from unscripted, authentic discussions between the guests of the talk show. In addition to the audio-visual data and the segmented utterances we provide emotion labels for a great part of the data. The emotion labels are given on a continuous valued scale for three emotion primitives: valence, activation and dominance, using a large number of human evaluators. Such data is of great interest to all research groups working on spontaneous speech analysis, emotion recognition in both speech and facial expression, natural language understanding, and robust speech recognition.},
keywords = {Data acquisition,Databases,Emotion recognition,Histograms,Speech,Speech analysis,Speech processing,Speech recognition,TV,Video signal processing},
file = {/Users/timokoch/Zotero/storage/DYTHT82Y/4607572.html}
}
@article{grossDissociationEmotionExpression2000,
title = {The {{Dissociation}} of {{Emotion Expression}} from {{Emotion Experience}}: {{A Personality Perspective}}},
shorttitle = {The {{Dissociation}} of {{Emotion Expression}} from {{Emotion Experience}}},
author = {Gross, James J. and John, Oliver P. and Richards, Jane M.},
year = {2000},
month = aug,
journal = {Personality and Social Psychology Bulletin},
volume = {26},
number = {6},
pages = {712--726},
issn = {0146-1672, 1552-7433},
doi = {10.1177/0146167200268006},
urldate = {2021-11-23},
abstract = {When we want to know what others are feeling, we look to the face for clues. However, individual differences matter: Some faces are more expressive than others. Do both emotion experience and dispositional expressivity predict emotion expression? Based on an analysis of display rules, the authors hypothesized that expressivity would moderate the relation between experience and expression for negative, but not for positive, emotion. Study 1 examined the relation between habitual emotion experience and peer-rated expressive behavior and showed the predicted moderator effect for negative emotion: Experience was related to expression only for dispositionally high-expressivity participants, not for low-expressivity participants. For positive emotion, however, experience was related to expression for both groups. Study 2 replicated these findings using momentary emotion experience and objectively coded expressive behavior during films that elicited amusement and sadness. Results are interpreted in terms of low-expressivity individuals' propensity to dynamically regulate negative emotion-expressive behavior.},
langid = {english},
file = {/Users/timokoch/Zotero/storage/ELFDBIQF/Gross et al. - 2000 - The Dissociation of Emotion Expression from Emotio.pdf}
}
@article{grossedetersKeepScrollingUsing2024,
title = {Keep on Scrolling? {{Using}} Intensive Longitudinal Smartphone Sensing Data to Assess How Everyday Smartphone Usage Behaviors Are Related to Well-Being},
shorttitle = {Keep on Scrolling?},
author = {{gro{\ss}e Deters}, Fenne and Schoedel, Ramona},
year = {2024},
month = jan,
journal = {Computers in Human Behavior},
volume = {150},
pages = {107977},
issn = {0747-5632},
doi = {10.1016/j.chb.2023.107977},
urldate = {2024-04-25},
abstract = {Smartphones are an integral part of daily life for many people worldwide. However, concerns have been raised that long usage times and the fragmentation of daily life through smartphone usage are detrimental to well-being. This preregistered study assesses (1) whether differences in smartphone usage behaviors between individuals predict differences in a variety of well-being measures (between-person effects) and (2) whether differences in smartphone usage behaviors between situations predict whether an individual is feeling better or worse (within-person effects). In addition to total usage time, several indicators capturing the fragmentation of usage/nonusage time were developed. The study combines objectively measured smartphone usage with self-reports of well-being in surveys (N~=~236) and an experience sampling period (N~=~378, n~=~5775 datapoints). To ensure the robustness of the results, we replicated our analyses in a second measurement period (surveys: N~=~305; experience sampling: N~=~534, n~=~7287 datapoints) and considered the pattern of effects across different operational definitions and constructs. Results show that individuals who use their smartphone more report slightly lower well-being (between-person effect) but no evidence for within-person effects of total usage time emerged. With respect to fragmentation, we found no robust association with well-being.},
keywords = {Experience sampling,Fragmentation,Psychological well-being,Smartphone sensing,Smartphone usage},
file = {/Users/timokoch/Zotero/storage/76K5IPUM/S074756322300328X.html}
}
@article{grossRevealingFeelingsFacets1997,
title = {Revealing Feelings: {{Facets}} of Emotional Expressivity in Self-Reports, Peer Ratings, and Expressive Behavior},
shorttitle = {Revealing Feelings},
author = {Gross, James J. and John, O. L. Iver E.},
year = {1997},
journal = {Journal of Personality and Social Psychology},
pages = {434--447},
abstract = {Drawing on an explicit model of emotion, we propose amultifaceted approach to emotional expressiv-ity, defined as the behavioral (e.g., facial, postural) changes associated with emotion. Study 1 shows that self-reported expressivity has 3 facets (Impulse Strength, Negative Expressivity, Positive Expressivity). Study 2 shows that the same 3 facets emerge in peer ratings and that there are robust relations between self- and peer-rated expressivity. In Study 3, emotion-expressive behavior was videotaped and related to expressivity self-reports obtained several months earlier. As expected, Negative Expressivity predicted behavioral expressions of sadness (but not amusement), whereas Positive Expressivity predicted amusement (but not sadness). These relations remained even when subjective motional experience and physiological response were controlled. These studies demon-strate the importance of a multifaceted approach to emotional expressivity and have implications for the understanding of personality and emotion. Emotions help us respond adaptively to environmental chal-lenges and opportunities (Frijda, 1988; Levenson, 1994; Plut-chik, 1980). Unlike other biologically based response tenden-cies, such as reflexes, however, emotions only incline us to act in certain ways; they do not compel us to do so. This means that we may deny expression to some emotional impulses while freely expressing others. Striking individual differences in ex-pressivity suggest that people differ in their response tendencies and in how they express these impulses as they arise. Because emotions influence such a wide range of intra- and interpersonal},
file = {/Users/timokoch/Zotero/storage/FD7GEKYZ/Gross und John - 1997 - Revealing feelings Facets of emotional expressivi.pdf;/Users/timokoch/Zotero/storage/3VIGAGEF/summary.html}
}
@article{hanelStudentSamplesProvide2016,
title = {Do {{Student Samples Provide}} an {{Accurate Estimate}} of the {{General Public}}?},
author = {Hanel, Paul H. P. and Vione, Katia C.},
year = {2016},
month = dec,
journal = {PLoS ONE},
volume = {11},
number = {12},
pages = {e0168354},
issn = {1932-6203},
doi = {10.1371/journal.pone.0168354},
urldate = {2023-02-15},
abstract = {Most psychological studies rely on student samples. Students are usually considered as more homogenous than representative samples both within and across countries. However, little is known about the nature of the differences between student and representative samples. This is an important gap, also because knowledge about the degree of difference between student and representative samples may allow to infer from the former to the latter group. Across 59 countries and 12 personality (Big-5) and attitudinal variables we found that differences between students and general public were partly substantial, incoherent, and contradicted previous findings. Two often used cultural variables, embeddedness and intellectual autonomy, failed to explain the differences between both groups across countries. We further found that students vary as much as the general population both between and within countries. In summary, our results indicate that generalizing from students to the general public can be problematic when personal and attitudinal variables are used, as students vary mostly randomly from the general public. Findings are also discussed in terms of the replication crisis within psychology.},
pmcid = {PMC5176168},
pmid = {28002494},
file = {/Users/timokoch/Zotero/storage/MGHRSBWN/Hanel und Vione - 2016 - Do Student Samples Provide an Accurate Estimate of.pdf}
}
@article{harariUsingSmartphonesCollect2016,
title = {Using {{Smartphones}} to {{Collect Behavioral Data}} in {{Psychological Science}}: {{Opportunities}}, {{Practical Considerations}}, and {{Challenges}}},
shorttitle = {Using {{Smartphones}} to {{Collect Behavioral Data}} in {{Psychological Science}}},
author = {Harari, Gabriella M. and Lane, Nicholas D. and Wang, Rui and Crosier, Benjamin S. and Campbell, Andrew T. and Gosling, Samuel D.},
year = {2016},
month = nov,
journal = {Perspectives on psychological science : a journal of the Association for Psychological Science},
volume = {11},
number = {6},
pages = {838--854},
issn = {1745-6916},
doi = {10.1177/1745691616650285},
urldate = {2021-06-30},
abstract = {Smartphones now offer the promise of collecting behavioral data unobtrusively, in situ, as it unfolds in the course of daily life. Data can be collected from the onboard sensors and other phone logs embedded in today's off-the-shelf smartphone devices. These data permit fine-grained, continuous collection of people's social interactions (e.g., speaking rates in conversation, size of social groups, calls, and text messages), daily activities (e.g., physical activity and sleep), and mobility patterns (e.g., frequency and duration of time spent at various locations). In this article, we have drawn on the lessons from the first wave of smartphone-sensing research to highlight areas of opportunity for psychological research, present practical considerations for designing smartphone studies, and discuss the ongoing methodological and ethical challenges associated with research in this domain. It is our hope that these practical guidelines will facilitate the use of smartphones as a behavioral observation tool in psychological science.},
pmcid = {PMC5572675},
pmid = {27899727},
file = {/Users/timokoch/Zotero/storage/RVQ8I8KL/Harari et al. - 2016 - Using Smartphones to Collect Behavioral Data in Ps.pdf}
}
@article{heavenWhyFacesDon2020,
title = {Why Faces Don't Always Tell the Truth about Feelings},
author = {Heaven, Douglas},
year = {2020},
month = feb,
journal = {Nature},
volume = {578},
number = {7796},
pages = {502--504},
publisher = {Nature Publishing Group},
doi = {10.1038/d41586-020-00507-5},
urldate = {2021-12-14},
abstract = {Although AI companies market software for recognizing emotions in faces, psychologists debate whether expressions can be read so easily.},
copyright = {2021 Nature},
langid = {english},
keywords = {Computer science,Psychology,Society},
annotation = {Bandiera\_abtest: a\\
Cg\_type: News Feature\\
Subject\_term: Psychology, Society, Computer science},
file = {/Users/timokoch/Zotero/storage/AW2CR8IT/Heaven - 2020 - Why faces don’t always tell the truth about feelin.pdf}
}
@article{henrichWeirdestPeopleWorld2010,
title = {The Weirdest People in the World?},
author = {Henrich, Joseph and Heine, Steven J. and Norenzayan, Ara},
year = {2010},
month = jun,
journal = {The Behavioral and Brain Sciences},
volume = {33},
number = {2-3},
pages = {61-83; discussion 83-135},
issn = {1469-1825},
doi = {10.1017/S0140525X0999152X},
abstract = {Behavioral scientists routinely publish broad claims about human psychology and behavior in the world's top journals based on samples drawn entirely from Western, Educated, Industrialized, Rich, and Democratic (WEIRD) societies. Researchers - often implicitly - assume that either there is little variation across human populations, or that these "standard subjects" are as representative of the species as any other population. Are these assumptions justified? Here, our review of the comparative database from across the behavioral sciences suggests both that there is substantial variability in experimental results across populations and that WEIRD subjects are particularly unusual compared with the rest of the species - frequent outliers. The domains reviewed include visual perception, fairness, cooperation, spatial reasoning, categorization and inferential induction, moral reasoning, reasoning styles, self-concepts and related motivations, and the heritability of IQ. The findings suggest that members of WEIRD societies, including young children, are among the least representative populations one could find for generalizing about humans. Many of these findings involve domains that are associated with fundamental aspects of psychology, motivation, and behavior - hence, there are no obvious a priori grounds for claiming that a particular behavioral phenomenon is universal based on sampling from a single subpopulation. Overall, these empirical patterns suggests that we need to be less cavalier in addressing questions of human nature on the basis of data drawn from this particularly thin, and rather unusual, slice of humanity. We close by proposing ways to structurally re-organize the behavioral sciences to best tackle these challenges.},
langid = {english},
pmid = {20550733},
keywords = {Behavioral Sciences,Cognition,Cross-Cultural Comparison,Decision Making,Humans,Morals,Population Groups,Visual Perception},
file = {/Users/timokoch/Zotero/storage/C7SQDQVH/Henrich et al. - 2010 - The weirdest people in the world.pdf}
}
@article{hildebrandVoiceAnalyticsBusiness2020,
title = {Voice Analytics in Business Research: {{Conceptual}} Foundations, Acoustic Feature Extraction, and Applications},
shorttitle = {Voice Analytics in Business Research},
author = {Hildebrand, Christian and Efthymiou, Fotis and Busquet, Francesc and Hampton, William H. and Hoffman, Donna L. and Novak, Thomas P.},
year = {2020},
month = dec,
journal = {Journal of Business Research},
volume = {121},
pages = {364--374},
issn = {0148-2963},
doi = {10.1016/j.jbusres.2020.09.020},
urldate = {2021-11-16},
abstract = {Recent advances in artificial intelligence and natural language processing are gradually transforming how humans search, shop, and express their preferences. Leveraging the new affordances and modalities of human--machine interaction through voice-controlled interfaces will require a nuanced understanding of the physics and psychology of speech formation as well as the systematic extraction and analysis of vocal features from the human voice. In this paper, we first develop a conceptual framework linking vocal features in the human voice to experiential outcomes and emotional states. We then illustrate the effective processing, editing, analysis, and visualization of voice data based on an Amazon Alexa user interaction, utilizing state-of-the-art signal-processing packages in R. Finally, we offer novel insight into the ways in which business research might employ voice and sound analytics moving forward, including a discussion of the ethical implications of building multi-modal databases for business and society.},
langid = {english},
keywords = {Acoustic markers of emotion,Emotion detection,Natural language processing,Voice Analytics,Voice-controlled interfaces},
file = {/Users/timokoch/Zotero/storage/PRRC3NCW/Hildebrand et al. - 2020 - Voice analytics in business research Conceptual f.pdf}
}
@article{hoemannContextawareExperienceSampling2020,
title = {Context-Aware Experience Sampling Reveals the Scale of Variation in Affective Experience},
author = {Hoemann, Katie and Khan, Zulqarnain and Feldman, Mallory J. and Nielson, Catie and Devlin, Madeleine and Dy, Jennifer and Barrett, Lisa Feldman and Wormwood, Jolie B. and Quigley, Karen S.},
year = {2020},
month = jul,
journal = {Scientific Reports},
volume = {10},
pages = {12459},
issn = {2045-2322},
doi = {10.1038/s41598-020-69180-y},
urldate = {2022-01-25},
abstract = {Emotion research typically searches for consistency and specificity in physiological activity across instances of an emotion category, such as anger or fear, yet studies to date have observed more variation than expected. In the present study, we adopt an alternative approach, searching inductively for structure within variation, both within and across participants. Following a novel, physiologically-triggered experience sampling procedure, participants' self-reports and peripheral physiological activity were recorded when substantial changes in cardiac activity occurred in the absence of movement. Unsupervised clustering analyses revealed variability in the number and nature of patterns of physiological activity that recurred within individuals, as well as in the affect ratings and emotion labels associated with each pattern. There were also broad patterns that recurred across individuals. These findings support a constructionist account of emotion which, drawing on Darwin, proposes that emotion categories are populations of variable instances tied to situation-specific needs.},
pmcid = {PMC7385108},
pmid = {32719368},
file = {/Users/timokoch/Zotero/storage/DVXXQMAG/Hoemann et al. - 2020 - Context-aware experience sampling reveals the scal.pdf}
}
@article{hoemannContextawareExperienceSampling2020a,
title = {Context-Aware Experience Sampling Reveals the Scale of Variation in Affective Experience},
author = {Hoemann, Katie and Khan, Zulqarnain and Feldman, Mallory J. and Nielson, Catie and Devlin, Madeleine and Dy, Jennifer and Barrett, Lisa Feldman and Wormwood, Jolie B. and Quigley, Karen S.},
year = {2020},
month = jul,
journal = {Scientific Reports},
volume = {10},
pages = {12459},
issn = {2045-2322},
doi = {10.1038/s41598-020-69180-y},
urldate = {2022-01-25},
abstract = {Emotion research typically searches for consistency and specificity in physiological activity across instances of an emotion category, such as anger or fear, yet studies to date have observed more variation than expected. In the present study, we adopt an alternative approach, searching inductively for structure within variation, both within and across participants. Following a novel, physiologically-triggered experience sampling procedure, participants' self-reports and peripheral physiological activity were recorded when substantial changes in cardiac activity occurred in the absence of movement. Unsupervised clustering analyses revealed variability in the number and nature of patterns of physiological activity that recurred within individuals, as well as in the affect ratings and emotion labels associated with each pattern. There were also broad patterns that recurred across individuals. These findings support a constructionist account of emotion which, drawing on Darwin, proposes that emotion categories are populations of variable instances tied to situation-specific needs.},
pmcid = {PMC7385108},
pmid = {32719368},
file = {/Users/timokoch/Zotero/storage/Y5Q7TCGI/Hoemann et al. - 2020 - Context-aware experience sampling reveals the scal.pdf}
}
@misc{holtAmazonAlexaGet,
title = {Amazon's {{Alexa Is About To Get More Emotional}}},
author = {Holt, Kris},
journal = {Forbes},
urldate = {2021-06-30},
abstract = {Developers can tap into Alexa's new excited and disappointed voice tones.},
chapter = {Consumer Tech},
howpublished = {https://www.forbes.com/sites/krisholt/2019/11/27/amazons-alexa-is-about-to-get-more-emotional/},
langid = {english},
file = {/Users/timokoch/Zotero/storage/F6P9Q3P4/amazons-alexa-is-about-to-get-more-emotional.html}
}
@misc{hsuHuBERTSelfSupervisedSpeech2021,
title = {{{HuBERT}}: {{Self-Supervised Speech Representation Learning}} by {{Masked Prediction}} of {{Hidden Units}}},
shorttitle = {{{HuBERT}}},
author = {Hsu, Wei-Ning and Bolte, Benjamin and Tsai, Yao-Hung Hubert and Lakhotia, Kushal and Salakhutdinov, Ruslan and Mohamed, Abdelrahman},
year = {2021},
month = jun,
number = {arXiv:2106.07447},
eprint = {2106.07447},
publisher = {arXiv},
doi = {10.48550/arXiv.2106.07447},
urldate = {2024-11-18},
abstract = {Self-supervised approaches for speech representation learning are challenged by three unique problems: (1) there are multiple sound units in each input utterance, (2) there is no lexicon of input sound units during the pre-training phase, and (3) sound units have variable lengths with no explicit segmentation. To deal with these three problems, we propose the Hidden-Unit BERT (HuBERT) approach for self-supervised speech representation learning, which utilizes an offline clustering step to provide aligned target labels for a BERT-like prediction loss. A key ingredient of our approach is applying the prediction loss over the masked regions only, which forces the model to learn a combined acoustic and language model over the continuous inputs. HuBERT relies primarily on the consistency of the unsupervised clustering step rather than the intrinsic quality of the assigned cluster labels. Starting with a simple k-means teacher of 100 clusters, and using two iterations of clustering, the HuBERT model either matches or improves upon the state-of-the-art wav2vec 2.0 performance on the Librispeech (960h) and Libri-light (60,000h) benchmarks with 10min, 1h, 10h, 100h, and 960h fine-tuning subsets. Using a 1B parameter model, HuBERT shows up to 19\% and 13\% relative WER reduction on the more challenging dev-other and test-other evaluation subsets.},
archiveprefix = {arXiv},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning,Electrical Engineering and Systems Science - Audio and Speech Processing},
file = {/Users/timokoch/Zotero/storage/4QL4FQF4/Hsu et al. - 2021 - HuBERT Self-Supervised Speech Representation Lear.pdf;/Users/timokoch/Zotero/storage/ANEUEL22/2106.html}
}
@article{huangPredictionEmotionChange2018,
title = {Prediction of {{Emotion Change From Speech}}},
author = {Huang, Zhaocheng and Epps, Julien},
year = {2018},
journal = {Frontiers in ICT},
volume = {5},
issn = {2297-198X},
urldate = {2023-02-13},
abstract = {The fact that emotions are dynamic in nature and evolve across time has been explored relatively less often in automatic emotion recognition systems to date. Although within-utterance information about emotion changes recently has received some attention, there remain open questions unresolved, such as how to approach delta emotion ground truth, how to predict the extent of emotion change from speech, and how well change can be predicted relative to absolute emotion ratings. In this article, we investigate speech-based automatic systems for continuous prediction of the extent of emotion changes in arousal/valence. We propose the use of regression (smoothed) deltas as ground truth for emotion change, which yielded considerably higher inter-rater reliability than first-order deltas, a commonly used approach in previous research, and represent a more appropriate approach to derive annotations for emotion change research, findings which are applicable beyond speech-based systems. In addition, the first system design for continuous emotion change prediction from speech is explored. Experimental results under the Output-Associative Relevance Vector Machine framework interestingly show that changes in emotion ratings may be better predicted than absolute emotion ratings on the RECOLA database, achieving 0.74 vs. 0.71 for arousal and 0.41 vs. 0.37 for valence in concordance correlation coefficients. However, further work is needed to achieve effective emotion change prediction performances on the SEMAINE database, due to the large number of non-change frames in the absolute emotion ratings.},
file = {/Users/timokoch/Zotero/storage/B2J4KMUW/Huang und Epps - 2018 - Prediction of Emotion Change From Speech.pdf}