diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml deleted file mode 100644 index c78502f4..00000000 --- a/.github/FUNDING.yml +++ /dev/null @@ -1,12 +0,0 @@ -# These are supported funding model platforms - -github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] -patreon: # Replace with a single Patreon username -open_collective: # Replace with a single Open Collective username -ko_fi: alshedivat -tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel -community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry -liberapay: # Replace with a single Liberapay username -issuehunt: # Replace with a single IssueHunt username -otechie: # Replace with a single Otechie username -custom: # ['https://www.buymeacoffee.com/TkFxuKo'] diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md deleted file mode 100644 index 1cc44f62..00000000 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ /dev/null @@ -1,38 +0,0 @@ ---- -name: Bug report -about: Create a report to help us improve -title: '' -labels: bug -assignees: '' - ---- - -**Acknowledge the following** -- [ ] I carefully read and followed the [Getting Started](https://github.com/alshedivat/al-folio#getting-started) guide. -- [ ] I read through [FAQ](https://github.com/alshedivat/al-folio#faq) and searched through the [past issues](https://github.com/alshedivat/al-folio/issues), none of which addressed my issue. -- [ ] The issue I am raising is a potential bug in al-folio and not just a usage question.
[For usage questions, please use [gitter chat](https://gitter.im/alshedivat/al-folio) instead of raising an issue.] - -**Describe the bug** -A clear and concise description of what the bug is. - -**To Reproduce** -Steps to reproduce the behavior: -1. Go to '...' -2. Click on '....' -3. Scroll down to '....' -4. See error - -**Expected behavior** -A clear and concise description of what you expected to happen. - -**Screenshots** -If applicable, add screenshots to help explain your problem. - -**System (please complete the following information):** - - OS: [e.g. iOS] - - Browser (and its version) [e.g. chrome, safari] - - Jekyll version [e.g. 3.8.7] -- Ruby version [e.g. 2.6.5] - -**Additional context** -Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md deleted file mode 100644 index 11fc491e..00000000 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -name: Feature request -about: Suggest an idea for this project -title: '' -labels: enhancement -assignees: '' - ---- - -**Is your feature request related to a problem? Please describe.** -A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] - -**Describe the solution you'd like** -A clear and concise description of what you want to happen. - -**Describe alternatives you've considered** -A clear and concise description of any alternative solutions or features you've considered. - -**Additional context** -Add any other context or screenshots about the feature request here. diff --git a/.github/stale.yml b/.github/stale.yml deleted file mode 100644 index 8ec2004d..00000000 --- a/.github/stale.yml +++ /dev/null @@ -1,18 +0,0 @@ -# Number of days of inactivity before an issue becomes stale -daysUntilStale: 60 -# Number of days of inactivity before a stale issue is closed -daysUntilClose: 7 -# Issues with these labels will never be considered stale -exemptLabels: - - pinned - - security - - enhancement -# Label to use when marking an issue as stale -staleLabel: wontfix -# Comment to post when marking an issue as stale. Set to `false` to disable -markComment: > - This issue has been automatically marked as stale because it has not had - recent activity. It will be closed if no further activity occurs. Thank you - for your contributions. -# Comment to post when closing a stale issue. Set to `false` to disable -closeComment: false diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml deleted file mode 100644 index 63729038..00000000 --- a/.github/workflows/deploy.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: Deploy - -on: - push: - branches: - - master - - source - pull_request: - branches: - - master - - source - -permissions: - contents: write - -jobs: - deploy: - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v2 - - name: Setup Ruby - uses: ruby/setup-ruby@v1 - with: - ruby-version: '3.2.2' - bundler-cache: true - - name: Enable bundler cache - uses: actions/cache@v2 - with: - path: vendor/bundle - key: ${{ runner.os }}-gems-${{ hashFiles('**/Gemfile.lock') }} - restore-keys: | - ${{ runner.os }}-gems- - - name: Install deps - run: | - gem install bundler - bundle config path vendor/bundle - bundle install --jobs 4 --retry 3 - - name: Setup deploy options - id: setup - run: | - git config --global user.name "GitHub Action" - git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com" - if [[ ${GITHUB_REF} = refs/pull/*/merge ]]; then # pull request - echo "::set-output name=SRC_BRANCH::${GITHUB_HEAD_REF}" - echo "::set-output name=NO_PUSH::--no-push" - elif [[ ${GITHUB_REF} = refs/heads/* ]]; then # branch, e.g. master, source etc - echo "::set-output name=SRC_BRANCH::${GITHUB_REF#refs/heads/}" - fi - if [[ ${{ github.repository }} = *.github.io ]]; then # user/org repo - echo "::set-output name=DEPLOY_BRANCH::master" - else - echo "::set-output name=DEPLOY_BRANCH::gh-pages" - fi - - name: Deploy website - run: yes | bin/deploy --verbose ${{ steps.setup.outputs.NO_PUSH }} - --src ${{ steps.setup.outputs.SRC_BRANCH }} - --deploy ${{ steps.setup.outputs.DEPLOY_BRANCH }} diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 328023ab..00000000 --- a/.travis.yml +++ /dev/null @@ -1,19 +0,0 @@ -language: ruby -rvm: - - 2.4.1 - -# Assume bundler is being used, therefore -# the `install` step will run `bundle install` by default. -script: ./bin/cibuild - -env: - global: - - NOKOGIRI_USE_SYSTEM_LIBRARIES=true # speeds up installation of html-proofer - -sudo: false # route your build to the container-based infrastructure for a faster build - -cache: bundler # caching bundler gem packages will speed up build - -# Optional: disable email notifications about the outcome of your builds -notifications: - email: false diff --git a/.tweet-cache/02b591d77a446cb7531ab71b75d3d2bc.cache b/.tweet-cache/02b591d77a446cb7531ab71b75d3d2bc.cache deleted file mode 100644 index feab9070..00000000 --- a/.tweet-cache/02b591d77a446cb7531ab71b75d3d2bc.cache +++ /dev/null @@ -1 +0,0 @@ -{"url":"https://twitter.com/rubygems/status/518821243320287232","author_name":"RubyGems","author_url":"https://twitter.com/rubygems","html":"

jekyll-twitter-plugin (1.0.0): A Liquid tag plugin for Jekyll that renders Tweets from Twitter API http://t.co/m4EIQPM9h4

— RubyGems (@rubygems) October 5, 2014
\n\n","width":550,"height":null,"type":"rich","cache_age":"3153600000","provider_name":"Twitter","provider_url":"https://twitter.com","version":"1.0"} \ No newline at end of file diff --git a/.tweet-cache/f18f38b6b6bb712c5873a899905f747c.cache b/.tweet-cache/f18f38b6b6bb712c5873a899905f747c.cache deleted file mode 100644 index 6f431ee4..00000000 --- a/.tweet-cache/f18f38b6b6bb712c5873a899905f747c.cache +++ /dev/null @@ -1 +0,0 @@ -{"url":"https://twitter.com/jekyllrb","title":"","html":"Tweets by jekyllrb\n\n","width":500,"height":null,"type":"rich","cache_age":"3153600000","provider_name":"Twitter","provider_url":"https://twitter.com","version":"1.0"} \ No newline at end of file diff --git a/Gemfile b/Gemfile deleted file mode 100644 index 03511a98..00000000 --- a/Gemfile +++ /dev/null @@ -1,12 +0,0 @@ -source 'https://rubygems.org' -group :jekyll_plugins do - gem 'jekyll' - gem 'jekyll-email-protect' - gem 'jekyll-github-metadata' - gem 'jekyll-paginate-v2' - gem 'jekyll-scholar' - gem 'jekyll-twitter-plugin' - gem 'jemoji' - gem 'unicode_utils' - gem 'webrick' -end diff --git a/_bibliography/papers.bib b/_bibliography/papers.bib deleted file mode 100644 index 22b220b6..00000000 --- a/_bibliography/papers.bib +++ /dev/null @@ -1,3039 +0,0 @@ ---- -@string{interspeech = {Proceedings of Interspeech}} -@string{ICASSP = {Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}} -@string{SLT = {Proceedings of IEEE Spoken Language Technology Workshop (SLT)}} -@string{ACL = {Proceedings of the Annual Meeting of the Association for Computational Linguistics}} -@string{ACLFindings = {Proceedings of Findings of the Annual Meeting of the Association for Computational Linguistics}} -@string{EACL = {Proceedings of the Conference of the European Chapter of the Association for Computational Linguistics}} -@string{NAACL = {Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}} -@string{TASLP = {IEEE/ACM Transactions on Audio, Speech, and Language Processing}} -@string{IWSLT = {Proceedings of the 18th International Conference on Spoken Language Translation (IWSLT)}} -@string{VCC = {Voice Conversion Challenge}} -@string{ASRU = {IEEE Automatic Speech Recogiton and Understanding Workshop (ASRU)}} -@string{WASPAA = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)}} -@string{APSIPA = {Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)}} -@string{ICML = {Proceedings of the International Conference on Machine Learning (ICML)}} -@string{ICLR = {Proceedings of the International Conference on Learning Representations (ICLR)}} -@string{NeurIPS = {Proceedings of the Conference on Neural Information Processing Systems}} - - - -@inproceedings{wu_icassp2024, - abbr={SE}, - abbr_publisher={ICASSP}, -author={Shilong Wu and Chenxi Wang and Hang Chen and Yusheng Dai and Chenyue Zhang and Ruoyu Wang and Hongbo Lan and Jun Du and Chin-hui Lee and Jingdong Chen and Shinji Watanabe and Sabato Marco Siniscalchi and Odette Scharenborg and Zhong-Qiu Wang and Jia Pan and Jianqing Gao}, -title={The Multimodal Information Based Speech Processing (MISP) 2023 Challenge: Audio-Visual Target Speaker Extraction}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{yang_icassp2024, - abbr={Audio}, - abbr_publisher={ICASSP}, -author={Muqiao Yang and Umberto Cappellazzo and Xiang Li and Shinji Watanabe and Bhiksha Raj}, -title={Improving Continual Learning of Acoustic Scene Classification via Mutual Information Optimization}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{tang_icassp2024, - abbr={ASR}, - abbr_publisher={ICASSP}, -author={Jiyang Tang and Kwangyoun Kim and Suwon Shon and Felix Wu and Prashant Sridhar and Shinji Watanabe}, -title={Improving ASR Contextual Biasing with Guided Attention}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{jung_icassp2024, - abbr={SLU}, - abbr_publisher={ICASSP}, -author={Jee-weon Jung and Roshan Sharma and William Chen and Bhiksha Raj and Shinji Watanabe}, -title={AugSumm: Towards Generalizable Speech Summarization Using Synthetic Labels from Large Language Models}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{maiti_icassp2024, - abbr={ASR&TTS}, - abbr_publisher={ICASSP}, -author={Soumi Maiti and Yifan Peng and Shukjae Choi and Jee-weon Jung and Xuankai Chang and Shinji Watanabe}, -title={Voxtlm: Unified Decoder-Only Models for Consolidating Speech Recognition, Synthesis and Speech, Text Continuation Tasks}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{hussein1_icassp2024, - abbr={ASR}, - abbr_publisher={ICASSP}, -author={Amir Hussein and Dorsa Zeinali and Ondřej Klejch and Matthew Wiesner and Brian Yan and Shammur Chowdhury and Ahmed Ali and Shinji Watanabe and Sanjeev Khudanpur}, -title={Speech Collage: Code-Switched Audio Generation by Collaging Monolingual Corpora}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{hussein2_icassp2024, - abbr={ST}, - abbr_publisher={ICASSP}, -author={Amir Hussein and Brian Yan and Antonios Anastasopoulos and Shinji Watanabe and Sanjeev Khudanpur}, -title={Enhancing End-to-End Conversational Speech Translation Through Target Language Context Utilization}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{medina_icassp2024, - abbr={ASR}, - abbr_publisher={ICASSP}, -author={Salvador Medina and Sarah Taylor and Carsten Stoll and Gareth Edwards and Alex Hauptmann and Shinji Watanabe and Iain Matthews}, -title={Phisanet: Phonetically Informed Speech Animation Network}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{cornell_icassp2024, - abbr={SD&ASR}, - abbr_publisher={ICASSP}, -author={Samuele Cornell and Jee-weon Jung and Shinji Watanabe and Stefano Squartini}, -title={One Model to Rule Them All? Towards End-to-End Joint Speaker Diarization and Speech Recognition}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{huang_icassp2024, - abbr={ASR}, - abbr_publisher={ICASSP}, -author={Ruizhe Huang and Xiaohui Zhang and Zhaoheng Ni and Li Sun and Moto Hira and Jeff Hwang and Vimal Manohar and Vineel Pratap and Shinji Watanabe and Daniel Povey and Sanjeev Khudanpur}, -title={Less Peaky and More Accurate CTC Forced Alignment by Pruned CTC Loss and Label Priors}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{maekaku_icassp2024, - abbr={SSL}, - abbr_publisher={ICASSP}, -author={Takashi Maekaku and Jiatong Shi and Xuankai Chang and Yuya Fujita and Shinji Watanabe}, -title={HuberTopic: Enhancing Semantic Representation of Hubert Through Self-Supervision Utilizing Topic Model}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{chang_icassp2024, - abbr={ASR&ST&SLU}, - abbr_publisher={ICASSP}, -author={Xuankai Chang and Brian Yan and Kwanghee Choi and Jee-weon Jung and Yichen Lu and Soumi Maiti and Roshan Sharma and Jiatong Shi and Jinchuan Tian and Shinji Watanabe and Yuya Fujita and Takashi Maekaku and Pengcheng Guo and Yao-Fei Cheng and Pavel Denisov and Kohei Saijo and Hsiu-Hsuan Wang}, -title={Exploring Speech Recognition, Translation, and Understanding with Discrete Speech Units: A Comparative Study}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{huang_chienyu_icassp2024, - abbr={LLM&SLU}, - abbr_publisher={ICASSP}, -author={Chien-yu Huang and Ke-Han Lu and Shih-Heng Wang and Chun-Yi Kuan and Chi-Yuan Hsiao and Haibin Wu and Siddhant Arora and Kai-Wei Chang and Jiatong Shi and Yifan Peng and Roshan Sharma and Shinji Watanabe and Bhiksha Ramakrishnan and Shady Shehata and Hung-yi Lee}, -title={Dynamic-Superb: Towards a Dynamic, Collaborative, and Comprehensive Instruction-Tuning Benchmark for Speech}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{yan_brian_icassp2024, - abbr={ST}, - abbr_publisher={ICASSP}, -author={Brian Yan and Xuankai Chang and Antonios Anastasopoulos and Yuya Fujita and Shinji Watanabe}, -title={Cross-Modal Multi-Tasking for Speech-to-Text Translation via Hard Parameter Sharing}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{arora_siddhant_icassp2024, - abbr={ASR}, - abbr_publisher={ICASSP}, -author={Siddhant Arora and George Saon and Shinji Watanabe and Brian Kingsbury}, -title={Semi-Autoregressive Streaming ASR with Label Context}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{shon_icassp2024, - abbr={SSL}, - abbr_publisher={ICASSP}, -author={Suwon Shon and Kwangyoun Kim and Prashant Sridhar and Yi-Te Hsu and Shinji Watanabe and Karen Livescu}, -title={Generative Context-Aware Fine-Tuning of Self-Supervised Speech Models}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{sudo_icassp2024, - abbr={ASR}, - abbr_publisher={ICASSP}, -author={Yui Sudo and Shakeel Muhammad and Yosuke Fukumoto and Yifan Peng and Shinji Watanabe}, -title={Contextualized Automatic Speech Recognition with Attention-Based Bias Phrase Boosted Beam Search}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{chen_william_icassp2024, - abbr={SSL}, - abbr_publisher={ICASSP}, -author={William Chen and Takatomo Kano and Atsunori Ogawa and Marc Delcroix and Shinji Watanabe}, -title={Train Long and Test Long: Leveraging Full Document Contexts in Speech Processing}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{zhang_icassp2024, - abbr={SE}, - abbr_publisher={ICASSP}, -author={Wangyou Zhang and Jee-weon Jung and Shinji Watanabe and Yanmin Qian}, -title={Improving Design of Input Condition Invariant Speech Enhancement}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{futami_icassp2024, - abbr={ASR}, - abbr_publisher={ICASSP}, -author={Hayato Futami and Emiru Tsunoo and Yosuke Kashiwagi and Hiroaki Ogawa and Siddhant Arora and Shinji Watanabe}, -title={Phoneme-Aware Encoding for Prefix-Tree-Based Contextual ASR}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{lee_younglo_icassp2024, - abbr={SS}, - abbr_publisher={ICASSP}, -author={Younglo Lee and Shukjae Choi and Byeong-Yeol Kim and Zhong-Qiu Wang and Shinji Watanabe}, -title={Boosting Unknown-Number Speaker Separation with Transformer Decoder-Based Attractor}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{yeo_icassp2024, - abbr={ASR}, - abbr_publisher={ICASSP}, -author={Jeong Hun Yeo and Minsu Kim and Shinji Watanabe and Yong Man Ro}, -title={Visual Speech Recognition for Low-Resource Languages with Automatic Labels from Whisper Model}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{kim_minsu_icassp2024, - abbr={Caption}, - abbr_publisher={ICASSP}, -author={Minsu Kim and Jeongsoo Choi and Soumi Maiti and Jeong Hun Yeo and Shinji Watanabe and Yong Man Ro}, -title={Towards Practical and Efficient Image-to-Speech Captioning with Vision-Language Pre-Training and Multi-Modal Tokens}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{choi_kwanghee_icassp2024, - abbr={SSL}, - abbr_publisher={ICASSP}, -author={Kwanghee Choi and Jee-weon Jung and Shinji Watanabe}, -title={Understanding Probe Behaviors Through Variational Bounds of Mutual Information}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{wu_shihlun_icassp2024, - abbr={Caption}, - abbr_publisher={ICASSP}, -author={Shih-Lun Wu and Xuankai Chang and Gordon Wichern and Jee-weon Jung and François Germain and Jonathan Le Roux and Shinji Watanabe}, -title={Improving Audio Captioning Models with Fine-Grained Audio Features, Text Embedding Supervision, and LLM Mix-Up Augmentation}, -booktitle=ICASSP, -year={2024} -} - -@inproceedings{tseng_icassp2024, - abbr={SSL}, - abbr_publisher={ICASSP}, -author={Yuan Tseng and Layne Berry and Yi-Ting Chen and I-Hsiang Chiu and Hsuan-Hao Lin and Max Liu and Puyuan Peng and Yi-Jen Shih and Hung-Yu Wang and Haibin Wu and Po-Yao Huang and Chun-Mao Lai and Shang-Wen Li and David Harwath and Yu Tsao and Shinji Watanabe and Abdelrahman Mohamed and Chi Luen Feng and Hung-yi Lee}, -title={AV-Superb: A Multi-Task Evaluation Benchmark for Audio-Visual Representation Models}, -booktitle=ICASSP, -year={2024} -} - - -@inproceedings{chou2023evaluating, - abbr={ASR}, - abbr_publisher={ASRU}, - title={Evaluating Self-supervised Speech Models on a Taiwanese Hokkien Corpus}, - author={Yi-Hui Chou and Kalvin Chang and Meng-Ju Wu and Winston Ou and Alice Wen-Hsin Bi and Carol Yang and Bryan Y. Chen and Rong-Wei Pai and Po-Yen Yeh and Jo-Peng Chiang and Lu-Tshiann Phoann and Winnie Chang and Chenxuan Cui and Noel Chen and Jiatong Shi}, - booktitle=ASRU, - year={2023}, - publisher={IEEE}, - -} - -@inproceedings{huang2023singing, - abbr={SVC}, - abbr_publisher={ASRU}, - title={The Singing Voice Conversion Challenge 2023}, - author={Wen-Chin Huang and Lester Phillip Violeta and Songxiang Liu and Jiatong Shi and Tomoki Toda}, - booktitle=ASRU, - year={2023}, - publisher={IEEE}, -} - -@inproceedings{shiohara2023domain, - abbr={ASR}, - abbr_publisher={ASRU}, - title={Domain Adaptation by Data Distribution Matching via Submodularity for Speech Recognition}, - author={Yusuke Shinohara and Shinji Watanabe}, - booktitle=ASRU, - year={2023}, - publisher={IEEE}, -} - -@inproceedings{kano2023summarize, - abbr={Summarization&ST}, - abbr_publisher={ASRU}, - title={Summarize while Translating: Universal Model with Parallel Decoding for Summarization and Translation}, - author={Takatomo Kano and Atsunori Ogawa and Marc Delcroix and Kohei Matsuura and Takanori Ashihara and William Chen and Shinji Watanabe}, - booktitle=ASRU, - year={2023}, - publisher={IEEE}, -} - -@inproceedings{li2023yodas, - abbr={ASR}, - abbr_publisher={ASRU}, - title={YODAS: Youtube-Oriented Dataset for Audio and Speech}, - author={Xinjian Li and Shinnosuke Takamichi and Takaaki Saeki and William Chen and Sayaka Shiota and Shinji Watanabe}, - booktitle=ASRU, - year={2023}, - publisher={IEEE}, -} - -@inproceedings{kohei2023single, - abbr={SE&SS}, - abbr_publisher={ASRU}, - title={A Single Speech Enhancement Model Unifying Dereverberation, Denoising, Speaker Counting, Separation, and Extraction}, - author={Kohei Saijo and Wangyou Zhang and Zhong-Qiu Wang and Shinji Watanabe and Tetsunori Kobayashi and Tetsuji Ogawa}, - booktitle=ASRU, - year={2023}, - publisher={IEEE}, -} - -@inproceedings{wang2023torchaudio, - abbr={ASR&SSL}, - abbr_publisher={ASRU}, - title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch}, - author={Jeff Hwang and Moto Hira and Caroline Chen and Xiaohui Zhang and Zhaoheng Ni and Guangzhi Sun and Pingchuan Ma and Ruizhe Huang and Vineel Pratap and Yuekai Zhang and Anurag Kumar and Chin-Yun Yu and Chuang Zhu and Chunxi Liu and Jacob Kahn and Mirco Ravanelli and Peng Sun and Shinji Watanabe and Yangyang Shi and Yumeng Tao}, - booktitle=ASRU, - year={2023}, - publisher={IEEE}, -} - -@inproceedings{zhang2023toward, - abbr={SE}, - abbr_publisher={ASRU}, - title={Toward Universal Speech Enhancement For Diverse Input Conditions}, - author={Wangyou Zhang and Kohei Saijo and Zhong-Qiu Wang and Shinji Watanabe and Yanmin Qian}, - booktitle=ASRU, - year={2023}, - publisher={IEEE}, -} - -@inproceedings{shi2023findings, - abbr={ASR}, - abbr_publisher={ASRU}, - title={Findings of the 2023 ML-SUPERB Challenge: Pre-Training and Evaluation over More Languages and Beyond}, - author={Jiatong Shi and William Chen and Dan Berrebbi and Hsiu-Hsuan Wang and Wei Ping Huang and En Pei Hu and ho lam Chung and Xuankai Chang and Yuxun Tang and Shang-Wen Li and Abdelrahman Mohamed and Hung-yi Lee and Shinji Watanabe}, - booktitle=ASRU, - year={2023}, - publisher={IEEE}, -} - -@inproceedings{chen2023joint, - abbr={SSL}, - abbr_publisher={ASRU}, - title={Joint Prediction and Denoising for Large-Scale Multilingual Self-Supervised Learning}, - author={William Chen and Jiatong Shi and Brian Yan and Dan Berrebbi and Wangyou Zhang and Yifan Peng and Xuankai Chang and Soumi Maiti and Shinji Watanabe}, - booktitle=ASRU, - year={2023}, - publisher={IEEE}, -} - -@inproceedings{someki2023segment, - abbr={ASR}, - abbr_publisher={ASRU}, - title={Segment-Level Vectorized Beam Search Based on Partially Autoregressive Inference}, - author={Masao Someki and Nicholas Eng and Yosuke Higuchi and Shinji Watanabe}, - booktitle=ASRU, - year={2023}, - publisher={IEEE}, -} - -@inproceedings{peng2023reproducing, - abbr={ASR&ST}, - abbr_publisher={ASRU}, - title={Reproducing Whisper-Style Training Using an Open-Source Toolkit and Publicly Available Data}, - author={Yifan Peng and Jinchuan Tian and Brian Yan and Dan Berrebbi and Xuankai Chang and Xinjian Li and Jiatong Shi and Siddhant Arora and William Chen and Roshan Sharma and Wangyou Zhang and Yui Sudo and Muhammad Shakeel and Jee-weon Jung and Soumi Maiti and Shinji Watanabe}, - booktitle=ASRU, - year={2023}, - publisher={IEEE}, -} - -@inproceedings{roshan2023espnet, - abbr={Summarization}, - abbr_publisher={ASRU}, - title={ESPNet-SUMM: Introducing a novel large dataset, toolkit, and a cross-corpora evaluation of speech summarization systems}, - author={Roshan Sharma and William Chen and Takatomo Kano and Ruchira Sharma and Atsunori Ogawa and Siddhant Arora and Marc Delcroix and Rita Singh and Shinji Watanabe and Bhiksha Raj}, - booktitle=ASRU, - year={2023}, - publisher={IEEE}, -} - - -@inproceedings{fujita2023lvctc, - abbr={ASR}, - abbr_publisher={ASRU}, - title={LV-CTC: Non-autoregressive ASR with CTC and latent variable models}, - author={Yuya Fujita and Shinji Watanabe and Xuankai Chang and Takashi Maekaku}, - booktitle=ASRU, - year={2023}, - publisher={IEEE}, -} - -@inproceedings{wang2023unssor, - abbr={SS}, - abbr_publisher={NeurIPS}, - title={UNSSOR: Unsupervised Neural Speech Separation by Leveraging Over-determined Training Mixtures}, - author={Zhong-Qiu Wang and Shinji Watanabe}, - booktitle=NeurIPS, - year={2023}, -} - -@inproceedings{masuyama2023exploring, - abbr={SS}, - abbr_publisher={WASPAA}, - title={Exploring the Integration of Speech Separation and Recognition with Self-Supervised Learning Representation}, - author={Yoshiki Masuyama and Xuankai Chang and Wangyou Zhang and Samuele Cornell and Zhong-Qiu Wang and Nobutaka Ono and Yanmin Qian and Shinji Watanabe}, - booktitle=WASPAA, - year={2023}, -} - - -@article{maciejewski2023adilemma, - abbr={SS}, - abbr_publisher={CSL}, - title={Dilemma of Ground Truth in Noisy Speech Separation and an Approach to Lessen the Impact of Imperfect Training Data}, - author={Matthew Maciejewski and Jing Shi and Shinji Watanabe and Sanjeev Khudanpur}, - journal=TASLP, - year={2023}, - publisher={IEEE}, -} - -@article{horiguchi2023online, - abbr={SD}, - abbr_publisher={TASLP}, - title={Online Neural Diarization of Unlimited Numbers of Speakers Using Global and Local Attractors}, - author={Shota Horiguchi and Shinji Watanabe and Paola Garcia and Yuki Takashima and Yohei Kawaguchi}, - journal=TASLP, - year={2023}, - publisher={IEEE}, -} - -@article{dalmia2023legonn, - abbr={MT&ASR}, - abbr_publisher={TASLP}, - title={LegoNN: Building Modular Encoder-Decoder Models}, - author={Siddharth Dalmia and Dmytro Okhonko and Mike Lewis and Sergey Edunov and Shinji Watanabe and Florian Metze and Luke Zettlemoyer and Abdelrahman Mohamed}, - journal=TASLP, - year={2023}, - publisher={IEEE}, -} - -@inproceedings{yan20203espnet-st-v2, - abbr={ST}, - abbr_publisher={ACL(demo)}, - title={ESPnet-ST-v2: Multipurpose Spoken Language Translation Toolkit}, - author={Brian Yan and Jiatong Shi and Yun Tang and Hirofumi Inaguma and Yifan Peng and Siddharth Dalmia and Peter Polak and Patrick Fernandes and Dan Berrebbi and Tomoki Hayashi and Xiaohui Zhang and Zhaoheng Ni and Moto Hira and Soumi Maiti and Juan Pino and Shinji Watanabe}, - booktitle=ACL, - year={2023}, -} - -@inproceedings{inaguma2023unity, - abbr={ST}, - abbr_publisher={ACL}, - title={UnitY: Two-pass Direct Speech-to-speech Translation with Discrete Units}, - author={Hirofumi Inaguma and Sravya Popuri and Ilia Kulikov and Peng-Jen Chen and Changhan Wang and Yu-An Chung and Yun Tang and Ann Lee and Shinji Watanabe and Juan Pino}, - booktitle=ACL, - year={2023}, -} - -@inproceedings{xu2023efficient, - abbr={ASR}, - abbr_publisher={ICML}, - title={Efficient Sequence Transduction by Jointly Predicting Tokens and Durations}, - author={Hainan Xu and Fei Jia and Somshubra Majumdar and He Huang and Shinji Watanabe and Boris Ginsburg}, - booktitle=ICML, - year={2023}, -} - -@inproceedings{saeki2023learning, - abbr={TTS}, - abbr_publisher={IJCAI}, - title={Learning to Speak from Text: Zero-Shot Multilingual Text-to-Speech with Unsupervised Text Pretraining}, - author={Takaaki Saeki and Soumi Maiti and Xinjian Li and Shinji Watanabe and Shinnosuke Takamichi and Hiroshi Saruwatari}, - booktitle={IJCAI}, - year={2023}, -} - - -@inproceedings{Wu_is2023, - abbr={TTS}, - abbr_publisher={Interspeech}, - title={Deep Speech Synthesis from MRI-Based Articulatory Representations}, - author={Peter Wu and Tingle Li and Yijing Lu and Yubin Zhang and Jiachen Lian and Alan Black and Louis Goldstein and Shinji Watanabe and Gopala Krishna Anumanchipalli}, - booktitle=interspeech, - year={2023}, -} - -@inproceedings{Tang_is2023, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={A New Benchmark of Aphasia Speech Recognition and Detection Based on E-Branchformer and Multi-task Learning}, - author={Jiyang Tang and William Chen and Xuankai Chang and Shinji Watanabe and Brian MacWhinney}, - booktitle=interspeech, - year={2023}, -} - -@inproceedings{Xuankai_is2023, - abbr={ASR&SSL}, - abbr_publisher={Interspeech}, - title={Exploration of Efficient End-to-End ASR using Discretized Input from Self-Supervised Learning}, - author={Xuankai Chang and Brian Yan and Yuya Fujita and Takashi Maekaku and Shinji Watanabe}, - booktitle=interspeech, - year={2023}, -} - -@inproceedings{Peng_is2023_3, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Prompting the Hidden Talent of Web-Scale Speech Models for Zero-Shot Task Generalization}, - author={Puyuan Peng and Brian Yan and Shinji Watanabe and David Harwath}, - booktitle=interspeech, - year={2023}, -} - -@inproceedings{Arora_is2023, - abbr={ASR&SLU}, - abbr_publisher={Interspeech}, - title={Integrating Pretrained ASR and LM to perform Sequence Generation for Spoken Language Understanding}, - author={Siddhant Arora and Hayato Futami and Yosuke Kashiwagi and Emiru Tsunoo and Brian Yan and Shinji Watanabe}, - booktitle=interspeech, - year={2023}, -} - -@inproceedings{Tsunoo_is2023, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Integration of Frame- and Label-synchronous Beam Search for Streaming Encoder--decoder Speech Recognition}, - author={Emiru Tsunoo and Hayato Futami and Yosuke Kashiwagi and Siddhant Arora and Shinji Watanabe}, - booktitle=interspeech, - year={2023}, -} - -@inproceedings{Tian_is2023, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Bayes Risk Transducer: Transducer with Controllable Alignment Prediction}, - author={Jinchuan Tian and Jianwei Yu and Hangting Chen and Brian Yan and Chao Weng andDong Yu and Shinji Watanabe}, - booktitle=interspeech, - year={2023}, -} - -@inproceedings{Jiatong_is2023_2, - abbr={SSL}, - abbr_publisher={Interspeech}, - title={Exploration on HuBERT with Multiple Resolution}, - author={Jiatong Shi and Yun Tang and HIrofumi Inaguma and Hongyu Gong and Juan Pino and Shinji Watanabe}, - booktitle=interspeech, - year={2023}, -} - -@inproceedings{Sudo_is2023_2, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Time-synchronous one-pass Beam Search for Parallel Online and Offline Transducers with Dynamic Block Training}, - author={Yui Sudo and Muhammad Shakeel and Yifan Peng and Shinji Watanabe}, - booktitle=interspeech, - year={2023}, -} - -@inproceedings{Jiatong_is2023, - abbr={ASR&SSL}, - abbr_publisher={Interspeech}, - title={ML-SUPERB: Multilingual Speech Universal PERformance Benchmark}, - author={Jiatong Shi and Dan Berrebbi and William Chen and En Pei Hu and Wei-Ping Huang and ho lam Chung and Xuankai Chang and Shang-Wen Li and, Abdelrahman Mohamed and Hung-yi Lee and Shinji Watanabe}, - booktitle=interspeech, - year={2023}, -} - -@inproceedings{Kashiwagi_is2023, - abbr={SLU}, - abbr_publisher={Interspeech}, - title={Tensor Decomposition for Minimization of E2E SLU Model Toward On-Device Processing}, - author={Yosuke Kashiwagi and Siddhant Arora and Hayato Futami and Jessica Huynh and Shih-Lun Wu and Yifan Peng and Brian Yan and Emiru Tsunoo and Shinji Watanabe}, - booktitle=interspeech, - year={2023}, -} - -@inproceedings{Sudo_is2023, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={4D: Joint modeling of CTC, Attention, Transducer, and Mask-Predict decoders}, - author={Yui Sudo and Muhammad Shakeel and Brian Yan and Jiatong Shi and Shinji Watanabe}, - booktitle=interspeech, - year={2023}, -} - -@inproceedings{Peng_is2023, - abbr={SSL}, - abbr_publisher={Interspeech}, - title={DPHuBERT: Joint Distillation and Pruning of Self-Supervised Speech Models}, - author={Yifan Peng and Yui Sudo and Muhammad Shakeel and Shinji Watanabe}, - booktitle=interspeech, - year={2023}, -} - -@inproceedings{Pend_is2023_2, - abbr={ASR&ST}, - abbr_publisher={Interspeech}, - title={lA Comparative Study on E-Branchformer vs Conformer in Speech Recognition, Translation, and Understanding Tasks}, - author={Yifan Peng andKwangyoun Kim and Felix Wu and Brian Yan and Siddhant Arora and William Chen and Jiyang Tang and Suwon Shon and Prashant Sridhar and Shinji Watanabe}, - booktitle=interspeech, - year={2023}, -} - -@inproceedings{Chen_is2023, - abbr={SSL}, - abbr_publisher={Interspeech}, - title={Reducing Barriers to Self-Supervised Learning: HuBERT Pre-training with Academic Compute}, - author={William Chen and Xuankai Chang and Yifan Peng and Zhaoheng Ni and Soumi Maiti and Shinji Watanabe}, - booktitle=interspeech, - year={2023}, -} - -@inproceedings{Sharma_is2023, - abbr={Summarization}, - abbr_publisher={Interspeech}, - title={BASS: Block-wise Adaptation for Speech Summarization}, - author={Roshan Sharma and Siddhant Arora and Kenneth Zheng and Shinji Watanabe and Rita Singh and Bhiksha Raj}, - booktitle=interspeech, - year={2023}, -} - -@inproceedings{yan_eacl2023, - abbr={ST}, - abbr_publisher={EACL}, - title={CTC Alignments Improve Autoregressive Translation}, - author={Brian Yan and Siddharth Dalmia and Yosuke Higuchi and Graham Neubig and Florian Metze and Alan W Black and Shinji Watanabe}, - booktitle=EACL, - year={2023}, -} - -@inproceedings{dan_iclr2023, - abbr={ASR}, - abbr_publisher={ICLR}, - title={Continuous Pseudo-Labeling from the Start}, - author={Dan Berrebbi and Ronan Collobert and Samy Bengio and Navdeep Jaitly and Tatiana Likhomanenko}, - booktitle=ICLR, - year={2023}, -} - - -@inproceedings{hainan_icassp2023, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Multi-blank Transducers for Speech Recognition}, - author={Hainan Xu and Fei Jia and Somshubra Majumdar and Shinji Watanabe and and Boris Ginsburg}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{muqiao_icassp2023, - abbr={SE}, - abbr_publisher={ICASSP}, - title={PAAPLoss: A Phonetic-Aligned Acoustic Parameter Loss for Speech Enhancement}, - author={Muqiao Yang and Joseph Konan and David Bick and Yunyang Zeng and Shuo Han and Anurag Kumar and Shinji Watanabe and and Bhiksha Raj}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{jee_icassp2023, - abbr={SD}, - abbr_publisher={ICASSP}, - title={In search of strong embedding extractors for speaker diarisation}, - author={Jee-weon Jung and Hee-Soo Heo and Bong-Jin Lee and Jaesung Huh and Andrew Brown and Youngki Kwon and Shinji Watanabe and and Joon Son Chung}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{felix_icassp2023, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Wav2Seq: Pre-training Speech-to-Text Encoder-Decoder Models Using Pseudo Languages}, - author={Felix Wu and Kwangyoun Kim and Shinji Watanabe and Kyu J. Han and Ryan McDonald and Kilian Q. Weinberger and and Yoav Artzi}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{yosuke_icassp2023, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={BECTRA: Transducer-based End-to-End ASR with BERT-Enhanced Encoder}, - author={Yosuke Higuchi and Tetsuji Ogawa and Tetsunori Kobayashi and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{yousuke_icassp2023, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={InterMPL: Momentum Pseudo-Labeling with Intermediate CTC Loss}, - author={Yosuke Higuchi and Tetsuji Ogawa and Tetsunori Kobayashi and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{liwei_icassp2023, - abbr={TTS&SSL}, - abbr_publisher={ICASSP}, - title={A Unified One-Shot Prosody and Speaker Conversion System with Self-Supervised Discrete Speech Units}, - author={Li-Wei Chen and Shinji Watanabe and and Alexander Rudnicky}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{yunyang_icassp2023, - abbr={SE}, - abbr_publisher={ICASSP}, - title={TAPLoss: A Temporal Acoustic Parameter Loss for Speech Enhancement}, - author={Yunyang Zeng and Joseph Konan and Shuo Han and David Bick and Muqiao Yang and Anurag Kumar and Shinji Watanabe and and Bhiksha Raj}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{jiatong2_icassp2023, - abbr={SSL&SLU}, - abbr_publisher={ICASSP}, - title={Bridging Speech and Text Pre-trained Models with Unsupervised ASR}, - author={Jiatong Shi and Chan-Jan Hsu and Holam Chung and Dongji Gao and Paola Garcia and Shinji Watanabe and Ann Lee and and Hung-yi Lee}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{jiatong3_icassp2023, - abbr={Music}, - abbr_publisher={ICASSP}, - title={PHONEix: Acoustic Feature Processing Strategy for Enhanced Singing Pronunciation with Phoneme Distribution Predictor}, - author={Yuning Wu and Jiatong Shi and Tao Qian and and Qin Jin}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{takatomo_icassp2023, - abbr={SLU}, - abbr_publisher={ICASSP}, - title={Speech summarization of long spoken document: Improving memory efficiency of speech/text encoders}, - author={Takatomo Kano and Atsunori Ogawa and Marc Delcroix and Roshan Sharma and Kohei Matsuura and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{suwon_icassp2023, - abbr={SSL}, - abbr_publisher={ICASSP}, - title={Context-Aware Fine-Tuning of Self-Supervised Speech Models}, - author={Suwon Shon and Felix Wu and Kwangyoun Kim and Prashant Sridhar and Karen Livescu and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{jiatong1_icassp2023, - abbr={S2ST}, - abbr_publisher={ICASSP}, - title={Enhancing Speech-To-Speech Translation with Multiple TTS Targets}, - author={Jiatong Shi and Yun Tang and Ann Lee and Hirofumi Inaguma and Changhan Wang and Juan Pino and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{_icassp2023, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Streaming Joint Speech Recognition and Disfluency Detection}, - author={Hayato Futami and Emiru Tsunoo and Kentaro Shibata and Yosuke Kashiwagi and Takao Okuda and Siddhant Arora and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{brian_icassp2023, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Towards Zero-Shot Code-Switched Speech Recognition}, - author={Brian Yan and Matthew Wiesner and Ondrej Klejch and Preethi Jyothi and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{motoi_icassp2023, - abbr={ST}, - abbr_publisher={ICASSP}, - title={Align and Write and Re-order: Explainable End-to-End Speech Translation via Operation Sequence Generation}, - author={Motoi Omachi and Brian Yan and Siddharth Dalmia and Yuya Fujita and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{william_icassp2023, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Improving Massively Multilingual ASR With Auxiliary CTC Objectives}, - author={William Chen and Brian Yan and Jiatong Shi and Yifan Peng and Soumi Maiti and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{soumi_icassp2023, - abbr={SSL}, - abbr_publisher={ICASSP}, - title={SpeechLMScore: Evaluating Speech Generation Using Speech Language Model}, - author={Soumi Maiti and Yifan Peng and Takaaki Saeki and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{peter_icassp2023, - abbr={TTS}, - abbr_publisher={ICASSP}, - title={Speaker-Independent Acoustic-to-Articulatory Speech Inversion}, - author={Peter Wu and Li-Wei Chen and Cheol Jun Cho and Shinji Watanabe and Louis Goldstein and Alan W. Black and and Gopala K. Anumanchipalli}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{siddhant_icassp2023, - abbr={SLU}, - abbr_publisher={ICASSP}, - title={Joint Modelling of Spoken Language Understanding Tasks with Integrated Dialog History}, - author={Siddhant Arora and Hayato Futami and Emiru Tsunoo and Brian Yan and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{zhongqiu2_icassp2023, - abbr={SS}, - abbr_publisher={ICASSP}, - title={TF-GridNet: Making Time-Frequency Domain Models Great Again for Monaural Speaker Separation}, - author={Zhong-Qiu Wang and Samuele Cornell and Shukjae Choi and Younglo Lee and Byeong-Yeol Kim and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{dongji_icassp2023, - abbr={ASR&SSL}, - abbr_publisher={ICASSP}, - title={EURO: ESPnet Unsupervised ASR Open-Source Toolkit}, - author={Dongji Gao and Jiatong Shi and Shun-Po Chuang and Leibny Paola Garcia and Hung-yi Lee and Shinji Watanabe and and Sanjeev Khudanpur}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{zhongqiu_icassp2023, - abbr={SE}, - abbr_publisher={ICASSP}, - title={Neural Speech Enhancement with Very Low Algorithmic Latency and Complexity via Integrated Full- and Sub-Band Modeling}, - author={Zhong-Qiu Wang and Samuele Cornell and Shukjae Choi and Younglo Lee and Byeong-Yeol Kim and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{dan_icassp2023, - abbr={ASR&SSL}, - abbr_publisher={ICASSP}, - title={Avoid Overthinking in Self-Supervised Models for Speech Recognition}, - author={Dan Berrebbi and Brian Yan and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{jianchen_icassp2023, - abbr={TTS}, - abbr_publisher={ICASSP}, - title={Articulatory Representation Learning Via Joint Factor Analysis and Neural Matrix Factorization}, - author={Jiachen Lian and Alan W Black and Yijing Lu and Louis Goldstein and Shinji Watanabe and and Gopala K. Anumanchipalli}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{yifan_icassp2023, - abbr={ASR&SLU&SSL}, - abbr_publisher={ICASSP}, - title={Structured Pruning of Self-Supervised Pre-trained Models for Speech Recognition and Understanding}, - author={Yifan Peng and Kwangyoun Kim and Felix Wu and Prashant Sridhar and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{takashi_icassp2023, - abbr={SSL}, - abbr_publisher={ICASSP}, - title={Fully Unsupervised Topic Clustering of Unlabelled Spoken Audio Using Self-Supervised Representation Learning and Topic Model}, - author={Takashi Maekaku and Yuya Fujita and Xuankai Chang and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{zhe_icassp2023, - abbr={MultiModal}, - abbr_publisher={ICASSP}, - title={The Multimodal Information Based Speech Processing (MISP) 2022 Challenge: Audio-Visual Diarization and Recognition}, - author={Zhe Wang and Shilong Wu and Hang Chen and Mao-Kui He and Jun Du and Chin-Hui Lee and Jingdong Chen and Shinji Watanabe and Sabato Siniscalchi and Odette Scharenborg and Diyuan Liu and Baocai Yin and Jia Pan and Jianqing Gao and and Cong Liu}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{junwei_icassp2023, - abbr={SSL&ASR}, - abbr_publisher={ICASSP}, - title={FINDADAPTNET: Find and Insert Adapters by Learned Layer Importance}, - author={Junwei Huang and Karthik Ganesan and Soumi Maiti and Young Min Kim and Xuankai Chang and Paul Liang and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - -@inproceedings{yifan_icassp2023, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={I3D: Transformer architectures with input-dependent dynamic depth for speech recognition}, - author={Yifan Peng and Jaesong Lee and and Shinji Watanabe}, - booktitle=ICASSP, - year={2023} -} - - -@inproceedings{liwei_aaai2022, - abbr={TTS}, - abbr_publisher={AAAI}, - title={A Vector Quantized Approach for Text to Speech Synthesis on Real-World Spontaneous Speech}, - author={Li-Wei Chen and Alexander Rudnicky and Shinji Watanabe}, - booktitle={Proceedings of AAAI}, - year={2022}, -} - -@inproceedings{yosuke_emnlp2022, - abbr={ASR}, - abbr_publisher={EMNLP}, - title={BERT Meets CTC: New Formulation of End-to-End Speech Recognition with Pre-trained Masked Language Model}, - author={Yosuke Higuchi and Brian Yan and Siddhant Arora and Tetsuji Ogawa and Tetsunori Kobayashi and Shinji Watanabe}, - booktitle={Proceedings of Findings of EMNLP}, - year={2022}, -} - -@inproceedings{siddhant_emnlp2022, - abbr={SLU}, - abbr_publisher={EMNLP}, - title={Token-level Sequence Labeling for Spoken Language Understanding using Compositional End-to-End Models}, - author={Siddhant Arora and Siddharth Dalmia and Brian Yan and Florian Metze and Alan W Black and Shinji Watanabe}, - booktitle={Proceedings of Findings of EMNLP}, - year={2022}, -} - -@inproceedings{shota_taslp2022-2, - abbr={SD}, - abbr_publisher={TASLP}, - title={Online Neural Diarization of Unlimited Numbers of Speakers Using Global and Local Attractors}, - author={Shota Horiguchi and Shinji Watanabe and Paola Garcia and Yuki Takashima and Yohei Kawaguchi}, - booktitle=TASLP, - year={2022}, -} - -@inproceedings{matthew_csl2022, - abbr={SE}, - abbr_publisher={CSL}, - title={A Dilemma of Ground Truth in Noisy Speech Separation and an Approach to Lessen the Impact of Imperfect Training Data}, - author={Matthew Maciejewski and Jing Shi and Shinji Watanabe and Sanjeev Khudanpur}, - booktitle={Computer Speech & Language}, - year={2022}, -} - -@inproceedings{wangyou_taslp2022, - abbr={SE}, - abbr_publisher={TASLP}, - title={End-to-End Dereverberation, Beamforming, and Speech Recognition in A Cocktail Party}, - author={Wangyou Zhang and Xuankai Chang and Christoph Boeddeker and Tomohiro Nakatani and Shinji Watanabe and Yanmin Qian}, - booktitle=TASLP, - year={2022}, -} - -@inproceedings{zhongqiu_spl2022, - abbr={SE}, - abbr_publisher={SPL}, - title={Improving Frame-Online Neural Speech Enhancement with Overlapped-Frame Prediction}, - author={Zhong-Qiu Wang and Shinji Watanabe}, - booktitle={IEEE Signal Processing Letters}, - year={2022}, -} - -@inproceedings{shota_taslp2022, - abbr={SD}, - abbr_publisher={TASLP}, - title={Encoder-Decoder Based Attractors for End-to-End Neural Diarization}, - author={Shota Horiguchi and Yusuke Fujita and Shinji Watanabe and Yawen Xue and Paola Garcia}, - booktitle=TASLP, - year={2022}, -} - -@inproceedings{abdel_jstsp2022, - abbr={ASR}, - abbr_publisher={JSTSP}, - title={Self-Supervised Speech Representation Learning: A Review}, - author={Abdelrahman Mohamed and Hung-yi Lee and Lasse Borgholt and Jakob D. Havtorn and Joakim Edin and Christian Igel and Katrin Kirchhoff and Shang-Wen Li and Karen Livescu and Lars Maaløe and Tara N. Sainath and Shinji Watanabe}, - booktitle={IEEE Journal of Selected Topics in Signal Processing}, - year={2022}, -} - -@inproceedings{antonios_iwslt2022, - abbr={ST}, - abbr_publisher={IWSLT}, - title={Findings of the IWSLT 2022 Evaluation Campaign}, - author={Antonios Anastasopoulos and Loïc Barrault and Luisa Bentivogli and Marcely Zanon Boito and Ondřej Bojar and Roldano Cattoni and Anna Currey and Georgiana Dinu and Kevin Duh and Maha Elbayad and Clara Emmanuel and Yannick Estève and Marcello Federico and Christian Federmann and Souhir Gahbiche and Hongyu Gong and Roman Grundkiewicz and Barry Haddow and Benjamin Hsu and Dávid Javorský and Vĕra Kloudová and Surafel Lakew and Xutai Ma and Prashant Mathur and Paul McNamee and Kenton Murray and Maria Nǎdejde and Satoshi Nakamura and Matteo Negri and Jan Niehues and Xing Niu and John Ortega and Juan Pino and Elizabeth Salesky and Jiatong Shi and Matthias Sperber and Sebastian Stüker and Katsuhito Sudoh and Marco Turchi and Yogesh Virkar and Alexander Waibel and Changhan Wang and Shinji Watanabe}, - booktitle=IWSLTT, - year={2022}, -} - -@inproceedings{yushi_slt2022, - abbr={SD&SS}, - abbr_publisher={SLT}, - title={EEND-SS: Joint End-to-End Neural Speaker Diarization and Speech Separation for Flexible Number of Speakers}, - author={Yushi Ueda and Soumi Maiti and Shinji Watanabe and Chunlei Zhang and Meng Yu and Shi-Xiong Zhang and Yong Xu}, - booktitle=SLT, - year={2022}, -} - -@inproceedings{tzuhsun_slt2022, - abbr={ASR&SD&SLU&ER}, - abbr_publisher={SLT}, - title={SUPERB @ SLT 2022: Challenge on Generalization and Efficiency of Self-Supervised Speech Representation Learning}, - author={Tzu-hsun Feng and Annie Dong and Ching-Feng Yeh and Shu-wen Yang and Tzu-Quan Lin and Jiatong Shi and Kai-Wei Chang and Zili Huang and Haibin Wu and Xuankai Chang and Shinji Watanabe and Abdel-rahman Mohamed and Shang-Wen Li and Hung-yi Lee}, - booktitle=SLT, - year={2022}, -} - -@inproceedings{kwangyoun_slt2022, - abbr={ASR}, - abbr_publisher={SLT}, - title={E-Branchformer: Branchformer with Enhanced merging for speech recognition}, - author={Kwangyoun Kim and Felix Wu and Yifan Peng and Jing Pan and Prashant Sridhar and Kyu Jeong Han and Shinji Watanabe}, - booktitle=SLT, - year={2022}, -} - -@inproceedings{yifan_slt2022, - abbr={ASR&SLU}, - abbr_publisher={SLT}, - title={A Study on the Integration of Pre-Trained SSL and ASR and LM and SLU Models for Spoken Language Understanding}, - author={Yifan Peng and Siddhant Arora and Yosuke Higuchi and Yushi Ueda and Sujay Kumar and Karthik Ganesan and Siddharth Dalmia and Xuankai Chang and Shinji Watanabe}, - booktitle=SLT, - year={2022}, -} - -@inproceedings{yen_slt2022, - abbr={ASR&SSL}, - abbr_publisher={SLT}, - title={On Compressing Sequences for Self-Supervised Speech Models}, - author={Yen Meng and Hsuan-Jui Chen and Jiatong Shi and Shinji Watanabe and Paola Garcia and Hung-yi Lee and Hao Tang}, - booktitle=SLT, - year={2022}, -} - -@inproceedings{yoshiki_slt2022, - abbr={ASR&SE&SSL}, - abbr_publisher={SLT}, - title={End-to-End Integration of Speech Recognition and Dereverberation and Beamforming and Self-Supervised Learning Representation}, - author={Yoshiki Masuyama and Xuankai Chang and Samuele Cornell and Shinji Watanabe and Nobutaka Ono}, - booktitle=SLT, - year={2022}, -} - -@inproceedings{shota_slt2022, - abbr={SE}, - abbr_publisher={SLT}, - title={Mutual Learning of Single- and Multi-Channel End-to-End Neural Diarization}, - author={Shota Horiguchi and Yuki Takashima and Shinji Watanabe and Paola Garcia}, - booktitle=SLT, - year={2022}, -} - -@inproceedings{robin_slt2022, - abbr={ASR}, - abbr_publisher={SLT}, - title={End-to-End Multi-speaker ASR with Independent Vector Analysis}, - author={Robin Scheibler and Wangyou Zhang and Xuankai Chang and Shinji Watanabe and Yanmin Qian}, - booktitle=SLT, - year={2022}, -} - -@inproceedings{jiatong_a_interspeech2022, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={VQ-T: RNN Transducers using Vector-Quantized Prediction Network States}, - author={Jiatong Shi and George Saon and David Haws and Shinji Watanabe and Brian Kingsbury}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{jaesong_interspeech2022, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Memory-Efficient Training of RNN-Transducer with Sampled Softmax}, - author={Jaesong Lee and Lukas Lee and Shinji Watanabe}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{keqi_interspeech2022, - abbr={SLU&ST}, - abbr_publisher={Interspeech}, - title={Blockwise Streaming Transformer for Spoken Language Understanding and Simultaneous Speech Translation}, - author={Keqi Deng and Shinji Watanabe and Jiatong Shi and Siddhant Arora}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{shuai_interspeech2022, - abbr={Music}, - abbr_publisher={Interspeech}, - title={SingAug: Data Augmentation for Singing Voice Synthesis with Cycle-consistent Training Strategy}, - author={Shuai Guo and Jiatong Shi and Tao Qian and Shinji Watanabe and Qin Jin}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{jiatong_b_interspeech2022, - abbr={Music}, - abbr_publisher={Interspeech}, - title={Muskits: an End-to-end Music Processing Toolkit for Singing Voice Synthesis}, - author={Jiatong Shi and Shuai Guo and Tao Qian and Tomoki Hayashi and Yuning Wu and Fangzheng Xu and Xuankai Chang and Huazhe Li and Peter Wu and Shinji Watanabe and Qin Jin}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{hang_interspeech2022, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Audio-Visual Speech Recognition in MISP2021 Challenge: Dataset Release and Deep Analysis}, - author={Hang Chen and Jun Du and Yusheng Dai and Chin-Hui Lee and Sabato Marco Siniscalchi and Shinji Watanabe and Odette Scharenborg and Jingdong Chen and Baocai Yin and Jia Pan}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{hengshun_interspeech2022, - abbr={KWS}, - abbr_publisher={Interspeech}, - title={Audio-Visual Wake Word Spotting in MISP2021 Challenge: Dataset Release and Deep Analysis}, - author={Hengshun Zhou and Jun Du and Gongzhen Zou and Zhaoxu Nian and Chin-Hui Lee and Sabato Marco Siniscalchi and Shinji Watanabe and Odette Scharenborg and Jingdong Chen and Shifu Xiong and Jian-Qing Gao}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{xijian_interspeech2022, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={ASR2K: Speech Recognition for Around 2000 Languages without Audio}, - author={Xinjian Li and Florian Metze and David R. Mortensen and Alan W Black and Shinji Watanabe}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{yenju_interspeech2022, - abbr={SE}, - abbr_publisher={Interspeech}, - title={ESPnet-SE++: Speech Enhancement for Robust Speech Recognition and Translation and and Understanding}, - author={Yen-Ju Lu and Xuankai Chang and Chenda Li and Wangyou Zhang and Samuele Cornell and Zhaoheng Ni and Yoshiki Masuyama and Brian Yan and Robin Scheibler and Zhong-Qiu Wang and Yu Tsao and Yanmin Qian and Shinji Watanabe}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{siddhant_interspeech2022, - abbr={SLU}, - abbr_publisher={Interspeech}, - title={Two-Pass Low Latency End-to-End Spoken Language Understanding}, - author={Siddhant Arora and Siddharth Dalmia and Xuankai Chang and Brian Yan and Alan W Black and Shinji Watanabe}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{peter_interspeech2022, - abbr={TTS}, - abbr_publisher={Interspeech}, - title={Deep Speech Synthesis from Articulatory Representations}, - author={Peter Wu and Shinji Watanabe and Louis Goldstein and Alan W Black and Gopala Krishna Anumanchipalli}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{yusuke_interspeech2022, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Minimum latency training of sequence transducers for streaming end-to-end speech recognition}, - author={Yusuke Shinohara and Shinji Watanabe}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{yui_interspeech2022, - abbr={ASR}, - abbr_publisher={Interspeech}, - author={Yui Sudo and Shakeel Muhammad and Kazuhiro Nakadai and Jiatong Shi and Shinji Watanabe}, - title={Streaming Automatic Speech Recognition with Re-blocking Processing Based on Integrated Voice Activity Detection}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{tatsuya_interspeech2022, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Better Intermediates Improve CTC Inference}, - author={Tatsuya Komatsu and Yusuke Fujita and Jaesong Lee and Lukas Lee and Shinji Watanabe and Yusuke Kida}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{yuki_interspeech2022, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Updating Only Encoders Prevents Catastrophic Forgetting of End-to-End ASR Models}, - author={Yuki Takashima and Shota Horiguchi and Shinji Watanabe and Leibny Paola Garcia Perera and Yohei Kawaguchi}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{takashi_interspeech2022, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Attention Weight Smoothing Using Prior Distributions for Transformer-Based End-to-End ASR}, - author={Takashi Maekaku and Yuya Fujita and Yifan Peng and Shinji Watanabe}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{emiru_interspeech2022, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Residual Language Model for End-to-end Speech Recognition}, - author={Emiru Tsunoo and Yosuke Kashiwagi and Chaitanya Prasad Narisetty and Shinji Watanabe}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{nathaniel_interspeech2022, - abbr={TTS}, - abbr_publisher={Interspeech}, - title={When Is TTS Augmentation Through a Pivot Language Useful?}, - author={Nathaniel Romney Robinson and Perez Ogayo and Swetha R. Gangu and David R. Mortensen and Shinji Watanabe}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{yooncheol_interspeech2022, - abbr={TTS}, - abbr_publisher={Interspeech}, - title={TriniTTS: Pitch-controllable End-to-end TTS without External Aligner}, - author={Yooncheol Ju and Ilhwan Kim and Hongsun Yang and Ji-Hoon Kim and Byeongyeol Kim and Soumi Maiti and Shinji Watanabe}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{muqiao_interspeech2022, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Online Continual Learning of End-to-End Speech Recognition Models}, - author={Muqiao Yang and Ian Lane and Shinji Watanabe}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{muqiao_interspeech2022, - abbr={SE}, - abbr_publisher={Interspeech}, - title={Improving Speech Enhancement through Fine-Grained Speech Characteristics}, - author={Muqiao Yang and Joseph Konan and David Bick and Anurag Kumar and Shinji Watanabe and Bhiksha Raj}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{xuankai_interspeech2022, - abbr={ASR&SE&SSL}, - abbr_publisher={Interspeech}, - title={End-to-End Integration of Speech Recognition, Speech Enhancement, and Self-Supervised Learning Representation}, - author={Xuankai Chang and Takashi Maekaku and Yuya Fujita and Shinji Watanabe}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{dan_interspeech2022, - abbr={ASR&SSL}, - abbr_publisher={Interspeech}, - title={Combining Spectral and Self-Supervised Features for Low Resource Speech Recognition and Translation}, - author={Dan Berrebbi and Jiatong Shi and Brian Yan and Osbel López-Francisco and Jonathan Amith and Shinji Watanabe}, - booktitle=interspeech, - year={2022}, -} - -@inproceedings{peng2022icml, - abbr={ASR&SLU&MT}, - abbr_publisher={ICML}, - title={Branchformer: Parallel MLP-Attention Architectures to Capture Local and Global Context for Speech Recognition and Understanding}, - author={Peng, Yifan and Dalmia, Siddharth and Lane, Ian and Watanabe, Shinji}, - booktitle=ICML, - year={2022}, -} - -@inproceedings{li2022aclfindings, - abbr={Linguistic}, - abbr_publisher={ACL}, - title={Zero-shot Learning for Grapheme to Phoneme Conversion with Language Ensemble}, - author={Xinjian Li and Florian Metze and David R Mortensen and Shinji Watanabe and Alan Black}, - booktitle=ACLFindings, - year={2022} -} - -@inproceedings{tsai2022acl, - abbr={SE&VC&ST}, - abbr_publisher={ACL}, - title={SUPERB-SG: Enhanced Speech processing Universal PERformance Benchmark for Semantic and Generative Capabilities}, - author={Hsiang-Sheng Tsai and Heng-Jui Chang and Wen-Chin Huang and Zili Huang and Kushal Lakhotia and Shu-wen Yang and Shuyan Dong and Andy T. Liu and Cheng-I Lai and Jiatong Shi and Xuankai Chang and Phil Hall and Hsuan-Jui Chen and Shang-Wen Li and Shinji Watanabe and Abdelrahman Mohamed and Hung-yi Lee}, - booktitle=ACL, - year={2022} -} - -@article{subramanian2022deep, - abbr={SE&ASR}, - abbr_publisher={CSL}, - title={Deep learning based multi-source localization with source splitting and its effectiveness in multi-talker speech recognition}, - author={Subramanian, Aswin Shanmugam and Weng, Chao and Watanabe, Shinji and Yu, Meng and Yu, Dong}, - journal={Computer Speech \& Language}, - volume={75}, - pages={101360}, - year={2022}, - publisher={Elsevier} -} - -@article{park2022review, - abbr={SD}, - abbr_publisher={CSL}, - title={A review of speaker diarization: Recent advances with deep learning}, - author={Park, Tae Jin and Kanda, Naoyuki and Dimitriadis, Dimitrios and Han, Kyu J and Watanabe, Shinji and Narayanan, Shrikanth}, - journal={Computer Speech \& Language}, - volume={72}, - pages={101317}, - year={2022}, - publisher={Elsevier}, - selected={true}, -} - -@article{huang2022joint, - abbr={SE&ASR}, - abbr_publisher={CSL}, - title={Joint speaker diarization and speech recognition based on region proposal networks}, - author={Huang, Zili and Delcroix, Marc and Garcia, Leibny Paola and Watanabe, Shinji and Raj, Desh and Khudanpur, Sanjeev}, - journal={Computer Speech \& Language}, - volume={72}, - pages={101316}, - year={2022}, - publisher={Elsevier} -} - -@article{hussein2022arabic, - abbr={ASR}, - abbr_publisher={CSL}, - title={Arabic speech recognition by end-to-end, modular systems and human}, - author={Hussein, Amir and Watanabe, Shinji and Ali, Ahmed}, - journal={Computer Speech \& Language}, - volume={71}, - pages={101272}, - year={2022}, - publisher={Elsevier} -} - - -@inproceedings{lu2022icassp, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={TOWARDS LOW-DISTORTION MULTI-CHANNEL SPEECH ENHANCEMENT: THE ESPNET-SE SUBMISSION TO THE L3DAS22 CHALLENGE}, - author={Jen-Ju Lu and Samuele Cornell and Xuankai Chang and Wangyou Zhang and Chenda Li and Zhaoheng Ni and Zhong-Qiu Wang and Shinji Watanabe}, - booktitle=ICASSP, - year={2022} -} - -@inproceedings{lu2022icassp, - abbr={Multimodal}, - abbr_publisher={ICASSP}, - title={THE FIRST MULTIMODAL INFORMATION BASED SPEECH PROCESSING (MISP) CHALLENGE: DATA, TASKS, BASELINES AND RESULTS}, - author={Hang Chen and Hengshun Zhou and Jun Du and Chin-Hui Lee and Jingdong Chen and Shinji Watanabe and Sabato Marco Siniscalchi and Odette Scharenborg and Di-Yuan Liu and Bao-Cai Yin and Jia Pan and Jian-Qing Gao and Cong Liu}, - booktitle=ICASSP, - year={2022} -} - -@inproceedings{motoi2022icassp, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={NON-AUTOREGRESSIVE END-TO-END AUTOMATIC SPEECH RECOGNITION INCORPORATING DOWNSTREAM NATURAL LANGUAGE PROCESSING}, - author={Motoi Omachi and Yuya Fujita and Shinji Watanabe and Tianzi Wang}, - booktitle=ICASSP, - year={2022} -} - -@inproceedings{takeshi2022icassp, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={AN EXPLORATION OF HUBERT WITH LARGE NUMBER OF CLUSTER UNITS AND MODEL ASSESSMENT USING BAYESIAN INFORMATION CRITERION}, - author={Takashi Maekaku and Xuankai Chang and Yuya Fujita and Shinji Watanabe}, - booktitle=ICASSP, - year={2022} -} - -@inproceedings{zili2022icassp, - abbr={SE&SSL}, - abbr_publisher={ICASSP}, - title={INVESTIGATING SELF-SUPERVISED LEARNING FOR SPEECH ENHANCEMENT AND SEPARATION}, - author={Zili Huang and Shinji Watanabe and Shu-wen Yang and Paola Garcia and Sanjeev Khudanpur}, - booktitle=ICASSP, - year={2022} -} - -@inproceedings{yenju2022icassp, - abbr={SE}, - abbr_publisher={ICASSP}, - title={CONDITIONAL DIFFUSION PROBABILISTIC MODEL FOR SPEECH ENHANCEMENT}, - author={Yen-Ju Lu and Zhong-Qiu Wang and Shinji Watanabe and Alexander Richard and Cheng Yu and Yu Tsao}, - booktitle=ICASSP, - year={2022} -} - -@inproceedings{keqi2022icassp, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={IMPROVING NON-AUTOREGRESSIVE END-TO-END SPEECH RECOGNITION WITH PRE-TRAINED ACOUSTIC AND LANGUAGE MODELS}, - author={Keqi Deng and Zehui Yang and Shinji Watanabe and Yosuke Higuchi and Gaofeng Cheng and Pengyuan Zhang}, - booktitle=ICASSP, - year={2022} -} - -@inproceedings{jingpan2022icassp, - abbr={ASR}, - abbr_publisher={ICASSP}, - author={Jing Pan and Tao Lei and Kwangyoun Kim and Kyu Han and Shinji Watanabe}, - title={SRU++: PIONEERING FAST RECURRENCE WITH ATTENTION FOR SPEECH RECOGNITION}, - booktitle=ICASSP, - year={2022} -} - -@inproceedings{taketomo2022icassp, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Integrating multiple ASR systems into NLP backend with attention fusion}, - author={Takatomo Kano and Atsunori Ogawa and Marc Delcroix and Shinji Watanabe}, - booktitle=ICASSP, - year={2022} -} - -@inproceedings{siddhant2022icassp, - abbr={SLU}, - abbr_publisher={ICASSP}, - title={ESPNET-SLU: ADVANCING SPOKEN LANGUAGE UNDERSTANDING THROUGH ESPNET}, - author={Siddhant Arora and Siddharth Dalmia and Pavel Denisov and Xuankai Chang and Yushi Ueda and Yifan Peng and Yuekai Zhang and Sujay Kumar and Karthik Ganesan and Brian Yan and Ngoc Thang Vu and Alan W Black and Shinji Watanabe}, - booktitle=ICASSP, - year={2022} -} - -@inproceedings{brian2022icassp, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={JOINT MODELING OF CODE-SWITCHED AND MONOLINGUAL ASR VIA CONDITIONAL FACTORIZATION}, - author={Brian Yan and Chunlei Zhang and Meng Yu and Shi-Xiong Zhang and Siddharth Dalmia and Dan Berrebbi and Chao Weng and Shinji Watanabe and Dong Yu}, - booktitle=ICASSP, - year={2022} -} - -@inproceedings{xuankai2022icassp, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={EXTENDED GRAPH TEMPORAL CLASSIFICATION FOR MULTI-SPEAKER END-TO-END ASR}, - author={Xuankai Chang and Niko Moritz and Takaaki Hori and Shinji Watanabe and Jonathan Le Roux}, - booktitle=ICASSP, - year={2022} -} - -@inproceedings{niko2022icassp, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Sequence Transduction with Graph-based Supervision}, - author={Niko Moritz and Takaaki Hori and Shinji Watanabe and Jonathan Le Roux}, - booktitle=ICASSP, - year={2022} -} - -@inproceedings{emiru2022icassp, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={RUN-AND-BACK STITCH SEARCH: NOVEL BLOCK SYNCHRONOUS DECODING FOR STREAMING ENCODER-DECODER ASR}, - author={Emiru Tsunoo and Chaitanya Narisetty and Michael Hentschel and Yosuke Kashiwagi and Shinji Watanabe}, - booktitle=ICASSP, - year={2022} -} - -@inproceedings{whenchin2022icassp, - abbr={VC&SSL}, - abbr_publisher={ICASSP}, - title={S3PRL-VC: OPEN-SOURCE VOICE CONVERSION FRAMEWORK WITH SELF-SUPERVISED SPEECH REPRESENTATIONS}, - author={Wen-Chin Huang and Shu-wen Yang and Tomoki Hayashi and Hung-yi Lee and Shinji Watanabe and Tomoki Toda}, - booktitle=ICASSP, - year={2022} -} - - -@inproceedings{chaitanya2022icassp, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={JOINT SPEECH RECOGNITION AND AUDIO CAPTIONING}, - author={Chaitanya Narisetty and Emiru Tsunoo and Xuankai Chang and Yosuke Kashiwagi and Michael Hentschel and Shinji Watanabe}, - booktitle=ICASSP, - year={2022} -} - -@inproceedings{shota2022icassp, - abbr={SD}, - abbr_publisher={ICASSP}, - title={MULTI-CHANNEL END-TO-END NEURAL DIARIZATION WITH DISTRIBUTED MICROPHONES}, - author={Shota Horiguchi and Yuki Takashima and Paola Garcia and Shinji Watanabe and Yohei Kawaguchi}, - booktitle=ICASSP, - year={2022} -} - -@inproceedings{yaoyuan2022icassp, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={TORCHAUDIO: BUILDING BLOCKS FOR AUDIO AND SPEECH PROCESSING}, - author={Yao-Yuan Yang and Moto Hira and Zhaoheng Ni and Artyom Astafurov and Caroline Chen and Christian Puhrsch and David Pollack and Dmitriy Genzel and Donny Greenberg and Edward Yang and Jason Lian and Jeff Hwang and Ji Chen and Peter Goldsborough and Sean Narenthiran and Shinji Watanabe and Soumith Chintala and Vincent Quenneville-Bélair}, - booktitle=ICASSP, - year={2022} -} - -@inproceedings{chunlei2022icassp, - abbr={SD}, - abbr_publisher={ICASSP}, - title={Towards End-to-End Speaker Diarization with Generalized Neural Speaker Clustering}, - author={Chunlei Zhang and Jiatong Shi and Chao Weng and Meng Yu and Dong Yu}, - booktitle=ICASSP, - year={2022} -} - - -@inproceedings{tao2022icassp, - abbr={Music}, - abbr_publisher={ICASSP}, - title={TRAINING STRATEGIES FOR AUTOMATIC SONG WRITING: A UNIFIED FRAMEWORK PERSPECTIVE}, - author={Tao Qian and Jiatong Shi and Shuai Guo and Peter Wu and Qin Jin}, - booktitle=ICASSP, - year={2022} -} - - -@article{SHI2022101327, - abbr={SE+ASR}, - abbr_publisher={CSL}, - title = {An investigation of neural uncertainty estimation for target speaker extraction equipped RNN transducer}, - journal = {Computer Speech & Language}, - volume = {73}, - pages = {101327}, - year = {2022}, - issn = {0885-2308}, - doi = {https://doi.org/10.1016/j.csl.2021.101327}, - url = {https://www.sciencedirect.com/science/article/pii/S0885230821001200}, - author = {Jiatong Shi and Chunlei Zhang and Chao Weng and Shinji Watanabe and Meng Yu and Dong Yu}, - keywords = {Target-speaker speech recognition, Target-speaker speech extraction, Uncertainty estimation}, - abstract = {Target-speaker speech recognition aims to recognize the speech of an enrolled speaker from an environment with background noise and interfering speakers. This study presents a joint framework that combines time-domain target speaker extraction and recurrent neural network transducer (RNN-T) for speech recognition. To alleviate the adverse effects of residual noise and artifacts introduced by the target speaker extraction module to the speech recognition back-end, we explore to training the target speaker extraction and RNN-T jointly. We find a multi-stage training strategy that pre-trains and fine-tunes each module before joint training is crucial in stabilizing the training process. In addition, we propose a novel neural uncertainty estimation that leverages useful information from the target speaker extraction module to further improve the back-end speech recognizer (i.e., speaker identity uncertainty and speech enhancement uncertainty). Compared to a recognizer with target speech extraction front-end, our experiments show that joint-training and the neural uncertainty module reduce 7% and 17% relative character error rate (CER) on multi-talker simulation data, respectively. The multi-condition experiments indicate that our method can reduce 9% relative CER in the noisy condition without losing performance in the clean condition. We also observe consistent improvements in further evaluation of real-world data based on vehicular speech.} -} - -@inproceedings{huang_asru2021, - abbr={ASR+TTS}, - abbr_publisher={ASRU}, - title={On Prosody Modeling for ASR+TTS based Voice Conversion}, - author={Wen-Chin Huang and Tomoki Hayashi and Xinjian Li and Shinji Watanabe and Tomoki Toda}, - booktitle=ASRU, - year={2021} -} - -@inproceedings{kano_asru2021, - abbr={SLU}, - abbr_publisher={ASRU}, - title={Attention-based Multi-hypothesis Fusion for Speech Summarization}, - author={Takatomo Kano and Atsunori Ogawa and Marc Delcroix and Shinji Watanabe}, - booktitle=ASRU, - year={2021} -} - -@inproceedings{inaguma_asru2021, - abbr={ST}, - abbr_publisher={ASRU}, - title={Fast-MD: Fast Multi-Decoder End-to-End Speech Translation with Non-Autoregressive Hidden Intermediates}, - author={Hirofumi Inaguma and Siddharth Dalmia and Brian Yan and Shinji Watanabe}, - booktitle=ASRU, - year={2021} -} - -@inproceedings{horiguchi_asru2021, - abbr={SD}, - abbr_publisher={ASRU}, - title={Towards Neural Diarization for Unlimited Numbers of Speakers using Global and Local Attractors}, - author={Shota Horiguchi and Shinji Watanabe and Paola Garcia and Yawen Xue and Yuki Takashima and Yohei Kawaguchi}, - booktitle=ASRU, - year={2021} -} - -@inproceedings{boyer_asru2021, - abbr={ASR}, - abbr_publisher={ASRU}, - title={A Study of Transducer based End-to-end ASR with ESPNet: Architecture, Auxiliary Loss and Decoding Strategies}, - author={Florian Boyer and Yusuke Shinohara and Takaaki Ishii and Hirofumi Inaguma and Shinji Watanabe}, - booktitle=ASRU, - year={2021} -} - -@inproceedings{higuchi_asru2021, - abbr={ASR}, - abbr_publisher={ASRU}, - title={A Comparative Study on Non-autoregressive Modelings for Speech-to-text Generation}, - author={Yosuke Higuchi and Nanxin Chen and Yuya Fujita and Hirofumi Inaguma and Tatsuya Komatsu and Jaesong Lee and Jumon Nozaki and Tianzi Wang and Shinji Watanabe}, - booktitle=ASRU, - year={2021} -} - -@inproceedings{rao_asru2021, - abbr={SE}, - abbr_publisher={ASRU}, - title={ConferencingSpeech Challenge: Towards Far-field Multi-channel Speech Enhancement for Video Conferencing}, - author={Wei Rao and Yihui Fu and Yanxin Hu and Xin Xu and Yvkai Jv and Jiangyu Han and Zhongjie Jiang and Lei Xie and Yannan Wang and Shinji Watanabe and Zheng-Hua Tan and Hui Bu and Tao Yu and Shidong Shang}, - booktitle=ASRU, - year={2021} -} - -@inproceedings{wu_asru2021, - abbr={ASR+TTS}, - abbr_publisher={ASRU}, - title={Cross-lingual Transfer for Speech Processing using Acoustic Language Similarity}, - author={Peter Wu and Jiatong Shi and Yifan Zhong and Shinji Watanabe and Alan Black}, - booktitle=ASRU, - year={2021} -} - -@inproceedings{chang_asru2021, - abbr={ASR&SSL}, - abbr_publisher={ASRU}, - title={An Exploration of Self-supervised Pretrained Representations for End-to-end Speech Recognition}, - author={Xuankai Chang and Takashi Maekaku and Pengcheng Guo and Jing Shi and Yen-Ju Lu and Aswin Shanmugam Subramanian and Tianzi Wang and Shu-wen Yang and Yu Tsao and Hung-yi Lee and Shinji Watanabe}, - booktitle=ASRU, - year={2021} -} - -@inproceedings{wu_apsipa2021, - abbr={VC}, - abbr_publisher={APSIPA}, - title={Understanding the Tradeoffs in Client-side Privacy for Downstream Speech Tasks}, - author={Peter Wu and Paul Pu Liang and Jiatong Shi and Ruslan Salakhutdinov and Shinji Watanabe and Louis-Philippe Morency}, - booktitle=APSIPA, - year={2021} -} - -@inproceedings{inaguma2021iwslt, - abbr={ST}, - abbr_publisher={IWSLT}, - title={ESPnet-ST IWSLT 2021 Offline Speech Translation System}, - author={Inaguma, Hirofumi and Yan, Brian and Dalmia, Siddharth and Guo, Pengcheng and Shi, Jiatong and Duh, Kevin and Watanabe, Shinji}, - booktitle=IWSLT, - pages={100--109}, - year={2021} -} - -@inproceedings{chen2021giga, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio}, - author={Chen, Guoguo and Chai, Shuzhou and Wang, Guanbo and Du, Jiayu and Zhang, Wei-Qiang and Weng, Chao and Su, Dan and Povey, Daniel and Trmal, Jan and Zhang, Junbo and Jin, Mingjie and Khudanpur, Sanjeev and Watanabe, Shinji and Zhao, Shuaijiang and Zou, Wei and Li, Xiangang and Yao, Xuchen and Wang, Yongqing and You, Zhao and Yan, Zhiyong}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{komatsu2021chains, - abbr={AED}, - abbr_publisher={Interspeech}, - title={Acoustic Event Detection with Classifier Chains}, - author={Komatsu, Tatsuya and Watanabe, Shinji and Miyazaki, Koichi and Hayashi, Tomoki}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{guo2021combine, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Multi-Speaker ASR Combining Non-Autoregressive Conformer CTC and Conditional Speaker Chain}, - author={Guo, Pengcheng and Chang, Xuankai and Watanabe, Shinji and Xie, Lei}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{kim2021transducer, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Multi-mode Transformer Transducer with Stochastic Future Context}, - author={Kim, Kwangyoun and Wu, Felix and Sridhar, Prashant and Han, Kyu and Watanabe, Shinji}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{yan2021allophone, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Differentiable Allophone Graphs for Language Universal Speech Recognition}, - author={Yan, Brian and Dalmia, Siddharth and Mortensen, David R. and Metze, Florian and Watanabe, Shinji}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{maciejewski2021verification, - abbr={SE}, - abbr_publisher={Interspeech}, - title={Speaker Verification-Based Evaluation of Single-Channel Speech Separation}, - author={Maciejewski, Matthew and Watanabe, Shinji and Khudanpur, Sanjeev}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{neill2021financial, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={SPGISpeech: 5,000 hours of transcribed financial audio for fully formatted end-to-end speech recognition}, - author={O'Neill, Patrick and Lavrukhin, Vitaly and Majumdar, Somshubra and Noroozi, Vahid and Zhang, Yuekai and Kuchaiev, Oleksii and Balam, Jagadeesh and Dovzhenko, Yuliya and Freyberg, Keenan and Shulman, Michael and Ginsburg, Boris and Watanabe, Shinji and Kucsko, Georg}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{yang2021superb, - abbr={ASR&SD&SLU&ER}, - abbr_publisher={Interspeech}, - title={SUPERB: Speech processing Universal PERformance Benchmark}, - author={Yang, Shu-wen and Chi, Po-Han and Chuang, Yung-Sung and Lai, Cheng-I and Lakhotia, Kushal and Y., Yist and T., Andy and Shi, Jiatong and Chang, Xuankai and Lin, Guan-Ting and Huang, Tzu-Hsien and Tseng, Wei-Cheng and Lee, Ko-tik and Liu, Da-Rong and Huang, Zili and Dong, Shuyan and Li, Shang-Wen and Watanabe, Shinji and Mohamed, Abdelrahman and Lee, Hung-yi}, - booktitle=interspeech, - year={2021}, - arxiv={2105.01051}, - selected={true}, - pdf={https://arxiv.org/pdf/2105.01051.pdf}, - html={https://www.isca-speech.org/archive/interspeech_2021/yang21c_interspeech.html}, -} - -@inproceedings{shon2021sentiment, - abbr={SSA}, - abbr_publisher={Interspeech}, - title={Leveraging Pre-trained Language Model for Speech Sentiment Analysis}, - author={Shon, Suwon and Brusco, Pablo and Pan, Jing and Han, Kyu and Watanabe, Shinji}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{wong2021E2EASR, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Streaming End-to-End ASR based on Blockwise Non-Autoregressive Models}, - author={Wang, Tianzi and Fujita, Yuya and Chang, Xuankai and Watanabe, Shinji}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{arora2021SLU, - abbr={SLU}, - abbr_publisher={Interspeech}, - title={Rethinking End-to-End Evaluation of Decomposable Tasks: A Case Study on Spoken Language Understanding}, - author={Arora, Siddhant and Ostapenko, Alissa and Viswanathan, Vijay and Dalmia, Siddharth and Metze, Florian and Watanabe, Shinji and Black, Alan W.}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{meekaku2021ZeroSpeech, - abbr={ASR & SpeDialog}, - abbr_publisher={Interspeech}, - title={Speech Representation Learning Combining Conformer CPC with Deep Cluster for the ZeroSpeech Challenge 2021}, - author={Maekaku, Takashi and Chang, Xuankai and Fujita, Yuya and Chen, Li-Wei and Watanabe, Shinji and Rudnicky, Alexander}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{lee2021CTC, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Layer Pruning on Demand with Intermediate CTC}, - author={Lee, Jaesong and Kang, Jingu and Watanabe, Shinji}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{fujita2021insertion, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Toward Streaming ASR with Non-autoregressive Insertion-based Model}, - author={Fujita, Yuya and Wang, Tianzi and Watanabe, Shinji and Omachi, Motoi}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{zmolikova2021weaksupervision, - abbr={SE&ASR}, - abbr_publisher={Interspeech}, - title={Auxiliary loss function for target speech extraction and recognition with weak supervision based on speaker characteristics}, - author={Zmolikova, Katerina and Delcroix, Marc and Raj, Desh and Watanabe, Shinji and Honza Černocký, Jan}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{tsunoo2021DataAug, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Data Augmentation Methods for End-to-end Speech Recognition on Distant-talk Scenarios}, - author={Tsunoo, Emiru and Shibata, Kentaro and Narisetty, Chaitanya and Kashiwagi, Yosuke and Watanabe, Shinji}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{he2021TSVAD, - abbr={SD}, - abbr_publisher={Interspeech}, - title={Target-Speaker Voice Activity Detection with Improved I-Vector Estimation for Unknown Number of Speaker}, - author={He, Mao-Kui and Raj, Desh and Huang, Zili and Du, Jun and Chen, Zhuo and Watanabe, Shinji}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{watanabe2021espnet, - abbr={SE&ASR&ST}, - abbr_publisher={DSLW}, - title={The 2020 ESPnet update: new features, broadened applications, performance improvements, and future plans}, - author={Shinji Watanabe and Florian Boyer and Xuankai Chang and Pengcheng Guo and Tomoki Hayashi and Yosuke Higuchi and Takaaki Hori and Wen-Chin Huang and Hirofumi Inaguma and Naoyuki Kamo and Shigeki Karita and Chenda Li and Jing Shi and Aswin Shanmugam Subramanian and Wangyou Zhang}, - booktitle={Proceedings of 2021 IEEE Data Science and Learning Workshop}, - year={2021}, - organization={IEEE} -} - -@inproceedings{li2021dual, - abbr={SE}, - abbr_publisher={SLT}, - title={Dual-path RNN for long recording speech separation}, - author={Li, Chenda and Luo, Yi and Han, Cong and Li, Jinyu and Yoshioka, Takuya and Zhou, Tianyan and Delcroix, Marc and Kinoshita, Keisuke and Boeddeker, Christoph and Qian, Yanmin and Watanabe, Shinji and Chen, Zhuo}, - booktitle=SLT, - pages={865--872}, - year={2021}, - organization={IEEE} -} - -@inproceedings{takashima2021end, - abbr={SD}, - abbr_publisher={SLT}, - title={End-to-End Speaker Diarization Conditioned on Speech Activity and Overlap Detection}, - author={Takashima, Yuki and Fujita, Yusuke and Watanabe, Shinji and Horiguchi, Shota and Garc{\'\i}a, Paola and Nagamatsu, Kenji}, - booktitle=SLT, - pages={849--856}, - year={2021}, - organization={IEEE} -} - -@inproceedings{tsunoo2021streaming, - abbr={ASR}, - abbr_publisher={SLT}, - title={Streaming Transformer ASR with blockwise synchronous beam search}, - author={Tsunoo, Emiru and Kashiwagi, Yosuke and Watanabe, Shinji}, - booktitle=SLT, - pages={22--29}, - year={2021}, - organization={IEEE} -} - -@inproceedings{wang2021sequential, - abbr={SE}, - abbr_publisher={SLT}, - title={Sequential multi-frame neural beamforming for speech separation and enhancement}, - author={Wang, Zhong-Qiu and Erdogan, Hakan and Wisdom, Scott and Wilson, Kevin and Raj, Desh and Watanabe, Shinji and Chen, Zhuo and Hershey, John R}, - booktitle=SLT, - pages={905--911}, - year={2021}, - organization={IEEE} -} - -@inproceedings{raj2021dover, - abbr={SD}, - abbr_publisher={SLT}, - title={DOVER-Lap: A Method for Combining Overlap-aware Diarization Outputs}, - author={Raj, Desh and Garcia-Perera, Leibny Paola and Huang, Zili and Watanabe, Shinji and Povey, Daniel and Stolcke, Andreas and Khudanpur, Sanjeev}, - booktitle=SLT, - pages={881--888}, - year={2021}, - organization={IEEE} -} - -@inproceedings{raj2021integration, - abbr={SE&SE&ASR}, - abbr_publisher={SLT}, - title={Integration of speech separation, diarization, and recognition for multi-speaker meetings: System description, comparison, and analysis}, - author={Raj, Desh and Denisov, Pavel and Chen, Zhuo and Erdogan, Hakan and Huang, Zili and He, Maokui and Watanabe, Shinji and Du, Jun and Yoshioka, Takuya and Luo, Yi and others}, - booktitle=SLT, - pages={897--904}, - year={2021}, - organization={IEEE} -} - -@inproceedings{xue2021online, - abbr={SD}, - abbr_publisher={SLT}, - title={Online end-to-end neural diarization with speaker-tracing buffer}, - author={Xue, Yawen and Horiguchi, Shota and Fujita, Yusuke and Watanabe, Shinji and Garc{\'\i}a, Paola and Nagamatsu, Kenji}, - booktitle=SLT, - pages={841--848}, - year={2021}, - organization={IEEE} -} - -@inproceedings{shi2021highland, - abbr={ST}, - abbr_publisher={AmericasNLP}, - title={Highland Puebla Nahuatl Speech Translation Corpus for Endangered Language Documentation}, - author={Shi, Jiatong and Amith, Jonathan D and Chang, Xuankai and Dalmia, Siddharth and Yan, Brian and Watanabe, Shinji}, - booktitle={Proceedings of the First Workshop on Natural Language Processing for Indigenous Languages of the Americas}, - pages={53--63}, - year={2021} -} - -@inproceedings{amith2021end, - abbr={ASR}, - abbr_publisher={AmericasNLP}, - title={End-to-End Automatic Speech Recognition: Its Impact on the Workflowin Documenting Yolox{\'o}chitl Mixtec}, - author={Amith, Jonathan D and Shi, Jiatong and Garc{\'\i}a, Rey Castillo}, - booktitle={Proceedings of the First Workshop on Natural Language Processing for Indigenous Languages of the Americas}, - pages={64--80}, - year={2021} -} - -@inproceedings{omachi2021end, - abbr={ASR}, - abbr_publisher={NAACL}, - title={End-to-end ASR to jointly predict transcriptions and linguistic annotations}, - author={Omachi, Motoi and Fujita, Yuya and Watanabe, Shinji and Wiesner, Matthew}, - booktitle=NAACL, - pages={1861--1871}, - year={2021} -} - -@inproceedings{dalmia2021searchable, - abbr={ST}, - abbr_publisher={NAACL}, - title={Searchable Hidden Intermediates for End-to-End Models of Decomposable Sequence Tasks}, - author={Dalmia, Siddharth and Yan, Brian and Raunak, Vikas and Metze, Florian and Watanabe, Shinji}, - booktitle=NAACL, - pages={1882--1896}, - year={2021} -} - -@inproceedings{inaguma2021source, - abbr={ST}, - abbr_publisher={NAACL}, - title={Source and Target Bidirectional Knowledge Distillation for End-to-end Speech Translation}, - author={Inaguma, Hirofumi and Kawahara, Tatsuya and Watanabe, Shinji}, - booktitle=NAACL, - pages={1872--1881}, - year={2021} -} - -@inproceedings{shi2021leveraging, - abbr={ASR}, - abbr_publisher={EACL}, - title={Leveraging End-to-End ASR for Endangered Language Documentation: An Empirical Study on Yol{\'o}xochitl Mixtec}, - author={Shi, Jiatong and Amith, Jonathan D and Garc{\'\i}a, Rey Castillo and Sierra, Esteban Guadalupe and Duh, Kevin and Watanabe, Shinji}, - booktitle=EACL, - pages={1134--1145}, - year={2021} -} - - -@inproceedings{xue2021Online, - abbr={SD}, - abbr_publisher={Interspeech}, - title={Online Streaming End-to-End Neural Diarization Handling Overlapping Speech and Flexible Numbers of Speakers}, - author={Xue, Yawen and Horiguchi, Shota and Fujita, Yusuke and Takashima, Yuki and Watanabe, Shinji and Paola Garcia Perera, Leibny and Namagatsu, Kenji}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{takashima2021SemiSup, - abbr={SD}, - abbr_publisher={Interspeech}, - title={Semi-Supervised Training with Pseudo-Labeling for End-to-End Neural Diarization}, - author={Takashima, Yuki and Fujita, Yusuke and Horiguchi, Shota and Watanabe, Shinji and Paola, Leibny and Nagamatsu, Kenji}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{han2021Cont, - abbr={SE}, - abbr_publisher={Interspeech}, - title={Continuous speech separation using speaker inventory for long recording}, - author={Han, Cong and Luo, Yi and Li, Chenda and Zhou, Tianyan and Kinoshita, Keisuke and Watanabe, Shinji and Delcroix, Marc and Erdogan, Hakan and Hershey, John and Mesgarani, Nima and Chen, Zhuo}, - booktitle=interspeech, - year={2021} -} - -@inproceedings{maiti2021end, - abbr={SD}, - abbr_publisher={ICASSP}, - title={End-To-End Diarization for Variable Number of Speakers with Local-Global Networks and Discriminative Speaker Embeddings}, - author={Maiti, Soumi and Erdogan, Hakan and Wilson, Kevin and Wisdom, Scott and Watanabe, Shinji and Hershey, John R}, - booktitle=ICASSP, - pages={7183--7187}, - year={2021}, - organization={IEEE} -} - -@inproceedings{li2021dual, - abbr={SE}, - abbr_publisher={ICASSP}, - title={Dual-Path Modeling for Long Recording Speech Separation in Meetings}, - author={Li, Chenda and Chen, Zhuo and Luo, Yi and Han, Cong and Zhou, Tianyan and Kinoshita, Keisuke and Delcroix, Marc and Watanabe, Shinji and Qian, Yanmin}, - booktitle=ICASSP, - year={2021}, - organization={IEEE} -} - -@inproceedings{guo2021recent, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Recent developments on espnet toolkit boosted by conformer}, - author={Guo, Pengcheng and Boyer, Florian and Chang, Xuankai and Hayashi, Tomoki and Higuchi, Yosuke and Inaguma, Hirofumi and Kamo, Naoyuki and Li, Chenda and Garcia-Romero, Daniel and Shi, Jiatong and others}, - booktitle=ICASSP, - pages={5874--5878}, - year={2021}, - organization={IEEE} -} - -@inproceedings{zhang2021end, - abbr={SE&ASR}, - abbr_publisher={ICASSP}, - title={End-to-end dereverberation, beamforming, and speech recognition with improved numerical stability and advanced frontend}, - author={Zhang, Wangyou and Boeddeker, Christoph and Watanabe, Shinji and Nakatani, Tomohiro and Delcroix, Marc and Kinoshita, Keisuke and Ochiai, Tsubasa and Kamo, Naoyuki and Haeb-Umbach, Reinhold and Qian, Yanmin}, - booktitle=ICASSP, - pages={6898--6902}, - year={2021}, - organization={IEEE} -} - -@inproceedings{horiguchi2021end, - abbr={SD}, - abbr_publisher={ICASSP}, - title={End-to-end speaker diarization as post-processing}, - author={Horiguchi, Shota and Garc{\'\i}a, Paola and Fujita, Yusuke and Watanabe, Shinji and Nagamatsu, Kenji}, - booktitle=ICASSP, - pages={7188--7192}, - year={2021}, - organization={IEEE} -} - -@inproceedings{higuchi2021improved, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Improved Mask-CTC for Non-Autoregressive End-to-End ASR}, - author={Higuchi, Yosuke and Inaguma, Hirofumi and Watanabe, Shinji and Ogawa, Tetsuji and Kobayashi, Tetsunori}, - booktitle=ICASSP, - pages={8363--8367}, - year={2021}, - organization={IEEE} -} - -@inproceedings{lee2021intermediate, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Intermediate Loss Regularization for CTC-Based Speech Recognition}, - author={Lee, Jaesong and Watanabe, Shinji}, - booktitle=ICASSP, - pages={6224--6228}, - year={2021}, - organization={IEEE} -} - -@inproceedings{inaguma2021orthros, - abbr={ST}, - abbr_publisher={ICASSP}, - title={Orthros: Non-autoregressive end-to-end speech translation with dual-decoder}, - author={Inaguma, Hirofumi and Higuchi, Yosuke and Duh, Kevin and Kawahara, Tatsuya and Watanabe, Shinji}, - booktitle=ICASSP, - pages={7503--7507}, - year={2021}, - organization={IEEE} -} - -@inproceedings{subramanian2021directional, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Directional ASR: A new paradigm for E2E multi-speaker speech recognition with source localization}, - author={Subramanian, Aswin Shanmugam and Weng, Chao and Watanabe, Shinji and Yu, Meng and Xu, Yong and Zhang, Shi-Xiong and Yu, Dong}, - booktitle=ICASSP, - pages={8433--8437}, - year={2021}, - organization={IEEE} -} - -@inproceedings{baskar2021eat, - abbr={ASR&TTS&SSL}, - abbr_publisher={ICASSP}, - title={Eat: Enhanced ASR-TTS for Self-Supervised Speech Recognition}, - author={Baskar, Murali Karthick and Burget, Luk{\'a}{\v{s}} and Watanabe, Shinji and Astudillo, Ramon Fernandez and others}, - booktitle=ICASSP, - pages={6753--6757}, - year={2021}, - organization={IEEE} -} - -@inproceedings{kashiwagi2021gaussian, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Gaussian Kernelized Self-Attention for Long Sequence Data and its Application to CTC-Based Speech Recognition}, - author={Kashiwagi, Yosuke and Tsunoo, Emiru and Watanabe, Shinji}, - booktitle=ICASSP, - pages={6214--6218}, - year={2021}, - organization={IEEE} -} - -@inproceedings{shi2021improving, - abbr={SE&ASR}, - abbr_publisher={ICASSP}, - title={Improving RNN Transducer with Target Speaker Extraction and Neural Uncertainty Estimation}, - author={Shi, Jiatong and Zhang, Chunlei and Weng, Chao and Watanabe, Shinji and Yu, Meng and Yu, Dong}, - booktitle=ICASSP, - pages={6908--6912}, - year={2021}, - organization={IEEE} -} - -@inproceedings{maciejewski2021training, - abbr={SE}, - abbr_publisher={ICASSP}, - title={Training Noisy Single-Channel Speech Separation with Noisy Oracle Sources: A Large Gap and a Small Step}, - author={Maciejewski, Matthew and Shi, Jing and Watanabe, Shinji and Khudanpur, Sanjeev}, - booktitle=ICASSP, - pages={5774--5778}, - year={2021}, - organization={IEEE} -} - -@inproceedings{shi2021sequence, - abbr={Music}, - abbr_publisher={ICASSP}, - title={Sequence-To-Sequence Singing Voice Synthesis With Perceptual Entropy Loss}, - author={Shi, Jiatong and Guo, Shuai and Huo, Nan and Zhang, Yuekai and Jin, Qin}, - booktitle=ICASSP, - pages={76--80}, - year={2021}, - organization={IEEE} -} - - -@inproceedings{hayashi2020espnet, - abbr={TTS}, - abbr_publisher={ICASSP}, - title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit}, - author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu}, - booktitle=ICASSP, - pages={7654--7658}, - year={2020}, - organization={IEEE}, - code={https://github.com/espnet/espnet}, - selected={true}, - pdf={https://arxiv.org/pdf/1910.10909.pdf}, - html={https://ieeexplore.ieee.org/abstract/document/9053512/}, - arxiv={1910.10909} -} - -@inproceedings{inaguma-etal-2020-espnet, - abbr={ST}, - abbr_publisher={ACL}, - title = "{ESP}net-{ST}: All-in-One Speech Translation Toolkit", - author = "Inaguma, Hirofumi and - Kiyono, Shun and - Duh, Kevin and - Karita, Shigeki and - Yalta, Nelson and - Hayashi, Tomoki and - Watanabe, Shinji", - booktitle = ACL, - month = jul, - year = "2020", - address = "Online", - publisher = "Association for Computational Linguistics", - url = "https://www.aclweb.org/anthology/2020.acl-demos.34", - pages = "302--311", - code={https://github.com/espnet/espnet}, - selected={true}, -} -@inproceedings{li2020espnet, - abbr={SE}, - abbr_publisher={SLT}, - title={{ESPnet-SE}: End-to-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration}, - author={Chenda Li and Jing Shi and Wangyou Zhang and Aswin Shanmugam Subramanian and Xuankai Chang and Naoyuki Kamo and Moto Hira and Tomoki Hayashi and Christoph Boeddeker and Zhuo Chen and Shinji Watanabe}, - booktitle=SLT, - pages={785--792}, - year={2021}, - organization={IEEE}, - code={https://github.com/espnet/espnet}, -} - -@article{huh2020augmentation, - abbr={SR&SSL}, - abbr_publisher={NeurIPS}, - title={Augmentation adversarial training for self-supervised speaker recognition}, - author={Huh, Jaesung and Heo, Hee Soo and Kang, Jingu and Watanabe, Shinji and Chung, Joon Son}, - arxiv={2007.12085}, - year={2020} -} -@article{miyazaki2020conformer, - abbr={SED}, - abbr_publisher={DCASE}, - title={Conformer-based sound event detection with semi-supervised learning and data augmentation}, - author={Miyazaki, Koichi and Komatsu, Tatsuya and Hayashi, Tomoki and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya}, - html={http://dcase.community/documents/workshop2020/proceedings/DCASE2020Workshop_Miyazaki_92.pdf}, - volume={1}, - pages={4}, - year={2020} -} - -@article{arora2020jhu, - abbr={ASR}, - abbr_publisher={CHiME}, - title={The JHU multi-microphone multi-speaker ASR system for the CHiME-6 challenge}, - author={Arora, Ashish and Raj, Desh and Subramanian, Aswin Shanmugam and Li, Ke and Ben-Yair, Bar and Maciejewski, Matthew and {\.Z}elasko, Piotr and Garcia, Paola and Watanabe, Shinji and Khudanpur, Sanjeev}, - arxiv={2006.07898}, - year={2020} -} -@inproceedings{chang2020end, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={End-to-end multi-speaker speech recognition with transformer}, - author={Chang, Xuankai and Zhang, Wangyou and Qian, Yanmin and Le Roux, Jonathan and Watanabe, Shinji}, - html={https://ieeexplore.ieee.org/abstract/document/9054029}, - pages={6134--6138}, - year={2020} -} -@inproceedings{inoue2020semi, - abbr={TTS}, - abbr_publisher={ICASSP}, - title={Semi-supervised speaker adaptation for end-to-end speech synthesis with pretrained models}, - author={Inoue, Katsuki and Hara, Sunao and Abe, Masanobu and Hayashi, Tomoki and Yamamoto, Ryuichi and Watanabe, Shinji}, - html={https://ieeexplore.ieee.org/abstract/document/9053371}, - pages={7634--7638}, - year={2020} -} -@inproceedings{yoshimura2020end, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={End-to-end automatic speech recognition integrated with ctc-based voice activity detection}, - author={Yoshimura, Takenori and Hayashi, Tomoki and Takeda, Kazuya and Watanabe, Shinji}, - arxiv={https://ieeexplore.ieee.org/abstract/document/9054358}, - pages={6999--7003}, - year={2020} -} -} -@inproceedings{fujita2020attention, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Attention-based asr with lightweight and dynamic convolutions}, - author={Fujita, Yuya and Subramanian, Aswin Shanmugam and Omachi, Motoi and Watanabe, Shinji}, - arxiv={https://ieeexplore.ieee.org/abstract/document/9053887}, - pages={7034--7038}, - year={2020}, - code={https://github.com/espnet/espnet} -} -@inproceedings{li2020practical, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={A practical two-stage training strategy for multi-stream end-to-end speech recognition}, - author={Li, Ruizhi and Sell, Gregory and Wang, Xiaofei and Watanabe, Shinji and Hermansky, Hynek}, - arxiv={1910.10671}, - pages={7014--7018}, - year={2020} -} -@inproceedings{huang2020speaker, - abbr={SD}, - abbr_publisher={ICASSP}, - title={Speaker diarization with region proposal network}, - author={Huang, Zili and Watanabe, Shinji and Fujita, Yusuke and Garc{\'\i}a, Paola and Shao, Yiwen and Povey, Daniel and Khudanpur, Sanjeev}, - html={https://ieeexplore.ieee.org/abstract/document/9053760}, - arxiv={2002.06220}, - pages={6514--6518}, - year={2020} -} -@inproceedings{miyazaki2020weakly, - abbr={SED}, - abbr_publisher={ICASSP}, - title={Weakly-supervised sound event detection with self-attention}, - author={Miyazaki, Koichi and Komatsu, Tatsuya and Hayashi, Tomoki and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya}, - html={https://ieeexplore.ieee.org/abstract/document/9053609}, - pages={66--70}, - year={2020}, - code={https://github.com/espnet/espnet} -} -@inproceedings{subramanian2020far, - abbr={SE}, - abbr_publisher={ICASSP}, - title={Far-field location guided target speech extraction using end-to-end speech recognition objectives}, - html={https://ieeexplore.ieee.org/document/9053692}, - author={Subramanian, Aswin Shanmugam and Weng, Chao and Yu, Meng and Zhang, Shi-Xiong and Xu, Yong and Watanabe, Shinji and Yu, Dong}, - pages={7299--7303}, - year={2020} -} -@incollection{shinozaki2020automated, - abbr={ASR}, - abbr_publisher={Deep Neural Evolution}, - title={Automated Development of DNN Based Spoken Language Systems Using Evolutionary Algorithms}, - author={Shinozaki, Takahiro and Watanabe, Shinji and Duh, Kevin}, - html={https://link.springer.com/chapter/10.1007/978-981-15-3685-4_4}, - pages={97--129}, - year={2020} -} -@article{huang2020sequence, - abbr={ASR&TTS}, - abbr_publisher={VCC}, - title={The sequence-to-sequence baseline for the voice conversion challenge 2020: Cascading asr and tts}, - author={Huang, Wen-Chin and Hayashi, Tomoki and Watanabe, Shinji and Toda, Tomoki}, - arxiv={2010.02434}, - year={2020}, - code={https://github.com/espnet/espnet/tree/master/egs/vcc20} -} -@article{shi2020sequence, - abbr={SE&ASR}, - abbr_publisher={NeurIPS}, - title={Sequence to multi-sequence learning via conditional chain mapping for mixture signals}, - author={Shi, Jing and Chang, Xuankai and Guo, Pengcheng and Watanabe, Shinji and Fujita, Yusuke and Xu, Jiaming and Xu, Bo and Xie, Lei}, - arxiv={2006.14150}, - year={2020}, - code={https://demotoshow.github.io/} -} -@inproceedings{chang2020end, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={End-to-End ASR with Adaptive Span Self-Attention.}, - author={Chang, Xuankai and Subramanian, Aswin Shanmugam and Guo, Pengcheng and Watanabe, Shinji and Fujita, Yuya and Omachi, Motoi}, - arxiv={http://www.interspeech2020.org/uploadfile/pdf/Thu-1-2-4.pdf}, - pages={3595--3599}, - year={2020} -} -@article{cho2020learning, - abbr={TTS}, - abbr_publisher={Interspeech}, - title={Learning speaker embedding from text-to-speech}, - author={Cho, Jaejin and Zelasko, Piotr and Villalba, Jes{\'u}s and Watanabe, Shinji and Dehak, Najim}, - arxiv={2010.11221}, - year={2020}, - code={https://github.com/JaejinCho/espnet spkidtts.git} -} -@article{shi2020speaker, - abbr={SE}, - abbr_publisher={Interspeech}, - title={Speaker-conditional chain model for speech separation and extraction}, - author={Shi, Jing and Xu, Jiaming and Fujita, Yusuke and Watanabe, Shinji and Xu, Bo}, - arxiv={2006.14149}, - year={2020} -} -@article{fujita2020insertion, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Insertion-based modeling for end-to-end automatic speech recognition}, - author={Fujita, Yuya and Watanabe, Shinji and Omachi, Motoi and Chan, Xuankai}, - arxiv={2005.13211}, - year={2020} -} -@article{horiguchi2020end, - abbr={SD}, - abbr_publisher={Interspeech}, - title={End-to-end speaker diarization for an unknown number of speakers with encoder-decoder based attractors}, - author={Horiguchi, Shota and Fujita, Yusuke and Watanabe, Shinji and Xue, Yawen and Nagamatsu, Kenji}, - arxiv={2005.09921}, - year={2020}, - code={https://github.com/hitachi-speech/EEND}, - selected={true}, - -} -@article{zhang2020end, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={End-to-end far-field speech recognition with unified dereverberation and beamforming}, - author={Zhang, Wangyou and Subramanian, Aswin Shanmugam and Chang, Xuankai and Watanabe, Shinji and Qian, Yanmin}, - arxiv={2005.10479}, - year={2020}, - code={https://github.com/Emrys365/espnet/blob/wsj1_mix_spatialized/egs/wsj1_mix_spatialized/asr1/} -} -@article{higuchi2020mask, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Mask CTC: Non-autoregressive end-to-end ASR with CTC and mask predict}, - author={Higuchi, Yosuke and Watanabe, Shinji and Chen, Nanxin and Ogawa, Tetsuji and Kobayashi, Tetsunori}, - arxiv={2005.08700}, - year={2020}, - code={https://github.com/espnet/espnet} -} - -@inproceedings{tsunoo2019transformer, - abbr={ASR}, - abbr_publisher={ASRU}, - title={Transformer ASR with contextual block processing}, - author={Tsunoo, Emiru and Kashiwagi, Yosuke and Kumakura, Toshiyuki and Watanabe, Shinji}, - booktitle=ASRU, - pages={427--433}, - year={2019}, - organization={IEEE} -} - -@inproceedings{chang2019mimo, - abbr={ASR}, - abbr_publisher={ASRU}, - title={MIMO-Speech: End-to-end multi-channel multi-speaker speech recognition}, - author={Chang, Xuankai and Zhang, Wangyou and Qian, Yanmin and Le Roux, Jonathan and Watanabe, Shinji}, - booktitle=ASRU, - pages={237--244}, - year={2019}, - organization={IEEE} -} - -@inproceedings{inaguma2019multilingual, - abbr={ST}, - abbr_publisher={ASRU}, - title={Multilingual end-to-end speech translation}, - author={Inaguma, Hirofumi and Duh, Kevin and Kawahara, Tatsuya and Watanabe, Shinji}, - booktitle=ASRU, - pages={570--577}, - year={2019}, - organization={IEEE} -} - -@inproceedings{kanda2019simultaneous, - abbr={ASR+SD}, - abbr_publisher={ASRU}, - title={Simultaneous speech recognition and speaker diarization for monaural dialogue recordings with target-speaker acoustic models}, - author={Kanda, Naoyuki and Horiguchi, Shota and Fujita, Yusuke and Xue, Yawen and Nagamatsu, Kenji and Watanabe, Shinji}, - booktitle=ASRU, - pages={31--38}, - year={2019}, - organization={IEEE} -} - -@inproceedings{wang2019espresso, - abbr={ASR}, - abbr_publisher={ASRU}, - title={Espresso: A fast end-to-end neural speech recognition toolkit}, - author={Wang, Yiming and Chen, Tongfei and Xu, Hainan and Ding, Shuoyang and Lv, Hang and Shao, Yiwen and Peng, Nanyun and Xie, Lei and Watanabe, Shinji and Khudanpur, Sanjeev}, - booktitle=ASRU, - pages={136--143}, - year={2019}, - organization={IEEE} -} - -@inproceedings{karita2019comparative, - abbr={ASR}, - abbr_publisher={ASRU}, - title={A comparative study on transformer vs rnn in speech applications}, - author={Karita, Shigeki and Chen, Nanxin and Hayashi, Tomoki and Hori, Takaaki and Inaguma, Hirofumi and Jiang, Ziyan and Someki, Masao and Soplin, Nelson Enrique Yalta and Yamamoto, Ryuichi and Wang, Xiaofei and others}, - booktitle=ASRU, - pages={449--456}, - year={2019}, - organization={IEEE}, - selected={True}, - html={https://ieeexplore.ieee.org/abstract/document/9003750}, - arxiv={1909.06317}, -} - -@inproceedings{fujita2019end, - abbr={SD}, - abbr_publisher={ASRU}, - title={End-to-end neural speaker diarization with self-attention}, - author={Fujita, Yusuke and Kanda, Naoyuki and Horiguchi, Shota and Xue, Yawen and Nagamatsu, Kenji and Watanabe, Shinji}, - booktitle=ASRU, - pages={296--303}, - year={2019}, - organization={IEEE}, - selected={true}, - pdf={https://arxiv.org/pdf/1909.06247.pdf}, - html={https://ieeexplore.ieee.org/abstract/document/9003959}, - arxiv={1909.06247} -} - -@article{li2019multi, - abbr={ASR}, - abbr_publisher={ASRU}, - title={Multi-stream end-to-end speech recognition}, - author={Li, Ruizhi and Wang, Xiaofei and Mallidi, Sri Harish and Watanabe, Shinji and Hori, Takaaki and Hermansky, Hynek}, - journal=ASRU, - volume={28}, - pages={646--655}, - year={2019}, - publisher={IEEE} -} - - -@inproceedings{maciejewski2019analysis, - abbr={SS}, - abbr_publisher={WASPAA}, - title={Analysis of robustness of deep single-channel speech separation using corpora constructed from multiple domains}, - author={Maciejewski, Matthew and Sell, Gregory and Fujita, Yusuke and Garcia-Perera, Leibny Paola and Watanabe, Shinji and Khudanpur, Sanjeev}, - booktitle=WASPAA, - pages={165--169}, - year={2019}, - organization={IEEE} -} - -@inproceedings{taniguchi2019generalized, - abbr={ASR}, - abbr_publisher={WASPAA}, - title={Generalized weighted-prediction-error dereverberation with varying source priors for reverberant speech recognition}, - author={Taniguchi, Toru and Subramanian, Aswin Shanmugam and Wang, Xiaofei and Tran, Dung and Fujita, Yuya and Watanabe, Shinji}, - booktitle=WASPAA, - pages={293--297}, - year={2019}, - organization={IEEE} -} - -@inproceedings{subramanian2019speech, - abbr={ASR}, - abbr_publisher={WASPAA}, - title={Speech enhancement using end-to-end speech recognition objectives}, - author={Subramanian, Aswin Shanmugam and Wang, Xiaofei and Baskar, Murali Karthick and Watanabe, Shinji and Taniguchi, Toru and Tran, Dung and Fujita, Yuya}, - booktitle=WASPAA, - pages={234--238}, - year={2019}, - organization={IEEE} -} - -@inproceedings{seki19inter, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={End-to-End Multilingual Multi-Speaker Speech Recognition}, - author={Hiroshi Seki, Takaaki Hori, Shinji Watanabe, Jonathan Le Roux and John Hershey}, - booktitle=Interspeech, - year={2019} -} - -@inproceedings{wiesner19inter, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Pretraining by Backtranslation for End-to-end ASR in Low-Resource Settings}, - author={Matthew Wiesner, Adithya Renduchintala, Shinji Watanabe, Chunxi Liu, Najim Dehak and Sanjeev Khudanpur}, - booktitle=Interspeech, - year={2019} -} - -@inproceedings{hayashi19inter, - abbr={TTS}, - abbr_publisher={Interspeech}, - title={Pre-trained Text Embeddings for Enhanced Text-to-Speech Synthesis}, - author={Tomoki Hayashi, Shinji Watanabe, Tomoki Toda, Kazuya Takeda, Shubham Toshniwal and Karen Livescu}, - booktitle=Interspeech, - year={2019} -} - -@inproceedings{fujita19inter, - abbr={SD}, - abbr_publisher={Interspeech}, - title={End-to-End Neural Speaker Diarization with Permutation-Free Objectives}, - author={Yusuke Fujita and Naoyuki Kanda and Shota Horiguchi and Kenji Nagamatsu and Shinji Watanabe}, - booktitle=Interspeech, - year={2019}, - selected={true}, - pdf={https://www.isca-speech.org/archive_v0/Interspeech_2019/pdfs/2899.pdf}, - html={https://www.isca-speech.org/archive_v0/Interspeech_2019/abstracts/2899.html}, - -} - -@inproceedings{kerafiat19inter, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Analysis of Multilingual Sequence-to-Sequence speech recognition systems}, - author={Martin Karafiat, Murali Karthick Baskar and Shinji Watanabe and Takaaki Hori and Matthew Wiesner and Jan Černocký}, - booktitle=Interspeech, - year={2019} -} - -@inproceedings{delcroix10inter, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={End-to-end SpeakerBeam for single channel target speech recognition}, - author={Marc Delcroix and Shinji Watanabe and Tsubasa Ochiai and Keisuke Kinoshita and Shigeki Karita and Atsunori Ogawa and Tomohiro Nakatani}, - booktitle=Interspeech, - year={2019} -} - - -@inproceedings{baskar19inter, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Semi-supervised Sequence-to-sequence ASR using Unpaired Speech and Text}, - author={Murali Karthick Baskar and Shinji Watanabe and Ramón Astudillo and Takaaki Hori and Lukas Burget and Jan Černocký}, - booktitle=Interspeech, - year={2019} -} - -@inproceedings{velazquez19inter, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Study of the performance of automatic speech recognition systems in speakers with Parkinson's Disease}, - author={Laureano Moro Velazquez and Jaejin Cho and Shinji Watanabe and Mark Hasegawa-Johnson and Odette Scharenborg and Kim Heejin and Najim Dehak}, - booktitle=Interspeech, - year={2019} -} - -@inproceedings{seki19inter, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Vectorized Beam Search for CTC-Attention-based Speech Recognition}, - author={Hiroshi Seki and Takaaki Hori and Shinji Watanabe and Niko Moritz and Jonathan Le Roux}, - booktitle=Interspeech, - year={2019} -} - -@inproceedings{garcia19inter, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Speaker recognition benchmark using the CHiME-5 corpus}, - author={Daniel Garcia-Romero and David Snyder and Shinji Watanabe and Gregory Sell and Alan McCree and Dan Povey and Sanjeev Khudanpur}, - booktitle=Interspeech, - year={2019} -} - -@inproceedings{karita19inter, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Improving Transformer Based End-to-End Speech Recognition with Connectionist Temporal Classification and Language Model Integration}, - author={Shigeki Karita and Nelson Yalta and Shinji Watanabe and Marc Delcroix and Atsunori Ogawa and Tomohiro Nakatani}, - booktitle=Interspeech, - year={2019}, - selected={true}, - pdf={https://www.isca-speech.org/archive_v0/Interspeech_2019/pdfs/1938.pdf}, - html={https://www.isca-speech.org/archive_v0/Interspeech_2019/abstracts/1938.html}, - -} - -@inproceedings{naoyuki19inter, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Interference Speaker Loss for Target-Speaker Speech Recognition}, - author={Naoyuki Kanda and Shota Horiguchi and Ryoichi Takashima and Yusuke Fujita and Kenji Nagamatsu and Shinji Watanabe}, - booktitle=Interspeech, - year={2019} -} - -@inproceedings{yalta2019cnn, - abbr={ASR}, - abbr_publisher={EUSIPCO}, - title={CNN-based multichannel end-to-end speech recognition for everyday home environments}, - author={Yalta, Nelson and Watanabe, Shinji and Hori, Takaaki and Nakadai, Kazuhiro and Ogata, Tetsuya}, - booktitle={2019 27th European Signal Processing Conference (EUSIPCO)}, - pages={1--5}, - year={2019}, - organization={IEEE} -} - -@inproceedings{arora2019using, - abbr={OCR}, - abbr_publisher={ICDAR}, - title={Using ASR methods for OCR}, - author={Arora, Ashish and Chang, Chun Chieh and Rekabdar, Babak and BabaAli, Bagher and Povey, Daniel and Etter, David and Raj, Desh and Hadian, Hossein and Trmal, Jan and Garcia, Paola and others}, - booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)}, - pages={663--668}, - year={2019}, - organization={IEEE} -} - -@inproceedings{yalta2019weakly, - abbr={Music}, - abbr_publisher={IJCNN}, - title={Weakly-supervised deep recurrent neural networks for basic dance step generation}, - author={Yalta, Nelson and Watanabe, Shinji and Nakadai, Kazuhiro and Ogata, Tetsuya}, - booktitle={2019 International Joint Conference on Neural Networks (IJCNN)}, - pages={1--8}, - year={2019}, - organization={IEEE} -} - -@inproceedings{adams2019massively, - abbr={ASR}, - abbr_publisher={NAACL}, - title={Massively Multilingual Adversarial Speech Recognition}, - author={Adams, Oliver and Wiesner, Matthew and Watanabe, Shinji and Yarowsky, David}, - booktitle=NAACL, - pages={96--108}, - year={2019} -} - -@inproceedings{baskar2019promising, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Promising accurate prefix boosting for sequence-to-sequence ASR}, - author={Baskar, Murali Karthick and Burget, Luk{\'a}{\v{s}} and Watanabe, Shinji and Karafi{\'a}t, Martin and Hori, Takaaki and {\v{C}}ernock{\`y}, Jan Honza}, - booktitle=ICASSP, - pages={5646--5650}, - year={2019}, - organization={IEEE} -} - -@inproceedings{inaguma2019transfer, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Transfer learning of language-independent end-to-end asr with language model fusion}, - author={Inaguma, Hirofumi and Cho, Jaejin and Baskar, Murali Karthick and Kawahara, Tatsuya and Watanabe, Shinji}, - booktitle=ICASSP, - pages={6096--6100}, - year={2019}, - organization={IEEE} -} - -@inproceedings{xu2019improving, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Improving end-to-end speech recognition with pronunciation-assisted sub-word modeling}, - author={Xu, Hainan and Ding, Shuoyang and Watanabe, Shinji}, - booktitle=ICASSP, - pages={7110--7114}, - year={2019}, - organization={IEEE} -} - -@inproceedings{cho2019language, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Language model integration based on memory control for sequence to sequence speech recognition}, - author={Cho, Jaejin and Watanabe, Shinji and Hori, Takaaki and Baskar, Murali Karthick and Inaguma, Hirofumi and Villalba, Jesus and Dehak, Najim}, - booktitle=ICASSP, - pages={6191--6195}, - year={2019}, - organization={IEEE} -} - -@inproceedings{wang2019stream, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Stream attention-based multi-array end-to-end speech recognition}, - author={Wang, Xiaofei and Li, Ruizhi and Mallidi, Sri Harish and Hori, Takaaki and Watanabe, Shinji and Hermansky, Hynek}, - booktitle=ICASSP, - pages={7105--7109}, - year={2019}, - organization={IEEE} -} - -@inproceedings{manohar2019acoustic, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Acoustic modeling for overlapping speech recognition: JHU CHiME-5 challenge system}, - author={Manohar, Vimal and Chen, Szu-Jui and Wang, Zhiqi and Fujita, Yusuke and Watanabe, Shinji and Khudanpur, Sanjeev}, - booktitle=ICASSP, - pages={6665--6669}, - year={2019}, - organization={IEEE} -} - -@inproceedings{hori2019cycle, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Cycle-consistency training for end-to-end speech recognition}, - author={Hori, Takaaki and Astudillo, Ramon and Hayashi, Tomoki and Zhang, Yu and Watanabe, Shinji and Le Roux, Jonathan}, - booktitle=ICASSP, - pages={6271--6275}, - year={2019}, - organization={IEEE} -} - -@inproceedings{kothinti2019joint, - abbr={AED}, - abbr_publisher={ICASSP}, - title={Joint acoustic and class inference for weakly supervised sound event detection}, - author={Kothinti, Sandeep and Imoto, Keisuke and Chakrabarty, Debmalya and Sell, Gregory and Watanabe, Shinji and Elhilali, Mounya}, - booktitle=ICASSP, - pages={36--40}, - year={2019}, - organization={IEEE} -} - -@inproceedings{le2019phasebook, - abbr={SE}, - abbr_publisher={ICASSP}, - title={The phasebook: Building complex masks via discrete representations for source separation}, - author={Le Roux, Jonathan and Wichern, Gordon and Watanabe, Shinji and Sarroff, Andy and Hershey, John R}, - booktitle=ICASSP, - pages={66--70}, - year={2019}, - organization={IEEE} -} - -@inproceedings{chang2019end, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={End-to-end monaural multi-speaker ASR system without pretraining}, - author={Chang, Xuankai and Qian, Yanmin and Yu, Kai and Watanabe, Shinji}, - booktitle=ICASSP, - pages={6256--6260}, - year={2019}, - organization={IEEE} -} - -@inproceedings{karita2019semi, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Semi-supervised end-to-end speech recognition using text-to-speech and autoencoders}, - author={Karita, Shigeki and Watanabe, Shinji and Iwata, Tomoharu and Delcroix, Marc and Ogawa, Atsunori and Nakatani, Tomohiro}, - booktitle=ICASSP, - pages={6166--6170}, - year={2019}, - organization={IEEE} -} - -@inproceedings{kanda2019acoustic, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Acoustic modeling for distant multi-talker speech recognition with single-and multi-channel branches}, - author={Kanda, Naoyuki and Fujita, Yusuke and Horiguchi, Shota and Ikeshita, Rintaro and Nagamatsu, Kenji and Watanabe, Shinji}, - booktitle=ICASSP, - pages={6630--6634}, - year={2019}, - organization={IEEE} -} - -@article{lin2018model, - abbr={ML}, - abbr_publisher={Physica}, - title={Model parameter learning using Kullback--Leibler divergence}, - author={Lin, Chungwei and Marks, Tim K and Pajovic, Milutin and Watanabe, Shinji and Tung, Chih-kuan}, - journal={Physica A: Statistical Mechanics and its Applications}, - volume={491}, - pages={549--559}, - year={2018}, - publisher={Elsevier} -} - -@inproceedings{hori2018end, - abbr={ASR}, - abbr_publisher={SLT}, - title={End-to-end speech recognition with word-based RNN language models}, - author={Hori, Takaaki and Cho, Jaejin and Watanabe, Shinji}, - booktitle=SLT, - pages={389--396}, - year={2018}, - organization={IEEE} -} - -@inproceedings{liu2018low, - abbr={ASR}, - abbr_publisher={SLT}, - title={Low-resource contextual topic identification on speech}, - author={Liu, Chunxi and Wiesner, Matthew and Watanabe, Shinji and Harman, Craig and Trmal, Jan and Dehak, Najim and Khudanpur, Sanjeev}, - booktitle=SLT, - pages={656--663}, - year={2018}, - organization={IEEE} -} - -@inproceedings{hayashi2018back, - abbr={ASR}, - abbr_publisher={SLT}, - title={Back-translation-style data augmentation for end-to-end ASR}, - author={Hayashi, Tomoki and Watanabe, Shinji and Zhang, Yu and Toda, Tomoki and Hori, Takaaki and Astudillo, Ramon and Takeda, Kazuya}, - booktitle=SLT, - pages={426--433}, - year={2018}, - organization={IEEE} -} - -@inproceedings{cho2018multilingual, - abbr={ASR}, - abbr_publisher={SLT}, - title={Multilingual sequence-to-sequence speech recognition: architecture, transfer learning, and language modeling}, - author={Cho, Jaejin and Baskar, Murali Karthick and Li, Ruizhi and Wiesner, Matthew and Mallidi, Sri Harish and Yalta, Nelson and Karafiat, Martin and Watanabe, Shinji and Hori, Takaaki}, - booktitle=SLT, - pages={521--527}, - year={2018}, - organization={IEEE} -} - -@article{watanabe2018espnet, - abbr={ASR}, - abbr_publisher={Interspeech}, - author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, - abstract={This paper introduces a new open source platform for end-to-end speech processing named ESPnet. ESPnet mainly focuses on end-to-end automatic speech recognition (ASR), and adopts widely-used dynamic neural network toolkits, Chainer and PyTorch, as a main deep learning engine. ESPnet also follows the Kaldi ASR toolkit style for data processing, feature extraction/format, and recipes to provide a complete setup for speech recognition and other speech processing experiments. This paper explains a major architecture of this software platform, several important functionalities, which differentiate ESPnet from other open source ASR toolkits, and experimental results with major ASR benchmarks.}, - title={{ESPnet}: End-to-End Speech Processing Toolkit}, - year={2018}, - journal=interspeech, - pages={2207--2211}, - doi={10.21437/Interspeech.2018-1456}, - html={https://www.isca-speech.org/archive/interspeech_2018/watanabe18_interspeech.html}, - pdf={https://www.isca-speech.org/archive/pdfs/interspeech_2018/watanabe18_interspeech.pdf}, - code={https://github.com/espnet/espnet}, - arxiv={1804.00015}, - selected={true} -} - -@article{chen2018building, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Building State-of-the-art Distant Speech Recognition Using the CHiME-4 Challenge with a Setup of Speech Enhancement Baseline}, - author={Chen, Szu-Jui and Subramanian, Aswin Shanmugam and Xu, Hainan and Watanabe, Shinji}, - journal=interspeech, - pages={1571--1575}, - year={2018} -} - -@article{hayashi2018multi, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Multi-Head Decoder for End-to-End Speech Recognition}, - author={Hayashi, Tomoki and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya}, - journal=interspeech, - pages={801--805}, - year={2018} -} - -@article{karita2018semi, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Semi-Supervised End-to-End Speech Recognition}, - author={Karita, Shigeki and Watanabe, Shinji and Iwata, Tomoharu and Ogawa, Atsunori and Delcroix, Marc}, - journal=interspeech, - pages={2--6}, - year={2018} -} - - -@article{barker2018fifth, - abbr={SE&ASR}, - abbr_publisher={Interspeech}, - title={The Fifth 'CHiME' Speech Separation and Recognition Challenge: Dataset, Task and Baselines}, - author={Barker, Jon and Watanabe, Shinji and Vincent, Emmanuel and Trmal, Jan}, - journal=interspeech, - pages={1561--1565}, - year={2018}, - selected={true}, - pdf={https://www.isca-speech.org/archive/pdfs/interspeech_2018/barker18_interspeech.pdf}, - arxiv={1803.10609}, - html={https://www.isca-speech.org/archive/interspeech_2018/barker18_interspeech.html}, -} - -@article{subramanian2018student, - abbr={SE}, - abbr_publisher={Interspeech}, - title={Student-Teacher Learning for BLSTM Mask-based Speech Enhancement}, - author={Subramanian, Aswin Shanmugam and Chen, Szu-Jui and Watanabe, Shinji}, - journal=interspeech, - pages={3249--3253}, - year={2018} -} - -@article{renduchintala2018multi, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Multi-Modal Data Augmentation for End-to-end ASR}, - author={Renduchintala, Adithya and Ding, Shuoyang and Wiesner, Matthew and Watanabe, Shinji}, - journal=interspeech, - pages={2394--2398}, - year={2018} -} - -@inproceedings{frederiksen2018effectiveness, - abbr={LID}, - abbr_publisher={Interspeech}, - title={Effectiveness of single-channel blstm enhancement for language identification}, - author={Frederiksen, Peter Sibbern and Villalba, Jes{\'u}s and Watanabe, Shinji and Tan, Zheng-Hua and Dehak, Najim}, - booktitle={Interspeech 2018}, - pages={1823--1827}, - year={2018}, - organization={ISCA} -} - -@inproceedings{sell2018diarization, - abbr={SD}, - abbr_publisher={Interspeech}, - title={Diarization is Hard: Some Experiences and Lessons Learned for the JHU Team in the Inaugural DIHARD Challenge.}, - author={Sell, Gregory and Snyder, David and McCree, Alan and Garcia-Romero, Daniel and Villalba, Jes{\'u}s and Maciejewski, Matthew and Manohar, Vimal and Dehak, Najim and Povey, Daniel and Watanabe, Shinji and others}, - abstract={We describe in this paper the experiences of the Johns Hopkins University team during the inaugural DIHARD diarization evaluation. This new task provided microphone recordings in a variety of difficult conditions and challenged researchers to fully consider all speaker activity, without the currently typical practices of unscored collars or ignored overlapping speaker segments. This paper explores several key aspects of currently state-of-the-art diarization methods, such as training data selection, signal bandwidth for feature extraction, representations of speech segments (i-vector versus x-vector) and domain-adaptive processing. In the end, our best system clustered x-vector embeddings trained on wideband microphone data followed by Variational-Bayesian refinement and a speech activity detector specifically trained for this task with in-domain data was found to be the best performing. After presenting these decisions and their final result, we discuss lessons learned and remaining challenges within the lens of this new approach to diarization performance measurement.}, - booktitle={Interspeech}, - pages={2808--2812}, - year={2018}, - selected={true}, - html={https://www.isca-speech.org/archive/interspeech_2018/sell18_interspeech.html}, - pdf={https://www.isca-speech.org/archive/pdfs/interspeech_2018/sell18_interspeech.pdf}, -} - -@article{delcroix2018auxiliary, - abbr={ASR}, - abbr_publisher={Interspeech}, - title={Auxiliary Feature Based Adaptation of End-to-end ASR Systems}, - author={Delcroix, Marc and Watanabe, Shinji and Ogawa, Atsunori and Karita, Shigeki and Nakatani, Tomohiro}, - journal=interspeech, - pages={2444--2448}, - year={2018} -} - -@inproceedings{seki2018purely, - abbr={ASR}, - abbr_publisher={ACL}, - title={A Purely End-to-End System for Multi-speaker Speech Recognition}, - author={Seki, Hiroshi and Hori, Takaaki and Watanabe, Shinji and Le Roux, Jonathan and Hershey, John R}, - booktitle=ACL, - pages={2620--2630}, - year={2018} -} - -@inproceedings{ochiai2018speaker, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={Speaker adaptation for multichannel end-to-end speech recognition}, - author={Ochiai, Tsubasa and Watanabe, Shinji and Katagiri, Shigeru and Hori, Takaaki and Hershey, John}, - booktitle=ICASSP, - pages={6707--6711}, - year={2018}, - organization={IEEE} -} - -@inproceedings{seki2018end, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={An end-to-end language-tracking speech recognizer for mixed-language speech}, - author={Seki, Hiroshi and Watanabe, Shinji and Hori, Takaaki and Le Roux, Jonathan and Hershey, John R}, - booktitle=ICASSP, - pages={4919--4923}, - year={2018}, - organization={IEEE} -} - -@inproceedings{settle2018end, - abbr={ASR}, - abbr_publisher={ICASSP}, - title={End-to-end multi-speaker speech recognition}, - author={Settle, Shane and Le Roux, Jonathan and Hori, Takaaki and Watanabe, Shinji and Hershey, John R}, - booktitle=ICASSP, - pages={4819--4823}, - year={2018}, - organization={IEEE} -} - - -@inproceedings{lu2022towards, - abbr={SE}, - abbr_publisher={ICASSP}, - title={Towards Low-distortion Multi-channel Speech Enhancement: The ESPNet-SE Submission to The L3DAS22 Challenge}, - author={Lu, Yen-Ju and Cornell, Samuele and Chang, Xuankai and Zhang, Wangyou and Li, Chenda and Ni, Zhaoheng and Wang, Zhong-Qiu and Watanabe, Shinji}, - booktitle=ICASSP, - year={2022}, - organization={IEEE} -} diff --git a/_config.yml b/_config.yml deleted file mode 100644 index 7cb8ac6c..00000000 --- a/_config.yml +++ /dev/null @@ -1,212 +0,0 @@ -# ----------------------------------------------------------------------------- -# Site settings -# ----------------------------------------------------------------------------- - -title: WAVLab # the website title (if blank, full name will be used instead) -first_name: -middle_name: -last_name: WAV Lab -email: -description: > # the ">" symbol means to ignore newlines until "footer_text:" - Webpage of Watanabe's Audio and Voice (WAV) Lab -footer_text: > - Powered by Jekyll with al-folio theme. - Hosted by GitHub Pages. - -icon: 🗣️ # the emoji used as the favicon -url: # the base hostname & protocol for your site -baseurl: # the subpath of your site, e.g. /blog/ -last_updated: false # set to true if you want to display last updated in the footer -impressum_path: # set to path to include impressum link in the footer, use the same path as permalink in a page, helps to conform with EU GDPR - -# ----------------------------------------------------------------------------- -# Layout -# ----------------------------------------------------------------------------- - -navbar_fixed: true -footer_fixed: true - -# Dimensions -max_width: 1000px - -# TODO: add layout settings (single page vs. multi-page) - -# ----------------------------------------------------------------------------- -# Open Graph -# ----------------------------------------------------------------------------- -# Display links to the page with a preview object on social media. -serve_og_meta: false # Include Open Graph meta tags in the HTML head -og_image: # The site-wide (default for all links) Open Graph preview image - -# ----------------------------------------------------------------------------- -# Social integration -# ----------------------------------------------------------------------------- - -github_username: shinjiwlab # your GitHub user name -gitlab_username: # your GitLab user name -twitter_username: WavLab # your Twitter handle -linkedin_username: wavlab-cmu # your LinkedIn user name -scholar_userid: # your Google Scholar ID -orcid_id: # your ORCID ID -medium_username: # your Medium username -quora_username: # your Quora username -publons_id: # your ID on Publons -research_gate_profile: # your profile on ResearchGate -blogger_url: # your blogger URL -work_url: # work page URL -keybase_username: # your keybase user name -wikidata_id: # your wikidata id - -contact_note: > - - -google_analytics: UA-XXXXXXXXX # out your google-analytics code -panelbear_analytics: XXXXXXXXX # panelbear analytics site ID - - -# ----------------------------------------------------------------------------- -# Blog -# ----------------------------------------------------------------------------- - -blog_name: Research Activities # your blog must have a name for it to show up in the nav bar -blog_description: -permalink: /activities/:year/:title/ - -# Pagination -pagination: - enabled: true - -# Comments -disqus_shortname: https-shinjiwlab-github-io # put your disqus shortname - - -# ----------------------------------------------------------------------------- -# Collections -# ----------------------------------------------------------------------------- - -collections: - news: - defaults: - layout: post - output: true - permalink: /news/:path/ - projects: - output: true - permalink: /projects/:path/ - -news_limit: 5 - -# ----------------------------------------------------------------------------- -# Jekyll settings -# ----------------------------------------------------------------------------- - -# Markdown and syntax highlight -markdown: kramdown -highlighter: rouge -highlight_theme: github # https://github.com/jwarby/jekyll-pygments-themes -kramdown: - input: GFM - syntax_highlighter_opts: - css_class: 'highlight' - span: - line_numbers: false - block: - line_numbers: false - start_line: 1 - -# Includes & excludes -include: ['_pages'] -exclude: - - bin - - Gemfile - - Gemfile.lock - - vendor -keep_files: - - CNAME - - .nojekyll - - .git - -# Plug-ins -plugins: - - jekyll-email-protect - - jekyll-github-metadata - - jekyll-paginate-v2 - - jekyll/scholar - - jekyll-twitter-plugin - - jemoji - -# Extras -github: [metadata] - -# ----------------------------------------------------------------------------- -# Jekyll Scholar -# ----------------------------------------------------------------------------- - -scholar: - - last_name: W - first_name: [S] - - style: apa - locale: en - - source: /_bibliography/ - bibliography: papers.bib - bibliography_template: bib - - replace_strings: true - join_strings: true - - details_dir: bibliography - details_layout: bibtex.html - details_link: Details - - query: "@*" - -# ----------------------------------------------------------------------------- -# Optional Features -# ----------------------------------------------------------------------------- - -enable_google_analytics: false # enables google analytics -enable_panelbear_analytics: false # enables panelbear analytics -enable_mansory: true # enables automatic project cards arangement -enable_math: true # enables math typesetting (uses MathJax) -enable_tooltips: false # enables automatic tooltip links generated - # for each section titles on pages and posts -enable_darkmode: true # enables switching between light/dark modes -enable_navbar_social: false # enables displaying social links in the - # navbar on the about page -enable_project_categories: true # enables categorization of projects into - # multiple categories - -# ----------------------------------------------------------------------------- -# Library versions -# ----------------------------------------------------------------------------- - -academicons: - version: "1.9.0" - integrity: "sha512-W4yqoT1+8NLkinBLBZko+dFB2ZbHsYLDdr50VElllRcNt2Q4/GSs6u71UHKxB7S6JEMCp5Ve4xjh3eGQl/HRvg==" -bootstrap: - version: "4.5.2" - integrity: - css: "sha512-MoRNloxbStBcD8z3M/2BmnT+rg4IsMxPkXaGh2zD6LGNNFE80W3onsAhRcMAMrSoyWL9xD7Ert0men7vR8LUZg==" - js: "sha512-M5KW3ztuIICmVIhjSqXe01oV2bpe248gOxqmlcYrEzAvws7Pw3z6BK0iGbrwvdrUQUhi3eXgtxp5I8PDo9YfjQ==" -fontawesome: - version: "5.14.0" - integrity: "sha512-1PKOgIY59xJ8Co8+NE6FZ+LOAZKjy+KY8iq0G4B3CyeY6wYHN3yt9PW0XpSriVlkMXe40PTKnXrLnZ9+fkDaog==" -jquery: - version: "3.5.1" - integrity: "sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" -mathjax: - version: "3.1.2" -mansory: - version: "4.2.2" - integrity: "sha256-Nn1q/fx0H7SNLZMQ5Hw5JLaTRZp0yILA/FRexe19VdI=" -mdb: - version: "4.19.1" - integrity: - css: "sha512-RO38pBRxYH3SoOprtPTD86JFOclM51/XTIdEPh5j8sj4tp8jmQIx26twG52UaLi//hQldfrh7e51WzP9wuP32Q==" - js: "sha512-Mug9KHKmroQFMLm93zGrjhibM2z2Obg9l6qFG2qKjXEXkMp/VDkI4uju9m4QKPjWSwQ6O2qzZEnJDEeCw0Blcw==" -popper: - version: "2.4.4" - integrity: "sha512-eUQ9hGdLjBjY3F41CScH3UX+4JDSI9zXeroz7hJ+RteoCaY+GP/LDoM8AO+Pt+DRFw3nXqsjh9Zsts8hnYv8/A==" diff --git a/_data/coauthors.yml b/_data/coauthors.yml deleted file mode 100644 index 1542bed7..00000000 --- a/_data/coauthors.yml +++ /dev/null @@ -1,39 +0,0 @@ -"Adams": - - firstname: ["Edwin", "E.", "E. P.", "Edwin Plimpton"] - url: https://en.wikipedia.org/wiki/Edwin_Plimpton_Adams - -"Podolsky": - - firstname: ["Boris", "B.", "B. Y.", "Boris Yakovlevich"] - url: https://en.wikipedia.org/wiki/Boris_Podolsky - -"Rosen": - - firstname: ["Nathan", "N."] - url: https://en.wikipedia.org/wiki/Nathan_Rosen - -"Shi": - - firstname: ["Jiatong", "J.", "J"] - url: http://shijt.site - -"Watanabe": - - firstname: ["Shinji"] - url: https://sites.google.com/view/shinjiwatanabe - -"Chang": - - firstname: ["Xuankai"] - url: http://simpleoier.github.io - -"Yan": - - firstname: ["Brian"] - url: http://www.cs.cmu.edu/~byan - -"Dalmia": - - firstname: ["Siddharth"] - url: http://www.cs.cmu.edu/~sdalmia - -"Peng": - - firstname: ["Yifan"] - url: https://pyf98.github.io/ - -"Jung": - - firstname: ["Jee-weon", "Jee-Weon", "J."] - url: https://jungjee.github.io diff --git a/_includes/figure.html b/_includes/figure.html deleted file mode 100644 index eb3dc28f..00000000 --- a/_includes/figure.html +++ /dev/null @@ -1,35 +0,0 @@ -{%- assign img_path = include.path | remove: ".jpg" | remove: ".jpeg" | remove: ".png" | remove: ".tiff" -%} - -
- - - {% if site.imagemagick.enabled %} - {% for i in site.imagemagick.widths -%} - - {% endfor -%} - {% endif %} - - - - - - {%- if include.caption -%}
{{ include.caption }}
{%- endif %} - -
diff --git a/_includes/footer.html b/_includes/footer.html deleted file mode 100644 index 2e345218..00000000 --- a/_includes/footer.html +++ /dev/null @@ -1,27 +0,0 @@ -{% if site.footer_fixed %} - -{% else %} - -{% endif %} diff --git a/_includes/head.html b/_includes/head.html deleted file mode 100644 index 055c0dde..00000000 --- a/_includes/head.html +++ /dev/null @@ -1,67 +0,0 @@ - - - - -{% if site.title == "blank" %}{{ site.first_name }} {{ site.middle_name }} {{ site.last_name }}{% else %}{{ site.title }}{% endif %}{% if page.title and page.url != "/" %} | {{ page.title }}{% endif %} - - - -{% if site.serve_og_meta %} - - - - - - -{% endif %} - - - - - - - - - - - - - - -{% if site.icon != empty %} - -{% endif %} - - - - - -{% include scripts/jquery.html %} - - -{% if site.enable_darkmode %} - - - -{% endif %} - -{% if site.enable_google_analytics %} - - - -{% endif %} - -{% if site.enable_panelbear_analytics %} - - - -{% endif %} diff --git a/_includes/header.html b/_includes/header.html deleted file mode 100644 index 7a7fb353..00000000 --- a/_includes/header.html +++ /dev/null @@ -1,73 +0,0 @@ -
- - - - -
diff --git a/_includes/news.html b/_includes/news.html deleted file mode 100644 index e1fc80bc..00000000 --- a/_includes/news.html +++ /dev/null @@ -1,24 +0,0 @@ -
-

news

- {% if site.news %} -
- - {% assign news = site.news | reverse %} - {% for item in news limit: site.news_limit %} - - - - - {% endfor %} -
{{ item.date | date: "%b %-d, %Y" }} - {% if item.inline %} - {{ item.content | remove: '

' | remove: '

' | emojify }} - {% else %} - {{ item.title }} - {% endif %} -
-
- {% else %} -

No news so far...

- {% endif %} -
diff --git a/_includes/pagination.html b/_includes/pagination.html deleted file mode 100644 index 67610175..00000000 --- a/_includes/pagination.html +++ /dev/null @@ -1,17 +0,0 @@ -{% if paginator.total_pages > 1 %} - -{% endif %} diff --git a/_includes/projects.html b/_includes/projects.html deleted file mode 100644 index 55a14610..00000000 --- a/_includes/projects.html +++ /dev/null @@ -1,32 +0,0 @@ -
- {% if project.redirect %} - - {% else %} - - {% endif %} -
- {% if project.img %} - project thumbnail - {% endif %} -
-

{{ project.title }}

-

{{ project.description }}

-
- {% if project.github %} -
-
- -
- {% if project.github_stars %} - - - - - {% endif %} -
- {% endif %} -
-
-
- -
diff --git a/_includes/projects_horizontal.html b/_includes/projects_horizontal.html deleted file mode 100644 index edecb9bf..00000000 --- a/_includes/projects_horizontal.html +++ /dev/null @@ -1,40 +0,0 @@ -
- {% if project.redirect %} - - {% else %} - - {% endif %} -
- - -
diff --git a/_includes/scripts/analytics.html b/_includes/scripts/analytics.html deleted file mode 100644 index 4a345d3e..00000000 --- a/_includes/scripts/analytics.html +++ /dev/null @@ -1,18 +0,0 @@ -{%- if site.enable_google_analytics -%} - - - -{%- endif -%} -{%- if site.enable_cronitor_analytics -%} - - - -{%- endif -%} \ No newline at end of file diff --git a/_includes/scripts/bootstrap.html b/_includes/scripts/bootstrap.html deleted file mode 100644 index 2c5d4ee0..00000000 --- a/_includes/scripts/bootstrap.html +++ /dev/null @@ -1,4 +0,0 @@ - - - - diff --git a/_includes/scripts/jquery.html b/_includes/scripts/jquery.html deleted file mode 100644 index 8de7788d..00000000 --- a/_includes/scripts/jquery.html +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/_includes/scripts/mansory.html b/_includes/scripts/mansory.html deleted file mode 100644 index 5a93a9ee..00000000 --- a/_includes/scripts/mansory.html +++ /dev/null @@ -1,6 +0,0 @@ -{% if site.enable_mansory %} - - - - -{% endif %} diff --git a/_includes/scripts/mathjax.html b/_includes/scripts/mathjax.html deleted file mode 100644 index 53db9e01..00000000 --- a/_includes/scripts/mathjax.html +++ /dev/null @@ -1,12 +0,0 @@ -{% if site.enable_math %} - - - - -{% endif %} diff --git a/_includes/scripts/misc.html b/_includes/scripts/misc.html deleted file mode 100644 index 08dece9a..00000000 --- a/_includes/scripts/misc.html +++ /dev/null @@ -1,10 +0,0 @@ -{% if site.enable_tooltips %} - - -{% endif %} - - - - diff --git a/_includes/scripts/progressBar.html b/_includes/scripts/progressBar.html deleted file mode 100644 index 1a3aa450..00000000 --- a/_includes/scripts/progressBar.html +++ /dev/null @@ -1,80 +0,0 @@ -{% if site.enable_progressbar %} - - - - -{%- endif %} \ No newline at end of file diff --git a/_includes/selected_papers.html b/_includes/selected_papers.html deleted file mode 100644 index 0093c87a..00000000 --- a/_includes/selected_papers.html +++ /dev/null @@ -1,4 +0,0 @@ -
-

selected publications

- {% bibliography -f papers -q @*[selected=true]* %} -
diff --git a/_includes/social.html b/_includes/social.html deleted file mode 100644 index 0843a5a1..00000000 --- a/_includes/social.html +++ /dev/null @@ -1,16 +0,0 @@ -{% if site.email %}{% endif %} -{% if site.orcid_id %}{% endif %} -{% if site.scholar_userid %}{% endif %} -{% if site.publons_id %}{% endif %} -{% if site.research_gate_profile %}{% endif %} -{% if site.github_username %}{% endif %} -{% if site.linkedin_username %}{% endif %} -{% if site.twitter_username %}{% endif %} -{% if site.medium_username %}{% endif %} -{% if site.quora_username %}{% endif %} -{% if site.blogger_url %}{% endif %} -{% if site.work_url %}{% endif %} -{% if site.wikidata_id %}{% endif %} -{% if site.strava_userid %}{% endif %} -{% if site.keybase_username %}{% endif %} -{% if site.gitlab_username %}{% endif %} diff --git a/_layouts/about.html b/_layouts/about.html deleted file mode 100644 index 0b7cf5fe..00000000 --- a/_layouts/about.html +++ /dev/null @@ -1,51 +0,0 @@ ---- -layout: default ---- - -
- -
-

- {% if site.title == "blank" %}{{ site.first_name }} {{ site.middle_name }} {{ site.last_name }}{% else %}{{ site.title }}{% endif %} -

-

{{ page.description }}

-
- -
- {% if page.profile %} -
- {% if page.profile.image %} - - {% endif %} - {% if page.profile.address %} -
- {{ page.profile.address }} -
- {% endif %} -
- {% endif %} - -
- {{ content }} -
- - - - {% if page.selected_papers %} - {% include selected_papers.html %} - {% endif %} - - {% if page.social %} - - {% endif %} -
- - -
diff --git a/_layouts/bib.html b/_layouts/bib.html deleted file mode 100644 index f5c92f02..00000000 --- a/_layouts/bib.html +++ /dev/null @@ -1,145 +0,0 @@ ---- ---- - -
-
- {% if entry.abbr %} - {% if site.data.venues[entry.abbr] %} - {{entry.abbr}} - {% else %} - {{entry.abbr}} - {% endif %} - {% endif %} - {% if entry.abbr_publisher %} - {% if site.data.venues[entry.abbr_publisher] %} - {{entry.abbr_publisher}} - {% else %} - {{entry.abbr_publisher}} - {% endif %} - {% endif %} -
- -
- {% if entry.type == "thesis" %} - {{reference}} - {% else %} -
{{entry.title}}
-
- {% for author in entry.author_array %} - {% assign author_is_self = false %} - {% if author.last == site.scholar.last_name%} - {% if site.scholar.first_name contains author.first%} - {% assign author_is_self = true %} - {% endif %} - {% endif %} - {% assign coauthor_url = nil %} - {% if site.data.coauthors[author.last] %} - {% for coauthor in site.data.coauthors[author.last] %} - {% if coauthor.firstname contains author.first %} - {% assign coauthor_url = coauthor.url %} - {% break %} - {% endif %} - {% endfor %} - {% endif %} - - {% if forloop.length == 1 %} - {% if author_is_self %} - {{author.first}} {{author.last}} - {% else %} - {{author.first}} {{author.last}} - {% endif %} - {% else %} - {% unless forloop.last %} - {% if author_is_self %} - {{author.first}} {{author.last}}, - {% else %} - {% if coauthor_url %} - {{author.first}} {{author.last}}, - {% else %} - {{author.first}} {{author.last}}, - {% endif %} - {% endif %} - {% else %} - {% if author_is_self %} - and {{author.first}} {{author.last}} - {% else %} - {% if coauthor_url %} - and {{author.first}} {{author.last}} - {% else %} - and {{author.first}} {{author.last}} - {% endif %} - {% endif %} - {% endunless %} - {% endif %} - {% endfor %} -
- -
- {% if entry.type == "article" %} - {{entry.journal}} - {% elsif entry.type == "inproceedings" %} - In {{entry.booktitle}} - {% endif %} - {% if entry.year %} - {{entry.year}} - {% endif %} -
- {% endif %} - - - - - {% if entry.abstract %} - - {% endif %} -
-
diff --git a/_layouts/default.html b/_layouts/default.html deleted file mode 100644 index b0aa49b6..00000000 --- a/_layouts/default.html +++ /dev/null @@ -1,31 +0,0 @@ - - - - - {% include head.html %} - {% include scripts/mathjax.html %} - - - - - - - {% include header.html %} - - - -
- {{ content }} -
- - - - {% include footer.html %} - - - - {% include scripts/bootstrap.html %} - {% include scripts/mansory.html %} - {% include scripts/misc.html %} - - diff --git a/_layouts/distill.html b/_layouts/distill.html deleted file mode 100644 index bc323ce8..00000000 --- a/_layouts/distill.html +++ /dev/null @@ -1,117 +0,0 @@ - - - - - {%- include head.html %} - - {% include scripts/jquery.html %} - {% include scripts/mathjax.html %} - - - - - {% if page._styles %} - - - {%- endif %} - - - - - - - - - - {%- include header.html %} - - -
- - -

{{ page.title }}

-

{{ page.description }}

-
- - - - - {% if page.toc -%} - - - - {%- endif %} - - {{ content }} - - - - - - - - - - {%- if site.disqus_shortname and page.disqus_comments -%} - {% include disqus.html %} - {%- endif %} - {%- if site.giscus.repo and page.giscus_comments -%} - {% include giscus.html %} - {%- endif -%} - -
- - - {%- include footer.html %} - - {% include scripts/bootstrap.html %} - {% include scripts/analytics.html %} - {% include scripts/progressBar.html %} - - diff --git a/_layouts/none.html b/_layouts/none.html deleted file mode 100644 index b92f6522..00000000 --- a/_layouts/none.html +++ /dev/null @@ -1 +0,0 @@ -{{content}} diff --git a/_layouts/page.html b/_layouts/page.html deleted file mode 100644 index 10b9ab4f..00000000 --- a/_layouts/page.html +++ /dev/null @@ -1,15 +0,0 @@ ---- -layout: default ---- -
- -
-

{{ page.title }}

-

{{ page.description }}

-
- -
- {{ content }} -
- -
diff --git a/_layouts/post.html b/_layouts/post.html deleted file mode 100644 index 70d595ed..00000000 --- a/_layouts/post.html +++ /dev/null @@ -1,37 +0,0 @@ ---- -layout: default ---- - -{% if page._styles %} - -{% endif %} - -
- -
-

{{ page.title }}

- -
- -
- {{ content }} -
- - {% if site.disqus_shortname and page.comments %} -
- - - {% endif %} - -
diff --git a/_news/announcement_1.md b/_news/announcement_1.md deleted file mode 100644 index 5e5b87e6..00000000 --- a/_news/announcement_1.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -layout: post -date: 2021-05-20 13:59:00-0400 -inline: true ---- - -Our lab published 14 ICASSP paper in the up-coming ICASSP2021. Please check here for details. diff --git a/_news/announcement_2.md b/_news/announcement_2.md deleted file mode 100644 index 334ccc67..00000000 --- a/_news/announcement_2.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -layout: post -date: 2021-06-07 13:30:00-0400 -inline: true ---- - - -Shinji, with Keisuke, Yusuke, and Naoyuki, delivered a tutorial on "Distant Conversational Speech Recognition And Analysis: Recent Advances, And Trends Towards End-To-End Optimization" in ICASSP 2021. Detailed slides can be found here. diff --git a/_news/announcement_3.md b/_news/announcement_3.md deleted file mode 100644 index 3413ebd5..00000000 --- a/_news/announcement_3.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -layout: post -date: 2021-06-03 0:00:00-0400 -inline: true ---- - -Our lab has 20 Interspeech paper accepted in the Interspeech2021. Detailed list will be available soon in publication page. diff --git a/_news/announcement_4.md b/_news/announcement_4.md deleted file mode 100644 index 44a79758..00000000 --- a/_news/announcement_4.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -layout: post -date: 2021-09-13 0:00:00-0400 -inline: true ---- - -Our lab has 9 ASRU paper accepted in the ASRU2021. Detailed list is already available in publication page. diff --git a/_news/announcement_5.md b/_news/announcement_5.md deleted file mode 100644 index f3d3d478..00000000 --- a/_news/announcement_5.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -layout: post -date: 2022-03-01 0:00:00-0400 -inline: true ---- - -Our lab has 18 ICASSP paper accepted in the ICASSP2022. Detailed list is already available in publication page. diff --git a/_news/announcement_6.md b/_news/announcement_6.md deleted file mode 100644 index 04614a2a..00000000 --- a/_news/announcement_6.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -layout: post -date: 2022-06-15 0:00:00-0400 -inline: true ---- - -Our lab has 23 paper accepted in the Interspeech2022. Detailed list will be available in publication page. diff --git a/_pages/about.md b/_pages/about.md deleted file mode 100644 index 7d165eb9..00000000 --- a/_pages/about.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -layout: about -title: About -permalink: / -description: 'affiliated with @ LTI/CMU.' - -news: true # includes a list of news items -selected_papers: true # includes a list of papers marked as "selected={true}" -social: true # includes social icons at the bottom of the page ---- - -This is Watanabe's Audio and Voice (WAV) Lab at the Language Technologies Institute of Carnegie Mellon University. Our research interests include automatic speech recognition, speech enhancement, spoken language understanding, and machine learning for speech and language processing. - - -
- -
- The end-of-semester presentation, 05.06.2024 -
-
- - - - - diff --git a/_pages/courses.md b/_pages/courses.md deleted file mode 100644 index 62568c9a..00000000 --- a/_pages/courses.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -layout: page -permalink: /courses/ -title: Courses -description: This page holds the courses related to our lab. -nav: true -order: 6 ---- - -### 2023 Fall - -* [Speech Recognition and Understanding (11-751)]({% post_url 2023-08-27-11751-2023f %}) - -### 2023 Spring - -* [Speech Processing (11-692)]({% post_url 2022-01-16-11692-2023s %}) - -### 2022 Fall - -* [Speech Recognition and Understanding (11-751)]({% post_url 2022-08-29-11751-2022f %}) - - diff --git a/_pages/info.md b/_pages/info.md deleted file mode 100644 index bbc0e888..00000000 --- a/_pages/info.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -layout: page -permalink: /info/ -title: Info -nav: true -order: 8 ---- - -This page has some information guidelines for members of WAVLab. - -* [TIR cluster use instructions (for ESPnet user)]({% post_url 2022-01-01-tir-usage %}) -* [AWS use instructions]({% post_url 2022-01-01-aws-usage %}) -* [PSC cluster use instructions]({% post_url 2022-01-01-psc-usage %}) -* [Delta cluster use instructions]({% post_url 2023-04-02-delta-usage %}) -* [Babel cluster use instructions]({% post_url 2024-08-20-babel-usage %}) -* [ESPnet2 recipes]({% post_url 2022-01-01-espnet2-recipe %}) -* [Lab logos and slides template](https://github.com/shinjiwlab/lab_logo) (Need to request access) - - -Our galleries - -* [2023 Gallery]({% post_url 2023-09-24-2023-record %}) -* [2022 Gallery]({% post_url 2022-12-31-2022-record %}) -* [2021 Gallery]({% post_url 2021-12-13-2021-record %}) diff --git a/_pages/positions.md b/_pages/positions.md deleted file mode 100644 index a8ab0f9e..00000000 --- a/_pages/positions.md +++ /dev/null @@ -1,85 +0,0 @@ ---- -layout: page -permalink: /positions/ -title: Positions -nav: true -order: 9 ---- - -Thank you for considering working with us! - -Our lab has open and collaborative minds and often has various opportunities, including postdocs, visitors, and Ph.D. students. -We want applicants to have solid fundamentals and interests in one or more of the following topics: - -- Automatic speech recognition -- Speech enhancement and separation -- Spoken language understanding -- Machine learning for speech and language processing - -Solid programming skills and open-source experiences are also preferred. - -However, we also consider the research diversity. We're interested in expertise outside the above topics or other unique experiences, which can be applied to speech and audio problems. - -Please see the following for details of each application category. - -### Postdoc -- Currently, we don't have an opening position, but if you really want to work with us, please contact us. -- If you're interested in the position, we suggest you to have the following actions. **Note that we would not respond to all applications.** -- [ ]   Please email your CV to shinjiw@ieee.org with the subject **"WAVLab postdoc applications"**. -- [ ]   If you do not put it in the subject, we regard that you do not thoroughly investigate our lab's activities in this webpage, and unfortunately, we may not respond to this email. -- [ ]   Note that your CV will be shared with other lab members (but we do not distribute it outside). -- [ ]   Please check our publications and find matches in advance. We really care about it. -- [ ]   Please clarify your available term. This is very important for the postdoc application. - -### PhD - -- [ ]   Please submit your application through [LTI](https://www.lti.cs.cmu.edu/apply-lti). Unfortunately, we may not respond to a direct email about the Ph.D. application. - -### CMU undergraduate/master students - -- [ ]   Please email your CV to shinjiw@ieee.org with the subject **"WAVLab undergraduate/master applications"** -- [ ]   If you do not put it in the subject, we regard that you do not thoroughly investigate our lab's activities in this webpage, and unfortunately, we may not respond to this email. -- [ ]   Note that your CV will be shared with other lab members (but we do not distribute it outside). -- [ ]   We care about the educational perspectives for the undergraduate/master students to be familiar with the speech research background, required programming skills, knowledge of the cluster, and use of speech and audio toolkit. This period would be at least the first couple of months or longer. We believe that this is an essential process for you to start a solid research activity. If you want to publish a paper as soon as possible, our lab is not the best option for you, unfortunately. I recommend you to contact the other faculties. -- [ ]   Please refer to the FAQ to learn more about the interview and admission process. - -### Visting positions - -- [ ]   Please email your CV to shinjiw@ieee.org with the subject **"WAVLab visitor applications"**. -- [ ]   If you do not put it in the subject, we regard that you do not thoroughly investigate our lab's activities in this webpage, and unfortunately, we may not respond to this email. -- [ ]   Note that your CV will be shared with other lab members (but we do not distribute it outside). -- [ ]   We will not have funding support for the visitor position, basically. So, therefore, we will only accept self-funded researchers. -- [ ]   Please refer to the FAQ to learn more about the interview and admission process. - -### FAQ - -- Do you have a Ph.D. opening? - - Yes, we'll always have at least one Ph.D. position per year. - -- Who will I meet with during my interview? - - The interview is of 2 stages - - A preliminary interview with Ph.D. students - - Interview with Prof. Shinji - -- What is the structure of the preliminary interview? - - The structure of the interview is as follows- - - Basic Introduction of yourself as well as the interviewee (5 minutes) - - Discussion of research experience of the interviewee (20 minutes) - - General Questions (10 minutes) - - Quiz (10 minutes) -> Either about Neural networks or Language Model fundamentals - - Interviewee asks questions about lab (5 minutes) - -- What are interviews with Shinji like? - - The structure of the interview is as follows- - - Introduction (5 minutes) - - General Questions (20 minutes) - - Interviewee asks questions about lab (5 minutes) - -- What can I expect following my interview? What are my chances for being accepted? - - After we finish the interviews, The committee combines the information from the interviews to form a complete picture of the applicants. We consider the overall quality, as well as fit with our lab and the research diversity of the lab. We also consider applications from students who have previously not worked in speech processing. This process takes a few weeks, after which we notify students if they have been accepted, waitlisted, or rejected. Additionally, not all offers are sent at the same time. If someone else receives an offer and you haven’t heard yet, don’t be discouraged! - -- What should I do following my interviews? - - Keep us informed of your status. If you are considering offers from other labs or have any deadlines specific to your program, please let us know. We try to move quickly, but we don’t want to lose a candidate who thinks that they won’t get in because they haven’t heard back yet. - -- What are potential projects? - - We work on almost every aspect of speech processing, from frontends like speech enhancement to speech recognition and text to speech, as well as downstream tasks like speech translation and spoken language understanding. The research projects assigned to you depend on your interest and are decided after you are accepted into our lab. diff --git a/_pages/publications.md b/_pages/publications.md deleted file mode 100644 index 1600c56c..00000000 --- a/_pages/publications.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -layout: page -permalink: /publications/ -title: Publications -years: [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017] -nav: true -order: 2 ---- - - -- [2024 Papers]({% post_url 2024-01-30-paper-list %}) -- [2023 Papers]({% post_url 2023-03-14-paper-list %}) -- [2022 Papers]({% post_url 2022-12-31-paper-list %}) -- [2021 Papers]({% post_url 2021-12-31-paper-list %}) -- [2020 Papers]({% post_url 2020-12-31-paper-list %}) -- [2019 Papers]({% post_url 2019-12-31-paper-list %}) -- [2018 Papers]({% post_url 2018-12-31-paper-list %}) - - -
- -{% for y in page.years %} -

{{ y }}

- {% bibliography -f papers -q @*[year={{y}}]* %} -{% endfor %} - -
- - diff --git a/_pages/speech-lunch.md b/_pages/speech-lunch.md deleted file mode 100644 index 5da0a79e..00000000 --- a/_pages/speech-lunch.md +++ /dev/null @@ -1,223 +0,0 @@ ---- -layout: page -permalink: /speech_lunch -title: Speech Lunch -nav: true -order: 10 ---- - -Welcome to the Speech Lunch (formerly Sphinx Lunch) at Carnegie Mellon University! -This lunch meeting is designed to discuss any speech-related research items regularly. -The meeting consists of presentations by CMU faculties, CMU students, and guest speakers. -We welcome any reserach topics, including an ordinary presentation, conference presentation rehearsals, preliminary research ideas, research discussions, and so on. -We also welcome any CMU researchers and external researchers to join the meeting. - -During the semester, we will regularly have meetings in the following slots: - -- Date: Thursday 12:30 pm - 1:30 pm -- Room: GHC 6501 - -The time and room may change, especially if we have a guest speaker. -We will announce the talk information through our [mailing list](https://mailman.srv.cs.cmu.edu/mailman/listinfo/sphinxmail). Approval by the admin is required. -So, please subscribe to it if you're interested in the CMU speech! - -Please contact Yifan Peng (yifanpen@andrew.cmu.edu) and Shinji Watanabe (shinjiw@ieee.org) if you would like to participate in our Speech Lunch. - -## Future Talks (tentative schedule) - - -## Previous Talks -- October 10, 2024 - - Title: Improving Multilingual Speech Recognition in the Wild - - Speaker: Brian Yan (CMU) - - Abstract: Multilingual Automatic Speech Recognition (ASR) models are typically evaluated in a setting where the ground-truth language identity of the speech utterance is known, however, this is often not the case for most practical settings. The first part of this talk examines the impact that imperfect Automatic Spoken Language Identification (SLID) has on downstream ASR quality. I present a simple and effective N-best re-ranking approach to improve multilingual ASR accuracy for several prominent acoustic models by employing external features such as language models and text-based language identification models. Our results on FLEURS using the MMS and Whisper models show spoken language identification accuracy improvements of 8.7% and 6.1%, respectively and word error rates which are 3.3% and 2.0% lower on these benchmarks. Then the second part of this talk delves into the tricky case of code-switched speech which contains segments from multiple languages. I describe an on-going effort to create Code-Switched FLEURS: a super hard code-switched ASR and ST benchmark. - -- October 3, 2024 - - Title: Toward Real-Time Simultaneous Translation with Large Language Models - - Speaker: Xi Xu and Siqi Ouyang (CMU) - - Abstract: An ideal real-time simultaneous translation system should deliver high-quality translations at sub-second latency. In this talk, we first discuss how our approach achieved first place in the IWSLT English-German task based on human ratings, using a standard speech LLM model and a Hold-N policy. However, while IWSLT allows for up to 2 seconds of algorithmic latency and overlooks computational delays, real-world applications demand far lower latency. To address this, we introduce FASST, a technique designed to minimize computational latency during inference by avoiding redundant recomputation, thereby maintaining translation quality for trainable policies like wait-k. Finally, we present a novel method leveraging LLMs to anticipate upcoming source content, allowing for enhanced translation quality while achieving ultra-low algorithmic latency, moving closer to the goal of real-time simultaneous translation. - -- September 26, 2024 - - Title: Foundations of Blind Source Separation and Its Advances in Spatial Self-Supervised Learning - - Speaker: Yoshiaki Bando - - Abstract: A key technology in speech and audio analysis is self-supervised learning, which can efficiently train neural models on large-scale unlabeled training data. While existing frameworks such as HuBERT and BEATs have achieved great success for this purpose, they primarily focus on obtaining embeddings for isolated/mixture inputs and would be less suitable for analyzing individual sound events or speech utterances in a mixture recording. In this talk, we introduce our series of studies called spatial self-supervised learning based on blind source separation. This framework trains a neural model to predict embeddings of latent individual sources from a multichannel mixture recording without any manual supervision. We first present the foundations of blind source separation and then describe its neural extension for self-supervised learning, followed by a discussion of future directions for large-scale training using real-world data. - - Bio: Yoshiaki Bando received his Ph.D. degree in informatics from Kyoto University in 2018 and is currently a Senior Researcher at Artificial Intelligence Research Center in National Institute of Advanced Industrial Science and Technology (AIST), Tokyo, Japan. He is also a Visiting Researcher at the RIKEN Center for Advanced Intelligence Project (AIP). His research interests include microphone array signal processing, deep Bayesian learning, robot audition, and field robotics. -- September 12, 2024 - - Title: Continual learning in speech recognition - - Speaker: Ngoc Quan Pham - - Abstract: The current speech recognition models are always trained with closed and stationary datasets, and only a few studies have been conducted for expanding currently trained models with new non-stationary data. In such case, a neural model can suffer from catastrophic forgetting - the weights of the models are overwritten in the subsequent training steps and lose the abilities on the previously learned tasks or domains. In our personal view of anticipating how we might train speech recognition models in the future, in which the models are updated as fast as data is generated, we investigate two different scenarios: expanding a multilingual speech recognition models with more languages and training a speech recognition model with online continual learning. - - Bio: Quan Pham is currently a postdoc at the Interact lab - Karlsruhe Institute of Technology, Germany with professor Alex Waibel (professor at both KIT and CMU). In the last 5 years he made some tiny contributions to speech recognition research, such as stochastic layers to facilitate training deep models, expanding/finetuning networks with low-rank additional weights (concurrent with LoRA) and learning new languages with continual learning. - -- March 21, 2024 - - Title: Online Speech Enhancement and Separation: From Discriminative Methods to Generative Methods - - Speaker: Chenda Li (Shanghai Jiao Tong University) - - Abstract: Online speech enhancement/separation has many applications in daily life. Many scenarios require speech processing systems to be low-latency, such as teleconferencing, hearing aids, etc. In this talk, I’ll present my recent research on online speech enhancement/separation. I will first introduce Skipping Memory LSTM (SkiM), a very efficient model for low-latency online processing. Then, I will share some techniques to better balance the performance and ideal latency for online speech separation. Finally, I will show some of our recent research on streamable diffusion-based speech enhancement. - - Bio: I’m a Ph.D. student from Shanghai Jiao Tong University, and now I’m visiting Watanabe’s Audio and Voice (WAV) Lab at LTI for CMU. My research interests include speech separation, speech enhancement, and multi-talker speech recognition. - -- February 29, 2024 - - Title: Towards robust speech generation - - Speaker: Soumi Maiti (CMU) - - Abstract: In the last decade, the field of Speech Generation has witnessed remarkable advancements, particularly in the improvement of speech quality and naturalness. Despite these strides, challenges persist, such as noise in speech and the limited availability of high-quality data, and lack of robustness of speech generation systems. Additionally, evaluating speech remains a obstacle for large-scale assessment of speech generation models. Simultaneously, recent breakthroughs in Large Language Models (LLMs) have transformed text processing and natural language applications. However, spoken language modeling introduces unique challenges due to the intricate nature of speech components, including speaker characteristics and emotional cues. In this presentation, I will delve into recent advances in speech synthesis, spoken language modeling, and speech evaluation for generative systems, shedding light on the ongoing efforts to address these challenges. - -- February 8, 2024 - - Title: Learning from Unlabeled Speech through Pre-training - - Speaker: Alexander Haojan Liu - - Abstract: In the first part of the talk, I will present DinoSR, a latest self-supervised learning method that can be viewed as a completion of recent speech representation models. Key results on recognition and acoustic unit discovery will be covered. The second part of the talk will cover generative pre-training for speech through flow matching. Similar to the spirit of self-supervised learning, I will show that a general-purpose generative model can be pre-trained from unlabeled speech, and later applied to different tasks in speech such as synthesis, denoising, and separation. - - Bio: Alexander Haojan Liu is a 4th Ph.D. student in Computer Science at MIT Computer Science and Artificial Intelligence Laboratory (CSAIL). He is a member of the Spoken Language System (SLS) Group leading by Dr. James Glass. His research interests are in the field of machine learning, natural language processing (speech processing in particular), and computer vision. Currently, his work focuses on self-supervised learning of audio and their applications. - -- December 7, 2023 - - Title: Unifying Speech Processing Applications with Speech Foundation Models - - Speaker: Shinji Watanabe - - Abstract: After the success of large language models in natural language processing, the field of speech processing is currently exploring the possibility of combining speech and language modalities to create a foundation model. This single unified model could perform multiple speech processing applications, such as speech recognition, synthesis, translation, and spoken language processing. Our group is dedicated to achieving this goal through the development of speech foundation models, including speech/text decoder-only models, whisper-style multi-tasking, universal spoken language understanding, and multilingual SUPERB projects. In addition to showcasing the above research outcomes during this talk, we will describe the engineering efforts involved in building such a large foundation model from scratch on an academic computing scale for reproducibility. - -- November 9, 2023 - - Title: Universal Speech Enhancement: What Can We Do With Real Data? - - Speaker: Wangyou Zhang - - Abstract: Speech enhancement (SE) methods based on deep learning have shown impressive performance on many simulation conditions (TIMIT/WSJ/Librispeech/...+Noise), whereas the generalization to a wider range of real conditions has not been addressed. In fact, many high-performing SE methods tend to overfit the simulation condition in training, whose inductive bias may be easily violated in real conditions. In the era of large-scale pre-training, it is natural to ask whether we can make use of the large-scale real recording data to train a truly universal SE model that can be used for all speech-as-input tasks in real-world conditoins. In this talk, I try to answer the following two questions by summarizing exisiting works on these directions: 1) what can we do to utilize real data for SE training? 2) what models can be used to achieve universal SE? Finally, I will finish the talk by proposing new problems in the related topics. - -- November 2, 2023 - - Title: Music generation with precise control - - Speakers: Chris Donahue and Shih-Lun Wu - - Abstract: In the first half of the session, Chris will discuss some recent work on generating music with precise control and composable outputs. Music audio generation has seen an explosion of activity - we now have the ability to generate music in broad styles with natural language control. However, despite the impressive breadth of these models, they have not yet had a salient impact on music in the real world. Instead, music AI models with more narrow capabilities have had disproportionate impact (e.g. source separation, voice cloning). In this talk, Chris will argue that current narrow models are more appealing to creators because they offer more creative potential for two reasons: (i) they offer precise and familiar forms of control, and (ii) their outputs are composable and integrate with conventional workflows. Chris will discuss two of his recent papers, SingSong (Donahue+ 23) and the Anticipatory Music Transformer (Thickstun+ 23) which seek to bring more creative potential to broadly-capable music generative models. In the second half of the session, Shih-Lun will introduce his recent work, Music ControlNet (Wu+ 23, unpublished), which imbues diffusion-based text-to-music generation models with precise melody, dynamics, and rhythm controls. Music ControlNet builds upon the ControlNet line of research in image generation, and adapts their framework to accept time-varying controls in audio domain. Shih-Lun will demonstrate that Music ControlNet can respond precisely to any composition of the controls it has been trained on, and can also generalize to out-of-distribution control signals that creators may realistically provide. - -- October 12, 2023 - - Title: Computational Audition through Imprecise labels - - Speaker: Ankit Shah - - Abstract: In this talk, we delve into computational auditory processing to mimic how humans and animals interpret sounds to interact with their surroundings effectively. The journey begins with the machine's challenge to recognize a vast array of sounds limited by the known sounds in our datasets. This limitation becomes glaring as current models require large labeled datasets for accuracy, which often isn't feasible in real-world settings due to data scarcity. We then spotlight core issues: the strength of sound labels within available datasets. The quandary is that even with a fraction of known sounds and limited data, inaccuracies in sound labeling lead to suboptimal models. Our focus shifts to devising strategies for sound modeling amidst inaccurate, weak or incomplete labels, termed as working with imprecise labeled data. Our exploration includes enhancing the existing annotations, understanding the effects of label noise and corruption, and innovating a co-training approach for learning sound events from web data without human intervention. We venture into exploiting additional cues like event counts and durations with negligible extra effort, introducing the concept of semi-weak labels. Lastly, the talk describes a unified framework encapsulating all our approaches, making a robust model capable of handling various labeling scenarios, paving a solid foundation for future endeavors in understanding and modeling the world of images (transferrable to sounds), irrespective of label availability. Through this, we aspire to bridge the gap between the human brain's natural sound-processing ability and machines, opening doors to a more harmonious interaction with the acoustic world around us. - - Bio: Ankit Shah is a Ph.D. student in the Language Technologies Institute in the School of Computer Science at Carnegie Mellon University. Ankit earned his master's in Language technologies at Carnegie Mellon University in 2019 and his bachelor's in electronics and communication engineering from the National Institute of Technology Karnataka Surathkal. He has worked in the industry for over 4 years as a verification engineer and project lead at ARM and as a Deep learning research Scientist at ReviveMed before joining the Ph.D. program. His areas of interest are audio understanding, machine learning, and deep learning. His thesis focuses on learning in the presence of weak, uncertain, and incomplete labels, where he has made several key contributions, including the setting up DCASE challenges on the topic. He has won the Gandhian Young Technological Innovator (GYTI) award in India for his contribution to building a never-ending learner of sound systems. His team recently emerged as a winning team in the NYC AI Hackathon challenge on LLM (Large Language Model and generative AI. He enjoys reading several books during the year, listens to music, and loves to travel. Further, he is keenly interested in Economics, Startups, Entrepreneurship, etc. Website: https://ankitshah009.github.io - -- October 5, 2023 - - Title: Adaptive Non-Causality for Speech Recognition - - Speaker: Grant Strimel (Amazon) - - Abstract: Streaming speech recognition architectures are employed for low-latency, real-time applications. Such architectures are often characterized by their causality – how much forward context is consumed before making a prediction on an individual frame. In this talk we will review prior approaches to balance competing objectives of low latency and the accuracy benefit derived from “look ahead” information. We then will discuss an approach we proposed called the Adaptive Non-Causal Attention Transducer (ANCAT). The architecture is non-causal in the traditional sense, but executes in a low-latency, streaming manner by dynamically choosing when to rely on future context and to what degree within the audio stream. The resulting mechanism, when coupled with novel regularization algorithms (which we will dive into) , delivers comparable accuracy to non-causal configurations while improving significantly upon latency, closing the gap with their fully-causal model counterparts. - - Bio: Grant Strimel is a Principal Scientist at Amazon AGI and part of the Alexa Speech Recognition and Deep Learning groups. He joined Alexa Pittsburgh in 2018 where the organization has now grown to over fifty scientists and engineers working on natural language processing experiences through both edge-first and cloud-centric solutions. His primary focus for Amazon has been on low-latency, real-time ML design for speech applications. - -- September 28, 2023 - - Title: Towards robust speech generation - - Speaker: Soumi Maiti - -- August 31, 2023 - - Title: Solving problems of a single-modal task with multi-modality - - Speaker: Minsu Kim (KAIST) - - Abstract: Speech processing technologies include diverse tasks with diverse modalities such as Audio-based Speech Recognition (ASR), Visual Speech Recognition (VSR), Text-to- Speech, Lip-to-Speech, Speech-driven talking face generation, etc. People usually utilize the modalities corresponding to the task at hand when developing a technology. For example, if we develop a VSR model, we usually utilize video and text modalities without considering other modalities (e.g., audio). In this talk, I will show some examples which try to solve a challenge of a single-modal task with multimodal data. They include the cases of employing 1) other modality containing rich task knowledge, 2) correspondence of multimodality, and 3) useful characteristics of different modalities. Finally, we can realize that numerous research opportunities emerge upon exploring various multimodal options pertinent to our current task. - - Bio: Minsu Kim is a Ph.D. student in the school of electrical engineering at Korea Advanced Institute of Science and Technology (KAIST), Daejeon, South Korea. He received the B.S. degree in electrical & electronic engineering from Yonsei University, Seoul, South Korea, in 2019. His research interest is multi-modal language processing including audio-visual speech processing, image/video analysis & generation, and speech translation. - -- June 22, 2023 - - Title: How to best leverage unlabeled data for speech recognition? - - Speaker: Dan Berrebbi - - Abstract: For speech recognition (ASR) as for most machine learning fields, labeled data is very expensive to get and so is far less abundant than unlabeled data. Leveraging untranscribed audio data is thus critical to build strong ASR models. Self-supervised models (SSL) such as wav2vec2.0 or HuBERT, pre-trained on large amounts of unlabeled data with pretext tasks, seem to perfectly respond to that need. Indeed, even when fine-tuned on very small amounts of transcribed speech data, they outperform previous supervised work as well as human performances on LibriSpeech dataset (which is the reference ASR benchmark) by a large margin. However those models can perform badly in case of domain shifts. Moreover we do not really know how to efficiently fine-tune them, or use them for inference. We also do not know well how to benchmark, evaluate or compare them. Finally their pre-training is very hard to reproduce and so finding alternative ways to leverage unlabeled data is of broad interest. In this talk we will tackle the following questions : 1) How to efficiently use speech SSL models for ASR? 2) How to adapt them in domain-shift scenarios? 3) How to efficiently evaluate and benchmark such models? 4) Can some alternative unsupervised training methods such as semi-supervised learning outperform SSL models? This talk is gathering our recent publications at Interspeech 2022 & 2023, ICASSP 2023 and ICLR 2023. - - Bio: Dan Berrebbi is a MLT Master’s student at Language Technologies Institute, Carnegie Mellon University. He obtained his Bachelor and Master’s in Mathematics at Ecole Polytechnique in Paris and then joined Carnegie Mellon University in 2021 to complete his Master’s. He is working with Professor Shinji Watanabe on speech processing. His main research areas are multilingual and low-resource speech recognition as well as self and semi-supervised learning for speech recognition. - -- Apr 27, 2023 - - Title: Audio-Visual Learning for Social Telepresence - - Speaker: Alexander Richard (Meta) - - Abstract: These days, physical distance between people is one of the biggest obstacles to maintaining meaningful social relationships with family, friends, and co-workers. Even with today’s technology, remote communication is limited to a two-dimensional audio-visual experience and lacks the availability of a shared, three-dimensional space in which people can interact with each other over the distance. Our mission at Reality Labs Research (RLR) in Pittsburgh is to develop a telepresence system that is indistinguishable from reality, i.e., a system that provides photo- and phono-realistic social interactions in VR. Building such a system requires a leap forward in audio modeling: 3D virtual spaces call for highly realistic 3D spatial audio rendering and immersive experiences demand to strip a user’s input audio from all environmental influences such as noise and reverb. Addressing the first problem, I will talk about building neural renderers for spatial audio from capture stage design to model development. For the latter problem, I will present an approach to speech enhancement that is based on a codify-and-resynthesize paradigm. In the future, these technologies will help build a realistic virtual environment with lifelike avatars that allow for authentic social interactions, connecting people all over the world, anywhere and at any time. - - Bio: Alexander Richard is a Research Scientist at Reality Labs Research (RLR) in Pittsburgh leading the audio-visual research team. With his team, he concentrates on audio-visual learning to build photo- and phono-realistic immersive experiences in Virtual Reality that enable remote communication indistinguishable from reality. Combining computer vision, machine learning, and audio processing, he develops key technologies for audio-visual lifelike avatars and novel 3D rendering approaches for spatial and binaural audio. Before joining RLR, Alexander was a Speech Scientist at Amazon Alexa in Aachen, Germany. He received his PhD from the University of Bonn for his work on temporal segmentation of human actions in videos. - -- Apr 20, 2023 - - Title: End-to-End Speech Summarization: Global Acoustic Context - - Speaker: Roshan Sharma - - Abstract: Speech in the real world is verbose, and humans possess the special ability to recognize what is being said, understand it, and consequently summarize speech. Automatic methods for speech summarization are crucial to imbue such capabilities in the artificial intelligence of tomorrow. Current methods for speech summarization involve a cascade of speech recognition and transcript summarization, which suffer from error propagation and larger model sizes. We proposed training speech summarization models end-to-end and demonstrated that such models outperform cascade summarization approaches. Speech summarization becomes more complex as the input length increases to a few minutes or half an hour. In this talk, we address the challenge of efficient and performant training for long audio summarization. - - Bio: Roshan Sharma is a Ph.D. candidate in the Electrical and Computer Engineering Department at Carnegie Mellon University. He earned his B.Tech. with distinction in Electronics and Communication Engineering at Amrita Vishwa Vidyapeetham, India in 2018 and his M.S. in Electrical and Computer Engineering at Carnegie Mellon University in 2020. His research interests lie in speech recognition, spoken language understanding, and multimodal machine learning. - -- Apr 6, 2023 - - Title: Continual Learning in Speech and Audio Applications - - Speaker: Muqiao Yang - - Abstract: The outstanding performance of deep neural networks typically relies on the training on a large fixed set of data. However, in practical applications, the properties of data streams may vary over time and the model may have limited access to past data, where the model performance will be affected due to the catastrophic forgetting effect. In this talk, we will focus on multiple Speech and Audio tasks, including Automatic Speech Recognition, Acoustic Scene Classification and Spoken Language Understanding and investigate how different continual learning scenarios and methods work under such settings. - - Bio: Muqiao Yang is a 3rd year PhD student at Carnegie Mellon University, working with Prof. Bhiksha Raj and Prof. Shinji Watanabe. His research interest is mainly on machine learning in speech processing including speech recognition and speech enhancement. He received his B.E degree from Hong Kong Polytechnic University and M.S. degree from Carnegie Mellon University. - -- Mar 30, 2023 - - Title: Reference Free Learning for Speech Enhancement and Speech Assessment - - Speaker: Anurag Kumar (Meta) - - Abstract: Improving perceptual quality and intelligibility of speech signals is critical for improving communications in real and virtual world. This is needed for people with normal hearing as well as those who have some form of hearing impairments. In this talk, I will present an outline of some of my recent research on both methods to enhance degraded speech as well as methods for speech assessment. I will go in depth of our recent works on unsupervised and self-supervised approaches for speech enhancement and how speech signals from the wild – for which target signal are not available - might be used for enhancement. These approaches enable easier adaptation to out of domain conditions as well as opens up the possibility of on-the-fly adaptation of enhancement systems. I will also present reference-less approaches for speech quality and intelligibility assessment - in particular the NORESQA framework which introduced a new way of non-intrusive speech assessment by leveraging non-matching references. - - Bio: Anurag Kumar is currently a Research Scientist and Technical Research Lead at Reality Labs Research, Meta. Anurag's primary research interests are in machine learning for audio and speech processing and audio-visual learning. Before joining Meta, Anurag obtained his PhD from Language Technologies Institute (LTI) in School of Computer Science, Carnegie Mellon University in 2018. Anurag obtained his undergraduate degree in Electrical Engineering from IIT Kanpur in 2013. Anurag’s PhD dissertation "Acoustic Intelligence in Machines" pioneered weak label learning for sounds which has since become a key area of research in the field of audio AI. Anurag has been recipient of several awards and recognition including Best Paper Finalist at CVPR 2022 and NCC 2014, Finalist in Qualcomm Innovation Fellowship 2017, Winner of Samsung Innovation Awards 2012, travel grants from IEEE SPS and EURASIP. - -- Mar 23, 2023 - - Title: Adversarial robustness of modern speech recognition models: evaluation and implications - - Speaker: Raphael Olivier - - Abstract: Adversarial attacks on Machine Learning models are small perturbations of inputs which fool models into predicting the wrong outputs. In numerous real-world settings they are known to be the source of potential security liabilities. In this work we study the implications of these adversarial attacks when applied to Automatic Speech Recognition (ASR) models. Our main finding is that the recent progress in ASR performance has led to an increase in adversarial vulnerability, rather than an improvement in robustness . We illustrate two aspects of this phenomenon. First, even models like Whisper with state-of-the-art robustness to random noise and domain change show no significant adversarial robustness improvement, while their increased adoption makes threat models more realistic. Second, we show that popular ASR training paradigms like Self-Supervised Learning (SSL) have opened the way to new threat models like transferred adversarial attacks from a proxy model. We draw conclusions from those results, emphasizing the importance to include adversarial robustness in speech modeling pipelines from a security perspective, but also the interest of adversarial evaluation in better understanding those new learning paradigms. - - Bio: Raphaël is a PhD candidate at Carnegie Mellon University working with professor Bhiksha Raj. His research is at the intersection of speech technologies and AI safety, with a focus on adversarially robust speech recognition. - -- Mar 16, 2023 - - Title: Everyday Conversation Recognition with End-to-End Neural Networks - - Speaker: Xuankai Chang - - Abstract: Over the past decade, remarkable advancements have been made in automatic speech recognition (ASR) largely due to the rapid development of deep neural networks. However, ASR systems often encounter challenges in complex settings caused by background noise, reverberations, overlapping speech, etc. In this talk, we present our efforts towards recognizing everyday conversations using End-to-End neural networks. Our approach tackles the issue of noise and overlapping speech by leveraging single- and multi-channel signals. - - Bio: Xuankai Chang is a 4-th year PhD student at Carnegie Mellon University's Language Technology Institute at the school of Computer Science working with Prof. Shinji Watanabe. His research interests are in the field of speech processing such as speech recognition / enhancement / separation. He received his B.S. and M.S degrees from Shanghai Jiao Tong University, China. - -- Feb 16, 2023 - - Title: Multi-blank Transducers for Speech Recognition - - Speaker: Hainan Xu (NVIDIA) - - Abstract: We propose a modification to RNN-Transducer (RNN-T) models for automatic speech recognition (ASR). In standard RNN-T, the emission of a blank symbol consumes exactly one input frame; in our proposed method, we introduce additional blank symbols, which consume two or more input frames when emitted. We refer to the added symbols as big blanks, and the method multi-blank RNN-T. For training multi-blank RNN-Ts, we propose a novel logit under-normalization method in order to prioritize emissions of big blanks. With experiments on multiple languages and datasets, we show that multi-blank RNN-T methods could bring relative speedups of over +90%/+139% to model inference for English Librispeech and German Multilingual Librispeech datasets, respectively. The multi-blank RNN-T method also improves ASR accuracy consistently. We will release our implementation of the method in the [NeMo](https://github.com/NVIDIA/NeMo/) toolkit. - - Bio: I am currently working in NVIDIA's NeMo Team, supervised by Boris Ginsburg. Before joining NVIDIA, I worked in Google's Speech Team under Bhuvana Ramabhadran from September 2019 to October 2021, after getting my Ph.D. degree in Computer Science from the Johns Hopkins University, working in the Center for Language and Speech Processing (CLSP) under former JHU Prof. Daniel Povey and Prof. Sanjeev Khudanpur. I received my B.S. in Software Engineering in 2012 from School of Software Engineering at Shanghai Jiaotong University in Shanghai, China. From 2012 to 2013, I worked with Professor Kai Yu in SJTU Speech Lab. - -- Feb 2, 2023 - - Title: Towards robust audio-visual learning - - Speaker: Billy Li - - Abstract: Audio Visual Event Detection has benefited greatly from the advancement of deep learning in the past few years. Various model architectures have been applied to the task in multiple modalities, pushing the performance benchmark and enabling the deployment of such models in many critical tasks such as surveillance and malicious content filtering. However, the research community still lacks: 1) a systematic understanding of the different machine learning models' behavior given the unique nature of audio signals compared to the image or text counterparts. 2) The robustness of different models used for audio-visual learning also remains to be an under-studied area. To address the first point, we investigate best practices for building an audio-only and audio-visual learning system that performs well. Specifically, we analyze the features, compare different architectures, mainly focusing on convolutional family and Transformer family models, and understand the difference in training techniques to provide a comprehensive and thorough understanding. To address the second goal, we study the robustness of each model by gauging their behavior under noise and adversarial perturbation. We first demonstrate the existence of real-world threats in both the visual and audio domains. We then expand our scope of robustness analysis from unimodal audio input to multiple modalities including audio, video, image and text. - - Bio: Juncheng (Billy) Li is last year PhD student at Carnegie Mellon University’s Language Technology Institute at the School of Computer Science working with Prof. Florian Metze. Juncheng (Billy) Li had worked as a research scientist at the Bosch Center for Artificial Intelligence from 2015 till 2019 where he worked with Prof. Zico Kolter. Juncheng (Billy) has a background in Deep Learning in Acoustics signals and multimodal data, and he is currently working on exploring the adversarial robustness of the multimodal machine learning systems. Juncheng also acquired extensive experience in applying AI to industrial problems when he worked at Bosch, specifically, he has worked on projects including fault detection, machine simulation and sensor fusion. Juncheng has published at IEEE ICASSP, Interspeech, ICML and NeurIPS, and won the best student paper at ICASSP 2022, and best paper award at ICMR 2018. - -- January 19, 2023 - - Title: Self-supervised speech restoration for historical audio - - Speaker: Takaaki Saeki - - Abstract: Existing historical audio materials are precious resources that contain various linguistic and cultural information. However, restoring or analyzing such audio data is challenging because paired high-quality and degraded speech data is not available. In this talk, we present our recent work on self-supervised speech restoration for historical audio. Our model is based on an autoencoder, which disentangles distortionless speech features from acoustic distortions and is trained only with degraded speech data by simulating the recording process. We evaluated our method with real historical audio data and demonstrated the effectiveness. I will also discuss the ongoing work and future directions including larger-scale self-supervised learning by collecting various historical audio materials. - - Bio: Takaaki Saeki is a Ph.D. student advised by Prof. Hiroshi Saruwatari, at Graduate School of Information Science and Technology, the University of Tokyo, Japan. He is also a visiting scholar at Watanabe's Audio and Voice (WAV) Lab, Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA. He received his B.S. and M.S degrees from the University of Tokyo, Japan. He has been working on speech and audio processing, including text-to-speech (TTS) synthesis, voice conversion, automatic speech quality assessment, speech restoration, speech representation learning, multilingual speech processing, etc. - -- December 8, 2022 - - Title: Recent Progresses in All-Neural Contextual Speech Recognition Technologies at Amazon Alexa - - Speaker: Jing Liu (Amazon) - - Abstract: Speech recognition technology is completing a dramatic shift from the conventional disjointly trained neural and non-neural sub-systems to an all-neural end-to-end (E2E) transducer architecture. This unified all-neural E2E architecture improves the state-of-the-art accuracy, while also achieving superior compute and memory compression, enabling low-latency streaming ASR on the edge where resources are constrained. One of the major challenges for E2E ASR systems is that they often have difficulty recognizing uncommon words that appear infrequently in the training data (e.g. personalized contact names, device names). In this talk, I will give an overview of the challenges and our recent progresses in the all-neural contextual ASR technology that powers Alexa, the virtual voice assistant that servers millions of customers daily. - - Bio: Jing Liu is a Sr. Applied Scientist at Amazon Alexa Speech. He earned his PhD degree in mathematics from Carnegie Mellon University and worked on quantitative research in the Wall Street for about 3 years prior to joining the Alexa Hybrid Science in Pittsburgh in 2017. Being a senior member in the team, he made key contributions to the launch of the first Edge ASR for Echo, Auto, FireTV and the transition to all-neural ASR technologies. Jing’s main focus has been low-latency far-field ASR systems for both edge and cloud. - -- November 17, 2022 - - Title: [Compositional End-to-End SLU]({{ site.baseurl }}/assets/pdf/2022-11-17_siddhant.pdf) - - Speaker: Siddhant Arora - - Abstract: End-to-end spoken language understanding (SLU) systems are gaining popularity over cascaded approaches due to their simplicity and ability to avoid error propagation. However, these systems model sequence labeling as a sequence prediction task causing a divergence from its well-established token-level tagging formulation. In work accepted at EMNLP 2022, we build compositional end-to-end SLU systems that explicitly separate the added complexity of recognizing spoken mentions in SLU from the NLU task of sequence labeling. We show that this composition of ASR and NLU formulations in our end-to-end SLU system outperforms both cascaded and direct end-to-end models, offers direct compatibility with pre-trained ASR and NLU systems, allows performance monitoring of individual components and enables the use of globally normalized losses like CRF, making them attractive in practical scenarios. - - Bio: Siddhant Arora is a Ph.D. student at Carnegie Mellon University's Language Technology Institute, advised by Prof. Shinji Watanabe. His research interests are in the field of Natural Language (NLP) and Speech Processing, particularly in Spoken Language Understanding and Spoken Dialog Systems. His prior research experience includes building compositional models, integrating pretrained models in the SLU framework, as well as interpretability and robust testing of ML systems. - -- November 10, 2022 - - Title: [The Cocktail Party Problem: WER we are and WER we are going?]({{ site.baseurl }}/assets/pdf/2022-11-10_cocktail_party.pdf) - - Speaker: Samuele Cornell - - Abstract: Multi-talker, distant speech recognition is still an incredibly challenging task due to the noisy/reverberant nature of the speech signal and the spontaneous nature of the conversation which leads to colloquial language and overlapped speech. In this presentation, focusing on the famous CHiME challenges, I will give a brief overview of how this problem has been tackled recently and what are the current research trends and promising directions (e.g. integration of front-end beamforming and back-end speech recognition systems). - - Bio: Samuele Cornell is a doctoral candidate at UnivPM Department of Information Engineering. His current research interests are in the area of front-end pre-processing techniques for Automatic Speech Recognition applications such as: Source Separation, Speech Enhancement, Speech Segmentation and Beamforming. He also deals with Sound Event Detection and was among the organizers of the DCASE Task 4 2021 and 2022 Challenges. He is co-author of Asteroid Source Separation, SpeechBrain and ESPNet++ toolkits and contributes to other open-source toolkits in the speech processing area. In 2021 he interned for three months at Amazon Alexa AI with the wakeword team and in summer 2022 was with the Language Technologies Institute at Carnegie Mellon University. - -- October 13, 2022 - - Title: Robot Concept Learning in Situated Dialogue - - Speaker: Matthew Marge - - Abstract: Intelligent agents that refer to and make use of the physical world, like robots, will be more able to adapt to new situations if they can learn concepts in real time from humans. This process forms an interactive dialogue loop between robots asking questions to learn more about the physical world and humans using natural language to teach them. In this talk, I will present findings from the Human-Robot Dialogue Learning project that explored these topics. Key accomplishments include (1) an improved understanding of how humans teach robots compared to other humans, (2) a first-of-its-kind corpus of questions that robots can use to learn from human teachers, and (3) real-time algorithms that enable robots to generate questions that maximize learning in a cognitive robotic architecture. The end result is the novel capability for intelligent agents to use situated dialogue and one-shot learning to acquire more information about their surroundings with human teammates. - - Bio: Matthew Marge is a Senior Computer Scientist at DEVCOM Army Research Laboratory (ARL). He received the Ph.D. and M.S. degrees in Language and Information Technologies from the School of Computer Science at Carnegie Mellon University, the M.S. degree in Artificial Intelligence from the University of Edinburgh, and the B.S. degrees in Computer Science and Applied Mathematics and Statistics from Stony Brook University. Dr. Marge's research focuses on how robots and other artificial agents can establish common ground with people through dialogue. His current interests lie at the intersection of computational linguistics, human-machine interaction, and integrative AI systems, specializing in conversational AI. Dr. Marge is a recipient of the 2018 Office of the Secretary of Defense's Laboratory University Collaboration Initiative award, supporting his research on dialogue with robots. In addition to his position at ARL, he is an Adjunct Professor in the Computer Science and Linguistics Departments at Georgetown University. - -- September 29, 2022 - - Title: [Audio Visual Recognition and Understanding]({{ site.baseurl }}/assets/pdf/2022-09-29_Karthik.pdf) - - Speaker: Karthik Ganesan - - Abstract: Streaming audio-visual speech recognition (SAVSR) introduces an online setting to audio-visual speech recognition (AVSR), which frees the full utterance requirement prior to decoding that traditional speech recognition models are limited to. Streaming audio-visual speech recognition further challenges the model leaving itself to decide how much the model should wait to have retrieved enough information to start decoding. While transformer based models such as AvHuBERT have been successful in AVSR tasks through pretraining and cross-modal interactions, these models suffer in achieving reasonable Real-Time Factor (RTF) which is necessary for communication agents. We propose ESPnet Mulimodal, a multimodal frame work integrated to ESPnet, and provide baseline results for the task SAVSR. We also propose a streaming transformer. and multimodal fusion based model for SAVSR. Through ESPnet Mulitmodal, we expect to facilitate research in the field of audio-visual tasks including SAVSR. - - Bio: Karthik Ganesan is a Masters student, advised by Dr. Shinji Watanabe, at Watanabe's Audio and Voice (WAV) Lab, Language Technologies Institute (LTI), Carnegie Mellon University (CMU), Pittsburgh, PA. He received his B.E. in computer science from MSRIT, Bangalore, India. He has been conducting research on various aspects of conversation AI systems, including audio visual streaming ASR, 2 pass streaming speech recognition, E2E SLU, Parameter efficient Multilingual ASR. - -- September 15, 2022 - - Title: [End-to-End Unsupervised ASR and Its Application]({{ site.baseurl }}/assets/pdf/2022-09-15-jiatong.pdf) - - Speaker: Jiatong Shi - - Abstract: Unsupervised ASR is to learn an ASR model without parallel speech/text. Recently, with the help of self-supervised learning, end-to-end unsupervised ASR has become possible and has shown impressive performances. This talk goes through some of our efforts from the pre-training team, [JSALT2022](https://jsalt-2022-ssl.github.io/), including some experiences in end-to-end unsupervised ASR and its extended usages in self-supervised augmentation, acoustic segmentation, and connection between modalities. We will also discuss some ongoing open-source works. - - Bio: Jiatong Shi is a Ph.D. student, advised by Dr. Shinji Watanabe, at Watanabe’s Audio and Voice (WAV) Lab, Language Technologies Institute (LTI), Carnegie Mellon University (CMU), Pittsburgh, PA. He received his B.S. of computer science from Renmin University of China (RUC), advised by Dr. Qin Jin and M.S. of computer science from Johns Hopkins University (JHU) advised by Dr. Shinji Watanabe. He has been conducting research on various aspects of speech/audio processing, including speech recognition, speech translation, speech synthesis, speaker diarization, and singing voice synthesis. His recent focus is speech-to-speech translation. - -- September 1, 2022 - - Title: [Is everything end-to-end?]({{ site.baseurl }}/assets/pdf/2022-09-01-shinji.pdf) - - Speaker: Shinji Watanabe (CMU) - - Abstract: This presentation introduces some of our group's recent attempts at making an end-to-end network that integrates various speech processing modules as a single neural network. I'll talk about ASR (feature extraction, acoustic modeling, lexicons, language modeling), far-field conversation recognition (ASR + denoising/dereverberation/separation (+ diarization)), and cycle consistency training (ASR + SID + TTS). I will introduce some random thoughts about these attempts and also discuss future integration ideas. - - Bio: Shinji Watanabe is an Associate Professor at Carnegie Mellon University, Pittsburgh, PA. He received his B.S., M.S., and Ph.D. (Dr. Eng.) degrees from Waseda University, Tokyo, Japan. He was a research scientist at NTT Communication Science Laboratories, Kyoto, Japan, from 2001 to 2011, a visiting scholar in Georgia institute of technology, Atlanta, GA in 2009, and a senior principal research scientist at Mitsubishi Electric Research Laboratories (MERL), Cambridge, MA USA from 2012 to 2017. Prior to the move to Carnegie Mellon University, he was an associate research professor at Johns Hopkins University, Baltimore, MD USA from 2017 to 2020. His research interests include automatic speech recognition, speech enhancement, spoken language understanding, and machine learning for speech and language processing. He has published more than 300 papers in peer-reviewed journals and conferences and received several awards, including the best paper award from the IEEE ASRU in 2019. He served as an Associate Editor of the IEEE Transactions on Audio Speech and Language Processing. He was/has been a member of several technical committees, including the APSIPA Speech, Language, and Audio Technical Committee (SLA), IEEE Signal Processing Society Speech and Language Technical Committee (SLTC), and Machine Learning for Signal Processing Technical Committee (MLSP). - -- August 8, 2022 - - Title: [An Unified Understanding of Voice Conversion and its Medical Application]({{ site.baseurl }}/assets/pdf/2022-08-08-voice_conversion.pdf) - - Speaker: Wen-Chin Huang (Nagoya University) - - Abstract: Voice conversion (VC) is the task of converting one kind of speech to another without changing the linguistic contents, and is the second most popular research field in speech synthesis. With the rise of deep neural networks, there are more and more VC methods being proposed each year, and it might be hard to understand the difference of these methods at first sight. In this talk, I will provide my own, unified understanding of VC, and show that how most successful VC methods implement the same underlying framework. I will also introduce my recent works on dysarthric VC, as a showcase of an application of VC to medicine. - - Bio: Wen-Chin Huang is currently a Ph.D. candidate at Nagoya University, Nagoya, Japan. He received the B.S. degree from National Taiwan University, Taipei, Taiwan, in 2018 and the M.S. degree from Nagoya University, Nagoya, Japan in 2021. He was the recipient of the Best Student Paper Award in ISCSLP2018, the Best Paper Award in APSIPA ASC 2021, and the research fellowship for young scientists (DC1) from the Japan Society for the Promotion of Science in 2021. He was a co-organizer of the Voice Conversion Challenge 2020 and VoiceMOS Challenge 2022. His research focuses on deep learning applications to speech processing, with a main focus in voice conversion and speech quality assessment. - -- June 16, 2022 - - Title: Language Technology for Medical Scribing - - Speaker: Thomas Schaaf (3M \| M\*Modal) - - Abstract: For many reasons, physicians document what they are doing. In the past, they have used handwritten or dictated notes. With the introduction of EHR systems, the complexity of the documentation workflow has increased, leading to frustration and burnout. Medical asynchronous scribes can do the data entry and note-taking for physicians from an audio recording of the conversation between the physician and the patient. Scribes can be supported with Language Technology using a pipeline of speaker diarization, speech recognition, and natural language understanding. This enables them to asynchronously navigate the audio and review extracted dictated sections or abstractive summaries of the conversation. - - Bio: Thomas Schaaf is a Principal Research Scientist at 3M \| M\*Modal. He received his Dr. Ing. from the Universität Karlsruhe in 2004 and has been working on Automatic Speech Recognition, Speech Translation, and Natural Language Understanding at Sony Europe, Carnegie Mellon University, Toshiba Europe, Amazon, and M\*Modal. He has worked on nearly all aspects of speech recognition systems,and his research has contributed, among others, to the prediction of word confidences, detection and learning of out-of-vocabulary words, and speaker normalization. He joined 3M in 2019 through the acquisition of M\*Modal. There, his research focuses on understanding doctor-patient conversations to reduce the burden of the documentation process for doctors and create more time to care. He is also Adjunct Faculty at the Language Technology Institute of Carnegie Mellon University and a reviewer for numerous conferences and journals. - -- May 13, 2022 - - Title: Directions of Dialog Research in the Era of Big Pre-training Models - - Speaker: Zhou Yu (Columbia University) - - Abstract: Big pre-training models (such as BERT and GPT3) have demonstrated excellent performances on various NLP tasks. Instruction tuning and prompting have enabled these models to shine in low-resource settings. The natural question is “Will big models solve dialog tasks?” This talk will first go through big models’ impact on several sub-topics within dialog systems (e.g. social chatbots, task-oriented dialog systems, negotiation/persuasion dialog systems, continue learning in dialog systems, multilingual dialog systems, multimodal dialog systems, deployable dialog systems, etc) and then follow up with the speaker's own interpretations of the challenges remaining and possible future directions. - - Bio: Zhou Yu joined the CS department at Columbia University in Jan 2021 as an Assistant Professor ([http://www.cs.columbia.edu/~zhouyu/](http://www.cs.columbia.edu/~zhouyu/)). Before that, she was an Assistant Professor at UC Davis. She obtained her Ph.D. from Carnegie Mellon University in 2017. Zhou has built various dialog systems that have a real impact, such as a job interview training system, a depression screening system, and a second language learning system. Her research interests include dialog systems, language understanding and generation, vision and language, human-computer interaction, and social robots. Zhou received an ACL 2019 best paper nomination, featured in Forbes 2018 30 under 30 in Science, and won the 2018 Amazon Alexa Prize. diff --git a/_pages/sponsors.md b/_pages/sponsors.md deleted file mode 100644 index d8fb6eea..00000000 --- a/_pages/sponsors.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -layout: page -permalink: /sponsors/ -title: Sponsors -description: We appreciate the sponsorship from various partners. The list is sorted in alphabet order. -nav: true -order: 7 ---- - - -- ASAPP -- Facebook -- Google -- Hitachi -- Honda -- Hyundai -- HLTCOE, Johns Hopkins University -- Line -- MIT Lincoln Laboratory -- Mitsubishi Electric Research Laboratories (MERL) -- Naver -- NSF -- NTT Communication Science Laboratories -- Sony -- Tencent AI Lab -- Yahoo! Japan diff --git a/_plugins/details.rb b/_plugins/details.rb deleted file mode 100644 index fa12bf26..00000000 --- a/_plugins/details.rb +++ /dev/null @@ -1,24 +0,0 @@ -# Code from http://movb.de/jekyll-details-support.html - -module Jekyll - module Tags - class DetailsTag < Liquid::Block - - def initialize(tag_name, markup, tokens) - super - @caption = markup - end - - def render(context) - site = context.registers[:site] - converter = site.find_converter_instance(::Jekyll::Converters::Markdown) - caption = converter.convert(@caption).gsub(/<\/?p[^>]*>/, '').chomp - body = converter.convert(super(context)) - "
#{caption}#{body}
" - end - - end - end - end - - Liquid::Template.register_tag('details', Jekyll::Tags::DetailsTag) \ No newline at end of file diff --git a/_posts/2018-12-31-paper-list.md b/_posts/2018-12-31-paper-list.md deleted file mode 100644 index 31c728d8..00000000 --- a/_posts/2018-12-31-paper-list.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -layout: post -title: 2018 Papers -date: 2018-12-31 11:00:00-0800 -description: 2018 Paper List -comments: false ---- - -
- - -

{{y}}

- {% bibliography -f papers -q @*[year=2018]* %} - -
- diff --git a/_posts/2019-12-31-paper-list.md b/_posts/2019-12-31-paper-list.md deleted file mode 100644 index b80903ed..00000000 --- a/_posts/2019-12-31-paper-list.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -layout: post -title: 2019 Papers -date: 2019-12-31 11:00:00-0800 -description: 2019 Paper List -comments: false ---- - -
- - -

{{y}}

- {% bibliography -f papers -q @*[year=2019]* %} - -
- diff --git a/_posts/2020-12-31-paper-list.md b/_posts/2020-12-31-paper-list.md deleted file mode 100644 index 03aca96c..00000000 --- a/_posts/2020-12-31-paper-list.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -layout: post -title: 2020 Papers -date: 2020-12-31 11:00:00-0800 -description: 2020 Paper List -comments: false ---- - -
- - -

{{y}}

- {% bibliography -f papers -q @*[year=2020]* %} - -
- diff --git a/_posts/2021-07-13-reading-group.md b/_posts/2021-07-13-reading-group.md deleted file mode 100644 index ddd06b7a..00000000 --- a/_posts/2021-07-13-reading-group.md +++ /dev/null @@ -1,56 +0,0 @@ ---- -layout: post -title: 2021 Reading Group -date: 2021-09-09 09:00:00-0800 -description: Paper list and comments for reading group -comments: false ---- - -### 2021.09.07 Interspeech 2021 Paper list -- [Speech SimCLR: Combining Contrastive and Reconstruction Objective for Self-Supervised Speech Representation Learning](https://www.isca-speech.org/archive/interspeech_2021/jiang21_interspeech.html) -- [Stochastic Attention Head Removal: A Simple and Effective Method for Improving Transformer Based ASR Models](https://www.isca-speech.org/archive/interspeech_2021/zhang21p_interspeech.html) -- [Graph-PIT: Generalized Permutation Invariant Training for Continuous Separation of Arbitrary Numbers of Speakers](https://www.isca-speech.org/archive/interspeech_2021/neumann21_interspeech.html) - - - -### 2021.10.12 Interspeech 2021 Paper list -- [LT-LM: A Novel Non-Autoregressive Language Model for Single-Shot Lattice Rescoring](https://www.isca-speech.org/archive/interspeech_2021/mitrofanov21_interspeech.html) -- [Variable Frame Rate Acoustic Models Using Minimum Error Reinforcement Learning](https://www.isca-speech.org/archive/interspeech_2021/jiang21b_interspeech.html) -- [Human Listening and Live Captioning: Multi-Task Training for Speech Enhancement](https://www.isca-speech.org/archive/interspeech_2021/eskimez21b_interspeech.html) -- [Speech Denoising Without Clean Training Data: A Noise2Noise Approach](https://www.isca-speech.org/archive/interspeech_2021/kashyap21_interspeech.html) -- [Efficient and Stable Adversarial Learning Using Unpaired Data for Unsupervised Multichannel Speech Separation](https://www.isca-speech.org/archive/interspeech_2021/nakagome21_interspeech.html) -- [Sparse Mixture of Local Experts for Efficient Speech Enhancement](http://www.interspeech2020.org/index.php?m=content&c=index&a=show&catid=412&id=1229) -- [Manifold-Aware Deep Clustering: Maximizing Angles Between Embedding Vectors Based on Regular Simplex](https://www.isca-speech.org/archive/interspeech_2021/tanaka21_interspeech.html) - - -### 2021.10.19 Multimodal Speech Summarization (Speaker: Shruti) -- [Multimodal Speech Summarization Through Semantic Concept Learning](https://www.isca-speech.org/archive/interspeech_2021/palaskar21_interspeech.html) - -### 2021.10.26 Code Switching introduction (Speaker: Dan) -Papers: -- [TOWARDS END-TO-END CODE-SWITCHING SPEECH RECOGNITION](https://arxiv.org/pdf/1810.13091.pdf) -- [Towards Context-Aware End-to-End Code-Switching Speech Recognition](http://www.interspeech2020.org/uploadfile/pdf/Thu-3-5-10.pdf) -- [Detection of language boundary in code-switching utterances by bi-phone probabilities](https://ieeexplore.ieee.org/document/1409644) -- [Bilingual Language Mixing: Why Do Bilinguals Code-Switch?](https://journals.sagepub.com/doi/pdf/10.1111/1467-8721.00140?casa_token=xRgWLP3mKxkAAAAA:qrie3I-QwiF5vSGpCId0h7nZNDxb4L5K5nUZjJxq5UblA7HCohpvV9digt03dpz72XEsnltut640) -- [An End-to-End Language-Tracking Speech Recognizer for Mixed-Language Speech](https://ieeexplore.ieee.org/document/8462180) -- [Data Augmentation for end-to-end Code-Switching Speech Recognition](https://ieeexplore.ieee.org/document/9383620) -- [Constrained Output Embeddings for End-to-End Code-Switching Speech Recognition with Only Monolingual Data](https://arxiv.org/pdf/1904.03802.pdf) - -Slides: - -- [Code Switching ASR](https://github.com/shinjiwlab/shinjiwlab.github.io/tree/source/assets/pdf/reading_group_code_switching.pdf) - - -### 2021.11.09 Waspaa 2021 Paper list (Selector: Zhong-Qiu) -- [DF-Conformer: Integrated architecture of Conv-TasNet and Conformer using linear complexity self-attention for speech enhancement](https://arxiv.org/abs/2106.15813) -- [Self-Supervised Learning from Automatically Separated Sound Scenes](https://arxiv.org/abs/2105.02132) -- [HARP-Net: Hyper-Autoencoded Reconstruction Propagation for Scalable Neural Audio Coding](https://arxiv.org/abs/2107.10843) - -### 2021.11.16 Waspaa 2021 Paper list (Selector: Zhong-Qiu) - -- [Point Cloud Audio Processing](https://arxiv.org/abs/2105.02469) -- [Filtered Noise Shaping for Time Domain Room Impulse Response Estimation From Reverberant Speech](https://arxiv.org/abs/2107.07503) -- [HiFi-GAN-2: Studio-quality Speech Enhancement via Generative Adversarial Networks Conditioned on Acoustic Features](https://arxiv.org/pdf/2006.05694.pdf) - - - diff --git a/_posts/2021-12-13-2021-record.md b/_posts/2021-12-13-2021-record.md deleted file mode 100644 index df34c5a8..00000000 --- a/_posts/2021-12-13-2021-record.md +++ /dev/null @@ -1,35 +0,0 @@ ---- -layout: post -title: 2021 Actvities Gallery -date: 2022-01-10 11:00:00-0800 -description: This is memory gallery for 2021 activities. -comments: false ---- - -
-
- -
-
-
- ScreenShot from our End-of-semester Presentation, 05.20.2021 -
- - -
-
- -
-
-
- Group party at The Church Brew Works, 10.29.2021 -
- -
-
- -
-
-
- Farewell for Yen-Ju and Welcome for Yosuke Kashiwagi at Cafe Carnegie, 12.19.2021 -
\ No newline at end of file diff --git a/_posts/2021-12-31-paper-list.md b/_posts/2021-12-31-paper-list.md deleted file mode 100644 index b3f46bbf..00000000 --- a/_posts/2021-12-31-paper-list.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -layout: post -title: 2021 Papers -date: 2021-09-10 11:00:00-0800 -description: 2021 Paper List -comments: false ---- - -
- - -

{{y}}

- {% bibliography -f papers -q @*[year=2021]* %} - -
- diff --git a/_posts/2022-01-01-aws-usage.md b/_posts/2022-01-01-aws-usage.md deleted file mode 100644 index 62023ddf..00000000 --- a/_posts/2022-01-01-aws-usage.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -layout: post -title: AWS Usage -date: 2022-01-01 09:00:00-0800 -description: AWS usage. -comments: false ---- - -# How to make an instance -1. basically follow https://aws.amazon.com/getting-started/hands-on/get-started-dlami/?nc1=h_ls -1. select instance (e.g., p3.8xlarge or p3.16xlarge) -1. change the storage size to as we want (e.g., 1 or a couple of TB) -1. save the pem files - -# How to give an access to the other people? -1. send the pem file to them -1. ask them to login with the following command -```bash -chmod 0400 xxx.pem -ssh -i xxx.pem ubuntu@yyy - -e.g. -chmod 0400 aws_aswin1.pem -ssh -i aws_aswin1.pem ubuntu@18.191.179.225 -``` - -# How to install espnet and setup the environments? -1. follow https://espnet.github.io/espnet/installation.html and installed required libraries. -1. default installation seems to work -1. I recommend you to download the data and setting up environments in advance. It takes a couple of hours (or more). We could put the corpora on ~/corpora and make a working directory in ~//work - diff --git a/_posts/2022-01-01-espnet2-recipe.md b/_posts/2022-01-01-espnet2-recipe.md deleted file mode 100644 index 6911e106..00000000 --- a/_posts/2022-01-01-espnet2-recipe.md +++ /dev/null @@ -1,216 +0,0 @@ ---- -layout: post -title: ESPnet Recipe Instructions -date: 2022-01-01 09:00:00-0800 -description: How to make a ESPnet2 recipe from scratch -comments: false ---- - -## 0. Installation - -For PSC usage and kaldi/espnet installation, please refer to [this wiki]({% post_url 2022-01-01-psc-usage %}). - -## 1. Introduction - -In this section, we will provide an overview of one the core parts of ESPnet, the recipes, and introduce their format. - -### 1.1. What is a recipe ? -First you need to define the speech task that you want to perform and the corpus that you want to use for this task. Let's call our task "__*task*__" and our corpus "__*corpus*__". As an exemple, we can have task = asr(Automatic Speech Recognition) and corpus = librispeech. - ->> __A recipe is a folder containing all the scripts to download and prepare the data of *corpus*, train a *task* model on this prepared data, and evaluate it's performances.__ - -The different stages of the recipe should be easily executable with bash instructions shared for all recipes detailed later in this wiki. -In ESPnet2, recipes that train models on the same task share most parts of their codes, using calls to shared scripts. - - -### 1.2. What is Kaldi-style ? - -ESPnet2 recipes follows Kaldi-style for their recipes. -Kaldi is a toolkit for speech recognition written in C++. Kaldi is intended for use by speech recognition researchers. - -To create a recipe, we only have to focus on one of Kaldi's top-level directories : __egs__. -egs stands for ‘examples’ and contains the recipes for a lot of corpora. - - - - -ESPnet follows the same architecture than Kaldi so you will find the directories in ESPnet. The folder for ESPnet2 examples is ```egs2/```. - -### 1.3. ESPnet2 - -ESPnet2 is a newer version of ESPnet. Contrary to Kaldi, it provides shared bash files for recipes to enable generic stages and states during the process. For instance, if there are 2 asr recipes, one on Librispeech corpus, and the other on aishell corpus, the directories ```egs2/Librispeech/asr/``` and ```egs2/aishell/asr/``` will call the generic ```asr.sh``` file. - - -## 2. Main steps - -### 2.1. Shared files and folders - -Most of the files and folders are shared with all ESPnet2 recipes. You can just copy them into your recipe's folder and use symbolic links (command : ```ln -s {source-filename} {symbolic-filename}```). In the following, we will point out the specific files that you need to modify for your recipe. - -### 2.2. Important files to write - -#### 2.2.1. Call the generic asr.sh : *run.sh* - -You have to write ```corpus/task/run.sh```file, for instance [```aishell/asr/run.sh```](https://github.com/espnet/espnet/blob/master/egs2/aishell/asr1/run.sh). -The role of this file is to call the generic *task* file, for instance the generic *asr.sh* file with specific arguments of your recipe. -After few lines defining variables, your file should look like this : -``` -./asr.sh \ - --lang zh \ - --audio_format wav \ - --feats_type raw \ - --token_type char \ - --use_lm ${use_lm} \ - --use_word_lm ${use_wordlm} \ - --lm_config "${lm_config}" \ - --asr_config "${asr_config}" \ - --inference_config "${inference_config}" \ - --train_set "${train_set}" \ - --valid_set "${valid_set}" \ - --test_sets "${test_sets}" -``` - -As all preparation and training stages are performed through the generic file (here ```asr.sh```), the ```run.sh``` file is a short file. -For more details you can have a look at any recipe's ```run.sh``` file ([aishell](https://github.com/espnet/espnet/blob/master/egs2/aishell/asr1/run.sh), [commonvoice](https://github.com/espnet/espnet/blob/master/egs2/commonvoice/asr1/run.sh) ...). - -#### 2.2.2. Prepare the data : *local/data.sh* - -This will probably be your first and most complicated task. As each recipe comes with its own data, there is no generic file for this part. -The file should handle data download and preparation. Starting from no data, you should get a folder like this after executing the ```local/data.sh``` file. We used the [template](https://github.com/DanBerrebbi/espnet/tree/dan_aishell4_branch/egs2/TEMPLATE) of ESPnet repo in this section. -``` -data/ - train/ - - text # The transcription - - wav.scp # Wave file path - - utt2spk # A file mapping utterance-id to speaker-id - - spk2utt # A file mapping speaker-id to utterance-id - - segments # [Option] Specifying start and end time of each utterance - dev/ - ... - test/ - ... -``` - - -- Directory structure - ``` - data/ - train/ - - text # The transcription - - wav.scp # Wave file path - - utt2spk # A file mapping utterance-id to speaker-id - - spk2utt # A file mapping speaker-id to utterance-id - - segments # [Option] Specifying start and end time of each utterance - dev/ - ... - test/ - ... - ``` - -- `text` format - ``` - uttidA - uttidB - ... - ``` - -- `wav.scp` format - ``` - uttidA /path/to/uttidA.wav - uttidB /path/to/uttidB.wav - ... - ``` - -- `utt2spk` format - ``` - uttidA speakerA - uttidB speakerB - uttidC speakerA - uttidD speakerB - ... - ``` - -- `spk2utt` format - ``` - speakerA uttidA uttidC ... - speakerB uttidB uttidD ... - ... - ``` - - Note that `spk2utt` file can be generated by `utt2spk`, and `utt2spk` can be generated by `spk2utt`, so it's enough to create either one of them. - - ```bash - utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt - utils/spk2utt_to_utt2spk.pl data/train/spk2utt > data/train/utt2spk - ``` - - If your corpus doesn't include speaker information, give the same speaker id as the utterance id to satisfy the directory format, otherwise give the same speaker id for all utterances (Actually we don't use speaker information for asr recipe now). - - ```bash - uttidA uttidA - uttidB uttidB - ... - ``` - -- [Option] `segments` format - - If the audio data is originally long recording, about > ~1 hour, and each audio file includes multiple utterances in each section, you need to create `segments` file to specify the start time and end time of each utterance. The format is ` `. - - ``` - sw02001-A_000098-001156 sw02001-A 0.98 11.56 - ... - ``` - - Note that if using `segments`, `wav.scp` has `` which corresponds to the `segments` instead of `utterance_id`. - - ``` - sw02001-A /path/to/sw02001-A.wav - ... - ``` - - -## 3. Shared files description - -As the shared task files ([```asr.sh```](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh), [```tts.sh```](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/tts1/tts.sh) ...) handle most of the important steps in ESPnet2, it is important to know how they are built. The shared files are built with stages. - - -### 3.1. ```asr.sh``` - -[```asr.sh```](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh) contains 15 stages. -Overview : -- [stage 1 to stage 5](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh#L417-L688) : data preparation stages - * [stage 1](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh#L418-L422) : call to your own data.sh file - * [stage 2](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh#L424-L444) : speed perturbation modification of inputs - * [stage 3](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh#L446-L548) : create a dump folder, segment audio files, change the audio-format and sampling rate if needed. This step enables to get a common format for files which enable combining different corpora at training or inference time. - * [stage 4](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh#L549-L612) : remove short and long utterances - * [stage 5](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh#L613-L683) : generate a token list (can be word level, character level or bpe level) - -- [stage 6 to stage 8](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh#L693-L868) : Language Model stages - * [stage 6](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh#L693-L770) : preparing LM training - * [stage 7](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh#L771-L845) : train the LM (needs GPU) - * [stage 8](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh#L847-L863) : calculates [perplexity](https://en.wikipedia.org/wiki/Perplexity) -- [stage 9 to stage 11](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh#L870-L1093) : ASR training steps - * [stage 9](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh#L873-L881) : training an ngram model to compare it to our asr model - * [stage 10](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh#L884-L983) : preparing asr training - * [stage 11](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh#L985-L1093) : asr training (needs GPU) -- [stage 12 to stage 13](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh#L1131-L1338) : Evaluation stages : decoding (stage 12) and scoring (stage 13) -- [stage 14 to stage 15](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh#L1345-L1434) : model uploading steps, upload your trained model through those two final steps - - -### 3.2. ```diar.sh``` - - - -### 3.3. ```enh.sh``` - - - - -### 3.4. ```tts.sh``` - - -### 3.5. ```st.sh``` - - - -## 4. Log files / TIPS (WIP) \ No newline at end of file diff --git a/_posts/2022-01-01-psc-usage.md b/_posts/2022-01-01-psc-usage.md deleted file mode 100644 index 4c67de31..00000000 --- a/_posts/2022-01-01-psc-usage.md +++ /dev/null @@ -1,399 +0,0 @@ ---- -layout: post -title: PSC Usage -date: 2022-01-01 09:00:00-0800 -description: PSC cluster usage. -comments: false ---- - -## Step by step starting towards successful ssh login -- Look for guide with some screenshots above - -1. Create an account for [ACCESS](https://identity.access-ci.org/new-user) - - This account is used for both PSC and Delta - - When you create, [Register with an existing identity](https://identity.access-ci.org/new-user-federated). Don't do Register without an existing identity. - -2. Send the username to allocation managers (e.g. Xuankai) to add the user in our group. - - After this step is done, you should be able to see `list of resources` at https://allocations.access-ci.org - - To see the `list of resources`, log-in with identity provider `Carnegie Mellon University` - -3. Initialise your PSC password (used for ssh login) - - Go to https://www.psc.edu/resources/bridges-2/user-guide-2-2/ and click `PSC Password Change Utility` - - **It may take few hours** for your `username` and `email` to be recognised, even if they're correct. - -4. Access via ssh - - ssh [username]@[resource_dir] - - E.g., `jjung1@bridges2.psc.edu` - - Use the password you initialised in step 3. - -# Important -1. `Home` directory is of limited space. Please do most of your work in ocean storage (`$ cd ${PROJECT}`) -2. When you publish a paper, please **acknowledge the PSC and ACCESS**. We will get benefit when we apply for PSC credits next time. - * [Acknowledgement webpage](https://access-ci.org/about/acknowledging-access/) - * Example: Experiments of this work used the Bridges2 system at PSC through allocations CIS210014 and IRI120008P from the Advanced Cyberinfrastructure Coordination Ecosystem: Services \& Support (ACCESS) program, supported by National Science Foundation grants \#2138259,\#tel:2138286, \#tel:2138307, \#tel:2137603, and \#tel:2138296. -
- Add the following references - - ``` - @ARTICLE{xsede, - author = {J. Towns and T. Cockerill and M. Dahan and I. Foster and K. Gaither and A. Grimshaw and V. Hazlewood and S. Lathrop and D. Lifka and G. D. Peterson and R. Roskies and J. R. Scott and N. Wilkins-Diehr}, - journal = {Computing in Science \& Engineering}, - title = {XSEDE: Accelerating Scientific Discovery}, - year = {2014}, - volume = {16}, - number = {5}, - pages = {62-74}, - keywords={Knowledge discovery;Scientific computing;Digital systems;Materials engineering;Supercomputers}, - doi = {10.1109/MCSE.2014.80}, - url = {doi.ieeecomputersociety.org/10.1109/MCSE.2014.80}, - ISSN = {1521-9615}, - month={Sept.-Oct.} - } - @inproceedings{nystrom2015bridges, - title={Bridges: a uniquely flexible HPC resource for new communities and data analytics}, - author={Nystrom, Nicholas A and Levine, Michael J and Roskies, Ralph Z and Scott, J Ray}, - booktitle={Proceedings of the 2015 XSEDE Conference: Scientific Advancements Enabled by Enhanced Cyberinfrastructure}, - pages={1--8}, - year={2015} - } - ``` -
- - -# Summary of PSC usage and the partitions -* Both PSC have limited service units (SUs) for resource availability. -* `sinfo` lists all the available partitions in PSC and their status. - - - - - - - - - - - - - - - - - - - - - - - - - -
PSC(Bridges-2)
PartitionsGPU, GPU-shared, RM-small, RM, RM-512, RM-shared
GPU resourcesGPU, GPU-shared (Recommended)
CPU resourcesRM-small, RM, RM-512, RM-shared (Recommended)
Default`RM`, which request to allocate a 128-cpu node. (**Be careful of this case**)
- - -## Misc. resources - - - - - - - - - - - - - - - - - - - - - - - - - -
PSC(Bridges-2)
User Guidehttps://www.psc.edu/resources/bridges-2/user-guide-2/
Connect from browserhttps://ondemand.bridges2.psc.edu/
ESPnet installation guidehttps://espnet.github.io/espnet/installation.html
Step-by-step guide with pictureshttps://granite-echidna-ff2.notion.site/Access-for-PSC-07c3d4c05b54426895e3ddc87276e4b5
- - -## GPU Partitions -* In `GPU / GPU-shared` partitions, each node consists of 8 v100 GPU devices -* There are two types of GPU nodes: `v100-16` and `v100-32` having GPU units with 16GB and 32GB memory respectively. -* Submit jobs to `GPU-shared` partition. (**Recommended**) - * Using `-p GPU-shared --gpus=type:n` in `sbatch` or `srun`. Here `type` can be `v100-16` or `v100-32` and `n` can range from 1 to 4 (1 is recommended). -* Submit jobs to `GPU` partition. - * **Please use it only when necessary.** - * Using `-p GPU` in `sbatch` or `srun`. It request to allocate a whole GPU node, 8 GPUs, for each job. - * In this case, it deducts 8 SUs from our team's GPU allocation every hour the job runs. -* **Good Practice: Usually, users are strongly recommended to use `GPU-shared` partition and allocate 1 GPU only for each job.** - * When allocating multiple GPUs, you usually need to wait much longer time. - * Single-GPU jobs are more efficient than multi-GPU ones: the latter will have communication overhead. - * Multi-GPU jobs will be allowed only when (1) the users have been familiar with the cluster and (2) the project really needs that resource. Beginners are strongly discouraged from trying this option. - * Please avoid using `GPU` partition by mistake: it makes the allocated GPUs idle and causes much waste to our resources. - * We will periodically check each user's usage and send a reminder when necessary. - -## RM Partitions (For CPU Jobs) -* In the `RM and RM-512` partitions, each node consists of 128 cores. -* Nodes in RM, RM-shared partitions have a memory of 128GB, while nodes in RM-512 partitions have 512GB memory. -* using an entire node for an hour will deduct 128 SUs from our team's Regular Memory allocation. -* Submit jobs to `RM-shared` partition. (**suggested**) - * Using `-p RM-shared --ntasks-per-node=n --mem=2000M` in `sbatch` or `srun`. Here `n` can range from 1 to 64. - * Usually, jobs only require a few cpu cores. -* Submit jobs to `RM` partition. - * **Please use it only when necessary.** - * It request to allocate all of the 128 cpu cores. - * Using `-p RM --ntasks-per-node=n` in `sbatch` or `srun`. Here `n` can range from 1 to 64. -* **Good Practice: Usually, users are strongly recommended to use `RM-shared` partition.** - * You can adjust the memory allocation of each CPU job, but please bear in mind that CPU is also charged by every 2000M memory allocation. E.g., a CPU core with 4000M memory running 1 hour will also be charged 2SUs. - * Usage of `RM` or `RM-512` partitions will be allowed only when (1) the users have been familiar with the cluster and (2) the project really needs that resource. Beginners are strongly discouraged from trying this option. - * Please avoid using `RM` or `RM-512` partitions by mistake: it makes the allocated CPUs idle and causes much waste to our resources. - * We will periodically check each user's usage and send a reminder when necessary. - -## Other usage -* Data copying / file transfer - * Suggest to use `data.bridges2.psc.edu` as the machine name. Doing file transfer with `rsync`, `sftp`, `scp`, etc. [Transferring Files](https://www.psc.edu/resources/bridges-2/user-guide-2-2/). Following is an example of `scp`. - ``` - # This requires the enrollment of Two-Factor Authentication (TFA) - scp -P 2222 myfile XSEDE-username@data.bridges2.psc.edu:/path/to/file - ``` - -* Submitting jobs with dependency - * This can be used to submit a job which is expected to start run after some specific jobs finish. In many cases, training a model can take a few days. However, PSC has the restriction that each job can run for 2 days at most. In this case, we can start a job with dependency for long jobs. For example, you already start a job ID is 000001 and you want a following job right after it. You can submit jobs like: - ``` - sbatch --time 2-0:00:00 --dependency=afterany:000001 run.sh - ``` - -* Common arguments in sbatch / srun - ``` - -p, --partition=partition partition requested - -J, --job-name=jobname name of job - -t, --time=time time limit - --gres=rsrc_name[:rsrc_type]:rsrc_num required generic resources - -c, --cpus-per-task=ncpus number of cpus required per task - -d, --dependency=type:jobid[:time] defer job until condition on jobid is satisfied - -e, --error=err file for batch script's standard error - -o, --output=out file for batch script's standard output - --mem-per-cpu=MB maximum amount of real memory per allocated - --ntasks-per-node=n number of tasks to invoke on each node - --reservation=name allocate resources from named reservation, e.g. `GPUcis210027` - ``` - -* View tools - * slurm commands - ``` - # view jobs in the queue - squeue -u ${username} - - # view detailed job info - scontrol show jobid -d ${jobid} - - # view job history and billing info, e.g. since time 04/22/2022 12am. - sacct -u ${username} -S 2022-04-22T00:00:00 --format=JobID,jobname,user,elapsed,nnodes,alloccpus,state,partition,nodelist,AllocTRES%50,CPUTime - ``` - * PSC provides `slurm-tool` - ``` - # Show or watch job queue: - slurm-tool [watch] queue show own jobs - slurm-tool [watch] q show user's jobs - slurm-tool [watch] quick show quick overview of own jobs - slurm-tool [watch] shorter sort and compact entire queue by job size - slurm-tool [watch] short sort and compact entire queue by priority - slurm-tool [watch] full show everything - slurm-tool [w] [q|qq|ss|s|f] shorthands for above! - - slurm-tool qos show job service classes - slurm-tool top [queue|all] show summary of active users - - # Show detailed information about jobs: - slurm-tool prio [all|short] show priority components - slurm-tool j|job show everything else - slurm-tool steps show memory usage of running srun job steps - - # Show usage and fair-share values from accounting database: - slurm-tool h|history