diff --git a/joss.05403/10.21105.joss.05403.crossref.xml b/joss.05403/10.21105.joss.05403.crossref.xml
new file mode 100644
index 0000000000..d63d5846f3
--- /dev/null
+++ b/joss.05403/10.21105.joss.05403.crossref.xml
@@ -0,0 +1,572 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<doi_batch xmlns="http://www.crossref.org/schema/5.3.1"
+           xmlns:ai="http://www.crossref.org/AccessIndicators.xsd"
+           xmlns:rel="http://www.crossref.org/relations.xsd"
+           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+           version="5.3.1"
+           xsi:schemaLocation="http://www.crossref.org/schema/5.3.1 http://www.crossref.org/schemas/crossref5.3.1.xsd">
+  <head>
+    <doi_batch_id>20231120T173622-f5cd4f87feb07d150e64a08e4dfe057a8a799847</doi_batch_id>
+    <timestamp>20231120173622</timestamp>
+    <depositor>
+      <depositor_name>JOSS Admin</depositor_name>
+      <email_address>admin@theoj.org</email_address>
+    </depositor>
+    <registrant>The Open Journal</registrant>
+  </head>
+  <body>
+    <journal>
+      <journal_metadata>
+        <full_title>Journal of Open Source Software</full_title>
+        <abbrev_title>JOSS</abbrev_title>
+        <issn media_type="electronic">2475-9066</issn>
+        <doi_data>
+          <doi>10.21105/joss</doi>
+          <resource>https://joss.theoj.org</resource>
+        </doi_data>
+      </journal_metadata>
+      <journal_issue>
+        <publication_date media_type="online">
+          <month>11</month>
+          <year>2023</year>
+        </publication_date>
+        <journal_volume>
+          <volume>8</volume>
+        </journal_volume>
+        <issue>91</issue>
+      </journal_issue>
+      <journal_article publication_type="full_text">
+        <titles>
+          <title>Software Design and User Interface of ESPnet-SE++:
+Speech Enhancement for Robust Speech Processing</title>
+        </titles>
+        <contributors>
+          <person_name sequence="first" contributor_role="author">
+            <given_name>Yen-Ju</given_name>
+            <surname>Lu</surname>
+            <ORCID>https://orcid.org/0000-0001-8400-4188</ORCID>
+          </person_name>
+          <person_name sequence="additional"
+                       contributor_role="author">
+            <given_name>Xuankai</given_name>
+            <surname>Chang</surname>
+            <ORCID>https://orcid.org/0000-0002-5221-5412</ORCID>
+          </person_name>
+          <person_name sequence="additional"
+                       contributor_role="author">
+            <given_name>Chenda</given_name>
+            <surname>Li</surname>
+            <ORCID>https://orcid.org/0000-0003-0299-9914</ORCID>
+          </person_name>
+          <person_name sequence="additional"
+                       contributor_role="author">
+            <given_name>Wangyou</given_name>
+            <surname>Zhang</surname>
+            <ORCID>https://orcid.org/0000-0003-4500-3515</ORCID>
+          </person_name>
+          <person_name sequence="additional"
+                       contributor_role="author">
+            <given_name>Samuele</given_name>
+            <surname>Cornell</surname>
+            <ORCID>https://orcid.org/0000-0002-5358-1844</ORCID>
+          </person_name>
+          <person_name sequence="additional"
+                       contributor_role="author">
+            <given_name>Zhaoheng</given_name>
+            <surname>Ni</surname>
+          </person_name>
+          <person_name sequence="additional"
+                       contributor_role="author">
+            <given_name>Yoshiki</given_name>
+            <surname>Masuyama</surname>
+          </person_name>
+          <person_name sequence="additional"
+                       contributor_role="author">
+            <given_name>Brian</given_name>
+            <surname>Yan</surname>
+          </person_name>
+          <person_name sequence="additional"
+                       contributor_role="author">
+            <given_name>Robin</given_name>
+            <surname>Scheibler</surname>
+            <ORCID>https://orcid.org/0000-0002-5205-8365</ORCID>
+          </person_name>
+          <person_name sequence="additional"
+                       contributor_role="author">
+            <given_name>Zhong-Qiu</given_name>
+            <surname>Wang</surname>
+            <ORCID>https://orcid.org/0000-0002-4204-9430</ORCID>
+          </person_name>
+          <person_name sequence="additional"
+                       contributor_role="author">
+            <given_name>Yu</given_name>
+            <surname>Tsao</surname>
+            <ORCID>https://orcid.org/0000-0001-6956-0418</ORCID>
+          </person_name>
+          <person_name sequence="additional"
+                       contributor_role="author">
+            <given_name>Yanmin</given_name>
+            <surname>Qian</surname>
+            <ORCID>https://orcid.org/0000-0002-0314-3790</ORCID>
+          </person_name>
+          <person_name sequence="additional"
+                       contributor_role="author">
+            <given_name>Shinji</given_name>
+            <surname>Watanabe</surname>
+            <ORCID>https://orcid.org/0000-0002-5970-8631</ORCID>
+          </person_name>
+        </contributors>
+        <publication_date>
+          <month>11</month>
+          <day>20</day>
+          <year>2023</year>
+        </publication_date>
+        <pages>
+          <first_page>5403</first_page>
+        </pages>
+        <publisher_item>
+          <identifier id_type="doi">10.21105/joss.05403</identifier>
+        </publisher_item>
+        <ai:program name="AccessIndicators">
+          <ai:license_ref applies_to="vor">http://creativecommons.org/licenses/by/4.0/</ai:license_ref>
+          <ai:license_ref applies_to="am">http://creativecommons.org/licenses/by/4.0/</ai:license_ref>
+          <ai:license_ref applies_to="tdm">http://creativecommons.org/licenses/by/4.0/</ai:license_ref>
+        </ai:program>
+        <rel:program>
+          <rel:related_item>
+            <rel:description>Software archive</rel:description>
+            <rel:inter_work_relation relationship-type="references" identifier-type="doi">10.5281/zenodo.10048174</rel:inter_work_relation>
+          </rel:related_item>
+          <rel:related_item>
+            <rel:description>GitHub review issue</rel:description>
+            <rel:inter_work_relation relationship-type="hasReview" identifier-type="uri">https://github.com/openjournals/joss-reviews/issues/5403</rel:inter_work_relation>
+          </rel:related_item>
+        </rel:program>
+        <doi_data>
+          <doi>10.21105/joss.05403</doi>
+          <resource>https://joss.theoj.org/papers/10.21105/joss.05403</resource>
+          <collection property="text-mining">
+            <item>
+              <resource mime_type="application/pdf">https://joss.theoj.org/papers/10.21105/joss.05403.pdf</resource>
+            </item>
+          </collection>
+        </doi_data>
+        <citation_list>
+          <citation key="Li:2021">
+            <article_title>ESPnet-SE: End-to-end speech enhancement and
+separation toolkit designed for ASR integration</article_title>
+            <author>Li</author>
+            <journal_title>2021 IEEE spoken language technology workshop
+(SLT)</journal_title>
+            <doi>10.1109/slt48900.2021.9383615</doi>
+            <cYear>2021</cYear>
+            <unstructured_citation>Li, C., Shi, J., Zhang, W.,
+Subramanian, A. S., Chang, X., Kamo, N., Hira, M., Hayashi, T.,
+Boeddeker, C., &amp; Chen, S., Z. Watanabe. (2021). ESPnet-SE:
+End-to-end speech enhancement and separation toolkit designed for ASR
+integration. 2021 IEEE Spoken Language Technology Workshop (SLT),
+785–792.
+https://doi.org/10.1109/slt48900.2021.9383615</unstructured_citation>
+          </citation>
+          <citation key="Hershey:2016">
+            <article_title>Deep clustering: Discriminative embeddings
+for segmentation and separation</article_title>
+            <author>Hershey</author>
+            <journal_title>2016 IEEE international conference on
+acoustics, speech and signal processing (ICASSP)</journal_title>
+            <doi>10.1109/icassp.2016.7471631</doi>
+            <cYear>2016</cYear>
+            <unstructured_citation>Hershey, J. R., Chen, Z., Le Roux,
+J., &amp; Watanabe, S. (2016). Deep clustering: Discriminative
+embeddings for segmentation and separation. 2016 IEEE International
+Conference on Acoustics, Speech and Signal Processing (ICASSP), 31–35.
+https://doi.org/10.1109/icassp.2016.7471631</unstructured_citation>
+          </citation>
+          <citation key="Chen:2017">
+            <article_title>Deep attractor network for single-microphone
+speaker separation</article_title>
+            <author>Chen</author>
+            <journal_title>2017 IEEE international conference on
+acoustics, speech and signal processing (ICASSP)</journal_title>
+            <doi>10.1109/icassp.2017.7952155</doi>
+            <cYear>2017</cYear>
+            <unstructured_citation>Chen, Z., Luo, Y., &amp; Mesgarani,
+N. (2017). Deep attractor network for single-microphone speaker
+separation. 2017 IEEE International Conference on Acoustics, Speech and
+Signal Processing (ICASSP), 246–250.
+https://doi.org/10.1109/icassp.2017.7952155</unstructured_citation>
+          </citation>
+          <citation key="Hu:2020">
+            <article_title>DCCRN: Deep complex convolution recurrent
+network for phase-aware speech enhancement</article_title>
+            <author>Hu</author>
+            <journal_title>Proceedings of interspeech</journal_title>
+            <doi>10.21437/interspeech.2020-2537</doi>
+            <cYear>2020</cYear>
+            <unstructured_citation>Hu, Y., Liu, Y., Lv, S., Xing, M.,
+Zhang, S., Fu, Y., Wu, J., Zhang, B., &amp; Xie, L. (2020). DCCRN: Deep
+complex convolution recurrent network for phase-aware speech
+enhancement. Proceedings of Interspeech, 2472–2476.
+https://doi.org/10.21437/interspeech.2020-2537</unstructured_citation>
+          </citation>
+          <citation key="Tan:2021">
+            <article_title>Deep learning based real-time speech
+enhancement for dual-microphone mobile phones</article_title>
+            <author>Tan</author>
+            <journal_title>IEEE/ACM Transactions on Audio, Speech, and
+Language Processing</journal_title>
+            <volume>29</volume>
+            <doi>10.1109/taslp.2021.3082318</doi>
+            <cYear>2021</cYear>
+            <unstructured_citation>Tan, K., Zhang, X., &amp; Wang, D.
+(2021). Deep learning based real-time speech enhancement for
+dual-microphone mobile phones. IEEE/ACM Transactions on Audio, Speech,
+and Language Processing, 29, 1853–1863.
+https://doi.org/10.1109/taslp.2021.3082318</unstructured_citation>
+          </citation>
+          <citation key="Li:2022">
+            <article_title>SkiM: Skipping memory lstm for low-latency
+real-time continuous speech separation</article_title>
+            <author>Li</author>
+            <journal_title>2022 IEEE international conference on
+acoustics, speech and signal processing (ICASSP)</journal_title>
+            <doi>10.1109/icassp43922.2022.9746372</doi>
+            <cYear>2022</cYear>
+            <unstructured_citation>Li, C., Yang, L., Wang, W., &amp;
+Qian, Y. (2022). SkiM: Skipping memory lstm for low-latency real-time
+continuous speech separation. 2022 IEEE International Conference on
+Acoustics, Speech and Signal Processing (ICASSP), 681–685.
+https://doi.org/10.1109/icassp43922.2022.9746372</unstructured_citation>
+          </citation>
+          <citation key="Dang:2022">
+            <article_title>DPT-FSNet: Dual-path transformer based
+full-band and sub-band fusion network for speech
+enhancement</article_title>
+            <author>Dang</author>
+            <journal_title>2022 IEEE international conference on
+acoustics, speech and signal processing (ICASSP)</journal_title>
+            <doi>10.1109/icassp43922.2022.9746171</doi>
+            <cYear>2022</cYear>
+            <unstructured_citation>Dang, F., Chen, H., &amp; Zhang, P.
+(2022). DPT-FSNet: Dual-path transformer based full-band and sub-band
+fusion network for speech enhancement. 2022 IEEE International
+Conference on Acoustics, Speech and Signal Processing (ICASSP),
+6857–6861.
+https://doi.org/10.1109/icassp43922.2022.9746171</unstructured_citation>
+          </citation>
+          <citation key="Takahashi:2019">
+            <article_title>Recursive speech separation for unknown
+number of speakers</article_title>
+            <author>Takahashi</author>
+            <journal_title>Interspeech 2019</journal_title>
+            <doi>10.21437/interspeech.2019-1550</doi>
+            <cYear>2019</cYear>
+            <unstructured_citation>Takahashi, N., Parthasaarathy, S.,
+Goswami, N., &amp; Mitsufuji, Y. (2019). Recursive speech separation for
+unknown number of speakers. Interspeech 2019, 1348–1352.
+https://doi.org/10.21437/interspeech.2019-1550</unstructured_citation>
+          </citation>
+          <citation key="Luo:2019">
+            <article_title>FaSNet: Low-latency adaptive beamforming for
+multi-microphone audio processing</article_title>
+            <author>Luo</author>
+            <journal_title>2019 IEEE automatic speech recognition and
+understanding workshop (ASRU)</journal_title>
+            <doi>10.1109/asru46091.2019.9003849</doi>
+            <cYear>2019</cYear>
+            <unstructured_citation>Luo, Y., Han, C., Mesgarani, N.,
+Ceolini, E., &amp; Liu, S. (2019). FaSNet: Low-latency adaptive
+beamforming for multi-microphone audio processing. 2019 IEEE Automatic
+Speech Recognition and Understanding Workshop (ASRU), 260–267.
+https://doi.org/10.1109/asru46091.2019.9003849</unstructured_citation>
+          </citation>
+          <citation key="Lu:2022a">
+            <article_title>Towards low-distortion multi-channel speech
+enhancement: The ESPNET-se submission to the L3DAS22
+challenge</article_title>
+            <author>Lu</author>
+            <journal_title>2022 IEEE international conference on
+acoustics, speech and signal processing (ICASSP)</journal_title>
+            <doi>10.1109/icassp43922.2022.9747146</doi>
+            <cYear>2022</cYear>
+            <unstructured_citation>Lu, Y. J., Cornell, S., Chang, X.,
+Zhang, W., Li, C., Ni, Z., Wang, Z., &amp; Watanabe, S. (2022). Towards
+low-distortion multi-channel speech enhancement: The ESPNET-se
+submission to the L3DAS22 challenge. 2022 IEEE International Conference
+on Acoustics, Speech and Signal Processing (ICASSP), 9201–9205.
+https://doi.org/10.1109/icassp43922.2022.9747146</unstructured_citation>
+          </citation>
+          <citation key="Luo:2018">
+            <article_title>TaSNet: Time-domain audio separation network
+for real-time, single-channel speech separation</article_title>
+            <author>Luo</author>
+            <journal_title>2018 IEEE international conference on
+acoustics, speech and signal processing (ICASSP)</journal_title>
+            <doi>10.1109/icassp.2018.8462116</doi>
+            <cYear>2018</cYear>
+            <unstructured_citation>Luo, Y., &amp; Mesgarani, N. (2018).
+TaSNet: Time-domain audio separation network for real-time,
+single-channel speech separation. 2018 IEEE International Conference on
+Acoustics, Speech and Signal Processing (ICASSP), 696–700.
+https://doi.org/10.1109/icassp.2018.8462116</unstructured_citation>
+          </citation>
+          <citation key="Le:2019">
+            <article_title>SDR half-baked or well done?</article_title>
+            <author>Le Roux</author>
+            <journal_title>2019 IEEE international conference on
+acoustics, speech and signal processing (ICASSP)</journal_title>
+            <doi>10.1109/icassp.2019.8683855</doi>
+            <cYear>2019</cYear>
+            <unstructured_citation>Le Roux, J., Wisdom, S., Erdogan, H.,
+&amp; Hershey, J. R. (2019). SDR half-baked or well done? 2019 IEEE
+International Conference on Acoustics, Speech and Signal Processing
+(ICASSP), 626–630.
+https://doi.org/10.1109/icassp.2019.8683855</unstructured_citation>
+          </citation>
+          <citation key="Boeddeker:2021">
+            <article_title>Convolutive transfer function invariant SDR
+training criteria for multi-channel reverberant speech
+separation</article_title>
+            <author>Boeddeker</author>
+            <journal_title>2021 IEEE international conference on
+acoustics, speech and signal processing (ICASSP)</journal_title>
+            <doi>10.1109/icassp39728.2021.9414661</doi>
+            <cYear>2021</cYear>
+            <unstructured_citation>Boeddeker, C., Zhang, W., Nakatani,
+T., Kinoshita, K., Ochiai, T., Delcroix, M., Kamo, N., Qian, Y., &amp;
+Haeb-Umbach, R. (2021). Convolutive transfer function invariant SDR
+training criteria for multi-channel reverberant speech separation. 2021
+IEEE International Conference on Acoustics, Speech and Signal Processing
+(ICASSP), 8428–8432.
+https://doi.org/10.1109/icassp39728.2021.9414661</unstructured_citation>
+          </citation>
+          <citation key="Scheibler:2022">
+            <article_title>SDR medium rare with fast
+computations</article_title>
+            <author>Scheibler</author>
+            <journal_title>2022 IEEE international conference on
+acoustics, speech and signal processing (ICASSP)</journal_title>
+            <doi>10.1109/icassp43922.2022.9747473</doi>
+            <cYear>2022</cYear>
+            <unstructured_citation>Scheibler, R. (2022). SDR medium rare
+with fast computations. 2022 IEEE International Conference on Acoustics,
+Speech and Signal Processing (ICASSP), 701–705.
+https://doi.org/10.1109/icassp43922.2022.9747473</unstructured_citation>
+          </citation>
+          <citation key="Lu:2022b">
+            <article_title>ESPnet-SE++: Speech enhancement for robust
+speech recognition, translation, and understanding</article_title>
+            <author>Lu</author>
+            <journal_title>Proceedings of interspeech</journal_title>
+            <doi>10.21437/interspeech.2022-10727</doi>
+            <cYear>2022</cYear>
+            <unstructured_citation>Lu, Y. J., Chang, X., Li, C., Zhang,
+W., Cornell, S., Ni, Z., Masuyama, Y., Yan, B., Scheibler, R., Wang, Z.
+Q., Tsao, Y., &amp; Qian Y. Watanabe, S. (2022). ESPnet-SE++: Speech
+enhancement for robust speech recognition, translation, and
+understanding. Proceedings of Interspeech, 5458–5462.
+https://doi.org/10.21437/interspeech.2022-10727</unstructured_citation>
+          </citation>
+          <citation key="Hayashi:2020">
+            <article_title>ESPnet-TTS: Unified, reproducible, and
+integratable open source end-to-end text-to-speech
+toolkit</article_title>
+            <author>Hayashi</author>
+            <journal_title>2020 IEEE international conference on
+acoustics, speech and signal processing (ICASSP)</journal_title>
+            <doi>10.1109/icassp40776.2020.9053512</doi>
+            <cYear>2020</cYear>
+            <unstructured_citation>Hayashi, T., Yamamoto, R., Inoue, K.,
+Yoshimura, T., Watanabe, S., Toda, T., Takeda, K., &amp; Zhang, X., Y.
+Tan. (2020). ESPnet-TTS: Unified, reproducible, and integratable open
+source end-to-end text-to-speech toolkit. 2020 IEEE International
+Conference on Acoustics, Speech and Signal Processing (ICASSP),
+7654–7658.
+https://doi.org/10.1109/icassp40776.2020.9053512</unstructured_citation>
+          </citation>
+          <citation key="Inaguma:2020">
+            <article_title>ESPnet-ST: All-in-one speech translation
+toolkit</article_title>
+            <author>Inaguma</author>
+            <journal_title>Proceedings of the 58th annual meeting of the
+association for computational linguistics: System
+demonstrations</journal_title>
+            <doi>10.18653/v1/2020.acl-demos.34</doi>
+            <cYear>2020</cYear>
+            <unstructured_citation>Inaguma, H., Kiyono, S., Duh, K.,
+Karita, S., Soplin, N. E. Y., Hayashi, T., &amp; Watanabe, S. (2020).
+ESPnet-ST: All-in-one speech translation toolkit. Proceedings of the
+58th Annual Meeting of the Association for Computational Linguistics:
+System Demonstrations, 302–311.
+https://doi.org/10.18653/v1/2020.acl-demos.34</unstructured_citation>
+          </citation>
+          <citation key="Arora:2022">
+            <article_title>ESPnet-SLU: Advancing spoken language
+understanding through ESPnet</article_title>
+            <author>Arora</author>
+            <journal_title>2022 IEEE international conference on
+acoustics, speech and signal processing (ICASSP)</journal_title>
+            <doi>10.1109/icassp43922.2022.9747674</doi>
+            <cYear>2022</cYear>
+            <unstructured_citation>Arora, S., Dalmia, S., Denisov, P.,
+Chang, X., Ueda, Y., Peng, Y., Zhang, Y., Kumar, S., Ganesan, K., &amp;
+Yan, W., B. (2022). ESPnet-SLU: Advancing spoken language understanding
+through ESPnet. 2022 IEEE International Conference on Acoustics, Speech
+and Signal Processing (ICASSP), 7167–7171.
+https://doi.org/10.1109/icassp43922.2022.9747674</unstructured_citation>
+          </citation>
+          <citation key="Watanabe:2018">
+            <article_title>ESPnet: End-to-end speech processing
+toolkit</article_title>
+            <author>Watanabe</author>
+            <journal_title>Proceedings of interspeech</journal_title>
+            <doi>10.21437/interspeech.2018-1456</doi>
+            <cYear>2018</cYear>
+            <unstructured_citation>Watanabe, S., Hori, T., Karita, S.,
+Hayashi, T., Nishitoba, J., Unno, Y., Soplin, N. E. Y., Heymann, J.,
+Wiesner, M., Chen, N., Renduchintala, A., &amp; Ochiai, T. (2018).
+ESPnet: End-to-end speech processing toolkit. Proceedings of
+Interspeech, 2207–2211.
+https://doi.org/10.21437/interspeech.2018-1456</unstructured_citation>
+          </citation>
+          <citation key="Manilow:2018">
+            <article_title>The northwestern university source separation
+library.</article_title>
+            <author>Manilow</author>
+            <journal_title>International society for music information
+retrieval (ISMIR)</journal_title>
+            <doi>10.1163/1872-9037_afco_asc_1322</doi>
+            <cYear>2018</cYear>
+            <unstructured_citation>Manilow, E., Seetharaman, P., &amp;
+Pardo, B. (2018). The northwestern university source separation library.
+International Society for Music Information Retrieval (ISMIR), 297–305.
+https://doi.org/10.1163/1872-9037_afco_asc_1322</unstructured_citation>
+          </citation>
+          <citation key="Ni:2019">
+            <article_title>ONSSEN: An open-source speech separation and
+enhancement library</article_title>
+            <author>Ni</author>
+            <journal_title>arXiv preprint
+arXiv:1911.00982</journal_title>
+            <cYear>2019</cYear>
+            <unstructured_citation>Ni, M. I., Zhaoheng Mandel. (2019).
+ONSSEN: An open-source speech separation and enhancement library. arXiv
+Preprint arXiv:1911.00982.</unstructured_citation>
+          </citation>
+          <citation key="Pariente:2020">
+            <article_title>Asteroid: The PyTorch-based audio source
+separation toolkit for researchers</article_title>
+            <author>Pariente</author>
+            <journal_title>Proceedings of interspeech</journal_title>
+            <doi>10.21437/interspeech.2020-1673</doi>
+            <cYear>2020</cYear>
+            <unstructured_citation>Pariente, M., Cornell, S., Cosentino,
+J., Sivasankaran, S., Tzinis, E., Heitkaemper, J., Olvera, M., Stöter,
+F. R., Hu, M., Martı́n-Doñas, J. M., Ditter, D., Frank, A., Deleforge,
+A., &amp; Vincent, E. (2020). Asteroid: The PyTorch-based audio source
+separation toolkit for researchers. Proceedings of Interspeech,
+2637–2641.
+https://doi.org/10.21437/interspeech.2020-1673</unstructured_citation>
+          </citation>
+          <citation key="Ravanelli:2021">
+            <article_title>SpeechBrain: A general-purpose speech
+toolkit</article_title>
+            <author>Ravanelli</author>
+            <journal_title>arXiv preprint
+arXiv:2106.04624</journal_title>
+            <cYear>2021</cYear>
+            <unstructured_citation>Ravanelli, M., Parcollet, T.,
+Plantinga, P., Rouhe, A., Cornell, S., Lugosch, L., Subakan, C.,
+Dawalatabad, N., Heba, A., Zhong, J., Chou, J. C., Yeh, S. L., Fu, S.
+W., Liao, C. F., Rastorgueva, E., Grondin, F., Aris, W., Na, H., Gao,
+Y., &amp; Mori R. D. Bengio, Y. (2021). SpeechBrain: A general-purpose
+speech toolkit. arXiv Preprint arXiv:2106.04624.</unstructured_citation>
+          </citation>
+          <citation key="Povey:2011">
+            <article_title>The Kaldi speech recognition
+toolkit</article_title>
+            <author>Povey</author>
+            <journal_title>IEEE 2011 workshop on automatic speech
+recognition and understanding</journal_title>
+            <doi>10.15199/48.2016.11.70</doi>
+            <cYear>2011</cYear>
+            <unstructured_citation>Povey, D., Ghoshal, A., Boulianne,
+G., Burget, L., Glembek, O., Goel, N., Hannemann, M., Motlicek, P.,
+Qian, Y., Schwarz, P., Silovsky´, J., &amp; Stemmer, K., G. Vesely.
+(2011). The Kaldi speech recognition toolkit. IEEE 2011 Workshop on
+Automatic Speech Recognition and Understanding.
+https://doi.org/10.15199/48.2016.11.70</unstructured_citation>
+          </citation>
+          <citation key="Taal:2011">
+            <article_title>An algorithm for intelligibility prediction
+of time–frequency weighted noisy speech</article_title>
+            <author>Taal</author>
+            <journal_title>IEEE Transactions on Audio, Speech, and
+Language Processing</journal_title>
+            <issue>7</issue>
+            <volume>19</volume>
+            <doi>10.1109/tasl.2011.2114881</doi>
+            <cYear>2011</cYear>
+            <unstructured_citation>Taal, C. H., Hendriks, R. C.,
+Heusdens, R., &amp; Jensen, J. (2011). An algorithm for intelligibility
+prediction of time–frequency weighted noisy speech. IEEE Transactions on
+Audio, Speech, and Language Processing, 19(7), 2125–2136.
+https://doi.org/10.1109/tasl.2011.2114881</unstructured_citation>
+          </citation>
+          <citation key="Rix:2001">
+            <article_title>Perceptual evaluation of speech quality
+(PESQ)-a new method for speech quality assessment of telephone networks
+and codecs</article_title>
+            <author>Rix</author>
+            <journal_title>2001 IEEE international conference on
+acoustics, speech, and signal processing. Proceedings (cat. No.
+01CH37221)</journal_title>
+            <volume>2</volume>
+            <doi>10.1109/icassp.2001.941023</doi>
+            <cYear>2001</cYear>
+            <unstructured_citation>Rix, A. W., Beerends, J. G., Hollier,
+M. P., &amp; Hekstra, A. P. (2001). Perceptual evaluation of speech
+quality (PESQ)-a new method for speech quality assessment of telephone
+networks and codecs. 2001 IEEE International Conference on Acoustics,
+Speech, and Signal Processing. Proceedings (Cat. No. 01CH37221), 2,
+749–752.
+https://doi.org/10.1109/icassp.2001.941023</unstructured_citation>
+          </citation>
+          <citation key="Towns:2014">
+            <article_title>XSEDE: Accelerating scientific
+discovery</article_title>
+            <author>Towns</author>
+            <journal_title>Computing in Science &amp;
+Engineering</journal_title>
+            <issue>5</issue>
+            <volume>16</volume>
+            <doi>10.1109/mcse.2014.80</doi>
+            <cYear>2014</cYear>
+            <unstructured_citation>Towns, J., Cockerill, T., Dahan, M.,
+Foster, I., Gaither, K., Grimshaw, A., Hazlewood, V., Lathrop, S.,
+Lifka, D., Peterson, G. D., Roskies, R., Scott, J. R., &amp;
+Wilkins-Diehr, N. (2014). XSEDE: Accelerating scientific discovery.
+Computing in Science &amp; Engineering, 16(5), 62–74.
+https://doi.org/10.1109/mcse.2014.80</unstructured_citation>
+          </citation>
+          <citation key="Nystrom:2015">
+            <article_title>Bridges: A uniquely flexible HPC resource for
+new communities and data analytics</article_title>
+            <author>Nystrom</author>
+            <journal_title>Proceedings of the 2015 XSEDE conference:
+Scientific advancements enabled by enhanced
+cyberinfrastructure</journal_title>
+            <doi>10.1145/2792745.2792775</doi>
+            <cYear>2015</cYear>
+            <unstructured_citation>Nystrom, N. A., Levine, M. J.,
+Roskies, R. Z., &amp; Scott, J. R. (2015). Bridges: A uniquely flexible
+HPC resource for new communities and data analytics. Proceedings of the
+2015 XSEDE Conference: Scientific Advancements Enabled by Enhanced
+Cyberinfrastructure, 1–8.
+https://doi.org/10.1145/2792745.2792775</unstructured_citation>
+          </citation>
+        </citation_list>
+      </journal_article>
+    </journal>
+  </body>
+</doi_batch>
diff --git a/joss.05403/10.21105.joss.05403.jats b/joss.05403/10.21105.joss.05403.jats
new file mode 100644
index 0000000000..264d646e2f
--- /dev/null
+++ b/joss.05403/10.21105.joss.05403.jats
@@ -0,0 +1,1139 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.2 20190208//EN"
+                  "JATS-publishing1.dtd">
+<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="1.2" article-type="other">
+<front>
+<journal-meta>
+<journal-id></journal-id>
+<journal-title-group>
+<journal-title>Journal of Open Source Software</journal-title>
+<abbrev-journal-title>JOSS</abbrev-journal-title>
+</journal-title-group>
+<issn publication-format="electronic">2475-9066</issn>
+<publisher>
+<publisher-name>Open Journals</publisher-name>
+</publisher>
+</journal-meta>
+<article-meta>
+<article-id pub-id-type="publisher-id">5403</article-id>
+<article-id pub-id-type="doi">10.21105/joss.05403</article-id>
+<title-group>
+<article-title>Software Design and User Interface of ESPnet-SE++: Speech
+Enhancement for Robust Speech Processing</article-title>
+</title-group>
+<contrib-group>
+<contrib contrib-type="author" equal-contrib="yes">
+<contrib-id contrib-id-type="orcid">https://orcid.org/0000-0001-8400-4188</contrib-id>
+<name>
+<surname>Lu</surname>
+<given-names>Yen-Ju</given-names>
+</name>
+<xref ref-type="aff" rid="aff-1"/>
+</contrib>
+<contrib contrib-type="author" equal-contrib="yes">
+<contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-5221-5412</contrib-id>
+<name>
+<surname>Chang</surname>
+<given-names>Xuankai</given-names>
+</name>
+<xref ref-type="aff" rid="aff-2"/>
+</contrib>
+<contrib contrib-type="author">
+<contrib-id contrib-id-type="orcid">https://orcid.org/0000-0003-0299-9914</contrib-id>
+<name>
+<surname>Li</surname>
+<given-names>Chenda</given-names>
+</name>
+<xref ref-type="aff" rid="aff-3"/>
+</contrib>
+<contrib contrib-type="author">
+<contrib-id contrib-id-type="orcid">https://orcid.org/0000-0003-4500-3515</contrib-id>
+<name>
+<surname>Zhang</surname>
+<given-names>Wangyou</given-names>
+</name>
+<xref ref-type="aff" rid="aff-3"/>
+</contrib>
+<contrib contrib-type="author">
+<contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-5358-1844</contrib-id>
+<name>
+<surname>Cornell</surname>
+<given-names>Samuele</given-names>
+</name>
+<xref ref-type="aff" rid="aff-2"/>
+<xref ref-type="aff" rid="aff-4"/>
+</contrib>
+<contrib contrib-type="author">
+<name>
+<surname>Ni</surname>
+<given-names>Zhaoheng</given-names>
+</name>
+<xref ref-type="aff" rid="aff-5"/>
+</contrib>
+<contrib contrib-type="author">
+<name>
+<surname>Masuyama</surname>
+<given-names>Yoshiki</given-names>
+</name>
+<xref ref-type="aff" rid="aff-2"/>
+<xref ref-type="aff" rid="aff-6"/>
+</contrib>
+<contrib contrib-type="author">
+<name>
+<surname>Yan</surname>
+<given-names>Brian</given-names>
+</name>
+<xref ref-type="aff" rid="aff-2"/>
+</contrib>
+<contrib contrib-type="author">
+<contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-5205-8365</contrib-id>
+<name>
+<surname>Scheibler</surname>
+<given-names>Robin</given-names>
+</name>
+<xref ref-type="aff" rid="aff-7"/>
+</contrib>
+<contrib contrib-type="author">
+<contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-4204-9430</contrib-id>
+<name>
+<surname>Wang</surname>
+<given-names>Zhong-Qiu</given-names>
+</name>
+<xref ref-type="aff" rid="aff-2"/>
+</contrib>
+<contrib contrib-type="author">
+<contrib-id contrib-id-type="orcid">https://orcid.org/0000-0001-6956-0418</contrib-id>
+<name>
+<surname>Tsao</surname>
+<given-names>Yu</given-names>
+</name>
+<xref ref-type="aff" rid="aff-8"/>
+</contrib>
+<contrib contrib-type="author">
+<contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-0314-3790</contrib-id>
+<name>
+<surname>Qian</surname>
+<given-names>Yanmin</given-names>
+</name>
+<xref ref-type="aff" rid="aff-3"/>
+</contrib>
+<contrib contrib-type="author" corresp="yes">
+<contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-5970-8631</contrib-id>
+<name>
+<surname>Watanabe</surname>
+<given-names>Shinji</given-names>
+</name>
+<xref ref-type="aff" rid="aff-2"/>
+<xref ref-type="corresp" rid="cor-1"><sup>*</sup></xref>
+</contrib>
+<aff id="aff-1">
+<institution-wrap>
+<institution>Johns Hopkins University, USA</institution>
+</institution-wrap>
+</aff>
+<aff id="aff-2">
+<institution-wrap>
+<institution>Carnegie Mellon University, USA</institution>
+</institution-wrap>
+</aff>
+<aff id="aff-3">
+<institution-wrap>
+<institution>Shanghai Jiao Tong University, Shanghai</institution>
+</institution-wrap>
+</aff>
+<aff id="aff-4">
+<institution-wrap>
+<institution>Universita` Politecnica delle Marche, Italy</institution>
+</institution-wrap>
+</aff>
+<aff id="aff-5">
+<institution-wrap>
+<institution>Meta AI, USA</institution>
+</institution-wrap>
+</aff>
+<aff id="aff-6">
+<institution-wrap>
+<institution>Tokyo Metropolitan University, Japan</institution>
+</institution-wrap>
+</aff>
+<aff id="aff-7">
+<institution-wrap>
+<institution>LINE Corporation, Japan</institution>
+</institution-wrap>
+</aff>
+<aff id="aff-8">
+<institution-wrap>
+<institution>Academia Sinica, Taipei</institution>
+</institution-wrap>
+</aff>
+</contrib-group>
+<author-notes>
+<corresp id="cor-1">* E-mail: <email></email></corresp>
+</author-notes>
+<pub-date date-type="pub" publication-format="electronic" iso-8601-date="2022-08-22">
+<day>22</day>
+<month>8</month>
+<year>2022</year>
+</pub-date>
+<volume>8</volume>
+<issue>91</issue>
+<fpage>5403</fpage>
+<permissions>
+<copyright-statement>Authors of papers retain copyright and release the
+work under a Creative Commons Attribution 4.0 International License (CC
+BY 4.0)</copyright-statement>
+<copyright-year>2022</copyright-year>
+<copyright-holder>The article authors</copyright-holder>
+<license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
+<license-p>Authors of papers retain copyright and release the work under
+a Creative Commons Attribution 4.0 International License (CC BY
+4.0)</license-p>
+</license>
+</permissions>
+<kwd-group kwd-group-type="author">
+<kwd>Python</kwd>
+<kwd>ESPnet</kwd>
+<kwd>speech processing</kwd>
+<kwd>speech enhancement</kwd>
+</kwd-group>
+</article-meta>
+</front>
+<body>
+<fig>
+  <caption><p>The Joint-task Systems of SSE with ASR, ST, and SLU in
+  ESPnet-SE++.</p></caption>
+  <graphic mimetype="image" mime-subtype="png" xlink:href="media/graphics/espnet-SE%2B%2B.png" />
+</fig>
+<sec id="summary">
+  <title>Summary</title>
+  <p>This paper presents the software design and user interface of
+  ESPnet-SE++, a new speech separation and enhancement (SSE) module of
+  the ESPnet toolkit. ESPnet-SE++ significantly expands the
+  functionality of ESPnet-SE
+  (<xref alt="Li et al., 2021" rid="ref-LiU003A2021" ref-type="bibr">Li
+  et al., 2021</xref>) with several new
+  models(<xref alt="Chen et al., 2017" rid="ref-ChenU003A2017" ref-type="bibr">Chen
+  et al., 2017</xref>;
+  <xref alt="Dang et al., 2022" rid="ref-DangU003A2022" ref-type="bibr">Dang
+  et al., 2022</xref>;
+  <xref alt="Hershey et al., 2016" rid="ref-HersheyU003A2016" ref-type="bibr">Hershey
+  et al., 2016</xref>;
+  <xref alt="Hu et al., 2020" rid="ref-HuU003A2020" ref-type="bibr">Hu
+  et al., 2020</xref>;
+  <xref alt="Li et al., 2022" rid="ref-LiU003A2022" ref-type="bibr">Li
+  et al., 2022</xref>;
+  <xref alt="Lu, Cornell, et al., 2022" rid="ref-LuU003A2022a" ref-type="bibr">Lu,
+  Cornell, et al., 2022</xref>;
+  <xref alt="Luo et al., 2019" rid="ref-LuoU003A2019" ref-type="bibr">Luo
+  et al., 2019</xref>;
+  <xref alt="Takahashi et al., 2019" rid="ref-TakahashiU003A2019" ref-type="bibr">Takahashi
+  et al., 2019</xref>;
+  <xref alt="Tan et al., 2021" rid="ref-TanU003A2021" ref-type="bibr">Tan
+  et al., 2021</xref>), loss functions
+  (<xref alt="Boeddeker et al., 2021" rid="ref-BoeddekerU003A2021" ref-type="bibr">Boeddeker
+  et al., 2021</xref>;
+  <xref alt="Le Roux et al., 2019" rid="ref-LeU003A2019" ref-type="bibr">Le
+  Roux et al., 2019</xref>;
+  <xref alt="Luo &amp; Mesgarani, 2018" rid="ref-LuoU003A2018" ref-type="bibr">Luo
+  &amp; Mesgarani, 2018</xref>;
+  <xref alt="Scheibler, 2022" rid="ref-ScheiblerU003A2022" ref-type="bibr">Scheibler,
+  2022</xref>), and training recipes as shown in
+  (<xref alt="Lu, Chang, et al., 2022" rid="ref-LuU003A2022b" ref-type="bibr">Lu,
+  Chang, et al., 2022</xref>). Crucially, it features a new, redesigned
+  interface, which allows for a flexible combination of SSE front-ends
+  with many downstream tasks, including automatic speech recognition
+  (ASR), speaker diarization (SD), speech translation (ST), and spoken
+  language understanding (SLU).</p>
+</sec>
+<sec id="statement-of-need">
+  <title>Statement of need</title>
+  <p><ext-link ext-link-type="uri" xlink:href="https://github.com/espnet/espnet">ESPnet</ext-link>
+  is an open-source toolkit for speech processing, including several
+  ASR, text-to-speech (TTS)
+  (<xref alt="Hayashi et al., 2020" rid="ref-HayashiU003A2020" ref-type="bibr">Hayashi
+  et al., 2020</xref>), ST
+  (<xref alt="Inaguma et al., 2020" rid="ref-InagumaU003A2020" ref-type="bibr">Inaguma
+  et al., 2020</xref>), machine translation (MT), SLU
+  (<xref alt="Arora et al., 2022" rid="ref-AroraU003A2022" ref-type="bibr">Arora
+  et al., 2022</xref>), and SSE recipes
+  (<xref alt="Watanabe et al., 2018" rid="ref-WatanabeU003A2018" ref-type="bibr">Watanabe
+  et al., 2018</xref>). Compared with other open-source SSE toolkits,
+  such as Nussl
+  (<xref alt="Manilow et al., 2018" rid="ref-ManilowU003A2018" ref-type="bibr">Manilow
+  et al., 2018</xref>), Onssen
+  (<xref alt="Ni, 2019" rid="ref-NiU003A2019" ref-type="bibr">Ni,
+  2019</xref>), Asteroid
+  (<xref alt="Pariente et al., 2020" rid="ref-ParienteU003A2020" ref-type="bibr">Pariente
+  et al., 2020</xref>), and SpeechBrain
+  (<xref alt="Ravanelli et al., 2021" rid="ref-RavanelliU003A2021" ref-type="bibr">Ravanelli
+  et al., 2021</xref>), the modularized design in ESPnet-SE++ allows for
+  the joint training of SSE modules with other tasks. Currently,
+  ESPnet-SE++ supports 20 SSE recipes with 24 different
+  enhancement/separation models.</p>
+</sec>
+<sec id="espnet-se-recipes-and-software-structure">
+  <title>ESPnet-SE++ Recipes and Software Structure</title>
+  <sec id="espnet-se-recipes-for-sse-and-joint-task">
+    <title>ESPNet-SE++ Recipes for SSE and Joint-Task</title>
+    <p>For each task, ESPnet-SE++, following the ESPnet2 style, provides
+    common scripts which are carefully designed to work out-of-the-box
+    with a wide variety of corpora. The recipes for different corpora
+    are under the <monospace>egs2/</monospace> folder. Under the
+    <monospace>egs2/TEMPLATE</monospace> folder, the common scripts
+    <monospace>enh1/enh.sh</monospace> and
+    <monospace>enh_asr1/enh_asr.sh</monospace> are shared for all the
+    SSE and joint-task recipes. The directory structure can be found in
+    <ext-link ext-link-type="uri" xlink:href="https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/enh_asr1/README.md">TEMPLATE/enh_asr1/README.md</ext-link>.</p>
+    <sec id="common-scripts">
+      <title>Common Scripts</title>
+      <p><monospace>enh.sh</monospace> contains 13 stages, and the
+      details for the scripts can be found in
+      <ext-link ext-link-type="uri" xlink:href="https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/enh1/README.md">TEMPLATE/enh1/README.md</ext-link>.</p>
+      <graphic mimetype="image" mime-subtype="png" xlink:href="media/graphics/enh_script.png" />
+      <p><monospace>enh_asr.sh</monospace> contains 17 stages, and the
+      details for the scripts can be found in
+      <ext-link ext-link-type="uri" xlink:href="https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/enh_asr1/README.md">TEMPLATE/enh_asr1/README.md</ext-link>.
+      The <monospace>enh_diar.sh</monospace> and
+      <monospace>enh_st.sh</monospace> are similar to it.</p>
+      <graphic mimetype="image" mime-subtype="png" xlink:href="media/graphics/enh_asr_script.png" />
+    </sec>
+    <sec id="training-configuration">
+      <title>Training Configuration</title>
+      <sec id="sse-task-training-configuration">
+        <title>SSE Task Training Configuration</title>
+        <p>An example of an enhancement task for the CHiME-4
+        <monospace>enh1</monospace> recipe is configured as
+        <ext-link ext-link-type="uri" xlink:href="https://github.com/espnet/espnet/blob/master/egs2/chime4/enh1/conf/tuning/train_enh_dprnn_tasnet.yaml"><monospace>conf/tuning/train_enh_dprnn_tasnet.yaml</monospace></ext-link>.
+        This file includes the specific types of
+        <monospace>encoder</monospace>, <monospace>decoder</monospace>,
+        <monospace>separator</monospace>, and their respective settings.
+        Furthermore, the file also defines the training setup and
+        <monospace>criterions</monospace>.</p>
+      </sec>
+      <sec id="joint-task-training-configuration">
+        <title>Joint-Task Training Configuration</title>
+        <p>An example of joint-task training configuration is the
+        CHiME-4 <monospace>enh_asr1</monospace> recipe, configured as
+        <ext-link ext-link-type="uri" xlink:href="https://github.com/espnet/espnet/blob/master/egs2/chime4/enh_asr1/conf/tuning/train_enh_asr_convtasnet_si_snr_fbank_transformer_lr2e-3_accum2_warmup20k_specaug.yaml"><monospace>conf/tuning/train_enh_asr_convtasnet.yaml</monospace></ext-link>.
+        This joint-task comprises of a front-end SSE model and a
+        back-end ASR model. The configuration file includes
+        specifications for the <monospace>encoder</monospace>,
+        <monospace>decoder</monospace>,
+        <monospace>separator</monospace>, and
+        <monospace>criterions</monospace> of both the SSE and ASR
+        models,using prefixes such as <monospace>enh_</monospace> and
+        <monospace>asr_</monospace>.</p>
+      </sec>
+    </sec>
+  </sec>
+  <sec id="espnet-se-software-structure-for-sse-task">
+    <title>ESPNet-SE++ Software Structure for SSE Task</title>
+    <p>The directory structure for the SSE python files can be found in
+    <ext-link ext-link-type="uri" xlink:href="https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/enh1/README.md">TEMPLATE/enh1/README.md</ext-link>.
+    Additionally, the UML diagram for the enhancement-only task in
+    ESPNet-SE++ is provided below.</p>
+    <fig>
+      <caption><p>UML Diagram for Speech Separation and Enhancement in
+      ESPnet-SE++</p></caption>
+      <graphic mimetype="image" mime-subtype="png" xlink:href="media/graphics/UML_SSE.png" />
+    </fig>
+    <sec id="sse-executable-code-bin">
+      <title>SSE Executable Code <monospace>bin/*</monospace></title>
+      <sec id="binenh_train.py">
+        <title>bin/enh_train.py</title>
+        <p>As the main interface for the SSE training stage of
+        <monospace>enh.sh</monospace>,
+        <monospace>enh_train.py</monospace> takes the training
+        parameters and model configurations from the arguments and
+        calls</p>
+        <preformat>    EnhancementTask.main(...)</preformat>
+        <p>to build an SSE object
+        <monospace>ESPnetEnhancementModel</monospace> for training the
+        SSE model according to the model configuration.</p>
+      </sec>
+      <sec id="binenh_inference.py">
+        <title>bin/enh_inference.py</title>
+        <p>The <monospace>inference</monospace> function in
+        <monospace>enh_inference.py</monospace> creates a</p>
+        <preformat>    class SeparateSpeech</preformat>
+        <p>object with the data-iterator for testing and validation.
+        During its initialization, this class instantiate an SSE object
+        <monospace>ESPnetEnhancementModel</monospace> based on a pair of
+        configuration and a pre-trained SSE model.</p>
+      </sec>
+      <sec id="binenh_scoring.py">
+        <title>bin/enh_scoring.py</title>
+        <preformat>    def scoring(..., ref_scp, inf_scp, ...)</preformat>
+        <p>The SSE scoring functions calculates several popular
+        objective scores such as SI-SDR
+        (<xref alt="Le Roux et al., 2019" rid="ref-LeU003A2019" ref-type="bibr">Le
+        Roux et al., 2019</xref>), STOI
+        (<xref alt="Taal et al., 2011" rid="ref-TaalU003A2011" ref-type="bibr">Taal
+        et al., 2011</xref>), SDR and PESQ
+        (<xref alt="Rix et al., 2001" rid="ref-RixU003A2001" ref-type="bibr">Rix
+        et al., 2001</xref>), based on the reference signal and
+        processed speech pairs.</p>
+      </sec>
+    </sec>
+    <sec id="sse-control-class-tasksenh.py">
+      <title>SSE Control Class
+      <monospace>tasks/enh.py</monospace></title>
+      <preformat>    class EnhancementTask(AbsTask)</preformat>
+      <p><monospace>EnhancementTask</monospace> is a control class which
+      is designed for SSE tasks. It contains class methods for building
+      and training an SSE model. Class method
+      <monospace>build_model</monospace> creates and returns an SSE
+      object <monospace>ESPnetEnhancementModel</monospace>.</p>
+    </sec>
+    <sec id="sse-modules-enhespnet_model.py">
+      <title>SSE Modules
+      <monospace>enh/espnet_model.py</monospace></title>
+      <preformat>    class ESPnetEnhancementModel(AbsESPnetModel)</preformat>
+      <p><monospace>ESPnetEnhancementModel</monospace> is the base class
+      for any ESPnet-SE++ SSE model. Since it inherits the same abstract
+      base class <monospace>AbsESPnetModel</monospace>, it is
+      well-aligned with other tasks such as ASR, TTS, ST, and SLU,
+      bringing the benefits of cross-tasks combination.</p>
+      <preformat>    def  forward(self, speech_mix, speech_ref, ...)</preformat>
+      <p>The <monospace>forward</monospace> function of
+      <monospace>ESPnetEnhancementModel</monospace> follows the general
+      design in the ESPnet single-task modules, which processes speech
+      and only returns losses for
+      <ext-link ext-link-type="uri" xlink:href="https://github.com/espnet/espnet/blob/master/espnet2/train/trainer.py#L87-L108">Trainer</ext-link>
+      to update the model.</p>
+      <preformat>    def  forward_enhance(self, speech_mix, ...)
+    def  forward_loss(self, speech_pre, speech_ref, ...)</preformat>
+      <p>For more flexible combinations, the
+      <monospace>forward_enhance</monospace> function returns the
+      enhanced speech, and the <monospace>forward_loss</monospace>
+      function returns the loss. The joint-training methods take the
+      enhanced speech as the input for the downstream task and the SSE
+      loss as a part of the joint-training loss.</p>
+    </sec>
+  </sec>
+  <sec id="espnet-se-software-structure-for-joint-task">
+    <title>ESPNet-SE++ Software Structure for Joint-Task</title>
+    <p>The directory structure for the SSE python files can be found in
+    <ext-link ext-link-type="uri" xlink:href="https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/enh_asr1/README.md">TEMPLATE/enh_asr1/README.md</ext-link>.
+    Furthermore, the UML diagram for the joint-task in ESPNet-SE++ is
+    displayed below.</p>
+    <fig>
+      <caption><p>UML Diagram for Joint-Task in
+      ESPnet-SE++</p></caption>
+      <graphic mimetype="image" mime-subtype="png" xlink:href="media/graphics/UML_Joint.png" />
+    </fig>
+    <sec id="joint-task-executable-code-bin">
+      <title>Joint-Task Executable Code
+      <monospace>bin/*</monospace></title>
+      <sec id="binenh_s2t_train.py">
+        <title>bin/enh_s2t_train.py</title>
+        <p>Similarly to the interface of SSE training code
+        <monospace>enh_train.py</monospace>,
+        <monospace>enh_s2t_train.py</monospace> takes the training and
+        modular parameters from the scripts, and calls</p>
+        <preformat>    tasks.enh_s2t.EnhS2TTask.main(...)</preformat>
+        <p>to build a joint-task object for training the joint-model
+        based on a configuration with both SSE and s2t models setting
+        with or without pre-trained checkpoints.</p>
+      </sec>
+      <sec id="binasr_inference.py-bindiar_inference.py-and-binst_inference.py">
+        <title>bin/asr_inference.py, bin/diar_inference.py, and
+        bin/st_inference.py</title>
+        <p>The <monospace>inference</monospace> function in
+        <monospace>asr_inference.py</monospace>,
+        <monospace>diar_inference.py</monospace>, and
+        <monospace>st_inference.py</monospace> builds and call a</p>
+        <preformat>    class Speech2Text
+    class DiarizeSpeech</preformat>
+        <p>object with the data-iterator for testing and validation.
+        During their initialization, the classes build a joint-task
+        object <monospace>ESPnetEnhS2TModel</monospace> with pre-trained
+        joint-task models and configurations.</p>
+      </sec>
+    </sec>
+    <sec id="joint-task-control-class-tasksenh_s2t.py">
+      <title>Joint-task Control Class
+      <monospace>tasks/enh_s2t.py</monospace></title>
+      <preformat>    class EnhS2TTask(AbsTask)</preformat>
+      <p><monospace>class EnhS2TTask</monospace> is designed for
+      joint-task model. The subtask models are created and sent into the
+      <monospace>ESPnetEnhS2TModel</monospace> to create a joint-task
+      object.</p>
+    </sec>
+    <sec id="joint-task-modules-enhespnet_enh_s2t_model.py">
+      <title>Joint-Task Modules
+      <monospace>enh/espnet_enh_s2t_model.py</monospace></title>
+      <preformat>    class ESPnetEnhS2TModel(AbsESPnetModel)</preformat>
+      <p>The <monospace>ESPnetEnhS2TModel</monospace> takes a front-end
+      <monospace>enh_model</monospace>, and a back-end
+      <monospace>s2t_model</monospace> (such as ASR, SLU, ST, and SD
+      models) as inputs to build a joint-model.</p>
+      <graphic mimetype="image" mime-subtype="png" xlink:href="media/graphics/joint_init.png" />
+      <p>The <monospace>forward</monospace> function of the class
+      follows the general design in ESPnet2:</p>
+      <preformat>    def  forward(self, speech_mix, speech_ref, ...)</preformat>
+      <p>which processes speech and only returns losses for
+      <ext-link ext-link-type="uri" xlink:href="https://github.com/espnet/espnet/blob/master/espnet2/train/trainer.py#L87-L108">Trainer</ext-link>
+      to update the model.</p>
+    </sec>
+  </sec>
+</sec>
+<sec id="espnet-se-user-interface">
+  <title>ESPnet-SE++ User Interface</title>
+  <sec id="building-a-new-recipe-from-scratch">
+    <title>Building a New Recipe from Scratch</title>
+    <p>Since ESPnet2 provides common scripts such as
+    <monospace>enh.sh</monospace> and <monospace>enh_asr.sh</monospace>
+    for each task, users only need to create
+    <monospace>local/data.sh</monospace> for the data preparation of a
+    new corpus. The generated data follows the Kaldi-style structure
+    (<xref alt="Povey et al., 2011" rid="ref-PoveyU003A2011" ref-type="bibr">Povey
+    et al., 2011</xref>):</p>
+    <graphic mimetype="image" mime-subtype="png" xlink:href="media/graphics/data_structure.png" />
+    <p>The detailed instructions for data preparation and building new
+    recipes in espnet2 are described in the
+    <ext-link ext-link-type="uri" xlink:href="https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE">link</ext-link>.</p>
+  </sec>
+  <sec id="inference-with-pre-trained-models">
+    <title>Inference with Pre-trained Models</title>
+    <p>Pretrained models from ESPnet are provided on HuggingFace and
+    Zenodo. Users can download and infer with the
+    models.<monospace>model_name</monospace> in the following section
+    should be <monospace>huggingface_id</monospace> or one of the tags
+    in the
+    <ext-link ext-link-type="uri" xlink:href="https://github.com/espnet/espnet_model_zoo/blob/master/espnet_model_zoo/table.csv">table.csv</ext-link>
+    in
+    <ext-link ext-link-type="uri" xlink:href="https://github.com/espnet/espnet_model_zoo">espnet_model_zoo</ext-link>
+    . Users can also directly provide a Zenodo URL or a HuggingFace
+    URL.</p>
+    <sec id="inference-api">
+      <title>Inference API</title>
+      <p>The inference functions are from the
+      <monospace>enh_inference</monospace> and
+      <monospace>enh_asr_inference</monospace> in the executable code
+      <monospace>bin/</monospace></p>
+      <graphic mimetype="image" mime-subtype="png" xlink:href="media/graphics/inference.png" />
+      <p>Calling <monospace>SeparateSpeech</monospace> and
+      <monospace>Speech2Text</monospace> with unprocessed audios returns
+      the separated speech and their recognition results.</p>
+      <sec id="sse">
+        <title>SSE</title>
+        <graphic mimetype="image" mime-subtype="png" xlink:href="media/graphics/inference_SSE.png" />
+      </sec>
+      <sec id="joint-task">
+        <title>Joint-Task</title>
+        <graphic mimetype="image" mime-subtype="png" xlink:href="media/graphics/inference_joint.png" />
+        <p>The details for downloading models and inference are
+        described in
+        <ext-link ext-link-type="uri" xlink:href="https://github.com/espnet/espnet_model_zoo">espnet_model_zoo</ext-link>.</p>
+      </sec>
+    </sec>
+  </sec>
+</sec>
+<sec id="demonstrations">
+  <title>Demonstrations</title>
+  <p>The demonstrations of ESPnet-SE can be found in the following
+  google colab links:</p>
+  <list list-type="bullet">
+    <list-item>
+      <p><ext-link ext-link-type="uri" xlink:href="https://colab.research.google.com/drive/1fjRJCh96SoYLZPRxsjF9VDv4Q2VoIckI?usp=sharing">ESPnet
+      SSE Demonstration: CHiME-4 and WSJ0-2mix</ext-link></p>
+    </list-item>
+    <list-item>
+      <p><ext-link ext-link-type="uri" xlink:href="https://colab.research.google.com/drive/1hAR5hp8i0cBIMeku8LbGXseBBaF2gEyO#scrollTo=0kIjHfagi4T1">ESPnet-SE++
+      Joint-Task Demonstration: L3DAS22 Challenge and
+      SLURP-Spatialized</ext-link></p>
+    </list-item>
+  </list>
+</sec>
+<sec id="development-plan">
+  <title>Development plan</title>
+  <p>The development plan of the ESPnet-SE++ can be found in
+  <ext-link ext-link-type="uri" xlink:href="https://github.com/espnet/espnet/issues/2200">Development
+  plan for ESPnet2 speech enhancement</ext-link>. In addition, we will
+  explore the combinations with other front-end tasks, such as using ASR
+  as a front-end model and TTS as a back-end model for speech-to-speech
+  conversion.</p>
+</sec>
+<sec id="conclusions">
+  <title>Conclusions</title>
+  <p>In this paper, we introduce the software structure and the user
+  interface of ESPnet-SE++, including the SSE task and joint-task
+  models. ESPnet-SE++ provides general recipes for training models on
+  different corpus and a simple way for adding new recipes. The
+  joint-task implementation further shows that the modularized design
+  improves the flexibility of ESPnet.</p>
+</sec>
+<sec id="acknowledgement">
+  <title>Acknowledgement</title>
+  <p>This work used the Extreme Science and Engineering Discovery
+  Environment (XSEDE)
+  (<xref alt="Towns et al., 2014" rid="ref-TownsU003A2014" ref-type="bibr">Towns
+  et al., 2014</xref>), which is supported by NSF grant number
+  ACI-1548562. Specifically, it used the Bridges system
+  (<xref alt="Nystrom et al., 2015" rid="ref-NystromU003A2015" ref-type="bibr">Nystrom
+  et al., 2015</xref>), which is supported by NSF award number
+  ACI-1445606, at the Pittsburgh Supercomputing Center (PSC).</p>
+</sec>
+</body>
+<back>
+<ref-list>
+  <ref id="ref-LiU003A2021">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Li</surname><given-names>C.</given-names></name>
+        <name><surname>Shi</surname><given-names>J.</given-names></name>
+        <name><surname>Zhang</surname><given-names>W.</given-names></name>
+        <name><surname>Subramanian</surname><given-names>A. S.</given-names></name>
+        <name><surname>Chang</surname><given-names>X.</given-names></name>
+        <name><surname>Kamo</surname><given-names>N.</given-names></name>
+        <name><surname>Hira</surname><given-names>M.</given-names></name>
+        <name><surname>Hayashi</surname><given-names>T.</given-names></name>
+        <name><surname>Boeddeker</surname><given-names>C.</given-names></name>
+        <name><surname>Chen</surname><given-names>S.</given-names><suffix>Z. Watanabe</suffix></name>
+      </person-group>
+      <article-title>ESPnet-SE: End-to-end speech enhancement and separation toolkit designed for ASR integration</article-title>
+      <source>2021 IEEE spoken language technology workshop (SLT)</source>
+      <publisher-name>IEEE</publisher-name>
+      <year iso-8601-date="2021">2021</year>
+      <pub-id pub-id-type="doi">10.1109/slt48900.2021.9383615</pub-id>
+      <fpage>785</fpage>
+      <lpage>792</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-HersheyU003A2016">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Hershey</surname><given-names>J. R.</given-names></name>
+        <name><surname>Chen</surname><given-names>Z.</given-names></name>
+        <name><surname>Le Roux</surname><given-names>J.</given-names></name>
+        <name><surname>Watanabe</surname><given-names>S.</given-names></name>
+      </person-group>
+      <article-title>Deep clustering: Discriminative embeddings for segmentation and separation</article-title>
+      <source>2016 IEEE international conference on acoustics, speech and signal processing (ICASSP)</source>
+      <publisher-name>IEEE</publisher-name>
+      <year iso-8601-date="2016">2016</year>
+      <pub-id pub-id-type="doi">10.1109/icassp.2016.7471631</pub-id>
+      <fpage>31</fpage>
+      <lpage>35</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-ChenU003A2017">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Chen</surname><given-names>Z.</given-names></name>
+        <name><surname>Luo</surname><given-names>Y.</given-names></name>
+        <name><surname>Mesgarani</surname><given-names>N.</given-names></name>
+      </person-group>
+      <article-title>Deep attractor network for single-microphone speaker separation</article-title>
+      <source>2017 IEEE international conference on acoustics, speech and signal processing (ICASSP)</source>
+      <publisher-name>IEEE</publisher-name>
+      <year iso-8601-date="2017">2017</year>
+      <pub-id pub-id-type="doi">10.1109/icassp.2017.7952155</pub-id>
+      <fpage>246</fpage>
+      <lpage>250</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-HuU003A2020">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Hu</surname><given-names>Y.</given-names></name>
+        <name><surname>Liu</surname><given-names>Y.</given-names></name>
+        <name><surname>Lv</surname><given-names>S.</given-names></name>
+        <name><surname>Xing</surname><given-names>M.</given-names></name>
+        <name><surname>Zhang</surname><given-names>S.</given-names></name>
+        <name><surname>Fu</surname><given-names>Y.</given-names></name>
+        <name><surname>Wu</surname><given-names>J.</given-names></name>
+        <name><surname>Zhang</surname><given-names>B.</given-names></name>
+        <name><surname>Xie</surname><given-names>L.</given-names></name>
+      </person-group>
+      <article-title>DCCRN: Deep complex convolution recurrent network for phase-aware speech enhancement</article-title>
+      <source>Proceedings of interspeech</source>
+      <year iso-8601-date="2020">2020</year>
+      <pub-id pub-id-type="doi">10.21437/interspeech.2020-2537</pub-id>
+      <fpage>2472</fpage>
+      <lpage>2476</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-TanU003A2021">
+    <element-citation publication-type="article-journal">
+      <person-group person-group-type="author">
+        <name><surname>Tan</surname><given-names>K.</given-names></name>
+        <name><surname>Zhang</surname><given-names>X.</given-names></name>
+        <name><surname>Wang</surname><given-names>D.</given-names></name>
+      </person-group>
+      <article-title>Deep learning based real-time speech enhancement for dual-microphone mobile phones</article-title>
+      <source>IEEE/ACM Transactions on Audio, Speech, and Language Processing</source>
+      <publisher-name>IEEE</publisher-name>
+      <year iso-8601-date="2021">2021</year>
+      <volume>29</volume>
+      <pub-id pub-id-type="doi">10.1109/taslp.2021.3082318</pub-id>
+      <fpage>1853</fpage>
+      <lpage>1863</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-LiU003A2022">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Li</surname><given-names>C.</given-names></name>
+        <name><surname>Yang</surname><given-names>L.</given-names></name>
+        <name><surname>Wang</surname><given-names>W.</given-names></name>
+        <name><surname>Qian</surname><given-names>Y.</given-names></name>
+      </person-group>
+      <article-title>SkiM: Skipping memory lstm for low-latency real-time continuous speech separation</article-title>
+      <source>2022 IEEE international conference on acoustics, speech and signal processing (ICASSP)</source>
+      <publisher-name>IEEE</publisher-name>
+      <year iso-8601-date="2022-05">2022</year><month>05</month>
+      <uri>https://doi.org/10.1109%2Ficassp43922.2022.9746372</uri>
+      <pub-id pub-id-type="doi">10.1109/icassp43922.2022.9746372</pub-id>
+      <fpage>681</fpage>
+      <lpage>685</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-DangU003A2022">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Dang</surname><given-names>F.</given-names></name>
+        <name><surname>Chen</surname><given-names>H.</given-names></name>
+        <name><surname>Zhang</surname><given-names>P.</given-names></name>
+      </person-group>
+      <article-title>DPT-FSNet: Dual-path transformer based full-band and sub-band fusion network for speech enhancement</article-title>
+      <source>2022 IEEE international conference on acoustics, speech and signal processing (ICASSP)</source>
+      <publisher-name>IEEE</publisher-name>
+      <year iso-8601-date="2022-05">2022</year><month>05</month>
+      <uri>https://doi.org/10.1109%2Ficassp43922.2022.9746171</uri>
+      <pub-id pub-id-type="doi">10.1109/icassp43922.2022.9746171</pub-id>
+      <fpage>6857</fpage>
+      <lpage>6861</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-TakahashiU003A2019">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Takahashi</surname><given-names>N.</given-names></name>
+        <name><surname>Parthasaarathy</surname><given-names>S.</given-names></name>
+        <name><surname>Goswami</surname><given-names>N.</given-names></name>
+        <name><surname>Mitsufuji</surname><given-names>Y.</given-names></name>
+      </person-group>
+      <article-title>Recursive speech separation for unknown number of speakers</article-title>
+      <source>Interspeech 2019</source>
+      <publisher-name>ISCA</publisher-name>
+      <year iso-8601-date="2019-09">2019</year><month>09</month>
+      <uri>https://doi.org/10.21437%2Finterspeech.2019-1550</uri>
+      <pub-id pub-id-type="doi">10.21437/interspeech.2019-1550</pub-id>
+      <fpage>1348</fpage>
+      <lpage>1352</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-LuoU003A2019">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Luo</surname><given-names>Y.</given-names></name>
+        <name><surname>Han</surname><given-names>C.</given-names></name>
+        <name><surname>Mesgarani</surname><given-names>N.</given-names></name>
+        <name><surname>Ceolini</surname><given-names>E.</given-names></name>
+        <name><surname>Liu</surname><given-names>S.</given-names></name>
+      </person-group>
+      <article-title>FaSNet: Low-latency adaptive beamforming for multi-microphone audio processing</article-title>
+      <source>2019 IEEE automatic speech recognition and understanding workshop (ASRU)</source>
+      <publisher-name>IEEE</publisher-name>
+      <year iso-8601-date="2019-12">2019</year><month>12</month>
+      <uri>https://doi.org/10.1109%2Fasru46091.2019.9003849</uri>
+      <pub-id pub-id-type="doi">10.1109/asru46091.2019.9003849</pub-id>
+      <fpage>260</fpage>
+      <lpage>267</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-LuU003A2022a">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Lu</surname><given-names>Y. J.</given-names></name>
+        <name><surname>Cornell</surname><given-names>S.</given-names></name>
+        <name><surname>Chang</surname><given-names>X.</given-names></name>
+        <name><surname>Zhang</surname><given-names>W.</given-names></name>
+        <name><surname>Li</surname><given-names>C.</given-names></name>
+        <name><surname>Ni</surname><given-names>Z.</given-names></name>
+        <name><surname>Wang</surname><given-names>Z.</given-names></name>
+        <name><surname>Watanabe</surname><given-names>S.</given-names></name>
+      </person-group>
+      <article-title>Towards low-distortion multi-channel speech enhancement: The ESPNET-se submission to the L3DAS22 challenge</article-title>
+      <source>2022 IEEE international conference on acoustics, speech and signal processing (ICASSP)</source>
+      <publisher-name>IEEE</publisher-name>
+      <year iso-8601-date="2022-05">2022</year><month>05</month>
+      <uri>https://doi.org/10.1109%2Ficassp43922.2022.9747146</uri>
+      <pub-id pub-id-type="doi">10.1109/icassp43922.2022.9747146</pub-id>
+      <fpage>9201</fpage>
+      <lpage>9205</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-LuoU003A2018">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Luo</surname><given-names>Y.</given-names></name>
+        <name><surname>Mesgarani</surname><given-names>N.</given-names></name>
+      </person-group>
+      <article-title>TaSNet: Time-domain audio separation network for real-time, single-channel speech separation</article-title>
+      <source>2018 IEEE international conference on acoustics, speech and signal processing (ICASSP)</source>
+      <publisher-name>IEEE</publisher-name>
+      <year iso-8601-date="2018-04">2018</year><month>04</month>
+      <uri>https://doi.org/10.1109%2Ficassp.2018.8462116</uri>
+      <pub-id pub-id-type="doi">10.1109/icassp.2018.8462116</pub-id>
+      <fpage>696</fpage>
+      <lpage>700</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-LeU003A2019">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Le Roux</surname><given-names>J.</given-names></name>
+        <name><surname>Wisdom</surname><given-names>S.</given-names></name>
+        <name><surname>Erdogan</surname><given-names>H.</given-names></name>
+        <name><surname>Hershey</surname><given-names>J. R.</given-names></name>
+      </person-group>
+      <article-title>SDR  half-baked or well done?</article-title>
+      <source>2019 IEEE international conference on acoustics, speech and signal processing (ICASSP)</source>
+      <publisher-name>IEEE</publisher-name>
+      <year iso-8601-date="2019-05">2019</year><month>05</month>
+      <uri>https://doi.org/10.1109%2Ficassp.2019.8683855</uri>
+      <pub-id pub-id-type="doi">10.1109/icassp.2019.8683855</pub-id>
+      <fpage>626</fpage>
+      <lpage>630</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-BoeddekerU003A2021">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Boeddeker</surname><given-names>C.</given-names></name>
+        <name><surname>Zhang</surname><given-names>W.</given-names></name>
+        <name><surname>Nakatani</surname><given-names>T.</given-names></name>
+        <name><surname>Kinoshita</surname><given-names>K.</given-names></name>
+        <name><surname>Ochiai</surname><given-names>T.</given-names></name>
+        <name><surname>Delcroix</surname><given-names>M.</given-names></name>
+        <name><surname>Kamo</surname><given-names>N.</given-names></name>
+        <name><surname>Qian</surname><given-names>Y.</given-names></name>
+        <name><surname>Haeb-Umbach</surname><given-names>R.</given-names></name>
+      </person-group>
+      <article-title>Convolutive transfer function invariant SDR training criteria for multi-channel reverberant speech separation</article-title>
+      <source>2021 IEEE international conference on acoustics, speech and signal processing (ICASSP)</source>
+      <publisher-name>IEEE</publisher-name>
+      <year iso-8601-date="2021-06">2021</year><month>06</month>
+      <uri>https://doi.org/10.1109%2Ficassp39728.2021.9414661</uri>
+      <pub-id pub-id-type="doi">10.1109/icassp39728.2021.9414661</pub-id>
+      <fpage>8428</fpage>
+      <lpage>8432</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-ScheiblerU003A2022">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Scheibler</surname><given-names>R.</given-names></name>
+      </person-group>
+      <article-title>SDR  medium rare with fast computations</article-title>
+      <source>2022 IEEE international conference on acoustics, speech and signal processing (ICASSP)</source>
+      <publisher-name>IEEE</publisher-name>
+      <year iso-8601-date="2022-05">2022</year><month>05</month>
+      <uri>https://doi.org/10.1109%2Ficassp43922.2022.9747473</uri>
+      <pub-id pub-id-type="doi">10.1109/icassp43922.2022.9747473</pub-id>
+      <fpage>701</fpage>
+      <lpage>705</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-LuU003A2022b">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Lu</surname><given-names>Y. J.</given-names></name>
+        <name><surname>Chang</surname><given-names>X.</given-names></name>
+        <name><surname>Li</surname><given-names>C.</given-names></name>
+        <name><surname>Zhang</surname><given-names>W.</given-names></name>
+        <name><surname>Cornell</surname><given-names>S.</given-names></name>
+        <name><surname>Ni</surname><given-names>Z.</given-names></name>
+        <name><surname>Masuyama</surname><given-names>Y.</given-names></name>
+        <name><surname>Yan</surname><given-names>B.</given-names></name>
+        <name><surname>Scheibler</surname><given-names>R.</given-names></name>
+        <name><surname>Wang</surname><given-names>Z. Q.</given-names></name>
+        <name><surname>Tsao</surname><given-names>Y.</given-names></name>
+        <name><surname>Qian Y. Watanabe</surname><given-names>S.</given-names></name>
+      </person-group>
+      <article-title>ESPnet-SE++: Speech enhancement for robust speech recognition, translation, and understanding</article-title>
+      <source>Proceedings of interspeech</source>
+      <year iso-8601-date="2022">2022</year>
+      <pub-id pub-id-type="doi">10.21437/interspeech.2022-10727</pub-id>
+      <fpage>5458</fpage>
+      <lpage>5462</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-HayashiU003A2020">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Hayashi</surname><given-names>T.</given-names></name>
+        <name><surname>Yamamoto</surname><given-names>R.</given-names></name>
+        <name><surname>Inoue</surname><given-names>K.</given-names></name>
+        <name><surname>Yoshimura</surname><given-names>T.</given-names></name>
+        <name><surname>Watanabe</surname><given-names>S.</given-names></name>
+        <name><surname>Toda</surname><given-names>T.</given-names></name>
+        <name><surname>Takeda</surname><given-names>K.</given-names></name>
+        <name><surname>Zhang</surname><given-names>X.</given-names><suffix>Y. Tan</suffix></name>
+      </person-group>
+      <article-title>ESPnet-TTS: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit</article-title>
+      <source>2020 IEEE international conference on acoustics, speech and signal processing (ICASSP)</source>
+      <publisher-name>IEEE</publisher-name>
+      <year iso-8601-date="2020">2020</year>
+      <pub-id pub-id-type="doi">10.1109/icassp40776.2020.9053512</pub-id>
+      <fpage>7654</fpage>
+      <lpage>7658</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-InagumaU003A2020">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Inaguma</surname><given-names>H.</given-names></name>
+        <name><surname>Kiyono</surname><given-names>S.</given-names></name>
+        <name><surname>Duh</surname><given-names>K.</given-names></name>
+        <name><surname>Karita</surname><given-names>S.</given-names></name>
+        <name><surname>Soplin</surname><given-names>N. E. Y.</given-names></name>
+        <name><surname>Hayashi</surname><given-names>T.</given-names></name>
+        <name><surname>Watanabe</surname><given-names>S.</given-names></name>
+      </person-group>
+      <article-title>ESPnet-ST: All-in-one speech translation toolkit</article-title>
+      <source>Proceedings of the 58th annual meeting of the association for computational linguistics: System demonstrations</source>
+      <publisher-name>Association for Computational Linguistics</publisher-name>
+      <year iso-8601-date="2020">2020</year>
+      <pub-id pub-id-type="doi">10.18653/v1/2020.acl-demos.34</pub-id>
+      <fpage>302</fpage>
+      <lpage>311</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-AroraU003A2022">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Arora</surname><given-names>S.</given-names></name>
+        <name><surname>Dalmia</surname><given-names>S.</given-names></name>
+        <name><surname>Denisov</surname><given-names>P.</given-names></name>
+        <name><surname>Chang</surname><given-names>X.</given-names></name>
+        <name><surname>Ueda</surname><given-names>Y.</given-names></name>
+        <name><surname>Peng</surname><given-names>Y.</given-names></name>
+        <name><surname>Zhang</surname><given-names>Y.</given-names></name>
+        <name><surname>Kumar</surname><given-names>S.</given-names></name>
+        <name><surname>Ganesan</surname><given-names>K.</given-names></name>
+        <name><surname>Yan</surname><given-names>Watanabe</given-names><suffix>B</suffix></name>
+      </person-group>
+      <article-title>ESPnet-SLU: Advancing spoken language understanding through ESPnet</article-title>
+      <source>2022 IEEE international conference on acoustics, speech and signal processing (ICASSP)</source>
+      <publisher-name>IEEE</publisher-name>
+      <year iso-8601-date="2022">2022</year>
+      <pub-id pub-id-type="doi">10.1109/icassp43922.2022.9747674</pub-id>
+      <fpage>7167</fpage>
+      <lpage>7171</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-WatanabeU003A2018">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Watanabe</surname><given-names>S.</given-names></name>
+        <name><surname>Hori</surname><given-names>T.</given-names></name>
+        <name><surname>Karita</surname><given-names>S.</given-names></name>
+        <name><surname>Hayashi</surname><given-names>T.</given-names></name>
+        <name><surname>Nishitoba</surname><given-names>J.</given-names></name>
+        <name><surname>Unno</surname><given-names>Y.</given-names></name>
+        <name><surname>Soplin</surname><given-names>N. E. Y.</given-names></name>
+        <name><surname>Heymann</surname><given-names>J.</given-names></name>
+        <name><surname>Wiesner</surname><given-names>M.</given-names></name>
+        <name><surname>Chen</surname><given-names>N.</given-names></name>
+        <name><surname>Renduchintala</surname><given-names>A.</given-names></name>
+        <name><surname>Ochiai</surname><given-names>T.</given-names></name>
+      </person-group>
+      <article-title>ESPnet: End-to-end speech processing toolkit</article-title>
+      <source>Proceedings of interspeech</source>
+      <year iso-8601-date="2018">2018</year>
+      <pub-id pub-id-type="doi">10.21437/interspeech.2018-1456</pub-id>
+      <fpage>2207</fpage>
+      <lpage>2211</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-ManilowU003A2018">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Manilow</surname><given-names>E.</given-names></name>
+        <name><surname>Seetharaman</surname><given-names>P.</given-names></name>
+        <name><surname>Pardo</surname><given-names>B.</given-names></name>
+      </person-group>
+      <article-title>The northwestern university source separation library.</article-title>
+      <source>International society for music information retrieval (ISMIR)</source>
+      <year iso-8601-date="2018">2018</year>
+      <pub-id pub-id-type="doi">10.1163/1872-9037_afco_asc_1322</pub-id>
+      <fpage>297</fpage>
+      <lpage>305</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-NiU003A2019">
+    <element-citation publication-type="article-journal">
+      <person-group person-group-type="author">
+        <name><surname>Ni</surname><given-names>Michael I</given-names><suffix>Zhaoheng Mandel</suffix></name>
+      </person-group>
+      <article-title>ONSSEN: An open-source speech separation and enhancement library</article-title>
+      <source>arXiv preprint arXiv:1911.00982</source>
+      <year iso-8601-date="2019">2019</year>
+    </element-citation>
+  </ref>
+  <ref id="ref-ParienteU003A2020">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Pariente</surname><given-names>M.</given-names></name>
+        <name><surname>Cornell</surname><given-names>S.</given-names></name>
+        <name><surname>Cosentino</surname><given-names>J.</given-names></name>
+        <name><surname>Sivasankaran</surname><given-names>S.</given-names></name>
+        <name><surname>Tzinis</surname><given-names>E.</given-names></name>
+        <name><surname>Heitkaemper</surname><given-names>J.</given-names></name>
+        <name><surname>Olvera</surname><given-names>M.</given-names></name>
+        <name><surname>Stöter</surname><given-names>F. R.</given-names></name>
+        <name><surname>Hu</surname><given-names>M.</given-names></name>
+        <name><surname>Martı́n-Doñas</surname><given-names>J. M.</given-names></name>
+        <name><surname>Ditter</surname><given-names>D.</given-names></name>
+        <name><surname>Frank</surname><given-names>A.</given-names></name>
+        <name><surname>Deleforge</surname><given-names>A.</given-names></name>
+        <name><surname>Vincent</surname><given-names>E.</given-names></name>
+      </person-group>
+      <article-title>Asteroid: The PyTorch-based audio source separation toolkit for researchers</article-title>
+      <source>Proceedings of interspeech</source>
+      <year iso-8601-date="2020">2020</year>
+      <pub-id pub-id-type="doi">10.21437/interspeech.2020-1673</pub-id>
+      <fpage>2637</fpage>
+      <lpage>2641</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-RavanelliU003A2021">
+    <element-citation publication-type="article-journal">
+      <person-group person-group-type="author">
+        <name><surname>Ravanelli</surname><given-names>M.</given-names></name>
+        <name><surname>Parcollet</surname><given-names>T.</given-names></name>
+        <name><surname>Plantinga</surname><given-names>P.</given-names></name>
+        <name><surname>Rouhe</surname><given-names>A.</given-names></name>
+        <name><surname>Cornell</surname><given-names>S.</given-names></name>
+        <name><surname>Lugosch</surname><given-names>L.</given-names></name>
+        <name><surname>Subakan</surname><given-names>C.</given-names></name>
+        <name><surname>Dawalatabad</surname><given-names>N.</given-names></name>
+        <name><surname>Heba</surname><given-names>A.</given-names></name>
+        <name><surname>Zhong</surname><given-names>J.</given-names></name>
+        <name><surname>Chou</surname><given-names>J. C.</given-names></name>
+        <name><surname>Yeh</surname><given-names>S. L.</given-names></name>
+        <name><surname>Fu</surname><given-names>S. W.</given-names></name>
+        <name><surname>Liao</surname><given-names>C. F.</given-names></name>
+        <name><surname>Rastorgueva</surname><given-names>E.</given-names></name>
+        <name><surname>Grondin</surname><given-names>F.</given-names></name>
+        <name><surname>Aris</surname><given-names>W.</given-names></name>
+        <name><surname>Na</surname><given-names>H.</given-names></name>
+        <name><surname>Gao</surname><given-names>Y.</given-names></name>
+        <name><surname>Mori R. D. Bengio</surname><given-names>Y.</given-names></name>
+      </person-group>
+      <article-title>SpeechBrain: A general-purpose speech toolkit</article-title>
+      <source>arXiv preprint arXiv:2106.04624</source>
+      <year iso-8601-date="2021">2021</year>
+    </element-citation>
+  </ref>
+  <ref id="ref-PoveyU003A2011">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Povey</surname><given-names>D.</given-names></name>
+        <name><surname>Ghoshal</surname><given-names>A.</given-names></name>
+        <name><surname>Boulianne</surname><given-names>G.</given-names></name>
+        <name><surname>Burget</surname><given-names>L.</given-names></name>
+        <name><surname>Glembek</surname><given-names>O.</given-names></name>
+        <name><surname>Goel</surname><given-names>N.</given-names></name>
+        <name><surname>Hannemann</surname><given-names>M.</given-names></name>
+        <name><surname>Motlicek</surname><given-names>P.</given-names></name>
+        <name><surname>Qian</surname><given-names>Y.</given-names></name>
+        <name><surname>Schwarz</surname><given-names>P.</given-names></name>
+        <name><surname>Silovsky´</surname><given-names>J.</given-names></name>
+        <name><surname>Stemmer</surname><given-names>K.</given-names><suffix>G. Vesely</suffix></name>
+      </person-group>
+      <article-title>The Kaldi speech recognition toolkit</article-title>
+      <source>IEEE 2011 workshop on automatic speech recognition and understanding</source>
+      <publisher-name>IEEE Signal Processing Society</publisher-name>
+      <year iso-8601-date="2011">2011</year>
+      <pub-id pub-id-type="doi">10.15199/48.2016.11.70</pub-id>
+    </element-citation>
+  </ref>
+  <ref id="ref-TaalU003A2011">
+    <element-citation publication-type="article-journal">
+      <person-group person-group-type="author">
+        <name><surname>Taal</surname><given-names>C. H.</given-names></name>
+        <name><surname>Hendriks</surname><given-names>R. C.</given-names></name>
+        <name><surname>Heusdens</surname><given-names>R.</given-names></name>
+        <name><surname>Jensen</surname><given-names>J.</given-names></name>
+      </person-group>
+      <article-title>An algorithm for intelligibility prediction of time–frequency weighted noisy speech</article-title>
+      <source>IEEE Transactions on Audio, Speech, and Language Processing</source>
+      <publisher-name>IEEE</publisher-name>
+      <year iso-8601-date="2011">2011</year>
+      <volume>19</volume>
+      <issue>7</issue>
+      <pub-id pub-id-type="doi">10.1109/tasl.2011.2114881</pub-id>
+      <fpage>2125</fpage>
+      <lpage>2136</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-RixU003A2001">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Rix</surname><given-names>A. W.</given-names></name>
+        <name><surname>Beerends</surname><given-names>J. G.</given-names></name>
+        <name><surname>Hollier</surname><given-names>M. P.</given-names></name>
+        <name><surname>Hekstra</surname><given-names>A. P.</given-names></name>
+      </person-group>
+      <article-title>Perceptual evaluation of speech quality (PESQ)-a new method for speech quality assessment of telephone networks and codecs</article-title>
+      <source>2001 IEEE international conference on acoustics, speech, and signal processing. Proceedings (cat. No. 01CH37221)</source>
+      <publisher-name>IEEE</publisher-name>
+      <year iso-8601-date="2001">2001</year>
+      <volume>2</volume>
+      <pub-id pub-id-type="doi">10.1109/icassp.2001.941023</pub-id>
+      <fpage>749</fpage>
+      <lpage>752</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-TownsU003A2014">
+    <element-citation publication-type="article-journal">
+      <person-group person-group-type="author">
+        <name><surname>Towns</surname><given-names>J.</given-names></name>
+        <name><surname>Cockerill</surname><given-names>T.</given-names></name>
+        <name><surname>Dahan</surname><given-names>M.</given-names></name>
+        <name><surname>Foster</surname><given-names>I.</given-names></name>
+        <name><surname>Gaither</surname><given-names>K.</given-names></name>
+        <name><surname>Grimshaw</surname><given-names>A.</given-names></name>
+        <name><surname>Hazlewood</surname><given-names>V.</given-names></name>
+        <name><surname>Lathrop</surname><given-names>S.</given-names></name>
+        <name><surname>Lifka</surname><given-names>D.</given-names></name>
+        <name><surname>Peterson</surname><given-names>G. D.</given-names></name>
+        <name><surname>Roskies</surname><given-names>R.</given-names></name>
+        <name><surname>Scott</surname><given-names>J. R.</given-names></name>
+        <name><surname>Wilkins-Diehr</surname><given-names>N.</given-names></name>
+      </person-group>
+      <article-title>XSEDE: Accelerating scientific discovery</article-title>
+      <source>Computing in Science &amp; Engineering</source>
+      <publisher-name>IEEE</publisher-name>
+      <year iso-8601-date="2014">2014</year>
+      <volume>16</volume>
+      <issue>5</issue>
+      <pub-id pub-id-type="doi">10.1109/mcse.2014.80</pub-id>
+      <fpage>62</fpage>
+      <lpage>74</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-NystromU003A2015">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Nystrom</surname><given-names>N. A.</given-names></name>
+        <name><surname>Levine</surname><given-names>M. J.</given-names></name>
+        <name><surname>Roskies</surname><given-names>R. Z.</given-names></name>
+        <name><surname>Scott</surname><given-names>J. R.</given-names></name>
+      </person-group>
+      <article-title>Bridges: A uniquely flexible HPC resource for new communities and data analytics</article-title>
+      <source>Proceedings of the 2015 XSEDE conference: Scientific advancements enabled by enhanced cyberinfrastructure</source>
+      <year iso-8601-date="2015">2015</year>
+      <pub-id pub-id-type="doi">10.1145/2792745.2792775</pub-id>
+      <fpage>1</fpage>
+      <lpage>8</lpage>
+    </element-citation>
+  </ref>
+</ref-list>
+</back>
+</article>
diff --git a/joss.05403/10.21105.joss.05403.pdf b/joss.05403/10.21105.joss.05403.pdf
new file mode 100644
index 0000000000..68cd0f086e
Binary files /dev/null and b/joss.05403/10.21105.joss.05403.pdf differ
diff --git a/joss.05403/media/graphics/UML_Joint.png b/joss.05403/media/graphics/UML_Joint.png
new file mode 100644
index 0000000000..7ff8f1761d
Binary files /dev/null and b/joss.05403/media/graphics/UML_Joint.png differ
diff --git a/joss.05403/media/graphics/UML_SSE.png b/joss.05403/media/graphics/UML_SSE.png
new file mode 100644
index 0000000000..d6c281aebb
Binary files /dev/null and b/joss.05403/media/graphics/UML_SSE.png differ
diff --git a/joss.05403/media/graphics/data_structure.png b/joss.05403/media/graphics/data_structure.png
new file mode 100644
index 0000000000..7f0a8e8d0c
Binary files /dev/null and b/joss.05403/media/graphics/data_structure.png differ
diff --git a/joss.05403/media/graphics/enh_asr_script.png b/joss.05403/media/graphics/enh_asr_script.png
new file mode 100644
index 0000000000..27a07f91ed
Binary files /dev/null and b/joss.05403/media/graphics/enh_asr_script.png differ
diff --git a/joss.05403/media/graphics/enh_script.png b/joss.05403/media/graphics/enh_script.png
new file mode 100644
index 0000000000..08c18246d7
Binary files /dev/null and b/joss.05403/media/graphics/enh_script.png differ
diff --git a/joss.05403/media/graphics/espnet-SE++.png b/joss.05403/media/graphics/espnet-SE++.png
new file mode 100644
index 0000000000..a261bf9577
Binary files /dev/null and b/joss.05403/media/graphics/espnet-SE++.png differ
diff --git a/joss.05403/media/graphics/inference.png b/joss.05403/media/graphics/inference.png
new file mode 100644
index 0000000000..c7ba59cf88
Binary files /dev/null and b/joss.05403/media/graphics/inference.png differ
diff --git a/joss.05403/media/graphics/inference_SSE.png b/joss.05403/media/graphics/inference_SSE.png
new file mode 100644
index 0000000000..33cd7a96ba
Binary files /dev/null and b/joss.05403/media/graphics/inference_SSE.png differ
diff --git a/joss.05403/media/graphics/inference_joint.png b/joss.05403/media/graphics/inference_joint.png
new file mode 100644
index 0000000000..7f28d45f2b
Binary files /dev/null and b/joss.05403/media/graphics/inference_joint.png differ
diff --git a/joss.05403/media/graphics/joint_init.png b/joss.05403/media/graphics/joint_init.png
new file mode 100644
index 0000000000..ede56be1dd
Binary files /dev/null and b/joss.05403/media/graphics/joint_init.png differ