From 8ab989dc35af2bada66ca7f7058cdd1d9479acd2 Mon Sep 17 00:00:00 2001 From: Ziyang Ma Date: Fri, 8 Nov 2024 21:30:35 +0800 Subject: [PATCH] Revert "update README" This reverts commit 41d61856d40d2e918a281ff4ab86798eb36bbd39. --- README.md | 49 +--- .../s2s/scripts/utils/data_cleaning_cn.py | 257 ------------------ examples/s2s/scripts/utils/turns_counting.py | 70 ----- 3 files changed, 4 insertions(+), 372 deletions(-) delete mode 100644 examples/s2s/scripts/utils/data_cleaning_cn.py delete mode 100644 examples/s2s/scripts/utils/turns_counting.py diff --git a/README.md b/README.md index c3c7b45d..0fd38d0a 100644 --- a/README.md +++ b/README.md @@ -28,8 +28,7 @@ developers to train custom multimodal large language model (MLLM), focusing on < 6. [Citation](#citation) # News -- [Update Nov. 5, 2024] Recipes for [speech emotion captioning (SEC)](examples/sec_emotioncaps/README.md) with [emotion2vec](https://github.com/ddlBoJack/emotion2vec) as the encoder has been supported. -- [Update Oct. 12, 2024] Recipes for [SLAM-AAC](examples/slam_aac/README.md) with [EAT](https://github.com/cwx-worst-one/EAT) as the encoder have been supported. +- [Update Oct. 12, 2024] Recipes for [SLAM-AAC](examples/slam_aac/README.md) have been supported. - [Update Sep. 28, 2024] Recipes for [CoT-ST](examples/st_covost2/README.md) have been supported. - [Update Sep. 25, 2024] Recipes for [DRCap](examples/drcap_zeroshot_aac/README.md) have been supported. - [Update Jun. 12, 2024] Recipes for [MaLa-ASR](examples/mala_asr_slidespeech/README.md) have been supported. @@ -91,7 +90,6 @@ We provide reference implementations of various LLM-based speech, audio, and mus - Text-to-Speech (TTS) - [VALL-E-X](examples/vallex/README.md) - - [Speech Emotion Captioning (SEC)](examples/sec_emotioncaps/README.md) - **Audio Task** - [Automated Audio Captioning (AAC)](examples/aac_audiocaps/README.md) @@ -120,10 +118,7 @@ command-line (shell file) > Hydra configuration (yaml file) > dataclass configur - We borrow code from [Fairseq](https://github.com/facebookresearch/fairseq) for deepspeed configuration. - We thank the contributors for providing diverse recipes. -# Citation - -## Speech Task - +## Citation SLAM-ASR: ``` @article{ma2024embarrassingly, @@ -133,27 +128,7 @@ SLAM-ASR: year={2024} } ``` -Mala-ASR: -``` -@article{yang2024mala, - title={MaLa-ASR: Multimedia-Assisted LLM-Based ASR}, - author={Yang, Guanrou and Ma, Ziyang and Yu, Fan and Gao, Zhifu and Zhang, Shiliang and Chen, Xie}, - journal={Proc. INTERSPEECH}, - year={2024} -} -``` -CoT-ST: -``` -@article{du2024cot, - title={CoT-ST: Enhancing LLM-based Speech Translation with Multimodal Chain-of-Thought}, - author={Du, Yexing and Ma, Ziyang and Yang, Yifan and Deng, Keqi and Chen, Xie and Yang, Bo and Xiang, Yang and Liu, Ming and Qin, Bing}, - journal={arXiv preprint arXiv:2409.19510}, - year={2024} -} -``` - -## Audio Task SLAM-AAC: ``` @article{chen2024slam, @@ -163,21 +138,5 @@ SLAM-AAC: year={2024} } ``` -DRCap: -``` -@article{li2024drcap, - title={DRCap: Decoding CLAP Latents with Retrieval-augmented Generation for Zero-shot Audio Captioning}, - author={Li, Xiquan and Chen, Wenxi and Ma, Ziyang and Xu, Xuenan and Liang, Yuzhe and Zheng, Zhisheng and Kong, Qiuqiang and Chen, Xie}, - journal={arXiv preprint arXiv:2410.09472}, - year={2024} -} -``` -BAT: -``` -@article{zheng2024bat, - title={BAT: Learning to Reason about Spatial Sounds with Large Language Models}, - author={Zheng, Zhisheng and Peng, Puyuan and Ma, Ziyang and Chen, Xie and Choi, Eunsol and Harwath, David}, - journal={Proc. ICML}, - year={2024} -} -``` \ No newline at end of file + + diff --git a/examples/s2s/scripts/utils/data_cleaning_cn.py b/examples/s2s/scripts/utils/data_cleaning_cn.py deleted file mode 100644 index 0c56e90d..00000000 --- a/examples/s2s/scripts/utils/data_cleaning_cn.py +++ /dev/null @@ -1,257 +0,0 @@ -import re -import json -from zhon.hanzi import punctuation as zh_punctuation -from opencc import OpenCC -import string -from tqdm import tqdm - -# 初始化OpenCC进行繁体到简体转换 -cc = OpenCC('t2s') - -# 定义合理的标点符号(中英文) -all_punctuation = string.punctuation + zh_punctuation + "×÷=" + "°℃" - -# 定义正则表达式模式 -url_pattern = re.compile(r'https?://\S+|www\.\S+') -email_pattern = re.compile(r'\S+@\S+\.\S+') -code_pattern = re.compile(r'`{1,3}.*?`{1,3}', re.DOTALL) -emoji_pattern = re.compile( - '[' - u'\U0001F600-\U0001F64F' # 表情符号 - u'\U0001F300-\U0001F5FF' # 符号和象形文字 - u'\U0001F680-\U0001F6FF' # 交通和地图符号 - u'\U0001F1E0-\U0001F1FF' # 国旗 - u'\U0001F700-\U0001F77F' # 其他象形符号 - u'\U0001F780-\U0001F7FF' # 更多符号 - u'\U0001F800-\U0001F8FF' # 更多符号 - u'\U0001F900-\U0001F9FF' # 更多表情符号 - u'\U0001FA00-\U0001FA6F' # 角色面部表情、面具等 - u'\U0001FA70-\U0001FAFF' # 高级符号,常见于现代设备 - u'\U00002702-\U000027B0' # 符号(例如勾选、交叉) - ']', - re.UNICODE -) - -table_pattern = re.compile(r'(\|.*\|(\n|\r\n)*)+') - -file_patten = re.compile(r'\S+\.(txt|pdf|docx|doc|xlsx|xls|csv|pptx|ppt|png|jpg|jpeg|gif|mp3|mp4|wav|avi|mov|mkv|flv|wmv|zip|rar|tar|gz|7z|iso|dmg|pkg)') - -phonetic_pattern = re.compile(r'/[a-zA-Zɪɛæʌʊɔəʌɜɑɔr]+/') # 匹配 /i/、/ɪ/ 等音素符号 - -repeating_pattern = re.compile(r'[_\-*]{6,}') # 匹配重复符号,如 ______ - -# 定义缩写词典 -abbreviation_dict = { - "Dr.": "Doctor", - "e.g.": "for example", - "i.e.": "that is", # 常见的缩写,用于解释说明 - "Mr.": "Mister", - "Mrs.": "Misses", - "St.": "Saint", # 用于地名或圣人的称谓 - "vs.": "versus", # 用于对比,通常见于比赛或法庭用语 - "Prof.": "Professor", - "Ave.": "Avenue", - "Dept.": "Department", - "etc.": "and so on", # 表示等等 - "Inc.": "Incorporated", # 公司名中常见 - "Ltd.": "Limited", # 有限公司 - "Jr.": "Junior", - "Sr.": "Senior", - "vs.": "versus", # 对抗 - "approx.": "approximately", # 大约 - "min.": "minute", # 分钟 - "sec.": "second", # 秒 - "Fri.": "Friday", - "Sat.": "Saturday", - "Sun.": "Sunday", - "Mon.": "Monday", - "Tue.": "Tuesday", - "Wed.": "Wednesday", - "Thu.": "Thursday", - "no.": "number", # 常用于序号 - "No.": "number", - "Jan.": "January", - "Feb.": "February", - "Mar.": "March", - "Apr.": "April", - "Jun.": "June", - "Jul.": "July", - "Aug.": "August", - "Sept.": "September", - "Oct.": "October", - "Nov.": "November", - "Dec.": "December", - "est.": "established", # 成立于(例如用于公司成立时间) - "max.": "maximum", # 最大值 - "min.": "minimum", # 最小值 -} - -def normalize_abbreviations(text): - for abbr, full in abbreviation_dict.items(): - text = re.sub(r'\b' + re.escape(abbr) + r'\b', full, text) - return text - -# 定义要过滤的敏感词列表(不包含具体词汇) -sensitive_words = set() # 可以在这里添加敏感词汇 - -def filter_inappropriate_content(text): - # 可以使用外部库或自定义方法过滤不当内容 - # 这里只是示例,不包含具体实现 - for word in sensitive_words: - text = text.replace(word, '') - return text - -def process_text(text): - # 将繁体中文转换为简体中文 - text = cc.convert(text) - - # 规范缩写 - text = normalize_abbreviations(text) - - # 过滤不当内容 - text = filter_inappropriate_content(text) - - return text - -def is_valid_char(text): - return all( - (u'\u4e00' <= char <= u'\u9fff') # 保留汉字 - or char.isalpha() # 保留字母 - or char in all_punctuation # 保留标点符号 - or char.isspace() # 允许空格 - or char == '\n' # 允许换行符 - or char.isdigit() # 允许数字 - for char in text - ) - -def is_valid_sentence(sentence): - # 检查URL和链接 - if url_pattern.search(sentence): - return False, "URL" - - # 检查电子邮件地址 - if email_pattern.search(sentence): - return False, "Email" - - # 检查代码片段 - if code_pattern.search(sentence): - return False, "Code" - - # 检查表情符号 - if emoji_pattern.search(sentence): - return False, "Emoji" - - # 检查表格 - if table_pattern.search(sentence): - return False, "Table" - - # 检查文件 - if file_patten.search(sentence): - return False, "File" - - # 检查音素符号 - if phonetic_pattern.search(sentence): - return False, "Phonetic" - - # 检查重复符号 - if repeating_pattern.search(sentence): - return False, "Repeating" - - return True, "Valid" - -def suitable_for_tts(data, max_turn_num=10, max_text_length=200): - # 检查对话轮数 - if len(data["conversations"]) > max_turn_num: - return False - - # 检查文本长度 - for conversation in data["conversations"]: - if len(conversation["value"]) > max_text_length: - return False - - return True - -def process_conversations(input_file, output_file, invalid_file, tts_file): - - with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile, open(invalid_file, 'w', encoding='utf-8') as invalidfile, open(tts_file, 'w', encoding='utf-8') as ttsfile: - - valid_num = 0 - invalid_num = 0 - tts_num = 0 - - # 流式处理,逐行读取和处理 - for line in tqdm(infile): - data = json.loads(line) - - # 处理对话数据 - processed_data = { - "conversations": [], - "id": data.get("id", "") - } - valid = True # 标记是否有有效的对话 - - for conversation in data.get("conversations", []): - text = conversation.get("value", "").strip() - - if not is_valid_char(text): - # print(f"Invalid Character error in: {text}") - valid = False - invalid_num += 1 - data["invalid_reason"] = "Character" - json.dump(data, invalidfile, ensure_ascii=False) - invalidfile.write('\n') - break - - valid_sentence, reason = is_valid_sentence(text) - if not valid_sentence: - # print(f"{reason} error in: {text}") - valid = False - invalid_num += 1 - data["invalid_reason"] = reason - json.dump(data, invalidfile, ensure_ascii=False) - invalidfile.write('\n') - break - - - # 处理文本 - processed_text = process_text(text) - if not is_valid_char(processed_text) or not is_valid_sentence(processed_text): - valid = False - break - - # 构建新的对话结构 - processed_conversation = { - "from": conversation.get("from", ""), - "value": processed_text - } - processed_data["conversations"].append(processed_conversation) - - if valid: - valid_num += 1 - json.dump(processed_data, outfile, ensure_ascii=False) - outfile.write('\n') # 添加换行符,以便下一行开始 - - if suitable_for_tts(processed_data, 10, 200): - tts_num += 1 - json.dump(processed_data, ttsfile, ensure_ascii=False) - ttsfile.write('\n') # 添加换行符,以便下一行开始 - - - print(f"Total number of valid conversations: {valid_num}") - print(f"Total number of invalid conversations: {invalid_num}") - print(f"Total number of conversations suitable for TTS: {tts_num}") - - -if __name__ == "__main__": - # 输入和输出文件路径 - input_file = '/mnt/bn/dev-mzy/data/corpus/belle_raw/train_3.5M_CN.json' # 原始对话数据文件 - output_file = '/mnt/bn/dev-mzy/data/corpus/belle_raw/train_3.5M_CN_cleaned.json' # 处理后的数据文件 - invalid_file = '/mnt/bn/dev-mzy/data/corpus/belle_raw/train_3.5M_CN_invalid.json' # 无效对话数据文件 - tts_file = '/mnt/bn/dev-mzy/data/corpus/belle_raw/train_3.5M_CN_ready4cosy.json' # 准备好的tts数据文件 - - process_conversations(input_file, output_file, invalid_file, tts_file) - # test = {"conversations": [{"from": "human", "value": "生成一个调查问卷,调查大学生对校园环境改善的看法和建议。"}, {"from": "assistant", "value": "调查问卷:大学生对校园环境改善的看法和建议\n你好,我们正在进行一项调查,旨在了解大学生们对于校园环境改善的看法和建议。希望您能抽出几分钟时间填写这份问卷,谢谢!\n1. 你是哪所大学的学生?\nA. xx大学\nB. xx大学\nC. 其他(请注明:________)\n2. 你认为你所在的学校的校园环境如何?\nA. 很好 B. 较好 C. 一般 D. 较差 E. 很差\n3. 对于学校的校园环境,你最满意的是什么方面?\nA. 绿化和景观\nB. 卫生和清洁\nC. 建筑和设施\nD. 安全和保障\nE. 其他(请注明:________)\n4. 对于学校的校园环境,你最不满意的是什么方面?\nA. 绿化和景观\nB. 卫生和清洁\nC. 建筑和设施\nD. 安全和保障\nE. 其他(请注明:________)\n5. 你认为有哪些方面可以改善你所在学校的校园环境?(请勾选)\nA. 增加绿化和植树\nB. 加大垃圾分类投放力度\nC. 加强设施和建筑维护\nD. 提高校园安全保障\nE. 加大环保宣传和教育力度\nF. 其他(请注明:________)\n6. 你觉得学校和学生可以共同做些什么来改善校园环境?\nA. 学生可以更爱护校园环境\nB. 学校可以加强环境管理和维护力度\nC. 设置环保奖励机制鼓励环保行为\nD. 加强师生环保教育\nE. 其他(请注明:________)\n7. 你认为学生在校内生态保护方面需要加强哪些方面的意识和行为?\nA. 节约用水和用电\nB. 垃圾分类和投放\nC. 自觉保护校园环境\nD. 拒绝乱扔垃圾和破坏设施\nE. 其他(请注明:________)\n8. 如果你可以对你所在学校的校园环境进行改善,你会怎么做?\nA. 增加植物和花草,美化校园\nB. 加强清洁和卫生管理\nC. 提高设施和建筑维护水平\nD. 加强安保力度保障学生个人安全\nE. 向学生提供更多与环保有关的活动和项目\nF. 其他(请注明:________)\n9. 最后,你认为校园环境的改善对于学生学习和生活的影响是什么?\nA. 可以提高学生的学习积极性和创造力\nB. 可以改善学生的生活和健康状况\nC. 可以提高学生的环保意识和社会责任感\nD. 可以促进学生个人成长和发展\nE. 其他(请注明:________)\n感谢您的参与!"}], "id": "18054991"} - # for conversations in test["conversations"]: - # print(conversations["value"]) - # print(is_valid_char(conversations["value"])) - # print(is_valid_sentence(conversations["value"])) diff --git a/examples/s2s/scripts/utils/turns_counting.py b/examples/s2s/scripts/utils/turns_counting.py deleted file mode 100644 index 0fe5be7e..00000000 --- a/examples/s2s/scripts/utils/turns_counting.py +++ /dev/null @@ -1,70 +0,0 @@ -import json -import matplotlib.pyplot as plt -from collections import Counter -import numpy as np -from tqdm import tqdm - -def remove_outliers(data): - """ - 使用 IQR (Interquartile Range) 方法来去除离群点。 - """ - # 计算Q1和Q3 - Q1 = np.percentile(data, 25) - Q3 = np.percentile(data, 75) - - # 计算IQR - IQR = Q3 - Q1 - - # 计算离群点的上下限 - lower_bound = Q1 - 1.5 * IQR - upper_bound = Q3 + 1.5 * IQR - - # 过滤离群点 - filtered_data = [x for x in data if lower_bound <= x <= upper_bound] - - return filtered_data - -def main(input_file): - turn_counts = [] - - with open(input_file, 'r', encoding='utf-8') as infile: - for line in tqdm(infile): - line = line.strip() - if not line: - continue - data = json.loads(line) - conversations = data.get("conversations", []) - # 计算当前对话的轮数 - num_turns = len(conversations)/2 - if num_turns <= 10: - turn_counts.append(num_turns) - - # 移除离群点 - # filtered_turn_counts = remove_outliers(turn_counts) - - # 统计每个轮数出现的频率 - count_dict = Counter(turn_counts) - sorted_counts = sorted(count_dict.items()) - - # 分离轮数和对应的对话数量 - turns, counts = zip(*sorted_counts) - - # 打印每个轮数的对话数量 - for turn, count in sorted_counts: - print(f"turn: {turn}, count: {count}") - - # 可视化对话轮数的分布 - plt.figure(figsize=(10, 6)) - plt.bar(turns, counts, color='skyblue', edgecolor='black') - plt.xlabel('turns') - plt.ylabel('counts') - plt.title('Distribution of turns') - plt.xticks(turns) - plt.grid(axis='y', linestyle='--', alpha=0.7) - plt.tight_layout() - plt.savefig('train_3.5M_CN_turns_distribution.png') - - -if __name__ == "__main__": - input_file = '/mnt/bn/dev-mzy/data/corpus/belle_raw/train_3.5M_CN.json' - main(input_file)