diff --git a/.flake8 b/.flake8 index b66c41a..ebbac9f 100644 --- a/.flake8 +++ b/.flake8 @@ -1,26 +1,26 @@ -[flake8] -max-line-length = 88 - -exclude = - .git, - __pycache__, - best_model, - logs, - prediction, - results, - saved, - wandb, - .gitignore - -ignore = - E203, - E266, - E501, - W503, - F401, - F403, - F405, - F821, - F841 - +[flake8] +max-line-length = 88 + +exclude = + .git, + __pycache__, + best_model, + logs, + prediction, + results, + saved, + wandb, + .gitignore + +ignore = + E203, + E266, + E501, + W503, + F401, + F403, + F405, + F821, + F841 + count = True \ No newline at end of file diff --git "a/.github/ISSUE_TEMPLATE/\354\203\210\353\241\234\354\232\264 \352\270\260\353\212\245 \354\266\224\352\260\200.md" "b/.github/ISSUE_TEMPLATE/\354\203\210\353\241\234\354\232\264 \352\270\260\353\212\245 \354\266\224\352\260\200.md" index 40e55f8..4d5d644 100644 --- "a/.github/ISSUE_TEMPLATE/\354\203\210\353\241\234\354\232\264 \352\270\260\353\212\245 \354\266\224\352\260\200.md" +++ "b/.github/ISSUE_TEMPLATE/\354\203\210\353\241\234\354\232\264 \352\270\260\353\212\245 \354\266\224\352\260\200.md" @@ -1,19 +1,19 @@ ---- -name: new function added! -about: when you propose new function~~! -title: "[FEAT]" -labels: '' -assignees: '' - ---- - -## Background -- -- - -## To Do -- [ ] -- [ ] - -## See Also -- +--- +name: new function added! +about: when you propose new function~~! +title: "[FEAT]" +labels: '' +assignees: '' + +--- + +## Background +- +- + +## To Do +- [ ] +- [ ] + +## See Also +- diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index ab2be1a..46ae9df 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,12 +1,12 @@ -## Overview -- - -## Change Log -- - -## To Reviewer -- - -## Issue Tags -- Closed | Fixed: # -- See also: # +## Overview +- + +## Change Log +- + +## To Reviewer +- + +## Issue Tags +- Closed | Fixed: # +- See also: # diff --git a/.gitignore b/.gitignore index 577450b..4c9ff32 100755 --- a/.gitignore +++ b/.gitignore @@ -1,25 +1,24 @@ -.DS_Store -.venv - -__pycache__/ - -best_model/ -logs/ -prediction/ -results/ -saved/ -wandb/ -configs/ -EDA/ -!eda/ - -EDA.ipynb -config*.yaml -!config.yaml -!config_full.yaml -output*.log -nohup*.out -run_exps.sh -TAPT/ -*.ipynb +.DS_Store +.venv + +__pycache__/ + +best_model/ +logs/ +prediction/ +results/ +saved/ +wandb/ +configs/ +EDA/ +!eda/ + +EDA.ipynb +config*.yaml +!config.yaml +!config_full.yaml +output*.log +nohup*.out +run_exps.sh +*.ipynb inference*.py \ No newline at end of file diff --git a/README.md b/README.md index ae55c79..ef0abf2 100644 --- a/README.md +++ b/README.md @@ -1,95 +1,95 @@ -# 문장 내 개체간 관계 추출 Relation Extraction -boostcamp AI Tech 5 NLP 트랙 레벨2 프로젝트 -문장 내의 두 단어(Entity)에 대한 속성과 관계를 예측하는 인공지능 만들기 - - -## 일정 Schedule -프로젝트 전체 기간 : 5월 3일 (화) ~ 5월 18일 (목) 19:00 - - -## 팀원 Team Members -|문지혜|박경택|박지은|송인서|윤지환| -|:---:|:---:|:---:|:---:|:---:| -|||||| -|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:munjh1121@gmail.com)](mailto:afterthougt@gmail.com)|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:afterthougt@gmail.com)](mailto:afterthougt@gmail.com)|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:imhappyhill@gmail.com)](mailto:imhappyhill@gmail.com)|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:songinseo0910@gmail.com)](mailto:songinseo0910@gmail.com)|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:yjh091500@naver.com)](mailto:yjh091500@naver.com)| -|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/jihye-moon)](https://github.com/jihye-moon)|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/afterthougt)](https://github.com/afterthougt)|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/iamzieun)](https://github.com/iamzieun)|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/fortunetiger)](https://github.com/fortunetiger)|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/ohilikeit)](https://github.com/ohilikeit)| - - -## 프로젝트 보고서 및 발표 시각 자료 Report and Presentation -- [Wrap-Up Report](https://github.com/ohilikeit/level2_klue-nlp-12/blob/main/documents/wrap%20up%20report.pdf) -- [Presentation](https://github.com/ohilikeit/level2_klue-nlp-12/blob/main/documents/presentation.pdf) - - -## 프로젝트 개요 Project Overview - - -## 레포지토리 구조 Repository Structure -```bash -level2_klue-nlp-12/ -├── eda // eda 및 사후 분석용 함수 -│ └── post_eda.py -│ -├── load_data // 데이터 불러오기 관련 폴더 -│ ├── dict_label_to_num.pkl // 레이블을 숫자로 변환하기 위한 사전 파일 -│ ├── dict_num_to_label.pkl // 숫자를 레이블로 변환하기 위한 사전 파일 -│ └── load_data.py // 데이터 불러오기 및 전처리 관련 함수 -│ -├── model // 모델, 손실 함수, 평가 지표 -│ ├── loss.py // 손실 함수 -│ ├── metric.py // 평가 지표 -│ └── model.py // 모델 아키텍쳐 -│ -├── trainer // 학습 관련 폴더 -│ └──trainer.py -│ -├── utils // 유틸리티 함수 -│ ├── args.py // 프로그램 실행 시 전달되는 인자들을 처리하기 위한 파일 -│ └── utils.py -│ -├── documents // 보고서 및 발표자료 -│ └── wrap up report.pdf -│ └── presentation.pdf -│ -├── requirements.txt // 프로젝트에 필요한 라이브러리들을 명시 -│ -├── train.py // 모델 학습 시작을 위한 메인 스크립트 -├── full_train.py // 전체 데이터로의 모델 학습 시작을 위한 메인 스크립트 -├── inference.py // 학습된 모델의 평가 및 추론을 위한 스크립트 -├── sweep.py // sweep 동작을 위한 스크립트 -│ -├── config.yaml // 모델 학습 설정 관리를 위한 YAML -├── config_full.yaml // 전체 데이터로의 모델 학습 설정 관리를 위한 YAML -│ -├── run.sh // 실험 자동화를 위한 쉘 스크립트 -├── pyproject.toml // Black 설정 파일 -│ -└── README.md -``` - -## 데이터 Data -- train.csv: 총 32470개 -- test_data.csv: 총 7765개 - - -## 사용법 Usage -- 모델 학습 및 추론 위해서는, [Huggingface Datasets](https://huggingface.co/datasets/Smoked-Salmon-s/RE_Competition) 로그인 및 access token 인증이 선행되어야 합니다. -- 환경 설치 -```bash -pip install -r requirement.txt -``` -- 학습 -```bash -python train.py config.yaml -``` -- 전체 데이터셋에 대한 학습 -```bash -python full_train.py config_full.yaml -``` -- 추론 -```bash -python inference.py config.yaml -``` -- 린팅 -```bash -black . -``` +# 문장 내 개체간 관계 추출 Relation Extraction +boostcamp AI Tech 5 NLP 트랙 레벨2 프로젝트 +문장 내의 두 단어(Entity)에 대한 속성과 관계를 예측하는 인공지능 만들기 + + +## 일정 Schedule +프로젝트 전체 기간 : 5월 3일 (화) ~ 5월 18일 (목) 19:00 + + +## 팀원 Team Members +|문지혜|박경택|박지은|송인서|윤지환| +|:---:|:---:|:---:|:---:|:---:| +|||||| +|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:munjh1121@gmail.com)](mailto:afterthougt@gmail.com)|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:afterthougt@gmail.com)](mailto:afterthougt@gmail.com)|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:imhappyhill@gmail.com)](mailto:imhappyhill@gmail.com)|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:songinseo0910@gmail.com)](mailto:songinseo0910@gmail.com)|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:yjh091500@naver.com)](mailto:yjh091500@naver.com)| +|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/jihye-moon)](https://github.com/jihye-moon)|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/afterthougt)](https://github.com/afterthougt)|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/iamzieun)](https://github.com/iamzieun)|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/fortunetiger)](https://github.com/fortunetiger)|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/ohilikeit)](https://github.com/ohilikeit)| + + +## 프로젝트 보고서 및 발표 시각 자료 Report and Presentation +- [Wrap-Up Report](https://github.com/ohilikeit/level2_klue-nlp-12/blob/main/documents/wrap%20up%20report.pdf) +- [Presentation](https://github.com/ohilikeit/level2_klue-nlp-12/blob/main/documents/presentation.pdf) + + +## 프로젝트 개요 Project Overview + + +## 레포지토리 구조 Repository Structure +```bash +level2_klue-nlp-12/ +├── eda // eda 및 사후 분석용 함수 +│ └── post_eda.py +│ +├── load_data // 데이터 불러오기 관련 폴더 +│ ├── dict_label_to_num.pkl // 레이블을 숫자로 변환하기 위한 사전 파일 +│ ├── dict_num_to_label.pkl // 숫자를 레이블로 변환하기 위한 사전 파일 +│ └── load_data.py // 데이터 불러오기 및 전처리 관련 함수 +│ +├── model // 모델, 손실 함수, 평가 지표 +│ ├── loss.py // 손실 함수 +│ ├── metric.py // 평가 지표 +│ └── model.py // 모델 아키텍쳐 +│ +├── trainer // 학습 관련 폴더 +│ └──trainer.py +│ +├── utils // 유틸리티 함수 +│ ├── args.py // 프로그램 실행 시 전달되는 인자들을 처리하기 위한 파일 +│ └── utils.py +│ +├── documents // 보고서 및 발표자료 +│ └── wrap up report.pdf +│ └── presentation.pdf +│ +├── requirements.txt // 프로젝트에 필요한 라이브러리들을 명시 +│ +├── train.py // 모델 학습 시작을 위한 메인 스크립트 +├── full_train.py // 전체 데이터로의 모델 학습 시작을 위한 메인 스크립트 +├── inference.py // 학습된 모델의 평가 및 추론을 위한 스크립트 +├── sweep.py // sweep 동작을 위한 스크립트 +│ +├── config.yaml // 모델 학습 설정 관리를 위한 YAML +├── config_full.yaml // 전체 데이터로의 모델 학습 설정 관리를 위한 YAML +│ +├── run.sh // 실험 자동화를 위한 쉘 스크립트 +├── pyproject.toml // Black 설정 파일 +│ +└── README.md +``` + +## 데이터 Data +- train.csv: 총 32470개 +- test_data.csv: 총 7765개 + + +## 사용법 Usage +- 모델 학습 및 추론 위해서는, [Huggingface Datasets](https://huggingface.co/datasets/Smoked-Salmon-s/RE_Competition) 로그인 및 access token 인증이 선행되어야 합니다. +- 환경 설치 +```bash +pip install -r requirement.txt +``` +- 학습 +```bash +python train.py config.yaml +``` +- 전체 데이터셋에 대한 학습 +```bash +python full_train.py config_full.yaml +``` +- 추론 +```bash +python inference.py config.yaml +``` +- 린팅 +```bash +black . +``` diff --git a/config.yaml b/config.yaml index abd12c1..fd261b8 100755 --- a/config.yaml +++ b/config.yaml @@ -1,102 +1,102 @@ ---- -run_name: exp-name -n_gpu: 1 -seed: 42 -use_wandb: true -num_labels: 30 -model: - name: klue/roberta-large - variant: BiLSTMREModel -dataloader: - type: REDataModule - input_format: typed_entity_marker_punct - prompt: s_and_o - type_transform: false - train_split: train - valid_split: validation - batch_size: 64 - shuffle: true - revision: a6a27f7e03c79cee2ee1171b7001d7a23cfd4495 - num_workers: 0 -optimizer: - type: AdamW - lr: 2.0e-5 - weight_decay: 0.01 - adam_beta2: 0.98 -loss: - type: CrossEntropyLoss - focal_alpha: 0.25 - focal_gamma: 2.0 - dice_smooth: 1.5 -lr_scheduler: - type: StepLR - warmup_steps: 500 - warmup_ratio: 0.06 - step_size: 50 - gamma: 0.1 - is_schedule: false -trainer: - epochs: 5 - output_dir: saved/models/ - model_dir: ./best_model - pred_dir: ./prediction/submission.csv - val_pred_dir: ./prediction/validation_output.csv - logging_dir: ./logs - logging_steps: 100 - save_total_limit: 5 - save_steps: 500 - save_freq: 1 - use_early_stop: true - early_stop: 3 - evaluation_strategy: steps - evaluation_steps: 500 - save_strategy: steps -wandb: - entity: salmons - project_name: klue-re - sweep_project_name: sweep - sweep_count: 10 -sweep_config: - method: bayes - metric: - name: eval/micro f1 score - goal: maximize - parameters: - input_format: - values: - - default - - entity_marker - - entity_marker_punct - - typed_entity_marker - - typed_entity_marker_punct - prompt: - values: - - default - - s_sep_o - - s_and_o - - quiz - - problem - type_transform: - values: - - true - - false - lr: - values: - - 1.0e-05 - - 2.0e-05 - - 3.0e-05 - - 5.0e-05 - epochs: - values: - - 3 - - 4 - - 5 - adam_beta2: - values: - - 0.98 - - 0.999 - warmup_ratio: - values: - - 0.06 - - 0.1 +--- +run_name: exp-name +n_gpu: 1 +seed: 42 +use_wandb: true +num_labels: 30 +model: + name: klue/roberta-large + variant: BiLSTMREModel +dataloader: + type: REDataModule + input_format: typed_entity_marker_punct + prompt: s_and_o + type_transform: false + train_split: train + valid_split: validation + batch_size: 64 + shuffle: true + revision: a6a27f7e03c79cee2ee1171b7001d7a23cfd4495 + num_workers: 0 +optimizer: + type: AdamW + lr: 2.0e-5 + weight_decay: 0.01 + adam_beta2: 0.98 +loss: + type: CrossEntropyLoss + focal_alpha: 0.25 + focal_gamma: 2.0 + dice_smooth: 1.5 +lr_scheduler: + type: StepLR + warmup_steps: 500 + warmup_ratio: 0.06 + step_size: 50 + gamma: 0.1 + is_schedule: false +trainer: + epochs: 5 + output_dir: saved/models/ + model_dir: ./best_model + pred_dir: ./prediction/submission.csv + val_pred_dir: ./prediction/validation_output.csv + logging_dir: ./logs + logging_steps: 100 + save_total_limit: 5 + save_steps: 500 + save_freq: 1 + use_early_stop: true + early_stop: 3 + evaluation_strategy: steps + evaluation_steps: 500 + save_strategy: steps +wandb: + entity: salmons + project_name: klue-re + sweep_project_name: sweep + sweep_count: 10 +sweep_config: + method: bayes + metric: + name: eval/micro f1 score + goal: maximize + parameters: + input_format: + values: + - default + - entity_marker + - entity_marker_punct + - typed_entity_marker + - typed_entity_marker_punct + prompt: + values: + - default + - s_sep_o + - s_and_o + - quiz + - problem + type_transform: + values: + - true + - false + lr: + values: + - 1.0e-05 + - 2.0e-05 + - 3.0e-05 + - 5.0e-05 + epochs: + values: + - 3 + - 4 + - 5 + adam_beta2: + values: + - 0.98 + - 0.999 + warmup_ratio: + values: + - 0.06 + - 0.1 ... \ No newline at end of file diff --git a/config_full.yaml b/config_full.yaml index ebdde7b..2f330a0 100644 --- a/config_full.yaml +++ b/config_full.yaml @@ -1,102 +1,102 @@ ---- -run_name: exp-name -n_gpu: 1 -seed: 42 -use_wandb: true -num_labels: 30 -model: - name: klue/roberta-large - variant: BiLSTMREModel -dataloader: - type: REDataModule - input_format: typed_entity_marker_punct - prompt: s_and_o - type_transform: false - train_split: train - valid_split: validation - batch_size: 64 - shuffle: true - revision: a6a27f7e03c79cee2ee1171b7001d7a23cfd4495 - num_workers: 0 -optimizer: - type: AdamW - lr: 2.0e-5 - weight_decay: 0.01 - adam_beta2: 0.98 -loss: - type: CrossEntropyLoss - focal_alpha: 0.25 - focal_gamma: 2.0 - dice_smooth: 1.5 -lr_scheduler: - type: StepLR - warmup_steps: 500 - warmup_ratio: 0.06 - step_size: 50 - gamma: 0.1 - is_schedule: false -trainer: - epochs: 5 - output_dir: saved/models/ - model_dir: ./best_model - pred_dir: ./prediction/submission.csv - val_pred_dir: ./prediction/validation_output.csv - logging_dir: ./logs - logging_steps: 100 - save_total_limit: 5 - save_steps: 500 - save_freq: 1 - use_early_stop: false - early_stop: 3 - evaluation_strategy: steps - evaluation_steps: 500 - save_strategy: steps -wandb: - entity: salmons - project_name: klue-re - sweep_project_name: sweep - sweep_count: 10 -sweep_config: - method: bayes - metric: - name: eval/micro f1 score - goal: maximize - parameters: - input_format: - values: - - default - - entity_marker - - entity_marker_punct - - typed_entity_marker - - typed_entity_marker_punct - prompt: - values: - - default - - s_sep_o - - s_and_o - - quiz - - problem - type_transform: - values: - - true - - false - lr: - values: - - 1.0e-05 - - 2.0e-05 - - 3.0e-05 - - 5.0e-05 - epochs: - values: - - 3 - - 4 - - 5 - adam_beta2: - values: - - 0.98 - - 0.999 - warmup_ratio: - values: - - 0.06 - - 0.1 +--- +run_name: exp-name +n_gpu: 1 +seed: 42 +use_wandb: true +num_labels: 30 +model: + name: klue/roberta-large + variant: BiLSTMREModel +dataloader: + type: REDataModule + input_format: typed_entity_marker_punct + prompt: s_and_o + type_transform: false + train_split: train + valid_split: validation + batch_size: 64 + shuffle: true + revision: a6a27f7e03c79cee2ee1171b7001d7a23cfd4495 + num_workers: 0 +optimizer: + type: AdamW + lr: 2.0e-5 + weight_decay: 0.01 + adam_beta2: 0.98 +loss: + type: CrossEntropyLoss + focal_alpha: 0.25 + focal_gamma: 2.0 + dice_smooth: 1.5 +lr_scheduler: + type: StepLR + warmup_steps: 500 + warmup_ratio: 0.06 + step_size: 50 + gamma: 0.1 + is_schedule: false +trainer: + epochs: 5 + output_dir: saved/models/ + model_dir: ./best_model + pred_dir: ./prediction/submission.csv + val_pred_dir: ./prediction/validation_output.csv + logging_dir: ./logs + logging_steps: 100 + save_total_limit: 5 + save_steps: 500 + save_freq: 1 + use_early_stop: false + early_stop: 3 + evaluation_strategy: steps + evaluation_steps: 500 + save_strategy: steps +wandb: + entity: salmons + project_name: klue-re + sweep_project_name: sweep + sweep_count: 10 +sweep_config: + method: bayes + metric: + name: eval/micro f1 score + goal: maximize + parameters: + input_format: + values: + - default + - entity_marker + - entity_marker_punct + - typed_entity_marker + - typed_entity_marker_punct + prompt: + values: + - default + - s_sep_o + - s_and_o + - quiz + - problem + type_transform: + values: + - true + - false + lr: + values: + - 1.0e-05 + - 2.0e-05 + - 3.0e-05 + - 5.0e-05 + epochs: + values: + - 3 + - 4 + - 5 + adam_beta2: + values: + - 0.98 + - 0.999 + warmup_ratio: + values: + - 0.06 + - 0.1 ... \ No newline at end of file diff --git a/eda/post_eda.py b/eda/post_eda.py index 92310c1..322d80b 100644 --- a/eda/post_eda.py +++ b/eda/post_eda.py @@ -1,227 +1,227 @@ -import pandas as pd -import seaborn as sns -import plotly.express as px -import matplotlib.pyplot as plt -import matplotlib.colors as mcolors -from datasets import load_dataset -from sklearn.metrics import confusion_matrix - - -def make_dataframe(PATH: str, split: str, revision: int) -> pd.DataFrame: - """ - 주어진 경로(PATH)로부터의 데이터프레임과 Huggingface Datasets로부터 불러온 데이터프레임을 합친 - 하나의 데이터프레임을 반환합니다. - - arguments: - PATH (str): 'label'과 'pred_label' 열이 포함된 데이터프레임의 경로 - split (str): 데이터셋의 분할 유형 (train, validation, test). - revision (str): 데이터셋의 버전 (commit hash). - - return: - df (pd.DataFrame): 주어진 경로(PATH)로부터의 데이터프레임과 Huggingface Datasets으로부터 불러온 데이터프레임을 합친 - 하나의 데이터프레임 - """ - # 원본 validation set 불러오기 - valid = load_dataset("Smoked-Salmon-s/RE_Competition", - split = split, - column_names = ['id', 'sentence', 'subject_entity', 'object_entity', 'label', 'source'], - revision = revision) - valid_df = valid.to_pandas().iloc[1:].reset_index(drop=True).astype({'id': 'int64'}) - - # inference한 validation set 불러오기 - valid_inferred_df = pd.read_csv(PATH) - - # 두 dataframe 합치기 - df = pd.merge(valid_df, - valid_inferred_df[['id', 'pred_label', 'probs']], - on='id', - how='inner') - df = df[['id', 'sentence', 'subject_entity', 'object_entity', 'label', 'pred_label', 'probs', 'source']] - - return df - - -def confusion_matrix_graph(df: pd.DataFrame): - """ - 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여 confusion matrix을 계산하고 - heatmap 형태로 시각화합니다. - - arguments: - df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임 - - return: - None. 함수는 confusion matrix heatmap을 출력합니다. - """ - # confusion matrix 계산 - cm = confusion_matrix(df['label'], df['pred_label']) - - # 커스텀 컬러맵 생성 - cmap = mcolors.ListedColormap(['white', 'pink', 'tomato']) - - # 정규화를 위한 경계값 설정 - bounds = [0.5, 1.0, 10.0, cm.max() + 0.5] # 1.0, 10.0을 경계값으로 설정 - - # 컬러맵을 적용할 값의 범위 설정 - norm = mcolors.BoundaryNorm(bounds, cmap.N) - - # 라벨 설정 - labels = sorted(list(df['label'].unique())) - - # heatmap 그리기 - plt.figure(figsize=(10, 7)) - sns.heatmap(cm, annot=True, annot_kws={"size":8}, cmap=cmap, norm=norm, fmt='g', xticklabels=labels, yticklabels=labels) - - # 축 이름 및 제목 설정 - plt.xlabel('Predicted Labels') - plt.ylabel('True Labels') - plt.title('Confusion Matrix') - - # 그래프 표시 - plt.show() - - -def all_label_matrix(df: pd.DataFrame, sort_column: str = 'label') -> pd.DataFrame: - """ - 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여 - 각 label에 대한 confusion matrix을 계산하고 dataframe 형태로 시각화합니다. - - arguments: - df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임 - sort_column (str): confusion matrix dataframe을 정렬하는 기준 열 - - return: - metric_df (pd.DataFrame): confusion matrix dataframe - """ - label_list = list(sorted(df['label'].unique())) - - label = [len(df[df['label'] == label]) for label in label_list] - pred_label = [len(df[df['pred_label'] == label]) for label in label_list] - TP = [len(df[(df['pred_label'] == label) & (df['label'] == label)]) for label in label_list] - FP = [len(df[(df['pred_label'] == label) & (df['label'] != label)]) for label in label_list] - FN = [len(df[(df['pred_label'] != label) & (df['label'] == label)]) for label in label_list] - - precision = [] - for tp, fp in zip(TP, FP): - if tp + fp > 0: - p = round(tp / (tp + fp), 4) - else: - p = 0 - precision.append(p) - - recall = [] - for tp, fn in zip(TP, FN): - if tp + fn > 0: - r = round(tp / (tp + fn), 4) - else: - r = 0 - recall.append(r) - - metric_df = pd.DataFrame(zip(label_list, label, pred_label, TP, FP, FN, precision, recall)) - metric_df.columns = ['label', 'label_#', 'pred_label_#', 'TP', 'FP', 'FN', 'precision', 'recall'] - metric_df = metric_df.sort_values(sort_column) - - return metric_df - - -def specific_label_matrix(df: pd.DataFrame, label: str ='no_relation') -> pd.DataFrame: - """ - 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여 - 주어진 label에 대한 confusion matrix을 계산하고 dataframe 형태로 시각화합니다. - - arguments: - df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임 - label (str): confusion matrix을 계산할 label - - return: - metric_df (pd.DataFrame): 주어진 label에 대한 confusion matrix dataframe - """ - TP = len(df[(df['pred_label'] == label) & (df['label'] == label)]) - FP = len(df[(df['pred_label'] == label) & (df['label'] != label)]) - FN = len(df[(df['pred_label'] != label) & (df['label'] == label)]) - - precision = round(TP / (TP + FP), 4) - recall = round(TP / (TP + FN), 4) - - metric_dict = {"TP": TP, "FP": FP, "FN": FN, "precision": precision, "recall": recall} - metric_df = pd.DataFrame.from_dict(data = metric_dict, - orient='index', - columns=['value']) - - return metric_df - - -def total_metric(df: pd.DataFrame) -> pd.DataFrame: - """ - 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여 - 전체 데이터에 대한 confusion matrix을 계산하고 dataframe 형태로 시각화합니다. - - arguments: - df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임 - - return: - metric_df (pd.DataFrame): 주어진 데이터에 대한 confusion matrix dataframe - """ - df = all_label_matrix(df) - cleared_df = df[df['label'] != 'no_relation'].copy() - - TP = sum(cleared_df['TP']) - FP = sum(cleared_df['FP']) - FN = sum(cleared_df['FN']) - - precision = TP / (TP + FP) - recall = TP / (TP + FN) - - F1 = 2 * precision * recall / (precision + recall) - - metric_dict = {"TP": TP, "FP": FP, "FN": FN, "micro precision": precision, "micro recall": recall, " mircro F1 score": F1} - metric_df = pd.DataFrame.from_dict(data = metric_dict, - orient='index', - columns=['value']) - - return metric_df - - -def precision_recall_graph(df: pd.DataFrame): - """ - 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여 - 각 label에 대한 precision과 recall을 계산하고 scatterplot 형태로 시각화합니다. - - arguments: - df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임 - - return: - None. 함수는 precision과 recall에 대한 scatterplot을 출력합니다. - """ - label_list = list(sorted(df['label'].unique())) - - label = [len(df[df['label'] == label]) for label in label_list] - pred_label = [len(df[df['pred_label'] == label]) for label in label_list] - TP = [len(df[(df['pred_label'] == label) & (df['label'] == label)]) for label in label_list] - FP = [len(df[(df['pred_label'] == label) & (df['label'] != label)]) for label in label_list] - FN = [len(df[(df['pred_label'] != label) & (df['label'] == label)]) for label in label_list] - - precision = [] - for tp, fp in zip(TP, FP): - if tp + fp > 0: - p = round(tp / (tp + fp), 4) - else: - p = 0 - precision.append(p) - - recall = [] - for tp, fn in zip(TP, FN): - if tp + fn > 0: - r = round(tp / (tp + fn), 4) - else: - r = 0 - recall.append(r) - - plt.scatter(recall, precision) - - # 그래프 제목과 축 레이블 설정 - plt.title('relation between recall and precision') - plt.xlabel('recall') - plt.ylabel('precision') - - # 그래프 보이기 +import pandas as pd +import seaborn as sns +import plotly.express as px +import matplotlib.pyplot as plt +import matplotlib.colors as mcolors +from datasets import load_dataset +from sklearn.metrics import confusion_matrix + + +def make_dataframe(PATH: str, split: str, revision: int) -> pd.DataFrame: + """ + 주어진 경로(PATH)로부터의 데이터프레임과 Huggingface Datasets로부터 불러온 데이터프레임을 합친 + 하나의 데이터프레임을 반환합니다. + + arguments: + PATH (str): 'label'과 'pred_label' 열이 포함된 데이터프레임의 경로 + split (str): 데이터셋의 분할 유형 (train, validation, test). + revision (str): 데이터셋의 버전 (commit hash). + + return: + df (pd.DataFrame): 주어진 경로(PATH)로부터의 데이터프레임과 Huggingface Datasets으로부터 불러온 데이터프레임을 합친 + 하나의 데이터프레임 + """ + # 원본 validation set 불러오기 + valid = load_dataset("Smoked-Salmon-s/RE_Competition", + split = split, + column_names = ['id', 'sentence', 'subject_entity', 'object_entity', 'label', 'source'], + revision = revision) + valid_df = valid.to_pandas().iloc[1:].reset_index(drop=True).astype({'id': 'int64'}) + + # inference한 validation set 불러오기 + valid_inferred_df = pd.read_csv(PATH) + + # 두 dataframe 합치기 + df = pd.merge(valid_df, + valid_inferred_df[['id', 'pred_label', 'probs']], + on='id', + how='inner') + df = df[['id', 'sentence', 'subject_entity', 'object_entity', 'label', 'pred_label', 'probs', 'source']] + + return df + + +def confusion_matrix_graph(df: pd.DataFrame): + """ + 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여 confusion matrix을 계산하고 + heatmap 형태로 시각화합니다. + + arguments: + df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임 + + return: + None. 함수는 confusion matrix heatmap을 출력합니다. + """ + # confusion matrix 계산 + cm = confusion_matrix(df['label'], df['pred_label']) + + # 커스텀 컬러맵 생성 + cmap = mcolors.ListedColormap(['white', 'pink', 'tomato']) + + # 정규화를 위한 경계값 설정 + bounds = [0.5, 1.0, 10.0, cm.max() + 0.5] # 1.0, 10.0을 경계값으로 설정 + + # 컬러맵을 적용할 값의 범위 설정 + norm = mcolors.BoundaryNorm(bounds, cmap.N) + + # 라벨 설정 + labels = sorted(list(df['label'].unique())) + + # heatmap 그리기 + plt.figure(figsize=(10, 7)) + sns.heatmap(cm, annot=True, annot_kws={"size":8}, cmap=cmap, norm=norm, fmt='g', xticklabels=labels, yticklabels=labels) + + # 축 이름 및 제목 설정 + plt.xlabel('Predicted Labels') + plt.ylabel('True Labels') + plt.title('Confusion Matrix') + + # 그래프 표시 + plt.show() + + +def all_label_matrix(df: pd.DataFrame, sort_column: str = 'label') -> pd.DataFrame: + """ + 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여 + 각 label에 대한 confusion matrix을 계산하고 dataframe 형태로 시각화합니다. + + arguments: + df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임 + sort_column (str): confusion matrix dataframe을 정렬하는 기준 열 + + return: + metric_df (pd.DataFrame): confusion matrix dataframe + """ + label_list = list(sorted(df['label'].unique())) + + label = [len(df[df['label'] == label]) for label in label_list] + pred_label = [len(df[df['pred_label'] == label]) for label in label_list] + TP = [len(df[(df['pred_label'] == label) & (df['label'] == label)]) for label in label_list] + FP = [len(df[(df['pred_label'] == label) & (df['label'] != label)]) for label in label_list] + FN = [len(df[(df['pred_label'] != label) & (df['label'] == label)]) for label in label_list] + + precision = [] + for tp, fp in zip(TP, FP): + if tp + fp > 0: + p = round(tp / (tp + fp), 4) + else: + p = 0 + precision.append(p) + + recall = [] + for tp, fn in zip(TP, FN): + if tp + fn > 0: + r = round(tp / (tp + fn), 4) + else: + r = 0 + recall.append(r) + + metric_df = pd.DataFrame(zip(label_list, label, pred_label, TP, FP, FN, precision, recall)) + metric_df.columns = ['label', 'label_#', 'pred_label_#', 'TP', 'FP', 'FN', 'precision', 'recall'] + metric_df = metric_df.sort_values(sort_column) + + return metric_df + + +def specific_label_matrix(df: pd.DataFrame, label: str ='no_relation') -> pd.DataFrame: + """ + 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여 + 주어진 label에 대한 confusion matrix을 계산하고 dataframe 형태로 시각화합니다. + + arguments: + df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임 + label (str): confusion matrix을 계산할 label + + return: + metric_df (pd.DataFrame): 주어진 label에 대한 confusion matrix dataframe + """ + TP = len(df[(df['pred_label'] == label) & (df['label'] == label)]) + FP = len(df[(df['pred_label'] == label) & (df['label'] != label)]) + FN = len(df[(df['pred_label'] != label) & (df['label'] == label)]) + + precision = round(TP / (TP + FP), 4) + recall = round(TP / (TP + FN), 4) + + metric_dict = {"TP": TP, "FP": FP, "FN": FN, "precision": precision, "recall": recall} + metric_df = pd.DataFrame.from_dict(data = metric_dict, + orient='index', + columns=['value']) + + return metric_df + + +def total_metric(df: pd.DataFrame) -> pd.DataFrame: + """ + 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여 + 전체 데이터에 대한 confusion matrix을 계산하고 dataframe 형태로 시각화합니다. + + arguments: + df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임 + + return: + metric_df (pd.DataFrame): 주어진 데이터에 대한 confusion matrix dataframe + """ + df = all_label_matrix(df) + cleared_df = df[df['label'] != 'no_relation'].copy() + + TP = sum(cleared_df['TP']) + FP = sum(cleared_df['FP']) + FN = sum(cleared_df['FN']) + + precision = TP / (TP + FP) + recall = TP / (TP + FN) + + F1 = 2 * precision * recall / (precision + recall) + + metric_dict = {"TP": TP, "FP": FP, "FN": FN, "micro precision": precision, "micro recall": recall, " mircro F1 score": F1} + metric_df = pd.DataFrame.from_dict(data = metric_dict, + orient='index', + columns=['value']) + + return metric_df + + +def precision_recall_graph(df: pd.DataFrame): + """ + 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여 + 각 label에 대한 precision과 recall을 계산하고 scatterplot 형태로 시각화합니다. + + arguments: + df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임 + + return: + None. 함수는 precision과 recall에 대한 scatterplot을 출력합니다. + """ + label_list = list(sorted(df['label'].unique())) + + label = [len(df[df['label'] == label]) for label in label_list] + pred_label = [len(df[df['pred_label'] == label]) for label in label_list] + TP = [len(df[(df['pred_label'] == label) & (df['label'] == label)]) for label in label_list] + FP = [len(df[(df['pred_label'] == label) & (df['label'] != label)]) for label in label_list] + FN = [len(df[(df['pred_label'] != label) & (df['label'] == label)]) for label in label_list] + + precision = [] + for tp, fp in zip(TP, FP): + if tp + fp > 0: + p = round(tp / (tp + fp), 4) + else: + p = 0 + precision.append(p) + + recall = [] + for tp, fn in zip(TP, FN): + if tp + fn > 0: + r = round(tp / (tp + fn), 4) + else: + r = 0 + recall.append(r) + + plt.scatter(recall, precision) + + # 그래프 제목과 축 레이블 설정 + plt.title('relation between recall and precision') + plt.xlabel('recall') + plt.ylabel('precision') + + # 그래프 보이기 plt.show() \ No newline at end of file diff --git a/full_train.py b/full_train.py index 1a66daa..b5bfc79 100644 --- a/full_train.py +++ b/full_train.py @@ -1,158 +1,158 @@ -import sys -import pickle as pickle -import pytz -from datetime import datetime -from typing import Dict, Any - -import torch -from transformers import ( - AutoTokenizer, - EarlyStoppingCallback, - TrainingArguments, -) - -from utils.args import * -from load_data.load_data import * -from model.model import * -from model.metric import * -from trainer.trainer import * -from utils.utils import * - - -def train(config: Any) -> None: - """ - 모델을 학습하는 함수, train.py와 다른 점은 평가 데이터셋 사용하지 않고 모든 데이터를 훈련에 활용함 - - 다음 프로세스를 수행: - 1. 데이터셋을 불러오고 전처리 및 토큰화 - 2. 레이블을 숫자 형태로 변환 - 3. 학습 및 개발 데이터셋에 대한 Dataset 객체를 생성 - 4. 지정된 모델을 불러와 훈련 인자 설정 - 5. 모델 학습 후 저장 - - Args: - config (dict): 모델 학습에 필요한 모든 구성 매개변수를 포함하는 딕셔너리 - dataloader, model, optimizer, trainer 구성 포함 - - Returns: - None - """ - seed_everything(config.seed) - - # load model and tokenizer - model_name = config.model['name'] - tokenizer = AutoTokenizer.from_pretrained(model_name) - - # 1. load dataset - # 2. preprocess dataset - # 3. tokenize dataset - revision = "69b6010fe9681567b98f9d3d3c70487079183d4b" - input_format = config.dataloader.get('input_format') - prompt = config.dataloader.get('prompt') - type_transform = config.dataloader.get('type_transform') - - train_dataset, train_raw_label = load_train_dataset( - split=config.dataloader['train_split'], - revision=revision, - tokenizer=tokenizer, - input_format=input_format, - prompt=prompt, - type_transform=type_transform, - ) - - train_label = label_to_num(train_raw_label) - - # 4. make Dataset object - re_train_dataset = REDataset(train_dataset, train_label) - - device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') - print(device) - - # 5. import model - # setting model hyperparameter - model_module = __import__('model.model', fromlist=[config.model['variant']]) - model_class = getattr(model_module, config.model['variant']) - # Available customized classes: - # BaseREModel, BiLSTMREModel, BiGRUREModel - model = model_class(config, len(tokenizer)) - - print(model.model_config) - - model.parameters - model.to(device) - - # 6. training arguments 설정 - ## 사용한 option 외에도 다양한 option들이 있습니다. - ## https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. - training_args = TrainingArguments( - # 기본 설정 - output_dir=config.trainer['output_dir'], # 모델 저장 디렉토리 - report_to=('wandb' if config.use_wandb else 'none'), # wandb 사용 여부 - fp16=True, # 16-bit floating point precision - - # 학습 설정 - num_train_epochs=config.trainer['epochs'], # 전체 훈련 epoch 수 - learning_rate=config.optimizer['lr'], # learning rate - weight_decay=config.optimizer['weight_decay'], # weight decay - adam_beta2=config.optimizer['adam_beta2'], # AdamW 옵티마이저의 beta2 하이퍼파라미터 - - # 배치 사이즈 설정 - per_device_train_batch_size=config.dataloader['batch_size'], # 훈련 중 장치 당 batch size - - # 스케줄링 설정 - warmup_ratio=config.lr_scheduler['warmup_ratio'], # learning rate scheduler의 warmup 비율 - - # 로깅 설정 - logging_dir=config.trainer['logging_dir'], # 로그 저장 디렉토리 - logging_steps=config.trainer['logging_steps'], # 로그 저장 스텝 - load_best_model_at_end=config.trainer['use_early_stop'], - ) - - # 7. trainer 설정 - # 8. evaluate 함수 설정 - trainer = RETrainer( - model=model, # the instantiated 🤗 Transformers model to be trained - args=training_args, # training arguments, defined above - train_dataset=re_train_dataset, # training dataset - compute_metrics=compute_metrics, # define metrics function - loss_cfg=config.loss, - ) - - # 9. train model - trainer.train() - # 10. save model - trainer.save_model(config.trainer['model_dir']) - - -def main() -> None: - """ - config를 불러오고 학습 프로세스를 시작하는 메인 함수입니다. - - 다음 프로세스를 수행: - 1. 제공된 YAML 파일에서 구성을 파싱하거나 기본 파일을 사용 - 2. 제공된 구성으로 Weights & Biases (wandb) 실행 초기화 - 3. train 함수를 호출하여 모델 훈련 프로세스를 시작 - 4. 학습 완료 후 wandb에 완료 메세지 송출 - - Args: - None - - Returns: - None - """ - try: - config_path = sys.argv[1] - except IndexError: - config_path = './config.yaml' - config = parse_arguments(config_path) - - now = datetime.now(pytz.timezone('Asia/Seoul')) - run_name = f'{config.run_name}_{now.strftime("%d-%H-%M")}' - - init_wandb(config, run_name) - train(config) - alert_wandb(config, run_name, 'finished') - - -if __name__ == '__main__': +import sys +import pickle as pickle +import pytz +from datetime import datetime +from typing import Dict, Any + +import torch +from transformers import ( + AutoTokenizer, + EarlyStoppingCallback, + TrainingArguments, +) + +from utils.args import * +from load_data.load_data import * +from model.model import * +from model.metric import * +from trainer.trainer import * +from utils.utils import * + + +def train(config: Any) -> None: + """ + 모델을 학습하는 함수, train.py와 다른 점은 평가 데이터셋 사용하지 않고 모든 데이터를 훈련에 활용함 + + 다음 프로세스를 수행: + 1. 데이터셋을 불러오고 전처리 및 토큰화 + 2. 레이블을 숫자 형태로 변환 + 3. 학습 및 개발 데이터셋에 대한 Dataset 객체를 생성 + 4. 지정된 모델을 불러와 훈련 인자 설정 + 5. 모델 학습 후 저장 + + Args: + config (dict): 모델 학습에 필요한 모든 구성 매개변수를 포함하는 딕셔너리 + dataloader, model, optimizer, trainer 구성 포함 + + Returns: + None + """ + seed_everything(config.seed) + + # load model and tokenizer + model_name = config.model['name'] + tokenizer = AutoTokenizer.from_pretrained(model_name) + + # 1. load dataset + # 2. preprocess dataset + # 3. tokenize dataset + revision = "69b6010fe9681567b98f9d3d3c70487079183d4b" + input_format = config.dataloader.get('input_format') + prompt = config.dataloader.get('prompt') + type_transform = config.dataloader.get('type_transform') + + train_dataset, train_raw_label = load_train_dataset( + split=config.dataloader['train_split'], + revision=revision, + tokenizer=tokenizer, + input_format=input_format, + prompt=prompt, + type_transform=type_transform, + ) + + train_label = label_to_num(train_raw_label) + + # 4. make Dataset object + re_train_dataset = REDataset(train_dataset, train_label) + + device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') + print(device) + + # 5. import model + # setting model hyperparameter + model_module = __import__('model.model', fromlist=[config.model['variant']]) + model_class = getattr(model_module, config.model['variant']) + # Available customized classes: + # BaseREModel, BiLSTMREModel, BiGRUREModel + model = model_class(config, len(tokenizer)) + + print(model.model_config) + + model.parameters + model.to(device) + + # 6. training arguments 설정 + ## 사용한 option 외에도 다양한 option들이 있습니다. + ## https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. + training_args = TrainingArguments( + # 기본 설정 + output_dir=config.trainer['output_dir'], # 모델 저장 디렉토리 + report_to=('wandb' if config.use_wandb else 'none'), # wandb 사용 여부 + fp16=True, # 16-bit floating point precision + + # 학습 설정 + num_train_epochs=config.trainer['epochs'], # 전체 훈련 epoch 수 + learning_rate=config.optimizer['lr'], # learning rate + weight_decay=config.optimizer['weight_decay'], # weight decay + adam_beta2=config.optimizer['adam_beta2'], # AdamW 옵티마이저의 beta2 하이퍼파라미터 + + # 배치 사이즈 설정 + per_device_train_batch_size=config.dataloader['batch_size'], # 훈련 중 장치 당 batch size + + # 스케줄링 설정 + warmup_ratio=config.lr_scheduler['warmup_ratio'], # learning rate scheduler의 warmup 비율 + + # 로깅 설정 + logging_dir=config.trainer['logging_dir'], # 로그 저장 디렉토리 + logging_steps=config.trainer['logging_steps'], # 로그 저장 스텝 + load_best_model_at_end=config.trainer['use_early_stop'], + ) + + # 7. trainer 설정 + # 8. evaluate 함수 설정 + trainer = RETrainer( + model=model, # the instantiated 🤗 Transformers model to be trained + args=training_args, # training arguments, defined above + train_dataset=re_train_dataset, # training dataset + compute_metrics=compute_metrics, # define metrics function + loss_cfg=config.loss, + ) + + # 9. train model + trainer.train() + # 10. save model + trainer.save_model(config.trainer['model_dir']) + + +def main() -> None: + """ + config를 불러오고 학습 프로세스를 시작하는 메인 함수입니다. + + 다음 프로세스를 수행: + 1. 제공된 YAML 파일에서 구성을 파싱하거나 기본 파일을 사용 + 2. 제공된 구성으로 Weights & Biases (wandb) 실행 초기화 + 3. train 함수를 호출하여 모델 훈련 프로세스를 시작 + 4. 학습 완료 후 wandb에 완료 메세지 송출 + + Args: + None + + Returns: + None + """ + try: + config_path = sys.argv[1] + except IndexError: + config_path = './config.yaml' + config = parse_arguments(config_path) + + now = datetime.now(pytz.timezone('Asia/Seoul')) + run_name = f'{config.run_name}_{now.strftime("%d-%H-%M")}' + + init_wandb(config, run_name) + train(config) + alert_wandb(config, run_name, 'finished') + + +if __name__ == '__main__': main() \ No newline at end of file diff --git a/inference.py b/inference.py index 6ab6433..f1da80d 100755 --- a/inference.py +++ b/inference.py @@ -1,177 +1,177 @@ -import pickle as pickle -import sys - -import numpy as np -import pandas as pd -import torch -import torch.nn.functional as F -from tqdm import tqdm -from torch.utils.data import DataLoader -from transformers import AutoTokenizer -from typing import Tuple, List - -from utils.args import * -from load_data.load_data import * -from model.model import * -from utils.utils import * - - -def inference(model: torch.nn.Module, tokenized_sent: DataLoader, device: torch.device) -> Tuple[List[int], List[List[float]]]: - """ - test dataset을 DataLoader로 만들어 준 후 batch_size로 나눠 model이 예측 - - Args: - model (torch.nn.Module): 예측에 사용할 모델 - tokenized_sent (DataLoader): 토큰화가 완료된 문장 데이터셋 - device (torch.device): 모델을 실행할 디바이스 (예: cuda:0) - - Returns: - Tuple[List[int], List[List[float]]]: 예측된 클래스 인덱스와 각 클래스에 대한 확률이 담긴 리스트를 반환 - """ - - dataloader = DataLoader(tokenized_sent, batch_size=32, shuffle=False) - model.eval() - - output_pred = [] - output_prob = [] - - for i, data in enumerate(tqdm(dataloader)): - with torch.no_grad(): - outputs = model( - input_ids=data['input_ids'].to(device), - attention_mask=data['attention_mask'].to(device), - token_type_ids=data['token_type_ids'].to(device), - ) - - logits = outputs['logits'] if isinstance(outputs, dict) else outputs[0] - prob = F.softmax(logits, dim=-1).detach().cpu().numpy() - logits = logits.detach().cpu().numpy() - result = np.argmax(logits, axis=-1) - - output_pred.append(result) - output_prob.append(prob) - - return ( - np.concatenate(output_pred).tolist(), - np.concatenate(output_prob, axis=0).tolist(), - ) - - -def main() -> None: - """ - 주어진 데이터셋 csv 파일과 같은 형태일 경우 inference를 수행할 수 있는 메인 함수 - - 다음 프로세스를 수행: - 1. config에 따라 시드를 고정하고, 디바이스를 설정 - 2. 토크나이저와 모델을 로드하고, 학습시킨 모델을 로드 - 3. 테스트 데이터셋을 로드하고, 데이터셋 객체 생성 - 4. 모델을 이용하여 예측 수행 - 5. 예측 결과를 csv 파일로 저장 - 6. full train이 아닐 경우 검증 데이터셋에 대해서도 같은 과정을 수행 - - Args: - None - - Returns: - None - """ - - try: - config_path = sys.argv[1] - except IndexError: - config_path = './config.yaml' - - config = parse_arguments(config_path) - - seed_everything(config.seed) - - device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') - - # load tokenizer - model_name = config.model['name'] - tokenizer = AutoTokenizer.from_pretrained(model_name) - - # load my model - model_module = __import__('model.model', fromlist=[config.model['variant']]) - model_class = getattr(model_module, config.model['variant']) - # Available customized classes: - # BaseREModel, BiLSTMREModel, BiGRUREModel - model = model_class(config, len(tokenizer)) - - load_model_path = './best_model/pytorch_model.bin' - checkpoint = torch.load(load_model_path) - model.load_state_dict(checkpoint) - - model.parameters - model.to(device) - - # load test dataset - revision = config.dataloader['revision'] - input_format = config.dataloader.get('input_format') - prompt = config.dataloader.get('prompt') - type_transform = config.dataloader.get('type_transform') - - test_id, test_dataset, test_label = load_test_dataset( - split='test', - revision=revision, - tokenizer=tokenizer, - input_format=input_format, - prompt=prompt, - type_transform=type_transform, - ) - re_test_dataset = REDataset(test_dataset, test_label) - - # predict answer - pred_answer, output_prob = inference(model, re_test_dataset, device) # model에서 class 추론 - pred_answer = num_to_label(pred_answer) # 숫자로 된 class를 원래 문자열 라벨로 변환 - - # make csv file with predicted answer - output = pd.DataFrame( - { - 'id': test_id, - 'pred_label': pred_answer, - 'probs': output_prob, - } - ) - output_path = config.trainer['pred_dir'] - os.makedirs(os.path.dirname(output_path), exist_ok=True) - output.to_csv(output_path, index=False) # 최종적으로 완성된 예측한 라벨 csv 파일 형태로 저장 - - ## 사후분석을 위한 validation data inference - # load validation dataset(full train일 경우 revision에 valid가 없어서 load_test_dataset에서 오류가 생기므로 넘기기) - try: - val_id, val_dataset, val_label = load_test_dataset( - split=config.dataloader['valid_split'], - revision=revision, - tokenizer=tokenizer, - input_format=input_format, - prompt=prompt, - type_transform=type_transform, - ) - re_val_dataset = REDataset(val_dataset, [100] * len(val_id)) - - # predict validation answer - pred_val_answer, val_output_prob = inference(model, re_val_dataset, device) - pred_val_answer = num_to_label(pred_val_answer) - - # make csv file with predicted validation answer - val_output = pd.DataFrame( - { - 'id': val_id, - 'true_label': val_label, - 'pred_label': pred_val_answer, - 'probs': val_output_prob, - } - ) - val_output_path = config.trainer['val_pred_dir'] - os.makedirs(os.path.dirname(val_output_path), exist_ok=True) - val_output.to_csv(val_output_path, index=False) # 최종적으로 완성된 예측한 라벨 csv 파일 형태로 저장 - - except ValueError: - print('There is no existing valiation dataset. The inference output is from full dataset model.') - - print('---- Finish! ----') - - -if __name__ == '__main__': +import pickle as pickle +import sys + +import numpy as np +import pandas as pd +import torch +import torch.nn.functional as F +from tqdm import tqdm +from torch.utils.data import DataLoader +from transformers import AutoTokenizer +from typing import Tuple, List + +from utils.args import * +from load_data.load_data import * +from model.model import * +from utils.utils import * + + +def inference(model: torch.nn.Module, tokenized_sent: DataLoader, device: torch.device) -> Tuple[List[int], List[List[float]]]: + """ + test dataset을 DataLoader로 만들어 준 후 batch_size로 나눠 model이 예측 + + Args: + model (torch.nn.Module): 예측에 사용할 모델 + tokenized_sent (DataLoader): 토큰화가 완료된 문장 데이터셋 + device (torch.device): 모델을 실행할 디바이스 (예: cuda:0) + + Returns: + Tuple[List[int], List[List[float]]]: 예측된 클래스 인덱스와 각 클래스에 대한 확률이 담긴 리스트를 반환 + """ + + dataloader = DataLoader(tokenized_sent, batch_size=32, shuffle=False) + model.eval() + + output_pred = [] + output_prob = [] + + for i, data in enumerate(tqdm(dataloader)): + with torch.no_grad(): + outputs = model( + input_ids=data['input_ids'].to(device), + attention_mask=data['attention_mask'].to(device), + token_type_ids=data['token_type_ids'].to(device), + ) + + logits = outputs['logits'] if isinstance(outputs, dict) else outputs[0] + prob = F.softmax(logits, dim=-1).detach().cpu().numpy() + logits = logits.detach().cpu().numpy() + result = np.argmax(logits, axis=-1) + + output_pred.append(result) + output_prob.append(prob) + + return ( + np.concatenate(output_pred).tolist(), + np.concatenate(output_prob, axis=0).tolist(), + ) + + +def main() -> None: + """ + 주어진 데이터셋 csv 파일과 같은 형태일 경우 inference를 수행할 수 있는 메인 함수 + + 다음 프로세스를 수행: + 1. config에 따라 시드를 고정하고, 디바이스를 설정 + 2. 토크나이저와 모델을 로드하고, 학습시킨 모델을 로드 + 3. 테스트 데이터셋을 로드하고, 데이터셋 객체 생성 + 4. 모델을 이용하여 예측 수행 + 5. 예측 결과를 csv 파일로 저장 + 6. full train이 아닐 경우 검증 데이터셋에 대해서도 같은 과정을 수행 + + Args: + None + + Returns: + None + """ + + try: + config_path = sys.argv[1] + except IndexError: + config_path = './config.yaml' + + config = parse_arguments(config_path) + + seed_everything(config.seed) + + device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') + + # load tokenizer + model_name = config.model['name'] + tokenizer = AutoTokenizer.from_pretrained(model_name) + + # load my model + model_module = __import__('model.model', fromlist=[config.model['variant']]) + model_class = getattr(model_module, config.model['variant']) + # Available customized classes: + # BaseREModel, BiLSTMREModel, BiGRUREModel + model = model_class(config, len(tokenizer)) + + load_model_path = './best_model/pytorch_model.bin' + checkpoint = torch.load(load_model_path) + model.load_state_dict(checkpoint) + + model.parameters + model.to(device) + + # load test dataset + revision = config.dataloader['revision'] + input_format = config.dataloader.get('input_format') + prompt = config.dataloader.get('prompt') + type_transform = config.dataloader.get('type_transform') + + test_id, test_dataset, test_label = load_test_dataset( + split='test', + revision=revision, + tokenizer=tokenizer, + input_format=input_format, + prompt=prompt, + type_transform=type_transform, + ) + re_test_dataset = REDataset(test_dataset, test_label) + + # predict answer + pred_answer, output_prob = inference(model, re_test_dataset, device) # model에서 class 추론 + pred_answer = num_to_label(pred_answer) # 숫자로 된 class를 원래 문자열 라벨로 변환 + + # make csv file with predicted answer + output = pd.DataFrame( + { + 'id': test_id, + 'pred_label': pred_answer, + 'probs': output_prob, + } + ) + output_path = config.trainer['pred_dir'] + os.makedirs(os.path.dirname(output_path), exist_ok=True) + output.to_csv(output_path, index=False) # 최종적으로 완성된 예측한 라벨 csv 파일 형태로 저장 + + ## 사후분석을 위한 validation data inference + # load validation dataset(full train일 경우 revision에 valid가 없어서 load_test_dataset에서 오류가 생기므로 넘기기) + try: + val_id, val_dataset, val_label = load_test_dataset( + split=config.dataloader['valid_split'], + revision=revision, + tokenizer=tokenizer, + input_format=input_format, + prompt=prompt, + type_transform=type_transform, + ) + re_val_dataset = REDataset(val_dataset, [100] * len(val_id)) + + # predict validation answer + pred_val_answer, val_output_prob = inference(model, re_val_dataset, device) + pred_val_answer = num_to_label(pred_val_answer) + + # make csv file with predicted validation answer + val_output = pd.DataFrame( + { + 'id': val_id, + 'true_label': val_label, + 'pred_label': pred_val_answer, + 'probs': val_output_prob, + } + ) + val_output_path = config.trainer['val_pred_dir'] + os.makedirs(os.path.dirname(val_output_path), exist_ok=True) + val_output.to_csv(val_output_path, index=False) # 최종적으로 완성된 예측한 라벨 csv 파일 형태로 저장 + + except ValueError: + print('There is no existing valiation dataset. The inference output is from full dataset model.') + + print('---- Finish! ----') + + +if __name__ == '__main__': main() \ No newline at end of file diff --git a/load_data/load_data.py b/load_data/load_data.py index f2794ad..a0babde 100755 --- a/load_data/load_data.py +++ b/load_data/load_data.py @@ -1,320 +1,320 @@ -import pickle as pickle -import re - -import torch -from datasets import Dataset, load_dataset -from transformers import PreTrainedTokenizer -from tqdm import tqdm -from typing import Dict, List, Tuple, Union - -from utils.utils import * - - -def load_train_dataset( - split: str, - revision: str, - tokenizer: PreTrainedTokenizer, - input_format: str = None, - prompt: str = None, - type_transform: bool = False -) -> Tuple[Dict[str, Union[List[str], List[int]]], Union[int, List[str]]]: - """ - train dataset을 불러온 후, tokenizing 하는 함수입니다. - - Args: - split (str): 데이터셋의 분할 유형 (train, validation, test). - revision (str): 데이터셋의 버전 (commit hash). - tokenizer (PreTrainedTokenizer): 사용할 토크나이저 객체. - input_format (str, optional): entity representation 유형. 기본값은 None이며, default로 설정됩니다. - prompt (str, optional): prompt 유형. 기본값은 None이며, default로 설정됩니다. - type_transform (bool, optional): entity type을 한글로 번역할지 여부. 기본값은 False입니다. - - Returns: - Tuple[Dict[str, Union[List[str], List[int]]], Union[int, List[str]]] - : 토큰화된 train 데이터셋과 레이블. - """ - - if input_format is None: - input_format = 'default' - if prompt is None: - prompt = 'default' - print('input format: ',input_format, '| prompt: ', prompt) - - dataset = load_dataset( - 'Smoked-Salmon-s/RE_Competition', - split=split, - column_names=['id', 'sentence', 'subject_entity', 'object_entity', 'label', 'source'], - revision=revision, - ) - pd_dataset = dataset.to_pandas().iloc[1:].reset_index(drop=True).astype({'id': 'int64'}) - train_dataset = preprocessing_dataset(pd_dataset, input_format, type_transform) - tokenized_train = tokenized_dataset(train_dataset, tokenizer, input_format, prompt) - train_label = pd_dataset['label'].values - - return tokenized_train, train_label - - -def load_test_dataset( - split: str, - revision: str, - tokenizer: PreTrainedTokenizer, - input_format: str = None, - prompt: str = None, - type_transform: bool = False -) -> Tuple[Union[int, str], Dict[str, Union[List[str], List[int]]], Union[int, List[str]]]: - """ - test dataset을 불러온 후, tokenizing 하는 함수입니다. - - Args: - split (str): 데이터셋의 분할 유형 (train, validation, test). - revision (str): 데이터셋의 버전 (commit hash). - tokenizer (PreTrainedTokenizer): 사용할 토크나이저 객체. - input_format (str, optional): entity representation 유형. 기본값은 None이며, default로 설정됩니다. - prompt (str, optional): prompt 유형. 기본값은 None이며, default로 설정됩니다. - type_transform (bool, optional): entity type을 한글로 번역할지 여부. 기본값은 False입니다. - - Returns: - Tuple[Union[int, str], Dict[str, Union[List[str], List[int]]], Union[int, List[str]]] - : test 데이터셋의 id, 토큰화된 문장, 레이블. - """ - - if input_format is None: - input_format = 'default' - if prompt is None: - prompt = 'default' - print('input format: ',input_format, 'prompt: ', prompt) - - dataset = load_dataset( - 'Smoked-Salmon-s/RE_Competition', - split=split, - column_names=['id', 'sentence', 'subject_entity', 'object_entity', 'label', 'source'], - revision=revision, - ) - pd_dataset = dataset.to_pandas().iloc[1:].reset_index(drop=True).astype({'id': 'int64'}) - test_dataset = preprocessing_dataset(pd_dataset, input_format, type_transform) - tokenized_test = tokenized_dataset(test_dataset, tokenizer, input_format, prompt) - - if split == 'test': - test_label = list(map(int, pd_dataset['label'].values)) - else: - test_label = pd_dataset['label'].values - - return test_dataset['id'], tokenized_test, test_label - - -def preprocessing_dataset( - dataset: Dict[str, List[str]], - input_format: str, - type_transform: bool = False -) -> Dict[str, List[str]]: - """ - subject_entity column과 object_entity column을 리스트 형태로 변환하고, - sentence column에 entity representation를 적용하는 함수입니다. - - Args: - dataset (Dict[str, List[str]]): 전처리할 데이터셋. - input_format (str): entity representation 유형. - type_transform (bool, optional): entity type을 한글로 번역할지 여부. 기본값은 False입니다. - - Returns: - Dict[str, List[str]]: 전처리된 데이터셋. - """ - - subject_entity = [] - object_entity = [] - - for i, j in zip(dataset['subject_entity'], dataset['object_entity']): - i = i[1:-1].split(',')[0].split(':')[1] - j = j[1:-1].split(',')[0].split(':')[1] - subject_entity.append(i) - object_entity.append(j) - - dataset['subj_entity'] = subject_entity - dataset['obj_entity'] = object_entity - - # entity type을 한글로 번역 - if type_transform: - print('entity type을 한글로 번역합니다.') - hanguled = [to_hangul(row_data) for index, row_data in tqdm(dataset.iterrows())] - dataset['subject_entity'] = [x[0] for x in hanguled] - dataset['object_entity'] = [x[1] for x in hanguled] - - # entity representation 적용 - input_format_list = ['entity_mask', 'entity_marker', 'entity_marker_punct', 'typed_entity_marker', 'typed_entity_marker_punct'] - if input_format in input_format_list: - marked_sentences = [marker(row_data, input_format) for index, row_data in tqdm(dataset.iterrows())] - dataset['sentence'] = marked_sentences - elif input_format == 'default': - pass - else: - raise ValueError('잘못된 input_format이 입력되었습니다. ') - - return dataset - - -def tokenized_dataset( - dataset: Dict[str, List[str]], - tokenizer: PreTrainedTokenizer, - input_format: str, - prompt: str -) -> Dict[str, Union[List[str], List[int]]]: - """ - tokenizer에 따라 문장을 토큰화하는 함수입니다. - - Args: - dataset (Dict[str, List[str]]): 토큰화할 데이터셋. - tokenizer (PreTrainedTokenizer): 사용할 토크나이저 객체. - input_format (str): entity representation 유형. - prompt (str): prompt 유형. - - Returns: - Dict[str, Union[List[str], List[int]]]: 토큰화된 문장의 딕셔너리. - """ - - # 새로운 특수 토큰 추가 - special_tokens = [] - - if input_format == 'entity_mask': - special_tokens = ['[SUBJ-ORG]', '[SUBJ-PER]', '[OBJ-ORG]', '[OBJ-PER]', '[OBJ-LOC]', '[OBJ-DAT]', '[OBJ-POH]', '[OBJ-NOH]'] - - elif input_format == 'entity_marker': - special_tokens = ['[E1]', '[/E1]', '[E2]', '[/E2]'] - - elif input_format == 'typed_entity_marker': - special_tokens = ['', '', '', '', '', '', '', '', - '', '', '', '', '', '', '', ''] - - tokenizer.add_special_tokens({'additional_special_tokens': special_tokens}) - - # check - print("length of tokenizer:", len(tokenizer)) - print("length of special tokens: ", tokenizer.all_special_tokens) - print("special tokens:", tokenizer.special_tokens_map) - - # prompt 추가 - if prompt in ['s_sep_o', 's_and_o', 'quiz']: - prompt_forward = [] - - if prompt == 's_sep_o': - for e01, e02 in zip(dataset['subj_entity'], dataset['obj_entity']): - temp = '' - temp = e01[2:-1] + '[SEP]' + e02[2:-1] - prompt_forward.append(temp) - - elif prompt == 's_and_o': - for e01, e02 in zip(dataset['subj_entity'], dataset['obj_entity']): - temp = '' - temp = e01[2:-1] + '와 ' + e02[2:-1] + '의 관계' - prompt_forward.append(temp) - - elif prompt == 'quiz': - for e01, e02 in zip(dataset['subj_entity'], dataset['obj_entity']): - temp = '' - temp = '다음 문장에서 ' + e01[2:-1] + '와 ' + e02[2:-1] + '사이의 관계를 추출하시오.' - prompt_forward.append(temp) - - tokenized_sentences = tokenizer( - prompt_forward, - list(dataset['sentence']), - return_tensors='pt', - padding=True, - truncation=True, - max_length=180, - add_special_tokens=True, - ) - - elif prompt == 'problem': - prompt_forward = [] - prompt_backward = [] - - for e01, e02 in zip(dataset['subj_entity'], dataset['obj_entity']): - temp = '' - temp = '다음 문장에서 ' + e01[2:-1] + '와 ' + e02[2:-1] + '사이의 관계를 추출하시오.' - prompt_forward.append(temp) - for e00, e01, e02 in zip(dataset['sentence'], dataset['subj_entity'], dataset['obj_entity']): - temp = '' - temp = e00 + e01[2:-1] + '와 ' + e02[2:-1] + '는 어떤 관계입니까?' - prompt_backward.append(temp) - - tokenized_sentences = tokenizer( - prompt_forward, - prompt_backward, - return_tensors='pt', - padding=True, - truncation=True, - max_length=200, - add_special_tokens=True, - ) - - elif prompt == 'default': - tokenized_sentences = tokenizer( - list(dataset['sentence']), - return_tensors='pt', - padding=True, - truncation=True, - max_length=180, - add_special_tokens=True, - ) - - else: - raise ValueError('잘못된 prompt가 입력되었습니다. ') - - return tokenized_sentences - - -def label_to_num(label: List[str]) -> List[int]: - """ - 원본 문자열 label을 숫자 형식 클래스로 변환하는 함수입니다. - - Args: - label (List[str]): 변환할 원본 문자열 클래스 리스트. - - Returns: - List[int]: 숫자 형식으로 변환된 클래스 리스트. - """ - - num_label = [] - with open('load_data/dict_label_to_num.pkl', 'rb') as f: - dict_label_to_num = pickle.load(f) - for v in label: - num_label.append(dict_label_to_num[v]) - - return num_label - - -def num_to_label(label: List[int]) -> List[str]: - """ - 숫자 형식 클래스를 원본 문자열 label로 변환하는 함수입니다. - - Args: - label (List[int]): 변환할 숫자 형식의 클래스 리스트. - - Returns: - List[str]: 원본 문자열로 변환된 클래스 리스트. - """ - - origin_label = [] - with open('load_data/dict_num_to_label.pkl', 'rb') as f: - dict_num_to_label = pickle.load(f) - for v in label: - origin_label.append(dict_num_to_label[v]) - - return origin_label - - -class REDataset(torch.utils.data.Dataset): - """Dataset 구성을 위한 class입니다.""" - - def __init__(self, pair_dataset, labels): - self.pair_dataset = pair_dataset - self.labels = labels - - def __getitem__(self, idx): - item = { - key: val[idx].clone().detach() for key, val in self.pair_dataset.items() - } - item['labels'] = torch.tensor(self.labels[idx]) - return item - - def __len__(self): +import pickle as pickle +import re + +import torch +from datasets import Dataset, load_dataset +from transformers import PreTrainedTokenizer +from tqdm import tqdm +from typing import Dict, List, Tuple, Union + +from utils.utils import * + + +def load_train_dataset( + split: str, + revision: str, + tokenizer: PreTrainedTokenizer, + input_format: str = None, + prompt: str = None, + type_transform: bool = False +) -> Tuple[Dict[str, Union[List[str], List[int]]], Union[int, List[str]]]: + """ + train dataset을 불러온 후, tokenizing 하는 함수입니다. + + Args: + split (str): 데이터셋의 분할 유형 (train, validation, test). + revision (str): 데이터셋의 버전 (commit hash). + tokenizer (PreTrainedTokenizer): 사용할 토크나이저 객체. + input_format (str, optional): entity representation 유형. 기본값은 None이며, default로 설정됩니다. + prompt (str, optional): prompt 유형. 기본값은 None이며, default로 설정됩니다. + type_transform (bool, optional): entity type을 한글로 번역할지 여부. 기본값은 False입니다. + + Returns: + Tuple[Dict[str, Union[List[str], List[int]]], Union[int, List[str]]] + : 토큰화된 train 데이터셋과 레이블. + """ + + if input_format is None: + input_format = 'default' + if prompt is None: + prompt = 'default' + print('input format: ',input_format, '| prompt: ', prompt) + + dataset = load_dataset( + 'Smoked-Salmon-s/RE_Competition', + split=split, + column_names=['id', 'sentence', 'subject_entity', 'object_entity', 'label', 'source'], + revision=revision, + ) + pd_dataset = dataset.to_pandas().iloc[1:].reset_index(drop=True).astype({'id': 'int64'}) + train_dataset = preprocessing_dataset(pd_dataset, input_format, type_transform) + tokenized_train = tokenized_dataset(train_dataset, tokenizer, input_format, prompt) + train_label = pd_dataset['label'].values + + return tokenized_train, train_label + + +def load_test_dataset( + split: str, + revision: str, + tokenizer: PreTrainedTokenizer, + input_format: str = None, + prompt: str = None, + type_transform: bool = False +) -> Tuple[Union[int, str], Dict[str, Union[List[str], List[int]]], Union[int, List[str]]]: + """ + test dataset을 불러온 후, tokenizing 하는 함수입니다. + + Args: + split (str): 데이터셋의 분할 유형 (train, validation, test). + revision (str): 데이터셋의 버전 (commit hash). + tokenizer (PreTrainedTokenizer): 사용할 토크나이저 객체. + input_format (str, optional): entity representation 유형. 기본값은 None이며, default로 설정됩니다. + prompt (str, optional): prompt 유형. 기본값은 None이며, default로 설정됩니다. + type_transform (bool, optional): entity type을 한글로 번역할지 여부. 기본값은 False입니다. + + Returns: + Tuple[Union[int, str], Dict[str, Union[List[str], List[int]]], Union[int, List[str]]] + : test 데이터셋의 id, 토큰화된 문장, 레이블. + """ + + if input_format is None: + input_format = 'default' + if prompt is None: + prompt = 'default' + print('input format: ',input_format, 'prompt: ', prompt) + + dataset = load_dataset( + 'Smoked-Salmon-s/RE_Competition', + split=split, + column_names=['id', 'sentence', 'subject_entity', 'object_entity', 'label', 'source'], + revision=revision, + ) + pd_dataset = dataset.to_pandas().iloc[1:].reset_index(drop=True).astype({'id': 'int64'}) + test_dataset = preprocessing_dataset(pd_dataset, input_format, type_transform) + tokenized_test = tokenized_dataset(test_dataset, tokenizer, input_format, prompt) + + if split == 'test': + test_label = list(map(int, pd_dataset['label'].values)) + else: + test_label = pd_dataset['label'].values + + return test_dataset['id'], tokenized_test, test_label + + +def preprocessing_dataset( + dataset: Dict[str, List[str]], + input_format: str, + type_transform: bool = False +) -> Dict[str, List[str]]: + """ + subject_entity column과 object_entity column을 리스트 형태로 변환하고, + sentence column에 entity representation를 적용하는 함수입니다. + + Args: + dataset (Dict[str, List[str]]): 전처리할 데이터셋. + input_format (str): entity representation 유형. + type_transform (bool, optional): entity type을 한글로 번역할지 여부. 기본값은 False입니다. + + Returns: + Dict[str, List[str]]: 전처리된 데이터셋. + """ + + subject_entity = [] + object_entity = [] + + for i, j in zip(dataset['subject_entity'], dataset['object_entity']): + i = i[1:-1].split(',')[0].split(':')[1] + j = j[1:-1].split(',')[0].split(':')[1] + subject_entity.append(i) + object_entity.append(j) + + dataset['subj_entity'] = subject_entity + dataset['obj_entity'] = object_entity + + # entity type을 한글로 번역 + if type_transform: + print('entity type을 한글로 번역합니다.') + hanguled = [to_hangul(row_data) for index, row_data in tqdm(dataset.iterrows())] + dataset['subject_entity'] = [x[0] for x in hanguled] + dataset['object_entity'] = [x[1] for x in hanguled] + + # entity representation 적용 + input_format_list = ['entity_mask', 'entity_marker', 'entity_marker_punct', 'typed_entity_marker', 'typed_entity_marker_punct'] + if input_format in input_format_list: + marked_sentences = [marker(row_data, input_format) for index, row_data in tqdm(dataset.iterrows())] + dataset['sentence'] = marked_sentences + elif input_format == 'default': + pass + else: + raise ValueError('잘못된 input_format이 입력되었습니다. ') + + return dataset + + +def tokenized_dataset( + dataset: Dict[str, List[str]], + tokenizer: PreTrainedTokenizer, + input_format: str, + prompt: str +) -> Dict[str, Union[List[str], List[int]]]: + """ + tokenizer에 따라 문장을 토큰화하는 함수입니다. + + Args: + dataset (Dict[str, List[str]]): 토큰화할 데이터셋. + tokenizer (PreTrainedTokenizer): 사용할 토크나이저 객체. + input_format (str): entity representation 유형. + prompt (str): prompt 유형. + + Returns: + Dict[str, Union[List[str], List[int]]]: 토큰화된 문장의 딕셔너리. + """ + + # 새로운 특수 토큰 추가 + special_tokens = [] + + if input_format == 'entity_mask': + special_tokens = ['[SUBJ-ORG]', '[SUBJ-PER]', '[OBJ-ORG]', '[OBJ-PER]', '[OBJ-LOC]', '[OBJ-DAT]', '[OBJ-POH]', '[OBJ-NOH]'] + + elif input_format == 'entity_marker': + special_tokens = ['[E1]', '[/E1]', '[E2]', '[/E2]'] + + elif input_format == 'typed_entity_marker': + special_tokens = ['', '', '', '', '', '', '', '', + '', '', '', '', '', '', '', ''] + + tokenizer.add_special_tokens({'additional_special_tokens': special_tokens}) + + # check + print("length of tokenizer:", len(tokenizer)) + print("length of special tokens: ", tokenizer.all_special_tokens) + print("special tokens:", tokenizer.special_tokens_map) + + # prompt 추가 + if prompt in ['s_sep_o', 's_and_o', 'quiz']: + prompt_forward = [] + + if prompt == 's_sep_o': + for e01, e02 in zip(dataset['subj_entity'], dataset['obj_entity']): + temp = '' + temp = e01[2:-1] + '[SEP]' + e02[2:-1] + prompt_forward.append(temp) + + elif prompt == 's_and_o': + for e01, e02 in zip(dataset['subj_entity'], dataset['obj_entity']): + temp = '' + temp = e01[2:-1] + '와 ' + e02[2:-1] + '의 관계' + prompt_forward.append(temp) + + elif prompt == 'quiz': + for e01, e02 in zip(dataset['subj_entity'], dataset['obj_entity']): + temp = '' + temp = '다음 문장에서 ' + e01[2:-1] + '와 ' + e02[2:-1] + '사이의 관계를 추출하시오.' + prompt_forward.append(temp) + + tokenized_sentences = tokenizer( + prompt_forward, + list(dataset['sentence']), + return_tensors='pt', + padding=True, + truncation=True, + max_length=180, + add_special_tokens=True, + ) + + elif prompt == 'problem': + prompt_forward = [] + prompt_backward = [] + + for e01, e02 in zip(dataset['subj_entity'], dataset['obj_entity']): + temp = '' + temp = '다음 문장에서 ' + e01[2:-1] + '와 ' + e02[2:-1] + '사이의 관계를 추출하시오.' + prompt_forward.append(temp) + for e00, e01, e02 in zip(dataset['sentence'], dataset['subj_entity'], dataset['obj_entity']): + temp = '' + temp = e00 + e01[2:-1] + '와 ' + e02[2:-1] + '는 어떤 관계입니까?' + prompt_backward.append(temp) + + tokenized_sentences = tokenizer( + prompt_forward, + prompt_backward, + return_tensors='pt', + padding=True, + truncation=True, + max_length=200, + add_special_tokens=True, + ) + + elif prompt == 'default': + tokenized_sentences = tokenizer( + list(dataset['sentence']), + return_tensors='pt', + padding=True, + truncation=True, + max_length=180, + add_special_tokens=True, + ) + + else: + raise ValueError('잘못된 prompt가 입력되었습니다. ') + + return tokenized_sentences + + +def label_to_num(label: List[str]) -> List[int]: + """ + 원본 문자열 label을 숫자 형식 클래스로 변환하는 함수입니다. + + Args: + label (List[str]): 변환할 원본 문자열 클래스 리스트. + + Returns: + List[int]: 숫자 형식으로 변환된 클래스 리스트. + """ + + num_label = [] + with open('load_data/dict_label_to_num.pkl', 'rb') as f: + dict_label_to_num = pickle.load(f) + for v in label: + num_label.append(dict_label_to_num[v]) + + return num_label + + +def num_to_label(label: List[int]) -> List[str]: + """ + 숫자 형식 클래스를 원본 문자열 label로 변환하는 함수입니다. + + Args: + label (List[int]): 변환할 숫자 형식의 클래스 리스트. + + Returns: + List[str]: 원본 문자열로 변환된 클래스 리스트. + """ + + origin_label = [] + with open('load_data/dict_num_to_label.pkl', 'rb') as f: + dict_num_to_label = pickle.load(f) + for v in label: + origin_label.append(dict_num_to_label[v]) + + return origin_label + + +class REDataset(torch.utils.data.Dataset): + """Dataset 구성을 위한 class입니다.""" + + def __init__(self, pair_dataset, labels): + self.pair_dataset = pair_dataset + self.labels = labels + + def __getitem__(self, idx): + item = { + key: val[idx].clone().detach() for key, val in self.pair_dataset.items() + } + item['labels'] = torch.tensor(self.labels[idx]) + return item + + def __len__(self): return len(self.labels) \ No newline at end of file diff --git a/model/loss.py b/model/loss.py index 29931c4..6206e4a 100644 --- a/model/loss.py +++ b/model/loss.py @@ -1,160 +1,160 @@ -from typing import Optional - -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch import Tensor - - -class FocalLoss(nn.Module): - """ - Dense detection을 위한 RetinaNet에서 제안된 loss: https://arxiv.org/abs/1708.02002. - """ - def __init__(self, alpha: float = 1.0, gamma: float = 2.0, reduction: str = 'mean') -> None: - """ - Args: - alpha (float): 개구간 (0, 1) 내의 실수값을 가지는 가중치 factor. - 양성 및 음성 예제간의 균형을 맞추는 역할. 기본값: 0.25. - gamma (float): 쉬운 예제와 어려운 예제 간의 균형을 맞추는 역할을 하는 modulating factor의 지수. - 기본값: 2.0 - reduction (string): ``'none'`` | ``'mean'`` | ``'sum'`` - ``'none'``: No reduction will be applied to the output. - ``'mean'``: 평균 출력 반환. - ``'sum'``: 합계 출력 반환. 기본값: none. - Returns: - Loss Tensor - """ - super(FocalLoss, self).__init__() - self.alpha = alpha # 각 클래스에 대한 가중치 - self.gamma = gamma # "focus" 매개변수로 어려운 예시에 더 많은 주의를 기울이는 역할 - self.reduction = reduction - - def forward(self, inputs: Tensor, targets: Tensor) -> Tensor: - """ - Args: - inputs (Tensor): (bsz, 30) 사이즈의 Float Tensor. - 각 예제에 대한 예측. - targets (Tensor): (30,) 사이즈의 true class 정보. 0부터 29까지의 정수가 담긴 Long Tensor. - 음성 클래스: 0, 양성 클래스: 1. - """ - p = torch.sigmoid(inputs) - targets = F.one_hot(targets, num_classes=30).float() - ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") - p_t = p * targets + (1 - p) * (1 - targets) - loss = ce_loss * ((1 - p_t) ** self.gamma) - - if self.alpha >= 0: - alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets) - loss = alpha_t * loss - - # Check reduction option and return loss accordingly - if self.reduction == 'none': - pass - elif self.reduction == 'mean': - loss = loss.mean() - elif self.reduction == 'sum': - loss = loss.sum() - else: - raise ValueError( - f'Invalid Value for arg "reduction": {self.reduction} \n Supported reduction modes: "none", "mean", "sum"' - ) - return loss - -class WeightedFocalLoss(nn.Module): - def __init__(self, alpha: torch.Tensor = None, gamma: float = 2.0, reduction: str = 'mean') -> None: - super(WeightedFocalLoss, self).__init__() - self.alpha = alpha # 각 클래스에 대한 가중치 - self.gamma = gamma # "focus" 매개변수로 어려운 예시에 더 많은 주의를 기울이는 역할 - self.reduction = reduction - - # alpha가 None인 경우, 모든 클래스에 동일한 가중치 적용 - # alpha가 텐서인 경우, alpha의 각 요소는 해당 클래스의 가중치로 설정 - self.ce_loss = nn.CrossEntropyLoss(weight=self.alpha, reduction='none') - - def forward(self, inputs: Tensor, targets: Tensor) -> Tensor: - ce_loss = self.ce_loss(inputs, targets) - pt = torch.exp(-ce_loss) - focal_loss = (1 - pt)**self.gamma * ce_loss - - if self.reduction == 'mean': - return torch.mean(focal_loss) - elif self.reduction == 'sum': - return torch.sum(focal_loss) - else: - return focal_loss - - -class LovaszSoftmaxLoss(nn.Module): - def __init__(self, weight=None, reduction: str = 'mean') -> None: - super(LovaszSoftmaxLoss, self).__init__() - self.weight = weight - self.reduction = reduction - - def lovasz_grad(self, true_sorted): - p = len(true_sorted) - gts = true_sorted.sum() - intersection = gts - true_sorted.cumsum(0) - union = gts + (1 - true_sorted).cumsum(0) - jaccard = 1 - intersection / union - if p > 1: # cover 1-pixel case - jaccard[1:p] = jaccard[1:p] - jaccard[0:-1] - return jaccard - - def lovasz_softmax(self, log_probs, labels): - C = log_probs.shape[1] - losses = [] - for c in range(C): - fg = (labels == c).float() # foreground for class c - if fg.sum() == 0: - continue - errors = (fg - log_probs[:, c]).abs() - errors_sorted, perm = torch.sort(errors, 0, descending=True) - fg_sorted = fg[perm] - losses.append(torch.dot(errors_sorted, self.lovasz_grad(fg_sorted))) - return torch.stack(losses) - - def forward(self, inputs: Tensor, targets: Tensor) -> Tensor: - log_probs = F.log_softmax(inputs, dim=1) - lovasz_loss = self.lovasz_softmax(log_probs, targets) - - if self.reduction == 'mean': - return torch.mean(lovasz_loss) - elif self.reduction == 'sum': - return torch.sum(lovasz_loss) - else: - return lovasz_loss - - -class MulticlassDiceLoss(nn.Module): - def __init__(self, smooth: float = 1e-5, reduction: str = 'mean'): - super(MulticlassDiceLoss, self).__init__() - self.smooth = smooth - self.reduction = reduction - - def forward(self, inputs: Tensor, targets: Tensor) -> Tensor: - # Softmax over the inputs - inputs = torch.softmax(inputs, dim=1) - - - # One-hot encode targets - targets_one_hot = torch.nn.functional.one_hot(targets, num_classes=inputs.shape[1]) - - # Move targets_one_hot to device of inputs - targets_one_hot = targets_one_hot.to(inputs.device) - - # Calculate Dice Loss for each class - dice_loss = 0 - for i in range(inputs.shape[1]): - intersection = 2 * (inputs[:, i] * targets_one_hot[:, i]).sum() - union = inputs[:, i].sum() + targets_one_hot[:, i].sum() - dice_loss += (1 - (intersection + self.smooth) / (union + self.smooth)) - - # Average the dice loss for all classes - dice_loss /= inputs.shape[1] - - if self.reduction == 'mean': - return dice_loss - elif self.reduction == 'sum': - return dice_loss.sum() - else: - return dice_loss +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + + +class FocalLoss(nn.Module): + """ + Dense detection을 위한 RetinaNet에서 제안된 loss: https://arxiv.org/abs/1708.02002. + """ + def __init__(self, alpha: float = 1.0, gamma: float = 2.0, reduction: str = 'mean') -> None: + """ + Args: + alpha (float): 개구간 (0, 1) 내의 실수값을 가지는 가중치 factor. + 양성 및 음성 예제간의 균형을 맞추는 역할. 기본값: 0.25. + gamma (float): 쉬운 예제와 어려운 예제 간의 균형을 맞추는 역할을 하는 modulating factor의 지수. + 기본값: 2.0 + reduction (string): ``'none'`` | ``'mean'`` | ``'sum'`` + ``'none'``: No reduction will be applied to the output. + ``'mean'``: 평균 출력 반환. + ``'sum'``: 합계 출력 반환. 기본값: none. + Returns: + Loss Tensor + """ + super(FocalLoss, self).__init__() + self.alpha = alpha # 각 클래스에 대한 가중치 + self.gamma = gamma # "focus" 매개변수로 어려운 예시에 더 많은 주의를 기울이는 역할 + self.reduction = reduction + + def forward(self, inputs: Tensor, targets: Tensor) -> Tensor: + """ + Args: + inputs (Tensor): (bsz, 30) 사이즈의 Float Tensor. + 각 예제에 대한 예측. + targets (Tensor): (30,) 사이즈의 true class 정보. 0부터 29까지의 정수가 담긴 Long Tensor. + 음성 클래스: 0, 양성 클래스: 1. + """ + p = torch.sigmoid(inputs) + targets = F.one_hot(targets, num_classes=30).float() + ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + p_t = p * targets + (1 - p) * (1 - targets) + loss = ce_loss * ((1 - p_t) ** self.gamma) + + if self.alpha >= 0: + alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets) + loss = alpha_t * loss + + # Check reduction option and return loss accordingly + if self.reduction == 'none': + pass + elif self.reduction == 'mean': + loss = loss.mean() + elif self.reduction == 'sum': + loss = loss.sum() + else: + raise ValueError( + f'Invalid Value for arg "reduction": {self.reduction} \n Supported reduction modes: "none", "mean", "sum"' + ) + return loss + +class WeightedFocalLoss(nn.Module): + def __init__(self, alpha: torch.Tensor = None, gamma: float = 2.0, reduction: str = 'mean') -> None: + super(WeightedFocalLoss, self).__init__() + self.alpha = alpha # 각 클래스에 대한 가중치 + self.gamma = gamma # "focus" 매개변수로 어려운 예시에 더 많은 주의를 기울이는 역할 + self.reduction = reduction + + # alpha가 None인 경우, 모든 클래스에 동일한 가중치 적용 + # alpha가 텐서인 경우, alpha의 각 요소는 해당 클래스의 가중치로 설정 + self.ce_loss = nn.CrossEntropyLoss(weight=self.alpha, reduction='none') + + def forward(self, inputs: Tensor, targets: Tensor) -> Tensor: + ce_loss = self.ce_loss(inputs, targets) + pt = torch.exp(-ce_loss) + focal_loss = (1 - pt)**self.gamma * ce_loss + + if self.reduction == 'mean': + return torch.mean(focal_loss) + elif self.reduction == 'sum': + return torch.sum(focal_loss) + else: + return focal_loss + + +class LovaszSoftmaxLoss(nn.Module): + def __init__(self, weight=None, reduction: str = 'mean') -> None: + super(LovaszSoftmaxLoss, self).__init__() + self.weight = weight + self.reduction = reduction + + def lovasz_grad(self, true_sorted): + p = len(true_sorted) + gts = true_sorted.sum() + intersection = gts - true_sorted.cumsum(0) + union = gts + (1 - true_sorted).cumsum(0) + jaccard = 1 - intersection / union + if p > 1: # cover 1-pixel case + jaccard[1:p] = jaccard[1:p] - jaccard[0:-1] + return jaccard + + def lovasz_softmax(self, log_probs, labels): + C = log_probs.shape[1] + losses = [] + for c in range(C): + fg = (labels == c).float() # foreground for class c + if fg.sum() == 0: + continue + errors = (fg - log_probs[:, c]).abs() + errors_sorted, perm = torch.sort(errors, 0, descending=True) + fg_sorted = fg[perm] + losses.append(torch.dot(errors_sorted, self.lovasz_grad(fg_sorted))) + return torch.stack(losses) + + def forward(self, inputs: Tensor, targets: Tensor) -> Tensor: + log_probs = F.log_softmax(inputs, dim=1) + lovasz_loss = self.lovasz_softmax(log_probs, targets) + + if self.reduction == 'mean': + return torch.mean(lovasz_loss) + elif self.reduction == 'sum': + return torch.sum(lovasz_loss) + else: + return lovasz_loss + + +class MulticlassDiceLoss(nn.Module): + def __init__(self, smooth: float = 1e-5, reduction: str = 'mean'): + super(MulticlassDiceLoss, self).__init__() + self.smooth = smooth + self.reduction = reduction + + def forward(self, inputs: Tensor, targets: Tensor) -> Tensor: + # Softmax over the inputs + inputs = torch.softmax(inputs, dim=1) + + + # One-hot encode targets + targets_one_hot = torch.nn.functional.one_hot(targets, num_classes=inputs.shape[1]) + + # Move targets_one_hot to device of inputs + targets_one_hot = targets_one_hot.to(inputs.device) + + # Calculate Dice Loss for each class + dice_loss = 0 + for i in range(inputs.shape[1]): + intersection = 2 * (inputs[:, i] * targets_one_hot[:, i]).sum() + union = inputs[:, i].sum() + targets_one_hot[:, i].sum() + dice_loss += (1 - (intersection + self.smooth) / (union + self.smooth)) + + # Average the dice loss for all classes + dice_loss /= inputs.shape[1] + + if self.reduction == 'mean': + return dice_loss + elif self.reduction == 'sum': + return dice_loss.sum() + else: + return dice_loss diff --git a/model/metric.py b/model/metric.py index 8aca908..2631e25 100755 --- a/model/metric.py +++ b/model/metric.py @@ -1,80 +1,80 @@ -import numpy as np -import sklearn -from sklearn.metrics import accuracy_score - - -def klue_re_micro_f1(preds, labels): - """KLUE-RE micro f1 (except no_relation)""" - label_list = [ - "no_relation", - "org:top_members/employees", - "org:members", - "org:product", - "per:title", - "org:alternate_names", - "per:employee_of", - "org:place_of_headquarters", - "per:product", - "org:number_of_employees/members", - "per:children", - "per:place_of_residence", - "per:alternate_names", - "per:other_family", - "per:colleagues", - "per:origin", - "per:siblings", - "per:spouse", - "org:founded", - "org:political/religious_affiliation", - "org:member_of", - "per:parents", - "org:dissolved", - "per:schools_attended", - "per:date_of_death", - "per:place_of_birth", - "per:place_of_death", - "org:founded_by", - "per:religion", - ] - - no_relation_label_idx = label_list.index("no_relation") - label_indices = list(range(len(label_list))) - label_indices.remove(no_relation_label_idx) - - return ( - sklearn.metrics.f1_score(labels, preds, average="micro", labels=label_indices) - * 100.0 - ) - - -def klue_re_auprc(probs, labels): - """KLUE-RE AUPRC (with no_relation)""" - labels = np.eye(30)[labels] - - score = np.zeros((30,)) - for c in range(30): - targets_c = labels.take([c], axis=1).ravel() - preds_c = probs.take([c], axis=1).ravel() - precision, recall, _ = sklearn.metrics.precision_recall_curve( - targets_c, preds_c - ) - score[c] = sklearn.metrics.auc(recall, precision) - - return np.average(score) * 100.0 - - -def compute_metrics(pred): - """validation을 위한 metrics function""" - labels = pred.label_ids - preds = pred.predictions.argmax(-1) - probs = pred.predictions - - f1 = klue_re_micro_f1(preds, labels) - auprc = klue_re_auprc(probs, labels) - acc = accuracy_score(labels, preds) # 리더보드 평가에는 포함되지 않습니다. - - return { - "micro f1 score": f1, - "auprc": auprc, - "accuracy": acc, - } +import numpy as np +import sklearn +from sklearn.metrics import accuracy_score + + +def klue_re_micro_f1(preds, labels): + """KLUE-RE micro f1 (except no_relation)""" + label_list = [ + "no_relation", + "org:top_members/employees", + "org:members", + "org:product", + "per:title", + "org:alternate_names", + "per:employee_of", + "org:place_of_headquarters", + "per:product", + "org:number_of_employees/members", + "per:children", + "per:place_of_residence", + "per:alternate_names", + "per:other_family", + "per:colleagues", + "per:origin", + "per:siblings", + "per:spouse", + "org:founded", + "org:political/religious_affiliation", + "org:member_of", + "per:parents", + "org:dissolved", + "per:schools_attended", + "per:date_of_death", + "per:place_of_birth", + "per:place_of_death", + "org:founded_by", + "per:religion", + ] + + no_relation_label_idx = label_list.index("no_relation") + label_indices = list(range(len(label_list))) + label_indices.remove(no_relation_label_idx) + + return ( + sklearn.metrics.f1_score(labels, preds, average="micro", labels=label_indices) + * 100.0 + ) + + +def klue_re_auprc(probs, labels): + """KLUE-RE AUPRC (with no_relation)""" + labels = np.eye(30)[labels] + + score = np.zeros((30,)) + for c in range(30): + targets_c = labels.take([c], axis=1).ravel() + preds_c = probs.take([c], axis=1).ravel() + precision, recall, _ = sklearn.metrics.precision_recall_curve( + targets_c, preds_c + ) + score[c] = sklearn.metrics.auc(recall, precision) + + return np.average(score) * 100.0 + + +def compute_metrics(pred): + """validation을 위한 metrics function""" + labels = pred.label_ids + preds = pred.predictions.argmax(-1) + probs = pred.predictions + + f1 = klue_re_micro_f1(preds, labels) + auprc = klue_re_auprc(probs, labels) + acc = accuracy_score(labels, preds) # 리더보드 평가에는 포함되지 않습니다. + + return { + "micro f1 score": f1, + "auprc": auprc, + "accuracy": acc, + } diff --git a/model/model.py b/model/model.py index 4e79d58..5983270 100644 --- a/model/model.py +++ b/model/model.py @@ -1,204 +1,204 @@ -import torch.nn as nn -from torch.cuda.amp import autocast -from transformers import ( - AutoConfig, - AutoModel, - AutoModelForSequenceClassification, -) - -from model.loss import * - - -class BaseREModel(nn.Module): - """Pre-trained Language Model로부터 나온 logits를 FC layer에 통과시키는 기본 분류기.""" - def __init__(self, config, new_num_tokens: int): - """ - Args: - config: 사용자 config. - new_num_tokens: tokenizer의 길이. Additional special tokens 수를 포함. - """ - super().__init__() - - self.model_config = AutoConfig.from_pretrained(config.model['name']) - self.model_config.num_labels = config.num_labels - - self.plm = AutoModelForSequenceClassification.from_pretrained(config.model['name'], - config=self.model_config) - - if self.model_config.vocab_size != new_num_tokens: - self.plm.resize_token_embeddings(new_num_tokens) - - @autocast() - def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None): - outputs = self.plm(input_ids=input_ids, - token_type_ids=token_type_ids, - attention_mask=attention_mask) - logits = outputs['logits'] - return { - 'logits': logits, - } - -class CustomModel(nn.Module): - def __init__(self, config, new_num_tokens: int): - super().__init__() - - self.model_config = AutoConfig.from_pretrained(config.model['name']) - self.model_config.num_labels = config.num_labels - - self.plm = AutoModelForSequenceClassification.from_pretrained(config.model['name'], - config=self.model_config) - - if self.model_config.vocab_size != new_num_tokens: - self.plm.resize_token_embeddings(new_num_tokens) - - self.hidden_size = self.model_config.hidden_size - - self.entity_embedding = nn.Embedding(3, self.hidden_size) - nn.init.xavier_normal_(self.entity_embedding.weight) - - self.weight = nn.Parameter(torch.Tensor(1)) # Learnable weight parameter - nn.init.uniform_(self.weight) - - # self.reduction_layer = nn.Linear(self.hidden_size * 2, self.hidden_size) - - @autocast() - def forward( - self, - input_ids=None, - token_type_ids=None, - attention_mask=None, - entity_ids=None, - labels=None, - ): - # entity_ids = entity_ids.long() - entity_embeddings = self.entity_embedding(entity_ids) # torch.tensor([64, 180, 1024]) - input_embeddings = self.plm.get_input_embeddings()(input_ids) # torch.tensor([64, 180, 1024]) - - # 단순히 더한 버전 - # combined_embeddings = input_embeddings + entity_embeddings - - # concat 버전 - # combined_embeddings = torch.cat([input_embeddings,entity_embeddings], dim=-1) # torch.tensor([64, 180, 2048]) - # combined_embeddings = self.reduction_layer(combined_embeddings) # torch.tensor([64, 180, 1024]) - - # weighted sum 버전 - combined_embeddings = self.weight * input_embeddings + (1 - self.weight) * entity_embeddings - - outputs = self.plm.roberta( - inputs_embeds=combined_embeddings, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - ) - logits = self.plm.classifier(outputs['last_hidden_state']) - - return { - 'logits': logits, - } - -class BiGRUREModel(nn.Module): - """ - Pre-trained Language Model로부터 나온 logits를 Bi-driectional GRU에 통과시킨 후 - hidden states 정보를 FC layer에 통과시킨 분류기. - """ - def __init__(self, config, new_num_tokens: int): - """ - Args: - config: 사용자 config. - new_num_tokens: tokenizer의 길이. Additional special tokens 수를 포함. - """ - super().__init__() - - self.model_config = AutoConfig.from_pretrained(config.model['name']) - self.model_config.num_labels = config.num_labels - - self.plm = AutoModel.from_pretrained(config.model['name'], - config=self.model_config) - - if self.model_config.vocab_size != new_num_tokens: - self.plm.resize_token_embeddings(new_num_tokens) - - self.hidden_size = self.model_config.hidden_size # 1024 for roberta-large - self.gru = nn.GRU(input_size=self.hidden_size, - hidden_size=self.hidden_size, - num_layers=1, - batch_first=True, # (bsz, seq, feature) if True else (seq, bsz, feature) - bidirectional=True) - self.init_gru() - self.classifier = nn.Linear(self.hidden_size * 2, config.num_labels) - nn.init.kaiming_normal_(self.classifier.weight, mode='fan_in', nonlinearity='relu') - self.classifier.bias.data.fill_(0) - - def init_gru(self): - for name, param in self.gru.named_parameters(): - if 'weight_ih' in name: - nn.init.xavier_normal_(param.data) - elif 'weight_hh' in name: - nn.init.xavier_normal_(param.data) - elif 'bias' in name: - param.data.fill_(0) - - @autocast() - def forward(self, input_ids: Tensor, token_type_ids: Tensor, attention_mask: Tensor, labels=None): - outputs = self.plm(input_ids=input_ids, - token_type_ids=token_type_ids, - attention_mask=attention_mask).last_hidden_state - _, next_hidden = self.gru(outputs) - outputs = torch.cat([next_hidden[0], next_hidden[1]], dim=1) - logits = self.classifier(outputs) - return { - 'logits': logits, - } - - -class BiLSTMREModel(nn.Module): - """Pre-trained Language Model로부터 나온 logits를 Bi-driectional LSTM에 통과시킨 후 - hidden states 정보를 FC layer에 통과시킨 분류기. - """ - def __init__(self, config, new_num_tokens: int): - """ - Args: - config: 사용자 config. - new_num_tokens: tokenizer의 길이. Additional special tokens 수를 포함. - """ - super().__init__() - - self.model_config = AutoConfig.from_pretrained(config.model['name']) - self.model_config.num_labels = config.num_labels - - self.plm = AutoModel.from_pretrained(config.model['name'], - config=self.model_config) - - if self.model_config.vocab_size != new_num_tokens: - self.plm.resize_token_embeddings(new_num_tokens) - - self.hidden_size = self.model_config.hidden_size # 1024 for roberta-large - self.lstm = nn.LSTM(input_size=self.hidden_size, - hidden_size=self.hidden_size, - num_layers=1, - batch_first=True, # (bsz, seq, feature) if True else (seq, bsz, feature) - bidirectional=True) - self.init_lstm() - self.classifier = nn.Linear(self.hidden_size * 2, config.num_labels) - nn.init.kaiming_normal_(self.classifier.weight, mode='fan_in', nonlinearity='relu') - self.classifier.bias.data.fill_(0) - - def init_lstm(self): - for name, param in self.lstm.named_parameters(): - if 'weight_ih' in name: - nn.init.xavier_normal_(param.data) - elif 'weight_hh' in name: - nn.init.xavier_normal_(param.data) - elif 'bias' in name: - param.data.fill_(0) - - @autocast() - def forward(self, input_ids: Tensor, token_type_ids: Tensor, attention_mask: Tensor, labels=None): - outputs = self.plm(input_ids=input_ids, - token_type_ids=token_type_ids, - attention_mask=attention_mask).last_hidden_state - _, (next_hidden, _) = self.lstm(outputs) - outputs = torch.cat([next_hidden[0], next_hidden[1]], dim=1) - logits = self.classifier(outputs) - return { - 'logits': logits, - } +import torch.nn as nn +from torch.cuda.amp import autocast +from transformers import ( + AutoConfig, + AutoModel, + AutoModelForSequenceClassification, +) + +from model.loss import * + + +class BaseREModel(nn.Module): + """Pre-trained Language Model로부터 나온 logits를 FC layer에 통과시키는 기본 분류기.""" + def __init__(self, config, new_num_tokens: int): + """ + Args: + config: 사용자 config. + new_num_tokens: tokenizer의 길이. Additional special tokens 수를 포함. + """ + super().__init__() + + self.model_config = AutoConfig.from_pretrained(config.model['name']) + self.model_config.num_labels = config.num_labels + + self.plm = AutoModelForSequenceClassification.from_pretrained(config.model['name'], + config=self.model_config) + + if self.model_config.vocab_size != new_num_tokens: + self.plm.resize_token_embeddings(new_num_tokens) + + @autocast() + def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None): + outputs = self.plm(input_ids=input_ids, + token_type_ids=token_type_ids, + attention_mask=attention_mask) + logits = outputs['logits'] + return { + 'logits': logits, + } + +class CustomModel(nn.Module): + def __init__(self, config, new_num_tokens: int): + super().__init__() + + self.model_config = AutoConfig.from_pretrained(config.model['name']) + self.model_config.num_labels = config.num_labels + + self.plm = AutoModelForSequenceClassification.from_pretrained(config.model['name'], + config=self.model_config) + + if self.model_config.vocab_size != new_num_tokens: + self.plm.resize_token_embeddings(new_num_tokens) + + self.hidden_size = self.model_config.hidden_size + + self.entity_embedding = nn.Embedding(3, self.hidden_size) + nn.init.xavier_normal_(self.entity_embedding.weight) + + self.weight = nn.Parameter(torch.Tensor(1)) # Learnable weight parameter + nn.init.uniform_(self.weight) + + # self.reduction_layer = nn.Linear(self.hidden_size * 2, self.hidden_size) + + @autocast() + def forward( + self, + input_ids=None, + token_type_ids=None, + attention_mask=None, + entity_ids=None, + labels=None, + ): + # entity_ids = entity_ids.long() + entity_embeddings = self.entity_embedding(entity_ids) # torch.tensor([64, 180, 1024]) + input_embeddings = self.plm.get_input_embeddings()(input_ids) # torch.tensor([64, 180, 1024]) + + # 단순히 더한 버전 + # combined_embeddings = input_embeddings + entity_embeddings + + # concat 버전 + # combined_embeddings = torch.cat([input_embeddings,entity_embeddings], dim=-1) # torch.tensor([64, 180, 2048]) + # combined_embeddings = self.reduction_layer(combined_embeddings) # torch.tensor([64, 180, 1024]) + + # weighted sum 버전 + combined_embeddings = self.weight * input_embeddings + (1 - self.weight) * entity_embeddings + + outputs = self.plm.roberta( + inputs_embeds=combined_embeddings, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + ) + logits = self.plm.classifier(outputs['last_hidden_state']) + + return { + 'logits': logits, + } + +class BiGRUREModel(nn.Module): + """ + Pre-trained Language Model로부터 나온 logits를 Bi-driectional GRU에 통과시킨 후 + hidden states 정보를 FC layer에 통과시킨 분류기. + """ + def __init__(self, config, new_num_tokens: int): + """ + Args: + config: 사용자 config. + new_num_tokens: tokenizer의 길이. Additional special tokens 수를 포함. + """ + super().__init__() + + self.model_config = AutoConfig.from_pretrained(config.model['name']) + self.model_config.num_labels = config.num_labels + + self.plm = AutoModel.from_pretrained(config.model['name'], + config=self.model_config) + + if self.model_config.vocab_size != new_num_tokens: + self.plm.resize_token_embeddings(new_num_tokens) + + self.hidden_size = self.model_config.hidden_size # 1024 for roberta-large + self.gru = nn.GRU(input_size=self.hidden_size, + hidden_size=self.hidden_size, + num_layers=1, + batch_first=True, # (bsz, seq, feature) if True else (seq, bsz, feature) + bidirectional=True) + self.init_gru() + self.classifier = nn.Linear(self.hidden_size * 2, config.num_labels) + nn.init.kaiming_normal_(self.classifier.weight, mode='fan_in', nonlinearity='relu') + self.classifier.bias.data.fill_(0) + + def init_gru(self): + for name, param in self.gru.named_parameters(): + if 'weight_ih' in name: + nn.init.xavier_normal_(param.data) + elif 'weight_hh' in name: + nn.init.xavier_normal_(param.data) + elif 'bias' in name: + param.data.fill_(0) + + @autocast() + def forward(self, input_ids: Tensor, token_type_ids: Tensor, attention_mask: Tensor, labels=None): + outputs = self.plm(input_ids=input_ids, + token_type_ids=token_type_ids, + attention_mask=attention_mask).last_hidden_state + _, next_hidden = self.gru(outputs) + outputs = torch.cat([next_hidden[0], next_hidden[1]], dim=1) + logits = self.classifier(outputs) + return { + 'logits': logits, + } + + +class BiLSTMREModel(nn.Module): + """Pre-trained Language Model로부터 나온 logits를 Bi-driectional LSTM에 통과시킨 후 + hidden states 정보를 FC layer에 통과시킨 분류기. + """ + def __init__(self, config, new_num_tokens: int): + """ + Args: + config: 사용자 config. + new_num_tokens: tokenizer의 길이. Additional special tokens 수를 포함. + """ + super().__init__() + + self.model_config = AutoConfig.from_pretrained(config.model['name']) + self.model_config.num_labels = config.num_labels + + self.plm = AutoModel.from_pretrained(config.model['name'], + config=self.model_config) + + if self.model_config.vocab_size != new_num_tokens: + self.plm.resize_token_embeddings(new_num_tokens) + + self.hidden_size = self.model_config.hidden_size # 1024 for roberta-large + self.lstm = nn.LSTM(input_size=self.hidden_size, + hidden_size=self.hidden_size, + num_layers=1, + batch_first=True, # (bsz, seq, feature) if True else (seq, bsz, feature) + bidirectional=True) + self.init_lstm() + self.classifier = nn.Linear(self.hidden_size * 2, config.num_labels) + nn.init.kaiming_normal_(self.classifier.weight, mode='fan_in', nonlinearity='relu') + self.classifier.bias.data.fill_(0) + + def init_lstm(self): + for name, param in self.lstm.named_parameters(): + if 'weight_ih' in name: + nn.init.xavier_normal_(param.data) + elif 'weight_hh' in name: + nn.init.xavier_normal_(param.data) + elif 'bias' in name: + param.data.fill_(0) + + @autocast() + def forward(self, input_ids: Tensor, token_type_ids: Tensor, attention_mask: Tensor, labels=None): + outputs = self.plm(input_ids=input_ids, + token_type_ids=token_type_ids, + attention_mask=attention_mask).last_hidden_state + _, (next_hidden, _) = self.lstm(outputs) + outputs = torch.cat([next_hidden[0], next_hidden[1]], dim=1) + logits = self.classifier(outputs) + return { + 'logits': logits, + } diff --git a/pyproject.toml b/pyproject.toml index 6d42880..94ad0f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,16 +1,16 @@ -[tool.black] -line-length = 88 -target-version = ['py37','py38'] -include = '\.pyi?$' -exclude = ''' -/( - \.git - | _build - | best_model - | logs - | prediction - | results - | saved - | wandb -)/ +[tool.black] +line-length = 88 +target-version = ['py37','py38'] +include = '\.pyi?$' +exclude = ''' +/( + \.git + | _build + | best_model + | logs + | prediction + | results + | saved + | wandb +)/ ''' \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 553ac5f..e184671 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ -datasets==2.12.0 -pandas==1.1.5 -pytz==2023.3 -scikit-learn~=0.24.1 -torch==2.0.1 -torchvision==0.15.2 -tqdm==4.62.1 -transformers==4.10.0 +datasets==2.12.0 +pandas==1.1.5 +pytz==2023.3 +scikit-learn~=0.24.1 +torch==2.0.1 +torchvision==0.15.2 +tqdm==4.62.1 +transformers==4.10.0 wandb==0.15.1 \ No newline at end of file diff --git a/run.sh b/run.sh index 08a8738..9a4653e 100755 --- a/run.sh +++ b/run.sh @@ -1,24 +1,24 @@ -#!/bin/bash - -configs=("configs/config_1.yaml" "configs/config_2.yaml") -index=0 - -for config in "${configs[@]}"; do - let index++ - log_filename_train="output_logs/output_train_${index}.log" - log_filename_infer="output_logs/output_infer_${index}.log" - - echo "Starting training with ${config}..." - - nohup python3 train.py ${config} > "${log_filename_train}" 2>&1 & - wait $! - - echo "Training with ${config} has completed." - - nohup python3 inference.py ${config} > "${log_filename_infer}" 2>&1 & - wait $! - - echo "Inferencing with ${config} has completed." -done - -echo "All experiments have been completed." +#!/bin/bash + +configs=("configs/config_1.yaml" "configs/config_2.yaml") +index=0 + +for config in "${configs[@]}"; do + let index++ + log_filename_train="output_logs/output_train_${index}.log" + log_filename_infer="output_logs/output_infer_${index}.log" + + echo "Starting training with ${config}..." + + nohup python3 train.py ${config} > "${log_filename_train}" 2>&1 & + wait $! + + echo "Training with ${config} has completed." + + nohup python3 inference.py ${config} > "${log_filename_infer}" 2>&1 & + wait $! + + echo "Inferencing with ${config} has completed." +done + +echo "All experiments have been completed." diff --git a/sweep.py b/sweep.py index c79d2f7..9597008 100644 --- a/sweep.py +++ b/sweep.py @@ -1,182 +1,182 @@ -import sys -import pickle as pickle -import pytz -from datetime import datetime -import wandb - -import torch -from transformers import ( - AutoTokenizer, - EarlyStoppingCallback, - TrainingArguments, -) - -from argparse import Namespace - -from utils.args import * -from load_data.load_data import * -from model.model import * -from model.metric import * -from trainer.trainer import * -from utils.utils import * - -from typing import Any - - -def main(config: Namespace) -> None: - """ - Sweep 초기화 및 Wandb sweep agent 선언 - - Args: - config(Namespace): 모델 학습에 필요한 hyperparameter를 포함하는 딕셔너리 - Returns: - None - """ - def sweep_train(config: Namespace = config) -> None: - """ - Sweep agent 선언시 function에 전달되는 함수 - - Args: - config(Namespace): 모델 학습에 필요한 hyperparmeter를 포함하는 딕셔너리 - Returns: - None - """ - wandb.init( - entity=config.wandb['entity'], - project=config.wandb['sweep_project_name'] - ) - - sweep_config = wandb.config - - seed_everything(config.seed) - - # load model and tokenizer - model_name = config.model['name'] - tokenizer = AutoTokenizer.from_pretrained(model_name) - - # 1. load dataset - # 2. preprocess dataset - # 3. tokenize dataset - revision = config.dataloader['revision'] - input_format = sweep_config['input_format'] - prompt = sweep_config['prompt'] - type_transform = sweep_config['type_transform'] - - train_dataset, train_raw_label = load_train_dataset( - split=config.dataloader['train_split'], - revision=revision, - tokenizer=tokenizer, - input_format=input_format, - prompt=prompt, - type_transform=type_transform, - ) - dev_dataset, dev_raw_label = load_train_dataset( - split=config.dataloader['valid_split'], - revision=revision, - tokenizer=tokenizer, - input_format=input_format, - prompt=prompt, - type_transform=type_transform, - ) - - train_label = label_to_num(train_raw_label) - dev_label = label_to_num(dev_raw_label) - - # 4. make Dataset object - re_train_dataset = REDataset(train_dataset, train_label) - re_dev_dataset = REDataset(dev_dataset, dev_label) - - device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') - print(device) - - # 5. import model - # setting model hyperparameter - model_module = __import__('model.model', fromlist=[config.model['variant']]) - model_class = getattr(model_module, config.model['variant']) - # Available customized classes: - # BaseREModel, BiLSTMREModel, BiGRUREModel - model = model_class(config, len(tokenizer)) - - print(model.model_config) - - model.parameters - model.to(device) - - # 6. training arguments 설정 - ## 사용한 option 외에도 다양한 option들이 있습니다. - ## https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. - training_args = TrainingArguments( - # 기본 설정 - output_dir=config.trainer['output_dir'], # 모델 저장 디렉토리 - report_to=('wandb' if config.use_wandb else 'none'), # wandb 사용 여부 - fp16=True, # 16-bit floating point precision - - # 학습 설정 - num_train_epochs=sweep_config['epochs'], # 전체 훈련 epoch 수 - learning_rate=sweep_config['lr'], # learning rate - weight_decay=config.optimizer['weight_decay'], # weight decay - adam_beta2=sweep_config['adam_beta2'], # AdamW 옵티마이저의 beta2 하이퍼파라미터 - - # 배치 사이즈 설정 - per_device_train_batch_size=config.dataloader['batch_size'], # 훈련 중 장치 당 batch size - per_device_eval_batch_size=config.dataloader['batch_size'], # 평가 중 장치 당 batch size - - # 스케줄링 설정 - warmup_ratio=sweep_config['warmup_ratio'], # learning rate scheduler의 warmup 비율 - # warmup_steps=config.lr_scheduler['warmup_steps'], # number of warmup steps for learning rate scheduler - - # 로깅 설정 - logging_dir=config.trainer['logging_dir'], # 로그 저장 디렉토리 - logging_steps=config.trainer['logging_steps'], # 로그 저장 스텝 - - # 모델 저장 설정 - save_total_limit=config.trainer['save_total_limit'], # 전체 저장 모델 수 제한 - save_steps=config.trainer['save_steps'], # 모델 저장 스텝 - save_strategy=config.trainer['save_strategy'], - - # 평가 설정 - evaluation_strategy=config.trainer['evaluation_strategy'], # 훈련 중 평가 전략 - eval_steps=config.trainer['evaluation_steps'], # 평가 스텝 - load_best_model_at_end=True, - ) - - # 7. trainer 설정 - # 8. evaluate 함수 설정 - trainer = RETrainer( - model=model, # the instantiated 🤗 Transformers model to be trained - args=training_args, # training arguments, defined above - train_dataset=re_train_dataset, # training dataset - eval_dataset=re_dev_dataset, # evaluation dataset - compute_metrics=compute_metrics, # define metrics function - # callbacks=([WandbCallback()] if config.use_wandb else []), - # callbacks=[EarlyStoppingCallback(early_stopping_patience=config.trainer['early_stop'])], - loss_cfg=config.loss, - ) - - # 9. train model - trainer.train() - # 10. save model - trainer.save_model(config.trainer['model_dir']) - - sweep_id = wandb.sweep( - sweep=config.sweep_config - ) - - wandb.agent( - sweep_id=sweep_id, - function=sweep_train, - count=config.wandb['sweep_count'] - ) - - -if __name__ == '__main__': - try: - config_path = sys.argv[1] - except IndexError: - config_path = './config.yaml' - config = parse_arguments(config_path) - - now = datetime.now(pytz.timezone('Asia/Seoul')) - run_name = f'{config.run_name}_{now.strftime("%d-%H-%M")}' - +import sys +import pickle as pickle +import pytz +from datetime import datetime +import wandb + +import torch +from transformers import ( + AutoTokenizer, + EarlyStoppingCallback, + TrainingArguments, +) + +from argparse import Namespace + +from utils.args import * +from load_data.load_data import * +from model.model import * +from model.metric import * +from trainer.trainer import * +from utils.utils import * + +from typing import Any + + +def main(config: Namespace) -> None: + """ + Sweep 초기화 및 Wandb sweep agent 선언 + + Args: + config(Namespace): 모델 학습에 필요한 hyperparameter를 포함하는 딕셔너리 + Returns: + None + """ + def sweep_train(config: Namespace = config) -> None: + """ + Sweep agent 선언시 function에 전달되는 함수 + + Args: + config(Namespace): 모델 학습에 필요한 hyperparmeter를 포함하는 딕셔너리 + Returns: + None + """ + wandb.init( + entity=config.wandb['entity'], + project=config.wandb['sweep_project_name'] + ) + + sweep_config = wandb.config + + seed_everything(config.seed) + + # load model and tokenizer + model_name = config.model['name'] + tokenizer = AutoTokenizer.from_pretrained(model_name) + + # 1. load dataset + # 2. preprocess dataset + # 3. tokenize dataset + revision = config.dataloader['revision'] + input_format = sweep_config['input_format'] + prompt = sweep_config['prompt'] + type_transform = sweep_config['type_transform'] + + train_dataset, train_raw_label = load_train_dataset( + split=config.dataloader['train_split'], + revision=revision, + tokenizer=tokenizer, + input_format=input_format, + prompt=prompt, + type_transform=type_transform, + ) + dev_dataset, dev_raw_label = load_train_dataset( + split=config.dataloader['valid_split'], + revision=revision, + tokenizer=tokenizer, + input_format=input_format, + prompt=prompt, + type_transform=type_transform, + ) + + train_label = label_to_num(train_raw_label) + dev_label = label_to_num(dev_raw_label) + + # 4. make Dataset object + re_train_dataset = REDataset(train_dataset, train_label) + re_dev_dataset = REDataset(dev_dataset, dev_label) + + device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') + print(device) + + # 5. import model + # setting model hyperparameter + model_module = __import__('model.model', fromlist=[config.model['variant']]) + model_class = getattr(model_module, config.model['variant']) + # Available customized classes: + # BaseREModel, BiLSTMREModel, BiGRUREModel + model = model_class(config, len(tokenizer)) + + print(model.model_config) + + model.parameters + model.to(device) + + # 6. training arguments 설정 + ## 사용한 option 외에도 다양한 option들이 있습니다. + ## https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. + training_args = TrainingArguments( + # 기본 설정 + output_dir=config.trainer['output_dir'], # 모델 저장 디렉토리 + report_to=('wandb' if config.use_wandb else 'none'), # wandb 사용 여부 + fp16=True, # 16-bit floating point precision + + # 학습 설정 + num_train_epochs=sweep_config['epochs'], # 전체 훈련 epoch 수 + learning_rate=sweep_config['lr'], # learning rate + weight_decay=config.optimizer['weight_decay'], # weight decay + adam_beta2=sweep_config['adam_beta2'], # AdamW 옵티마이저의 beta2 하이퍼파라미터 + + # 배치 사이즈 설정 + per_device_train_batch_size=config.dataloader['batch_size'], # 훈련 중 장치 당 batch size + per_device_eval_batch_size=config.dataloader['batch_size'], # 평가 중 장치 당 batch size + + # 스케줄링 설정 + warmup_ratio=sweep_config['warmup_ratio'], # learning rate scheduler의 warmup 비율 + # warmup_steps=config.lr_scheduler['warmup_steps'], # number of warmup steps for learning rate scheduler + + # 로깅 설정 + logging_dir=config.trainer['logging_dir'], # 로그 저장 디렉토리 + logging_steps=config.trainer['logging_steps'], # 로그 저장 스텝 + + # 모델 저장 설정 + save_total_limit=config.trainer['save_total_limit'], # 전체 저장 모델 수 제한 + save_steps=config.trainer['save_steps'], # 모델 저장 스텝 + save_strategy=config.trainer['save_strategy'], + + # 평가 설정 + evaluation_strategy=config.trainer['evaluation_strategy'], # 훈련 중 평가 전략 + eval_steps=config.trainer['evaluation_steps'], # 평가 스텝 + load_best_model_at_end=True, + ) + + # 7. trainer 설정 + # 8. evaluate 함수 설정 + trainer = RETrainer( + model=model, # the instantiated 🤗 Transformers model to be trained + args=training_args, # training arguments, defined above + train_dataset=re_train_dataset, # training dataset + eval_dataset=re_dev_dataset, # evaluation dataset + compute_metrics=compute_metrics, # define metrics function + # callbacks=([WandbCallback()] if config.use_wandb else []), + # callbacks=[EarlyStoppingCallback(early_stopping_patience=config.trainer['early_stop'])], + loss_cfg=config.loss, + ) + + # 9. train model + trainer.train() + # 10. save model + trainer.save_model(config.trainer['model_dir']) + + sweep_id = wandb.sweep( + sweep=config.sweep_config + ) + + wandb.agent( + sweep_id=sweep_id, + function=sweep_train, + count=config.wandb['sweep_count'] + ) + + +if __name__ == '__main__': + try: + config_path = sys.argv[1] + except IndexError: + config_path = './config.yaml' + config = parse_arguments(config_path) + + now = datetime.now(pytz.timezone('Asia/Seoul')) + run_name = f'{config.run_name}_{now.strftime("%d-%H-%M")}' + main(config) \ No newline at end of file diff --git a/trainer/trainer.py b/trainer/trainer.py index cd0b221..0cc63d3 100644 --- a/trainer/trainer.py +++ b/trainer/trainer.py @@ -1,41 +1,41 @@ -import torch -from transformers import Trainer - - -class RETrainer(Trainer): - def __init__(self, loss_cfg=None, *args, **kwargs): - super().__init__(*args, **kwargs) - self.loss_cfg = loss_cfg - - def compute_loss(self, model, inputs, return_outputs=False): - device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu:0') - - labels = inputs.pop('labels') - outputs = model(**inputs) - logits = outputs['logits'] if isinstance(outputs, dict) else outputs[0] - - # 인덱스에 맞춰서 과거 ouput을 다 저장 - if self.args.past_index >= 0: - self._past= outputs[self.args.past_index] - - # 커스텀 loss 정의 - if self.loss_cfg['type'] == 'CrossEntropyLoss': - loss_fct = torch.nn.functional.cross_entropy - elif self.loss_cfg['type'] == 'WeightedCrossEntropyLoss': - loss_fct = torch.nn.CrossEntropyLoss(weight=torch.Tensor(self.loss_cfg['weights']).to(device)) - else: - loss_module = __import__('model.loss', fromlist=[self.loss_cfg['type']]) - loss_class = getattr(loss_module, self.loss_cfg['type']) - if self.loss_cfg['type'] == 'LovaszSoftmaxLoss': - loss_fct = loss_class() - elif self.loss_cfg['type'] == 'FocalLoss': - loss_fct = loss_class(self.loss_cfg['focal_alpha'], self.loss_cfg['focal_gamma']) - elif self.loss_cfg['type'] == 'WeightedFocalLoss': - loss_fct = loss_class(alpha = torch.Tensor(self.loss_cfg['weight_focal_alpha']).to(device), gamma= self.loss_cfg['focal_gamma']) - elif self.loss_cfg['type'] == 'MulticlassDiceLoss': - loss_fct = loss_class(self.loss_cfg['dice_smooth']) - else: - raise ValueError('Unsupported loss type') - - loss = loss_fct(logits, labels).to(device) - return (loss, outputs) if return_outputs else loss +import torch +from transformers import Trainer + + +class RETrainer(Trainer): + def __init__(self, loss_cfg=None, *args, **kwargs): + super().__init__(*args, **kwargs) + self.loss_cfg = loss_cfg + + def compute_loss(self, model, inputs, return_outputs=False): + device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu:0') + + labels = inputs.pop('labels') + outputs = model(**inputs) + logits = outputs['logits'] if isinstance(outputs, dict) else outputs[0] + + # 인덱스에 맞춰서 과거 ouput을 다 저장 + if self.args.past_index >= 0: + self._past= outputs[self.args.past_index] + + # 커스텀 loss 정의 + if self.loss_cfg['type'] == 'CrossEntropyLoss': + loss_fct = torch.nn.functional.cross_entropy + elif self.loss_cfg['type'] == 'WeightedCrossEntropyLoss': + loss_fct = torch.nn.CrossEntropyLoss(weight=torch.Tensor(self.loss_cfg['weights']).to(device)) + else: + loss_module = __import__('model.loss', fromlist=[self.loss_cfg['type']]) + loss_class = getattr(loss_module, self.loss_cfg['type']) + if self.loss_cfg['type'] == 'LovaszSoftmaxLoss': + loss_fct = loss_class() + elif self.loss_cfg['type'] == 'FocalLoss': + loss_fct = loss_class(self.loss_cfg['focal_alpha'], self.loss_cfg['focal_gamma']) + elif self.loss_cfg['type'] == 'WeightedFocalLoss': + loss_fct = loss_class(alpha = torch.Tensor(self.loss_cfg['weight_focal_alpha']).to(device), gamma= self.loss_cfg['focal_gamma']) + elif self.loss_cfg['type'] == 'MulticlassDiceLoss': + loss_fct = loss_class(self.loss_cfg['dice_smooth']) + else: + raise ValueError('Unsupported loss type') + + loss = loss_fct(logits, labels).to(device) + return (loss, outputs) if return_outputs else loss diff --git a/utils/args.py b/utils/args.py index b6b6a4f..b35ea72 100755 --- a/utils/args.py +++ b/utils/args.py @@ -1,16 +1,16 @@ -import yaml -from argparse import Namespace - - -def parse_arguments(config_path: str) -> Namespace: - """config.json 파일의 내용을 argparse.Namespace 객체로 변환. - - Returns: - args (argparse.Namespace): config.json 파일의 내용을 포함하는 Namespace 객체. - """ - - with open(config_path, "r") as f: - config = yaml.load(f, Loader=yaml.FullLoader) - - args = Namespace(**config) - return args +import yaml +from argparse import Namespace + + +def parse_arguments(config_path: str) -> Namespace: + """config.json 파일의 내용을 argparse.Namespace 객체로 변환. + + Returns: + args (argparse.Namespace): config.json 파일의 내용을 포함하는 Namespace 객체. + """ + + with open(config_path, "r") as f: + config = yaml.load(f, Loader=yaml.FullLoader) + + args = Namespace(**config) + return args diff --git a/utils/utils.py b/utils/utils.py index 9075d1c..addae29 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -1,154 +1,154 @@ -import logging -import os -import re -import random -from argparse import Namespace -from typing import Tuple - -import numpy as np -import torch -import wandb -from wandb import AlertLevel - -log = logging.getLogger(__name__) - -def seed_everything(seed: int, workers: bool = False) -> int: - log.info(f"Global seed set to {seed}") - os.environ["PL_GLOBAL_SEED"] = str(seed) - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - - os.environ["PL_SEED_WORKERS"] = f"{int(workers)}" - return seed - - -def init_wandb(config: Namespace, run_name: str) -> None: - if not config.use_wandb: - return - - wandb.init( - entity=config.wandb['entity'], - project=config.wandb['project_name'], - name=run_name, - config=config, - ) - wandb.alert(title='start', level=AlertLevel.INFO, text=f'{run_name}') - - -def alert_wandb(config: Namespace, run_name: str, title: str) -> None: - if config.use_wandb: - wandb.alert(title=title, level=AlertLevel.INFO, text=f'{run_name}') - - -def to_hangul(sent) -> Tuple[str, str]: - """ - entity명을 한글로 변경 - """ - dic = { - "ORG" : "조직", - "PER" : "사람", - "DAT" : "시간", - "LOC" : "장소", - "POH" : "기타", - "NOH" : "수량", - } - - sub = eval(sent['subject_entity']) - obj = eval(sent['object_entity']) - - sub['type'] = dic[sub['type']] - obj['type'] = dic[obj['type']] - - sent['subject_entity'] = str(sub) - sent['object_entity'] = str(obj) - - return sent['subject_entity'], sent['object_entity'] - - -def marker(sent, input_format: str) -> str: - """dataframe에서 하나의 row 내의 정보들을 조합해 마킹한 sentence를 만드는 함수""" - - # str 타입에서 dict 뽑아내기 - sub = eval(sent['subject_entity']) - obj = eval(sent['object_entity']) - - # 인덱스 뽑아와서 entity 구분하기 - indices = sorted([sub['start_idx'], sub['end_idx'], obj['start_idx'], obj['end_idx']]) - indices[1] += 1 - indices[3] += 1 - - def split_string_by_index(string, indices): - substrings = [] - start_index = 0 - for index in indices: - substrings.append(string[start_index:index]) - start_index = index - substrings.append(string[start_index:]) - return substrings - - split_sent = split_string_by_index(sent['sentence'], indices) - - # entity에 마킹하기 - lst = [] - if input_format == 'entity_mask': - for i in split_sent: - if i == sub['word']: - sub_token = f'[SUBJ-{sub["type"]}]' - lst.append(sub_token) - elif i == obj['word']: - obj_token = f'[OBJ-{obj["type"]}]' - lst.append(obj_token) - else: - lst.append(i) - - elif input_format == 'entity_marker': - for i in split_sent: - if i == sub['word']: - new_sub = ['[E1] '] + [sub['word']] + [' [/E1]'] - lst.append(new_sub) - elif i == obj['word']: - new_obj = ['[E2] '] + [obj['word']] + [' [/E2]'] - lst.append(new_obj) - else: - lst.append(i) - - elif input_format == 'entity_marker_punct': - for i in split_sent: - if i == sub['word']: - new_sub = ['@ '] + [sub['word']] + [' @'] - lst.append(new_sub) - elif i == obj['word']: - new_obj = ['# '] + [obj['word']] + [' #'] - lst.append(new_obj) - else: - lst.append(i) - - elif input_format == 'typed_entity_marker': - for i in split_sent: - if i == sub['word']: - new_sub = [' '] + [sub['word']] + [' '] - lst.append(new_sub) - elif i == obj['word']: - new_obj = [' '] + [obj['word']] + [' '] - lst.append(new_obj) - else: - lst.append(i) - - elif input_format == 'typed_entity_marker_punct': - for i in split_sent: - if i == sub['word']: - new_sub = ['@ '] + [' * '] + [sub['type'].lower()] + [' * '] + [sub['word']] + [' @ '] - lst.append(new_sub) - elif i == obj['word']: - new_obj = [' # '] + [' ^ '] + [obj['type'].lower()] + [' ^ '] + [obj['word']] + [' # '] - lst.append(new_obj) - else: - lst.append(i) - - # 최종 sentence로 만들고 공백 처리하기 - sentence = ''.join(str(item) if isinstance(item, str) else ''.join(item) for item in lst) - sentence = re.sub(r'\s+', ' ', sentence) - - return sentence +import logging +import os +import re +import random +from argparse import Namespace +from typing import Tuple + +import numpy as np +import torch +import wandb +from wandb import AlertLevel + +log = logging.getLogger(__name__) + +def seed_everything(seed: int, workers: bool = False) -> int: + log.info(f"Global seed set to {seed}") + os.environ["PL_GLOBAL_SEED"] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + os.environ["PL_SEED_WORKERS"] = f"{int(workers)}" + return seed + + +def init_wandb(config: Namespace, run_name: str) -> None: + if not config.use_wandb: + return + + wandb.init( + entity=config.wandb['entity'], + project=config.wandb['project_name'], + name=run_name, + config=config, + ) + wandb.alert(title='start', level=AlertLevel.INFO, text=f'{run_name}') + + +def alert_wandb(config: Namespace, run_name: str, title: str) -> None: + if config.use_wandb: + wandb.alert(title=title, level=AlertLevel.INFO, text=f'{run_name}') + + +def to_hangul(sent) -> Tuple[str, str]: + """ + entity명을 한글로 변경 + """ + dic = { + "ORG" : "조직", + "PER" : "사람", + "DAT" : "시간", + "LOC" : "장소", + "POH" : "기타", + "NOH" : "수량", + } + + sub = eval(sent['subject_entity']) + obj = eval(sent['object_entity']) + + sub['type'] = dic[sub['type']] + obj['type'] = dic[obj['type']] + + sent['subject_entity'] = str(sub) + sent['object_entity'] = str(obj) + + return sent['subject_entity'], sent['object_entity'] + + +def marker(sent, input_format: str) -> str: + """dataframe에서 하나의 row 내의 정보들을 조합해 마킹한 sentence를 만드는 함수""" + + # str 타입에서 dict 뽑아내기 + sub = eval(sent['subject_entity']) + obj = eval(sent['object_entity']) + + # 인덱스 뽑아와서 entity 구분하기 + indices = sorted([sub['start_idx'], sub['end_idx'], obj['start_idx'], obj['end_idx']]) + indices[1] += 1 + indices[3] += 1 + + def split_string_by_index(string, indices): + substrings = [] + start_index = 0 + for index in indices: + substrings.append(string[start_index:index]) + start_index = index + substrings.append(string[start_index:]) + return substrings + + split_sent = split_string_by_index(sent['sentence'], indices) + + # entity에 마킹하기 + lst = [] + if input_format == 'entity_mask': + for i in split_sent: + if i == sub['word']: + sub_token = f'[SUBJ-{sub["type"]}]' + lst.append(sub_token) + elif i == obj['word']: + obj_token = f'[OBJ-{obj["type"]}]' + lst.append(obj_token) + else: + lst.append(i) + + elif input_format == 'entity_marker': + for i in split_sent: + if i == sub['word']: + new_sub = ['[E1] '] + [sub['word']] + [' [/E1]'] + lst.append(new_sub) + elif i == obj['word']: + new_obj = ['[E2] '] + [obj['word']] + [' [/E2]'] + lst.append(new_obj) + else: + lst.append(i) + + elif input_format == 'entity_marker_punct': + for i in split_sent: + if i == sub['word']: + new_sub = ['@ '] + [sub['word']] + [' @'] + lst.append(new_sub) + elif i == obj['word']: + new_obj = ['# '] + [obj['word']] + [' #'] + lst.append(new_obj) + else: + lst.append(i) + + elif input_format == 'typed_entity_marker': + for i in split_sent: + if i == sub['word']: + new_sub = [' '] + [sub['word']] + [' '] + lst.append(new_sub) + elif i == obj['word']: + new_obj = [' '] + [obj['word']] + [' '] + lst.append(new_obj) + else: + lst.append(i) + + elif input_format == 'typed_entity_marker_punct': + for i in split_sent: + if i == sub['word']: + new_sub = ['@ '] + [' * '] + [sub['type'].lower()] + [' * '] + [sub['word']] + [' @ '] + lst.append(new_sub) + elif i == obj['word']: + new_obj = [' # '] + [' ^ '] + [obj['type'].lower()] + [' ^ '] + [obj['word']] + [' # '] + lst.append(new_obj) + else: + lst.append(i) + + # 최종 sentence로 만들고 공백 처리하기 + sentence = ''.join(str(item) if isinstance(item, str) else ''.join(item) for item in lst) + sentence = re.sub(r'\s+', ' ', sentence) + + return sentence