diff --git a/.flake8 b/.flake8
index b66c41a..ebbac9f 100644
--- a/.flake8
+++ b/.flake8
@@ -1,26 +1,26 @@
-[flake8]
-max-line-length = 88
-
-exclude =
- .git,
- __pycache__,
- best_model,
- logs,
- prediction,
- results,
- saved,
- wandb,
- .gitignore
-
-ignore =
- E203,
- E266,
- E501,
- W503,
- F401,
- F403,
- F405,
- F821,
- F841
-
+[flake8]
+max-line-length = 88
+
+exclude =
+ .git,
+ __pycache__,
+ best_model,
+ logs,
+ prediction,
+ results,
+ saved,
+ wandb,
+ .gitignore
+
+ignore =
+ E203,
+ E266,
+ E501,
+ W503,
+ F401,
+ F403,
+ F405,
+ F821,
+ F841
+
count = True
\ No newline at end of file
diff --git "a/.github/ISSUE_TEMPLATE/\354\203\210\353\241\234\354\232\264 \352\270\260\353\212\245 \354\266\224\352\260\200.md" "b/.github/ISSUE_TEMPLATE/\354\203\210\353\241\234\354\232\264 \352\270\260\353\212\245 \354\266\224\352\260\200.md"
index 40e55f8..4d5d644 100644
--- "a/.github/ISSUE_TEMPLATE/\354\203\210\353\241\234\354\232\264 \352\270\260\353\212\245 \354\266\224\352\260\200.md"
+++ "b/.github/ISSUE_TEMPLATE/\354\203\210\353\241\234\354\232\264 \352\270\260\353\212\245 \354\266\224\352\260\200.md"
@@ -1,19 +1,19 @@
----
-name: new function added!
-about: when you propose new function~~!
-title: "[FEAT]"
-labels: ''
-assignees: ''
-
----
-
-## Background
--
--
-
-## To Do
-- [ ]
-- [ ]
-
-## See Also
--
+---
+name: new function added!
+about: when you propose new function~~!
+title: "[FEAT]"
+labels: ''
+assignees: ''
+
+---
+
+## Background
+-
+-
+
+## To Do
+- [ ]
+- [ ]
+
+## See Also
+-
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index ab2be1a..46ae9df 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,12 +1,12 @@
-## Overview
--
-
-## Change Log
--
-
-## To Reviewer
--
-
-## Issue Tags
-- Closed | Fixed: #
-- See also: #
+## Overview
+-
+
+## Change Log
+-
+
+## To Reviewer
+-
+
+## Issue Tags
+- Closed | Fixed: #
+- See also: #
diff --git a/.gitignore b/.gitignore
index 577450b..4c9ff32 100755
--- a/.gitignore
+++ b/.gitignore
@@ -1,25 +1,24 @@
-.DS_Store
-.venv
-
-__pycache__/
-
-best_model/
-logs/
-prediction/
-results/
-saved/
-wandb/
-configs/
-EDA/
-!eda/
-
-EDA.ipynb
-config*.yaml
-!config.yaml
-!config_full.yaml
-output*.log
-nohup*.out
-run_exps.sh
-TAPT/
-*.ipynb
+.DS_Store
+.venv
+
+__pycache__/
+
+best_model/
+logs/
+prediction/
+results/
+saved/
+wandb/
+configs/
+EDA/
+!eda/
+
+EDA.ipynb
+config*.yaml
+!config.yaml
+!config_full.yaml
+output*.log
+nohup*.out
+run_exps.sh
+*.ipynb
inference*.py
\ No newline at end of file
diff --git a/README.md b/README.md
index ae55c79..ef0abf2 100644
--- a/README.md
+++ b/README.md
@@ -1,95 +1,95 @@
-# 문장 내 개체간 관계 추출 Relation Extraction
-boostcamp AI Tech 5 NLP 트랙 레벨2 프로젝트
-문장 내의 두 단어(Entity)에 대한 속성과 관계를 예측하는 인공지능 만들기
-
-
-## 일정 Schedule
-프로젝트 전체 기간 : 5월 3일 (화) ~ 5월 18일 (목) 19:00
-
-
-## 팀원 Team Members
-|문지혜|박경택|박지은|송인서|윤지환|
-|:---:|:---:|:---:|:---:|:---:|
-||||||
-|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:munjh1121@gmail.com)](mailto:afterthougt@gmail.com)|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:afterthougt@gmail.com)](mailto:afterthougt@gmail.com)|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:imhappyhill@gmail.com)](mailto:imhappyhill@gmail.com)|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:songinseo0910@gmail.com)](mailto:songinseo0910@gmail.com)|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:yjh091500@naver.com)](mailto:yjh091500@naver.com)|
-|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/jihye-moon)](https://github.com/jihye-moon)|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/afterthougt)](https://github.com/afterthougt)|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/iamzieun)](https://github.com/iamzieun)|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/fortunetiger)](https://github.com/fortunetiger)|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/ohilikeit)](https://github.com/ohilikeit)|
-
-
-## 프로젝트 보고서 및 발표 시각 자료 Report and Presentation
-- [Wrap-Up Report](https://github.com/ohilikeit/level2_klue-nlp-12/blob/main/documents/wrap%20up%20report.pdf)
-- [Presentation](https://github.com/ohilikeit/level2_klue-nlp-12/blob/main/documents/presentation.pdf)
-
-
-## 프로젝트 개요 Project Overview
-
-
-## 레포지토리 구조 Repository Structure
-```bash
-level2_klue-nlp-12/
-├── eda // eda 및 사후 분석용 함수
-│ └── post_eda.py
-│
-├── load_data // 데이터 불러오기 관련 폴더
-│ ├── dict_label_to_num.pkl // 레이블을 숫자로 변환하기 위한 사전 파일
-│ ├── dict_num_to_label.pkl // 숫자를 레이블로 변환하기 위한 사전 파일
-│ └── load_data.py // 데이터 불러오기 및 전처리 관련 함수
-│
-├── model // 모델, 손실 함수, 평가 지표
-│ ├── loss.py // 손실 함수
-│ ├── metric.py // 평가 지표
-│ └── model.py // 모델 아키텍쳐
-│
-├── trainer // 학습 관련 폴더
-│ └──trainer.py
-│
-├── utils // 유틸리티 함수
-│ ├── args.py // 프로그램 실행 시 전달되는 인자들을 처리하기 위한 파일
-│ └── utils.py
-│
-├── documents // 보고서 및 발표자료
-│ └── wrap up report.pdf
-│ └── presentation.pdf
-│
-├── requirements.txt // 프로젝트에 필요한 라이브러리들을 명시
-│
-├── train.py // 모델 학습 시작을 위한 메인 스크립트
-├── full_train.py // 전체 데이터로의 모델 학습 시작을 위한 메인 스크립트
-├── inference.py // 학습된 모델의 평가 및 추론을 위한 스크립트
-├── sweep.py // sweep 동작을 위한 스크립트
-│
-├── config.yaml // 모델 학습 설정 관리를 위한 YAML
-├── config_full.yaml // 전체 데이터로의 모델 학습 설정 관리를 위한 YAML
-│
-├── run.sh // 실험 자동화를 위한 쉘 스크립트
-├── pyproject.toml // Black 설정 파일
-│
-└── README.md
-```
-
-## 데이터 Data
-- train.csv: 총 32470개
-- test_data.csv: 총 7765개
-
-
-## 사용법 Usage
-- 모델 학습 및 추론 위해서는, [Huggingface Datasets](https://huggingface.co/datasets/Smoked-Salmon-s/RE_Competition) 로그인 및 access token 인증이 선행되어야 합니다.
-- 환경 설치
-```bash
-pip install -r requirement.txt
-```
-- 학습
-```bash
-python train.py config.yaml
-```
-- 전체 데이터셋에 대한 학습
-```bash
-python full_train.py config_full.yaml
-```
-- 추론
-```bash
-python inference.py config.yaml
-```
-- 린팅
-```bash
-black .
-```
+# 문장 내 개체간 관계 추출 Relation Extraction
+boostcamp AI Tech 5 NLP 트랙 레벨2 프로젝트
+문장 내의 두 단어(Entity)에 대한 속성과 관계를 예측하는 인공지능 만들기
+
+
+## 일정 Schedule
+프로젝트 전체 기간 : 5월 3일 (화) ~ 5월 18일 (목) 19:00
+
+
+## 팀원 Team Members
+|문지혜|박경택|박지은|송인서|윤지환|
+|:---:|:---:|:---:|:---:|:---:|
+||||||
+|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:munjh1121@gmail.com)](mailto:afterthougt@gmail.com)|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:afterthougt@gmail.com)](mailto:afterthougt@gmail.com)|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:imhappyhill@gmail.com)](mailto:imhappyhill@gmail.com)|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:songinseo0910@gmail.com)](mailto:songinseo0910@gmail.com)|[![Gmail Badge](https://img.shields.io/badge/Gmail-d14836?style=flat-square&logo=Gmail&logoColor=white&link=mailto:yjh091500@naver.com)](mailto:yjh091500@naver.com)|
+|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/jihye-moon)](https://github.com/jihye-moon)|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/afterthougt)](https://github.com/afterthougt)|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/iamzieun)](https://github.com/iamzieun)|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/fortunetiger)](https://github.com/fortunetiger)|[![GitHub Badge](https://img.shields.io/badge/-GitHub-black?style=flat-square&logo=github&link=https://github.com/ohilikeit)](https://github.com/ohilikeit)|
+
+
+## 프로젝트 보고서 및 발표 시각 자료 Report and Presentation
+- [Wrap-Up Report](https://github.com/ohilikeit/level2_klue-nlp-12/blob/main/documents/wrap%20up%20report.pdf)
+- [Presentation](https://github.com/ohilikeit/level2_klue-nlp-12/blob/main/documents/presentation.pdf)
+
+
+## 프로젝트 개요 Project Overview
+
+
+## 레포지토리 구조 Repository Structure
+```bash
+level2_klue-nlp-12/
+├── eda // eda 및 사후 분석용 함수
+│ └── post_eda.py
+│
+├── load_data // 데이터 불러오기 관련 폴더
+│ ├── dict_label_to_num.pkl // 레이블을 숫자로 변환하기 위한 사전 파일
+│ ├── dict_num_to_label.pkl // 숫자를 레이블로 변환하기 위한 사전 파일
+│ └── load_data.py // 데이터 불러오기 및 전처리 관련 함수
+│
+├── model // 모델, 손실 함수, 평가 지표
+│ ├── loss.py // 손실 함수
+│ ├── metric.py // 평가 지표
+│ └── model.py // 모델 아키텍쳐
+│
+├── trainer // 학습 관련 폴더
+│ └──trainer.py
+│
+├── utils // 유틸리티 함수
+│ ├── args.py // 프로그램 실행 시 전달되는 인자들을 처리하기 위한 파일
+│ └── utils.py
+│
+├── documents // 보고서 및 발표자료
+│ └── wrap up report.pdf
+│ └── presentation.pdf
+│
+├── requirements.txt // 프로젝트에 필요한 라이브러리들을 명시
+│
+├── train.py // 모델 학습 시작을 위한 메인 스크립트
+├── full_train.py // 전체 데이터로의 모델 학습 시작을 위한 메인 스크립트
+├── inference.py // 학습된 모델의 평가 및 추론을 위한 스크립트
+├── sweep.py // sweep 동작을 위한 스크립트
+│
+├── config.yaml // 모델 학습 설정 관리를 위한 YAML
+├── config_full.yaml // 전체 데이터로의 모델 학습 설정 관리를 위한 YAML
+│
+├── run.sh // 실험 자동화를 위한 쉘 스크립트
+├── pyproject.toml // Black 설정 파일
+│
+└── README.md
+```
+
+## 데이터 Data
+- train.csv: 총 32470개
+- test_data.csv: 총 7765개
+
+
+## 사용법 Usage
+- 모델 학습 및 추론 위해서는, [Huggingface Datasets](https://huggingface.co/datasets/Smoked-Salmon-s/RE_Competition) 로그인 및 access token 인증이 선행되어야 합니다.
+- 환경 설치
+```bash
+pip install -r requirement.txt
+```
+- 학습
+```bash
+python train.py config.yaml
+```
+- 전체 데이터셋에 대한 학습
+```bash
+python full_train.py config_full.yaml
+```
+- 추론
+```bash
+python inference.py config.yaml
+```
+- 린팅
+```bash
+black .
+```
diff --git a/config.yaml b/config.yaml
index abd12c1..fd261b8 100755
--- a/config.yaml
+++ b/config.yaml
@@ -1,102 +1,102 @@
----
-run_name: exp-name
-n_gpu: 1
-seed: 42
-use_wandb: true
-num_labels: 30
-model:
- name: klue/roberta-large
- variant: BiLSTMREModel
-dataloader:
- type: REDataModule
- input_format: typed_entity_marker_punct
- prompt: s_and_o
- type_transform: false
- train_split: train
- valid_split: validation
- batch_size: 64
- shuffle: true
- revision: a6a27f7e03c79cee2ee1171b7001d7a23cfd4495
- num_workers: 0
-optimizer:
- type: AdamW
- lr: 2.0e-5
- weight_decay: 0.01
- adam_beta2: 0.98
-loss:
- type: CrossEntropyLoss
- focal_alpha: 0.25
- focal_gamma: 2.0
- dice_smooth: 1.5
-lr_scheduler:
- type: StepLR
- warmup_steps: 500
- warmup_ratio: 0.06
- step_size: 50
- gamma: 0.1
- is_schedule: false
-trainer:
- epochs: 5
- output_dir: saved/models/
- model_dir: ./best_model
- pred_dir: ./prediction/submission.csv
- val_pred_dir: ./prediction/validation_output.csv
- logging_dir: ./logs
- logging_steps: 100
- save_total_limit: 5
- save_steps: 500
- save_freq: 1
- use_early_stop: true
- early_stop: 3
- evaluation_strategy: steps
- evaluation_steps: 500
- save_strategy: steps
-wandb:
- entity: salmons
- project_name: klue-re
- sweep_project_name: sweep
- sweep_count: 10
-sweep_config:
- method: bayes
- metric:
- name: eval/micro f1 score
- goal: maximize
- parameters:
- input_format:
- values:
- - default
- - entity_marker
- - entity_marker_punct
- - typed_entity_marker
- - typed_entity_marker_punct
- prompt:
- values:
- - default
- - s_sep_o
- - s_and_o
- - quiz
- - problem
- type_transform:
- values:
- - true
- - false
- lr:
- values:
- - 1.0e-05
- - 2.0e-05
- - 3.0e-05
- - 5.0e-05
- epochs:
- values:
- - 3
- - 4
- - 5
- adam_beta2:
- values:
- - 0.98
- - 0.999
- warmup_ratio:
- values:
- - 0.06
- - 0.1
+---
+run_name: exp-name
+n_gpu: 1
+seed: 42
+use_wandb: true
+num_labels: 30
+model:
+ name: klue/roberta-large
+ variant: BiLSTMREModel
+dataloader:
+ type: REDataModule
+ input_format: typed_entity_marker_punct
+ prompt: s_and_o
+ type_transform: false
+ train_split: train
+ valid_split: validation
+ batch_size: 64
+ shuffle: true
+ revision: a6a27f7e03c79cee2ee1171b7001d7a23cfd4495
+ num_workers: 0
+optimizer:
+ type: AdamW
+ lr: 2.0e-5
+ weight_decay: 0.01
+ adam_beta2: 0.98
+loss:
+ type: CrossEntropyLoss
+ focal_alpha: 0.25
+ focal_gamma: 2.0
+ dice_smooth: 1.5
+lr_scheduler:
+ type: StepLR
+ warmup_steps: 500
+ warmup_ratio: 0.06
+ step_size: 50
+ gamma: 0.1
+ is_schedule: false
+trainer:
+ epochs: 5
+ output_dir: saved/models/
+ model_dir: ./best_model
+ pred_dir: ./prediction/submission.csv
+ val_pred_dir: ./prediction/validation_output.csv
+ logging_dir: ./logs
+ logging_steps: 100
+ save_total_limit: 5
+ save_steps: 500
+ save_freq: 1
+ use_early_stop: true
+ early_stop: 3
+ evaluation_strategy: steps
+ evaluation_steps: 500
+ save_strategy: steps
+wandb:
+ entity: salmons
+ project_name: klue-re
+ sweep_project_name: sweep
+ sweep_count: 10
+sweep_config:
+ method: bayes
+ metric:
+ name: eval/micro f1 score
+ goal: maximize
+ parameters:
+ input_format:
+ values:
+ - default
+ - entity_marker
+ - entity_marker_punct
+ - typed_entity_marker
+ - typed_entity_marker_punct
+ prompt:
+ values:
+ - default
+ - s_sep_o
+ - s_and_o
+ - quiz
+ - problem
+ type_transform:
+ values:
+ - true
+ - false
+ lr:
+ values:
+ - 1.0e-05
+ - 2.0e-05
+ - 3.0e-05
+ - 5.0e-05
+ epochs:
+ values:
+ - 3
+ - 4
+ - 5
+ adam_beta2:
+ values:
+ - 0.98
+ - 0.999
+ warmup_ratio:
+ values:
+ - 0.06
+ - 0.1
...
\ No newline at end of file
diff --git a/config_full.yaml b/config_full.yaml
index ebdde7b..2f330a0 100644
--- a/config_full.yaml
+++ b/config_full.yaml
@@ -1,102 +1,102 @@
----
-run_name: exp-name
-n_gpu: 1
-seed: 42
-use_wandb: true
-num_labels: 30
-model:
- name: klue/roberta-large
- variant: BiLSTMREModel
-dataloader:
- type: REDataModule
- input_format: typed_entity_marker_punct
- prompt: s_and_o
- type_transform: false
- train_split: train
- valid_split: validation
- batch_size: 64
- shuffle: true
- revision: a6a27f7e03c79cee2ee1171b7001d7a23cfd4495
- num_workers: 0
-optimizer:
- type: AdamW
- lr: 2.0e-5
- weight_decay: 0.01
- adam_beta2: 0.98
-loss:
- type: CrossEntropyLoss
- focal_alpha: 0.25
- focal_gamma: 2.0
- dice_smooth: 1.5
-lr_scheduler:
- type: StepLR
- warmup_steps: 500
- warmup_ratio: 0.06
- step_size: 50
- gamma: 0.1
- is_schedule: false
-trainer:
- epochs: 5
- output_dir: saved/models/
- model_dir: ./best_model
- pred_dir: ./prediction/submission.csv
- val_pred_dir: ./prediction/validation_output.csv
- logging_dir: ./logs
- logging_steps: 100
- save_total_limit: 5
- save_steps: 500
- save_freq: 1
- use_early_stop: false
- early_stop: 3
- evaluation_strategy: steps
- evaluation_steps: 500
- save_strategy: steps
-wandb:
- entity: salmons
- project_name: klue-re
- sweep_project_name: sweep
- sweep_count: 10
-sweep_config:
- method: bayes
- metric:
- name: eval/micro f1 score
- goal: maximize
- parameters:
- input_format:
- values:
- - default
- - entity_marker
- - entity_marker_punct
- - typed_entity_marker
- - typed_entity_marker_punct
- prompt:
- values:
- - default
- - s_sep_o
- - s_and_o
- - quiz
- - problem
- type_transform:
- values:
- - true
- - false
- lr:
- values:
- - 1.0e-05
- - 2.0e-05
- - 3.0e-05
- - 5.0e-05
- epochs:
- values:
- - 3
- - 4
- - 5
- adam_beta2:
- values:
- - 0.98
- - 0.999
- warmup_ratio:
- values:
- - 0.06
- - 0.1
+---
+run_name: exp-name
+n_gpu: 1
+seed: 42
+use_wandb: true
+num_labels: 30
+model:
+ name: klue/roberta-large
+ variant: BiLSTMREModel
+dataloader:
+ type: REDataModule
+ input_format: typed_entity_marker_punct
+ prompt: s_and_o
+ type_transform: false
+ train_split: train
+ valid_split: validation
+ batch_size: 64
+ shuffle: true
+ revision: a6a27f7e03c79cee2ee1171b7001d7a23cfd4495
+ num_workers: 0
+optimizer:
+ type: AdamW
+ lr: 2.0e-5
+ weight_decay: 0.01
+ adam_beta2: 0.98
+loss:
+ type: CrossEntropyLoss
+ focal_alpha: 0.25
+ focal_gamma: 2.0
+ dice_smooth: 1.5
+lr_scheduler:
+ type: StepLR
+ warmup_steps: 500
+ warmup_ratio: 0.06
+ step_size: 50
+ gamma: 0.1
+ is_schedule: false
+trainer:
+ epochs: 5
+ output_dir: saved/models/
+ model_dir: ./best_model
+ pred_dir: ./prediction/submission.csv
+ val_pred_dir: ./prediction/validation_output.csv
+ logging_dir: ./logs
+ logging_steps: 100
+ save_total_limit: 5
+ save_steps: 500
+ save_freq: 1
+ use_early_stop: false
+ early_stop: 3
+ evaluation_strategy: steps
+ evaluation_steps: 500
+ save_strategy: steps
+wandb:
+ entity: salmons
+ project_name: klue-re
+ sweep_project_name: sweep
+ sweep_count: 10
+sweep_config:
+ method: bayes
+ metric:
+ name: eval/micro f1 score
+ goal: maximize
+ parameters:
+ input_format:
+ values:
+ - default
+ - entity_marker
+ - entity_marker_punct
+ - typed_entity_marker
+ - typed_entity_marker_punct
+ prompt:
+ values:
+ - default
+ - s_sep_o
+ - s_and_o
+ - quiz
+ - problem
+ type_transform:
+ values:
+ - true
+ - false
+ lr:
+ values:
+ - 1.0e-05
+ - 2.0e-05
+ - 3.0e-05
+ - 5.0e-05
+ epochs:
+ values:
+ - 3
+ - 4
+ - 5
+ adam_beta2:
+ values:
+ - 0.98
+ - 0.999
+ warmup_ratio:
+ values:
+ - 0.06
+ - 0.1
...
\ No newline at end of file
diff --git a/eda/post_eda.py b/eda/post_eda.py
index 92310c1..322d80b 100644
--- a/eda/post_eda.py
+++ b/eda/post_eda.py
@@ -1,227 +1,227 @@
-import pandas as pd
-import seaborn as sns
-import plotly.express as px
-import matplotlib.pyplot as plt
-import matplotlib.colors as mcolors
-from datasets import load_dataset
-from sklearn.metrics import confusion_matrix
-
-
-def make_dataframe(PATH: str, split: str, revision: int) -> pd.DataFrame:
- """
- 주어진 경로(PATH)로부터의 데이터프레임과 Huggingface Datasets로부터 불러온 데이터프레임을 합친
- 하나의 데이터프레임을 반환합니다.
-
- arguments:
- PATH (str): 'label'과 'pred_label' 열이 포함된 데이터프레임의 경로
- split (str): 데이터셋의 분할 유형 (train, validation, test).
- revision (str): 데이터셋의 버전 (commit hash).
-
- return:
- df (pd.DataFrame): 주어진 경로(PATH)로부터의 데이터프레임과 Huggingface Datasets으로부터 불러온 데이터프레임을 합친
- 하나의 데이터프레임
- """
- # 원본 validation set 불러오기
- valid = load_dataset("Smoked-Salmon-s/RE_Competition",
- split = split,
- column_names = ['id', 'sentence', 'subject_entity', 'object_entity', 'label', 'source'],
- revision = revision)
- valid_df = valid.to_pandas().iloc[1:].reset_index(drop=True).astype({'id': 'int64'})
-
- # inference한 validation set 불러오기
- valid_inferred_df = pd.read_csv(PATH)
-
- # 두 dataframe 합치기
- df = pd.merge(valid_df,
- valid_inferred_df[['id', 'pred_label', 'probs']],
- on='id',
- how='inner')
- df = df[['id', 'sentence', 'subject_entity', 'object_entity', 'label', 'pred_label', 'probs', 'source']]
-
- return df
-
-
-def confusion_matrix_graph(df: pd.DataFrame):
- """
- 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여 confusion matrix을 계산하고
- heatmap 형태로 시각화합니다.
-
- arguments:
- df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임
-
- return:
- None. 함수는 confusion matrix heatmap을 출력합니다.
- """
- # confusion matrix 계산
- cm = confusion_matrix(df['label'], df['pred_label'])
-
- # 커스텀 컬러맵 생성
- cmap = mcolors.ListedColormap(['white', 'pink', 'tomato'])
-
- # 정규화를 위한 경계값 설정
- bounds = [0.5, 1.0, 10.0, cm.max() + 0.5] # 1.0, 10.0을 경계값으로 설정
-
- # 컬러맵을 적용할 값의 범위 설정
- norm = mcolors.BoundaryNorm(bounds, cmap.N)
-
- # 라벨 설정
- labels = sorted(list(df['label'].unique()))
-
- # heatmap 그리기
- plt.figure(figsize=(10, 7))
- sns.heatmap(cm, annot=True, annot_kws={"size":8}, cmap=cmap, norm=norm, fmt='g', xticklabels=labels, yticklabels=labels)
-
- # 축 이름 및 제목 설정
- plt.xlabel('Predicted Labels')
- plt.ylabel('True Labels')
- plt.title('Confusion Matrix')
-
- # 그래프 표시
- plt.show()
-
-
-def all_label_matrix(df: pd.DataFrame, sort_column: str = 'label') -> pd.DataFrame:
- """
- 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여
- 각 label에 대한 confusion matrix을 계산하고 dataframe 형태로 시각화합니다.
-
- arguments:
- df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임
- sort_column (str): confusion matrix dataframe을 정렬하는 기준 열
-
- return:
- metric_df (pd.DataFrame): confusion matrix dataframe
- """
- label_list = list(sorted(df['label'].unique()))
-
- label = [len(df[df['label'] == label]) for label in label_list]
- pred_label = [len(df[df['pred_label'] == label]) for label in label_list]
- TP = [len(df[(df['pred_label'] == label) & (df['label'] == label)]) for label in label_list]
- FP = [len(df[(df['pred_label'] == label) & (df['label'] != label)]) for label in label_list]
- FN = [len(df[(df['pred_label'] != label) & (df['label'] == label)]) for label in label_list]
-
- precision = []
- for tp, fp in zip(TP, FP):
- if tp + fp > 0:
- p = round(tp / (tp + fp), 4)
- else:
- p = 0
- precision.append(p)
-
- recall = []
- for tp, fn in zip(TP, FN):
- if tp + fn > 0:
- r = round(tp / (tp + fn), 4)
- else:
- r = 0
- recall.append(r)
-
- metric_df = pd.DataFrame(zip(label_list, label, pred_label, TP, FP, FN, precision, recall))
- metric_df.columns = ['label', 'label_#', 'pred_label_#', 'TP', 'FP', 'FN', 'precision', 'recall']
- metric_df = metric_df.sort_values(sort_column)
-
- return metric_df
-
-
-def specific_label_matrix(df: pd.DataFrame, label: str ='no_relation') -> pd.DataFrame:
- """
- 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여
- 주어진 label에 대한 confusion matrix을 계산하고 dataframe 형태로 시각화합니다.
-
- arguments:
- df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임
- label (str): confusion matrix을 계산할 label
-
- return:
- metric_df (pd.DataFrame): 주어진 label에 대한 confusion matrix dataframe
- """
- TP = len(df[(df['pred_label'] == label) & (df['label'] == label)])
- FP = len(df[(df['pred_label'] == label) & (df['label'] != label)])
- FN = len(df[(df['pred_label'] != label) & (df['label'] == label)])
-
- precision = round(TP / (TP + FP), 4)
- recall = round(TP / (TP + FN), 4)
-
- metric_dict = {"TP": TP, "FP": FP, "FN": FN, "precision": precision, "recall": recall}
- metric_df = pd.DataFrame.from_dict(data = metric_dict,
- orient='index',
- columns=['value'])
-
- return metric_df
-
-
-def total_metric(df: pd.DataFrame) -> pd.DataFrame:
- """
- 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여
- 전체 데이터에 대한 confusion matrix을 계산하고 dataframe 형태로 시각화합니다.
-
- arguments:
- df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임
-
- return:
- metric_df (pd.DataFrame): 주어진 데이터에 대한 confusion matrix dataframe
- """
- df = all_label_matrix(df)
- cleared_df = df[df['label'] != 'no_relation'].copy()
-
- TP = sum(cleared_df['TP'])
- FP = sum(cleared_df['FP'])
- FN = sum(cleared_df['FN'])
-
- precision = TP / (TP + FP)
- recall = TP / (TP + FN)
-
- F1 = 2 * precision * recall / (precision + recall)
-
- metric_dict = {"TP": TP, "FP": FP, "FN": FN, "micro precision": precision, "micro recall": recall, " mircro F1 score": F1}
- metric_df = pd.DataFrame.from_dict(data = metric_dict,
- orient='index',
- columns=['value'])
-
- return metric_df
-
-
-def precision_recall_graph(df: pd.DataFrame):
- """
- 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여
- 각 label에 대한 precision과 recall을 계산하고 scatterplot 형태로 시각화합니다.
-
- arguments:
- df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임
-
- return:
- None. 함수는 precision과 recall에 대한 scatterplot을 출력합니다.
- """
- label_list = list(sorted(df['label'].unique()))
-
- label = [len(df[df['label'] == label]) for label in label_list]
- pred_label = [len(df[df['pred_label'] == label]) for label in label_list]
- TP = [len(df[(df['pred_label'] == label) & (df['label'] == label)]) for label in label_list]
- FP = [len(df[(df['pred_label'] == label) & (df['label'] != label)]) for label in label_list]
- FN = [len(df[(df['pred_label'] != label) & (df['label'] == label)]) for label in label_list]
-
- precision = []
- for tp, fp in zip(TP, FP):
- if tp + fp > 0:
- p = round(tp / (tp + fp), 4)
- else:
- p = 0
- precision.append(p)
-
- recall = []
- for tp, fn in zip(TP, FN):
- if tp + fn > 0:
- r = round(tp / (tp + fn), 4)
- else:
- r = 0
- recall.append(r)
-
- plt.scatter(recall, precision)
-
- # 그래프 제목과 축 레이블 설정
- plt.title('relation between recall and precision')
- plt.xlabel('recall')
- plt.ylabel('precision')
-
- # 그래프 보이기
+import pandas as pd
+import seaborn as sns
+import plotly.express as px
+import matplotlib.pyplot as plt
+import matplotlib.colors as mcolors
+from datasets import load_dataset
+from sklearn.metrics import confusion_matrix
+
+
+def make_dataframe(PATH: str, split: str, revision: int) -> pd.DataFrame:
+ """
+ 주어진 경로(PATH)로부터의 데이터프레임과 Huggingface Datasets로부터 불러온 데이터프레임을 합친
+ 하나의 데이터프레임을 반환합니다.
+
+ arguments:
+ PATH (str): 'label'과 'pred_label' 열이 포함된 데이터프레임의 경로
+ split (str): 데이터셋의 분할 유형 (train, validation, test).
+ revision (str): 데이터셋의 버전 (commit hash).
+
+ return:
+ df (pd.DataFrame): 주어진 경로(PATH)로부터의 데이터프레임과 Huggingface Datasets으로부터 불러온 데이터프레임을 합친
+ 하나의 데이터프레임
+ """
+ # 원본 validation set 불러오기
+ valid = load_dataset("Smoked-Salmon-s/RE_Competition",
+ split = split,
+ column_names = ['id', 'sentence', 'subject_entity', 'object_entity', 'label', 'source'],
+ revision = revision)
+ valid_df = valid.to_pandas().iloc[1:].reset_index(drop=True).astype({'id': 'int64'})
+
+ # inference한 validation set 불러오기
+ valid_inferred_df = pd.read_csv(PATH)
+
+ # 두 dataframe 합치기
+ df = pd.merge(valid_df,
+ valid_inferred_df[['id', 'pred_label', 'probs']],
+ on='id',
+ how='inner')
+ df = df[['id', 'sentence', 'subject_entity', 'object_entity', 'label', 'pred_label', 'probs', 'source']]
+
+ return df
+
+
+def confusion_matrix_graph(df: pd.DataFrame):
+ """
+ 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여 confusion matrix을 계산하고
+ heatmap 형태로 시각화합니다.
+
+ arguments:
+ df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임
+
+ return:
+ None. 함수는 confusion matrix heatmap을 출력합니다.
+ """
+ # confusion matrix 계산
+ cm = confusion_matrix(df['label'], df['pred_label'])
+
+ # 커스텀 컬러맵 생성
+ cmap = mcolors.ListedColormap(['white', 'pink', 'tomato'])
+
+ # 정규화를 위한 경계값 설정
+ bounds = [0.5, 1.0, 10.0, cm.max() + 0.5] # 1.0, 10.0을 경계값으로 설정
+
+ # 컬러맵을 적용할 값의 범위 설정
+ norm = mcolors.BoundaryNorm(bounds, cmap.N)
+
+ # 라벨 설정
+ labels = sorted(list(df['label'].unique()))
+
+ # heatmap 그리기
+ plt.figure(figsize=(10, 7))
+ sns.heatmap(cm, annot=True, annot_kws={"size":8}, cmap=cmap, norm=norm, fmt='g', xticklabels=labels, yticklabels=labels)
+
+ # 축 이름 및 제목 설정
+ plt.xlabel('Predicted Labels')
+ plt.ylabel('True Labels')
+ plt.title('Confusion Matrix')
+
+ # 그래프 표시
+ plt.show()
+
+
+def all_label_matrix(df: pd.DataFrame, sort_column: str = 'label') -> pd.DataFrame:
+ """
+ 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여
+ 각 label에 대한 confusion matrix을 계산하고 dataframe 형태로 시각화합니다.
+
+ arguments:
+ df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임
+ sort_column (str): confusion matrix dataframe을 정렬하는 기준 열
+
+ return:
+ metric_df (pd.DataFrame): confusion matrix dataframe
+ """
+ label_list = list(sorted(df['label'].unique()))
+
+ label = [len(df[df['label'] == label]) for label in label_list]
+ pred_label = [len(df[df['pred_label'] == label]) for label in label_list]
+ TP = [len(df[(df['pred_label'] == label) & (df['label'] == label)]) for label in label_list]
+ FP = [len(df[(df['pred_label'] == label) & (df['label'] != label)]) for label in label_list]
+ FN = [len(df[(df['pred_label'] != label) & (df['label'] == label)]) for label in label_list]
+
+ precision = []
+ for tp, fp in zip(TP, FP):
+ if tp + fp > 0:
+ p = round(tp / (tp + fp), 4)
+ else:
+ p = 0
+ precision.append(p)
+
+ recall = []
+ for tp, fn in zip(TP, FN):
+ if tp + fn > 0:
+ r = round(tp / (tp + fn), 4)
+ else:
+ r = 0
+ recall.append(r)
+
+ metric_df = pd.DataFrame(zip(label_list, label, pred_label, TP, FP, FN, precision, recall))
+ metric_df.columns = ['label', 'label_#', 'pred_label_#', 'TP', 'FP', 'FN', 'precision', 'recall']
+ metric_df = metric_df.sort_values(sort_column)
+
+ return metric_df
+
+
+def specific_label_matrix(df: pd.DataFrame, label: str ='no_relation') -> pd.DataFrame:
+ """
+ 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여
+ 주어진 label에 대한 confusion matrix을 계산하고 dataframe 형태로 시각화합니다.
+
+ arguments:
+ df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임
+ label (str): confusion matrix을 계산할 label
+
+ return:
+ metric_df (pd.DataFrame): 주어진 label에 대한 confusion matrix dataframe
+ """
+ TP = len(df[(df['pred_label'] == label) & (df['label'] == label)])
+ FP = len(df[(df['pred_label'] == label) & (df['label'] != label)])
+ FN = len(df[(df['pred_label'] != label) & (df['label'] == label)])
+
+ precision = round(TP / (TP + FP), 4)
+ recall = round(TP / (TP + FN), 4)
+
+ metric_dict = {"TP": TP, "FP": FP, "FN": FN, "precision": precision, "recall": recall}
+ metric_df = pd.DataFrame.from_dict(data = metric_dict,
+ orient='index',
+ columns=['value'])
+
+ return metric_df
+
+
+def total_metric(df: pd.DataFrame) -> pd.DataFrame:
+ """
+ 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여
+ 전체 데이터에 대한 confusion matrix을 계산하고 dataframe 형태로 시각화합니다.
+
+ arguments:
+ df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임
+
+ return:
+ metric_df (pd.DataFrame): 주어진 데이터에 대한 confusion matrix dataframe
+ """
+ df = all_label_matrix(df)
+ cleared_df = df[df['label'] != 'no_relation'].copy()
+
+ TP = sum(cleared_df['TP'])
+ FP = sum(cleared_df['FP'])
+ FN = sum(cleared_df['FN'])
+
+ precision = TP / (TP + FP)
+ recall = TP / (TP + FN)
+
+ F1 = 2 * precision * recall / (precision + recall)
+
+ metric_dict = {"TP": TP, "FP": FP, "FN": FN, "micro precision": precision, "micro recall": recall, " mircro F1 score": F1}
+ metric_df = pd.DataFrame.from_dict(data = metric_dict,
+ orient='index',
+ columns=['value'])
+
+ return metric_df
+
+
+def precision_recall_graph(df: pd.DataFrame):
+ """
+ 주어진 데이터프레임(df)의 'label'과 'pred_label' 열을 사용하여
+ 각 label에 대한 precision과 recall을 계산하고 scatterplot 형태로 시각화합니다.
+
+ arguments:
+ df (pd.DataFrame): 'label'과 'pred_label' 열이 포함된 데이터프레임
+
+ return:
+ None. 함수는 precision과 recall에 대한 scatterplot을 출력합니다.
+ """
+ label_list = list(sorted(df['label'].unique()))
+
+ label = [len(df[df['label'] == label]) for label in label_list]
+ pred_label = [len(df[df['pred_label'] == label]) for label in label_list]
+ TP = [len(df[(df['pred_label'] == label) & (df['label'] == label)]) for label in label_list]
+ FP = [len(df[(df['pred_label'] == label) & (df['label'] != label)]) for label in label_list]
+ FN = [len(df[(df['pred_label'] != label) & (df['label'] == label)]) for label in label_list]
+
+ precision = []
+ for tp, fp in zip(TP, FP):
+ if tp + fp > 0:
+ p = round(tp / (tp + fp), 4)
+ else:
+ p = 0
+ precision.append(p)
+
+ recall = []
+ for tp, fn in zip(TP, FN):
+ if tp + fn > 0:
+ r = round(tp / (tp + fn), 4)
+ else:
+ r = 0
+ recall.append(r)
+
+ plt.scatter(recall, precision)
+
+ # 그래프 제목과 축 레이블 설정
+ plt.title('relation between recall and precision')
+ plt.xlabel('recall')
+ plt.ylabel('precision')
+
+ # 그래프 보이기
plt.show()
\ No newline at end of file
diff --git a/full_train.py b/full_train.py
index 1a66daa..b5bfc79 100644
--- a/full_train.py
+++ b/full_train.py
@@ -1,158 +1,158 @@
-import sys
-import pickle as pickle
-import pytz
-from datetime import datetime
-from typing import Dict, Any
-
-import torch
-from transformers import (
- AutoTokenizer,
- EarlyStoppingCallback,
- TrainingArguments,
-)
-
-from utils.args import *
-from load_data.load_data import *
-from model.model import *
-from model.metric import *
-from trainer.trainer import *
-from utils.utils import *
-
-
-def train(config: Any) -> None:
- """
- 모델을 학습하는 함수, train.py와 다른 점은 평가 데이터셋 사용하지 않고 모든 데이터를 훈련에 활용함
-
- 다음 프로세스를 수행:
- 1. 데이터셋을 불러오고 전처리 및 토큰화
- 2. 레이블을 숫자 형태로 변환
- 3. 학습 및 개발 데이터셋에 대한 Dataset 객체를 생성
- 4. 지정된 모델을 불러와 훈련 인자 설정
- 5. 모델 학습 후 저장
-
- Args:
- config (dict): 모델 학습에 필요한 모든 구성 매개변수를 포함하는 딕셔너리
- dataloader, model, optimizer, trainer 구성 포함
-
- Returns:
- None
- """
- seed_everything(config.seed)
-
- # load model and tokenizer
- model_name = config.model['name']
- tokenizer = AutoTokenizer.from_pretrained(model_name)
-
- # 1. load dataset
- # 2. preprocess dataset
- # 3. tokenize dataset
- revision = "69b6010fe9681567b98f9d3d3c70487079183d4b"
- input_format = config.dataloader.get('input_format')
- prompt = config.dataloader.get('prompt')
- type_transform = config.dataloader.get('type_transform')
-
- train_dataset, train_raw_label = load_train_dataset(
- split=config.dataloader['train_split'],
- revision=revision,
- tokenizer=tokenizer,
- input_format=input_format,
- prompt=prompt,
- type_transform=type_transform,
- )
-
- train_label = label_to_num(train_raw_label)
-
- # 4. make Dataset object
- re_train_dataset = REDataset(train_dataset, train_label)
-
- device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
- print(device)
-
- # 5. import model
- # setting model hyperparameter
- model_module = __import__('model.model', fromlist=[config.model['variant']])
- model_class = getattr(model_module, config.model['variant'])
- # Available customized classes:
- # BaseREModel, BiLSTMREModel, BiGRUREModel
- model = model_class(config, len(tokenizer))
-
- print(model.model_config)
-
- model.parameters
- model.to(device)
-
- # 6. training arguments 설정
- ## 사용한 option 외에도 다양한 option들이 있습니다.
- ## https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
- training_args = TrainingArguments(
- # 기본 설정
- output_dir=config.trainer['output_dir'], # 모델 저장 디렉토리
- report_to=('wandb' if config.use_wandb else 'none'), # wandb 사용 여부
- fp16=True, # 16-bit floating point precision
-
- # 학습 설정
- num_train_epochs=config.trainer['epochs'], # 전체 훈련 epoch 수
- learning_rate=config.optimizer['lr'], # learning rate
- weight_decay=config.optimizer['weight_decay'], # weight decay
- adam_beta2=config.optimizer['adam_beta2'], # AdamW 옵티마이저의 beta2 하이퍼파라미터
-
- # 배치 사이즈 설정
- per_device_train_batch_size=config.dataloader['batch_size'], # 훈련 중 장치 당 batch size
-
- # 스케줄링 설정
- warmup_ratio=config.lr_scheduler['warmup_ratio'], # learning rate scheduler의 warmup 비율
-
- # 로깅 설정
- logging_dir=config.trainer['logging_dir'], # 로그 저장 디렉토리
- logging_steps=config.trainer['logging_steps'], # 로그 저장 스텝
- load_best_model_at_end=config.trainer['use_early_stop'],
- )
-
- # 7. trainer 설정
- # 8. evaluate 함수 설정
- trainer = RETrainer(
- model=model, # the instantiated 🤗 Transformers model to be trained
- args=training_args, # training arguments, defined above
- train_dataset=re_train_dataset, # training dataset
- compute_metrics=compute_metrics, # define metrics function
- loss_cfg=config.loss,
- )
-
- # 9. train model
- trainer.train()
- # 10. save model
- trainer.save_model(config.trainer['model_dir'])
-
-
-def main() -> None:
- """
- config를 불러오고 학습 프로세스를 시작하는 메인 함수입니다.
-
- 다음 프로세스를 수행:
- 1. 제공된 YAML 파일에서 구성을 파싱하거나 기본 파일을 사용
- 2. 제공된 구성으로 Weights & Biases (wandb) 실행 초기화
- 3. train 함수를 호출하여 모델 훈련 프로세스를 시작
- 4. 학습 완료 후 wandb에 완료 메세지 송출
-
- Args:
- None
-
- Returns:
- None
- """
- try:
- config_path = sys.argv[1]
- except IndexError:
- config_path = './config.yaml'
- config = parse_arguments(config_path)
-
- now = datetime.now(pytz.timezone('Asia/Seoul'))
- run_name = f'{config.run_name}_{now.strftime("%d-%H-%M")}'
-
- init_wandb(config, run_name)
- train(config)
- alert_wandb(config, run_name, 'finished')
-
-
-if __name__ == '__main__':
+import sys
+import pickle as pickle
+import pytz
+from datetime import datetime
+from typing import Dict, Any
+
+import torch
+from transformers import (
+ AutoTokenizer,
+ EarlyStoppingCallback,
+ TrainingArguments,
+)
+
+from utils.args import *
+from load_data.load_data import *
+from model.model import *
+from model.metric import *
+from trainer.trainer import *
+from utils.utils import *
+
+
+def train(config: Any) -> None:
+ """
+ 모델을 학습하는 함수, train.py와 다른 점은 평가 데이터셋 사용하지 않고 모든 데이터를 훈련에 활용함
+
+ 다음 프로세스를 수행:
+ 1. 데이터셋을 불러오고 전처리 및 토큰화
+ 2. 레이블을 숫자 형태로 변환
+ 3. 학습 및 개발 데이터셋에 대한 Dataset 객체를 생성
+ 4. 지정된 모델을 불러와 훈련 인자 설정
+ 5. 모델 학습 후 저장
+
+ Args:
+ config (dict): 모델 학습에 필요한 모든 구성 매개변수를 포함하는 딕셔너리
+ dataloader, model, optimizer, trainer 구성 포함
+
+ Returns:
+ None
+ """
+ seed_everything(config.seed)
+
+ # load model and tokenizer
+ model_name = config.model['name']
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+ # 1. load dataset
+ # 2. preprocess dataset
+ # 3. tokenize dataset
+ revision = "69b6010fe9681567b98f9d3d3c70487079183d4b"
+ input_format = config.dataloader.get('input_format')
+ prompt = config.dataloader.get('prompt')
+ type_transform = config.dataloader.get('type_transform')
+
+ train_dataset, train_raw_label = load_train_dataset(
+ split=config.dataloader['train_split'],
+ revision=revision,
+ tokenizer=tokenizer,
+ input_format=input_format,
+ prompt=prompt,
+ type_transform=type_transform,
+ )
+
+ train_label = label_to_num(train_raw_label)
+
+ # 4. make Dataset object
+ re_train_dataset = REDataset(train_dataset, train_label)
+
+ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+ print(device)
+
+ # 5. import model
+ # setting model hyperparameter
+ model_module = __import__('model.model', fromlist=[config.model['variant']])
+ model_class = getattr(model_module, config.model['variant'])
+ # Available customized classes:
+ # BaseREModel, BiLSTMREModel, BiGRUREModel
+ model = model_class(config, len(tokenizer))
+
+ print(model.model_config)
+
+ model.parameters
+ model.to(device)
+
+ # 6. training arguments 설정
+ ## 사용한 option 외에도 다양한 option들이 있습니다.
+ ## https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
+ training_args = TrainingArguments(
+ # 기본 설정
+ output_dir=config.trainer['output_dir'], # 모델 저장 디렉토리
+ report_to=('wandb' if config.use_wandb else 'none'), # wandb 사용 여부
+ fp16=True, # 16-bit floating point precision
+
+ # 학습 설정
+ num_train_epochs=config.trainer['epochs'], # 전체 훈련 epoch 수
+ learning_rate=config.optimizer['lr'], # learning rate
+ weight_decay=config.optimizer['weight_decay'], # weight decay
+ adam_beta2=config.optimizer['adam_beta2'], # AdamW 옵티마이저의 beta2 하이퍼파라미터
+
+ # 배치 사이즈 설정
+ per_device_train_batch_size=config.dataloader['batch_size'], # 훈련 중 장치 당 batch size
+
+ # 스케줄링 설정
+ warmup_ratio=config.lr_scheduler['warmup_ratio'], # learning rate scheduler의 warmup 비율
+
+ # 로깅 설정
+ logging_dir=config.trainer['logging_dir'], # 로그 저장 디렉토리
+ logging_steps=config.trainer['logging_steps'], # 로그 저장 스텝
+ load_best_model_at_end=config.trainer['use_early_stop'],
+ )
+
+ # 7. trainer 설정
+ # 8. evaluate 함수 설정
+ trainer = RETrainer(
+ model=model, # the instantiated 🤗 Transformers model to be trained
+ args=training_args, # training arguments, defined above
+ train_dataset=re_train_dataset, # training dataset
+ compute_metrics=compute_metrics, # define metrics function
+ loss_cfg=config.loss,
+ )
+
+ # 9. train model
+ trainer.train()
+ # 10. save model
+ trainer.save_model(config.trainer['model_dir'])
+
+
+def main() -> None:
+ """
+ config를 불러오고 학습 프로세스를 시작하는 메인 함수입니다.
+
+ 다음 프로세스를 수행:
+ 1. 제공된 YAML 파일에서 구성을 파싱하거나 기본 파일을 사용
+ 2. 제공된 구성으로 Weights & Biases (wandb) 실행 초기화
+ 3. train 함수를 호출하여 모델 훈련 프로세스를 시작
+ 4. 학습 완료 후 wandb에 완료 메세지 송출
+
+ Args:
+ None
+
+ Returns:
+ None
+ """
+ try:
+ config_path = sys.argv[1]
+ except IndexError:
+ config_path = './config.yaml'
+ config = parse_arguments(config_path)
+
+ now = datetime.now(pytz.timezone('Asia/Seoul'))
+ run_name = f'{config.run_name}_{now.strftime("%d-%H-%M")}'
+
+ init_wandb(config, run_name)
+ train(config)
+ alert_wandb(config, run_name, 'finished')
+
+
+if __name__ == '__main__':
main()
\ No newline at end of file
diff --git a/inference.py b/inference.py
index 6ab6433..f1da80d 100755
--- a/inference.py
+++ b/inference.py
@@ -1,177 +1,177 @@
-import pickle as pickle
-import sys
-
-import numpy as np
-import pandas as pd
-import torch
-import torch.nn.functional as F
-from tqdm import tqdm
-from torch.utils.data import DataLoader
-from transformers import AutoTokenizer
-from typing import Tuple, List
-
-from utils.args import *
-from load_data.load_data import *
-from model.model import *
-from utils.utils import *
-
-
-def inference(model: torch.nn.Module, tokenized_sent: DataLoader, device: torch.device) -> Tuple[List[int], List[List[float]]]:
- """
- test dataset을 DataLoader로 만들어 준 후 batch_size로 나눠 model이 예측
-
- Args:
- model (torch.nn.Module): 예측에 사용할 모델
- tokenized_sent (DataLoader): 토큰화가 완료된 문장 데이터셋
- device (torch.device): 모델을 실행할 디바이스 (예: cuda:0)
-
- Returns:
- Tuple[List[int], List[List[float]]]: 예측된 클래스 인덱스와 각 클래스에 대한 확률이 담긴 리스트를 반환
- """
-
- dataloader = DataLoader(tokenized_sent, batch_size=32, shuffle=False)
- model.eval()
-
- output_pred = []
- output_prob = []
-
- for i, data in enumerate(tqdm(dataloader)):
- with torch.no_grad():
- outputs = model(
- input_ids=data['input_ids'].to(device),
- attention_mask=data['attention_mask'].to(device),
- token_type_ids=data['token_type_ids'].to(device),
- )
-
- logits = outputs['logits'] if isinstance(outputs, dict) else outputs[0]
- prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
- logits = logits.detach().cpu().numpy()
- result = np.argmax(logits, axis=-1)
-
- output_pred.append(result)
- output_prob.append(prob)
-
- return (
- np.concatenate(output_pred).tolist(),
- np.concatenate(output_prob, axis=0).tolist(),
- )
-
-
-def main() -> None:
- """
- 주어진 데이터셋 csv 파일과 같은 형태일 경우 inference를 수행할 수 있는 메인 함수
-
- 다음 프로세스를 수행:
- 1. config에 따라 시드를 고정하고, 디바이스를 설정
- 2. 토크나이저와 모델을 로드하고, 학습시킨 모델을 로드
- 3. 테스트 데이터셋을 로드하고, 데이터셋 객체 생성
- 4. 모델을 이용하여 예측 수행
- 5. 예측 결과를 csv 파일로 저장
- 6. full train이 아닐 경우 검증 데이터셋에 대해서도 같은 과정을 수행
-
- Args:
- None
-
- Returns:
- None
- """
-
- try:
- config_path = sys.argv[1]
- except IndexError:
- config_path = './config.yaml'
-
- config = parse_arguments(config_path)
-
- seed_everything(config.seed)
-
- device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-
- # load tokenizer
- model_name = config.model['name']
- tokenizer = AutoTokenizer.from_pretrained(model_name)
-
- # load my model
- model_module = __import__('model.model', fromlist=[config.model['variant']])
- model_class = getattr(model_module, config.model['variant'])
- # Available customized classes:
- # BaseREModel, BiLSTMREModel, BiGRUREModel
- model = model_class(config, len(tokenizer))
-
- load_model_path = './best_model/pytorch_model.bin'
- checkpoint = torch.load(load_model_path)
- model.load_state_dict(checkpoint)
-
- model.parameters
- model.to(device)
-
- # load test dataset
- revision = config.dataloader['revision']
- input_format = config.dataloader.get('input_format')
- prompt = config.dataloader.get('prompt')
- type_transform = config.dataloader.get('type_transform')
-
- test_id, test_dataset, test_label = load_test_dataset(
- split='test',
- revision=revision,
- tokenizer=tokenizer,
- input_format=input_format,
- prompt=prompt,
- type_transform=type_transform,
- )
- re_test_dataset = REDataset(test_dataset, test_label)
-
- # predict answer
- pred_answer, output_prob = inference(model, re_test_dataset, device) # model에서 class 추론
- pred_answer = num_to_label(pred_answer) # 숫자로 된 class를 원래 문자열 라벨로 변환
-
- # make csv file with predicted answer
- output = pd.DataFrame(
- {
- 'id': test_id,
- 'pred_label': pred_answer,
- 'probs': output_prob,
- }
- )
- output_path = config.trainer['pred_dir']
- os.makedirs(os.path.dirname(output_path), exist_ok=True)
- output.to_csv(output_path, index=False) # 최종적으로 완성된 예측한 라벨 csv 파일 형태로 저장
-
- ## 사후분석을 위한 validation data inference
- # load validation dataset(full train일 경우 revision에 valid가 없어서 load_test_dataset에서 오류가 생기므로 넘기기)
- try:
- val_id, val_dataset, val_label = load_test_dataset(
- split=config.dataloader['valid_split'],
- revision=revision,
- tokenizer=tokenizer,
- input_format=input_format,
- prompt=prompt,
- type_transform=type_transform,
- )
- re_val_dataset = REDataset(val_dataset, [100] * len(val_id))
-
- # predict validation answer
- pred_val_answer, val_output_prob = inference(model, re_val_dataset, device)
- pred_val_answer = num_to_label(pred_val_answer)
-
- # make csv file with predicted validation answer
- val_output = pd.DataFrame(
- {
- 'id': val_id,
- 'true_label': val_label,
- 'pred_label': pred_val_answer,
- 'probs': val_output_prob,
- }
- )
- val_output_path = config.trainer['val_pred_dir']
- os.makedirs(os.path.dirname(val_output_path), exist_ok=True)
- val_output.to_csv(val_output_path, index=False) # 최종적으로 완성된 예측한 라벨 csv 파일 형태로 저장
-
- except ValueError:
- print('There is no existing valiation dataset. The inference output is from full dataset model.')
-
- print('---- Finish! ----')
-
-
-if __name__ == '__main__':
+import pickle as pickle
+import sys
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+from torch.utils.data import DataLoader
+from transformers import AutoTokenizer
+from typing import Tuple, List
+
+from utils.args import *
+from load_data.load_data import *
+from model.model import *
+from utils.utils import *
+
+
+def inference(model: torch.nn.Module, tokenized_sent: DataLoader, device: torch.device) -> Tuple[List[int], List[List[float]]]:
+ """
+ test dataset을 DataLoader로 만들어 준 후 batch_size로 나눠 model이 예측
+
+ Args:
+ model (torch.nn.Module): 예측에 사용할 모델
+ tokenized_sent (DataLoader): 토큰화가 완료된 문장 데이터셋
+ device (torch.device): 모델을 실행할 디바이스 (예: cuda:0)
+
+ Returns:
+ Tuple[List[int], List[List[float]]]: 예측된 클래스 인덱스와 각 클래스에 대한 확률이 담긴 리스트를 반환
+ """
+
+ dataloader = DataLoader(tokenized_sent, batch_size=32, shuffle=False)
+ model.eval()
+
+ output_pred = []
+ output_prob = []
+
+ for i, data in enumerate(tqdm(dataloader)):
+ with torch.no_grad():
+ outputs = model(
+ input_ids=data['input_ids'].to(device),
+ attention_mask=data['attention_mask'].to(device),
+ token_type_ids=data['token_type_ids'].to(device),
+ )
+
+ logits = outputs['logits'] if isinstance(outputs, dict) else outputs[0]
+ prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
+ logits = logits.detach().cpu().numpy()
+ result = np.argmax(logits, axis=-1)
+
+ output_pred.append(result)
+ output_prob.append(prob)
+
+ return (
+ np.concatenate(output_pred).tolist(),
+ np.concatenate(output_prob, axis=0).tolist(),
+ )
+
+
+def main() -> None:
+ """
+ 주어진 데이터셋 csv 파일과 같은 형태일 경우 inference를 수행할 수 있는 메인 함수
+
+ 다음 프로세스를 수행:
+ 1. config에 따라 시드를 고정하고, 디바이스를 설정
+ 2. 토크나이저와 모델을 로드하고, 학습시킨 모델을 로드
+ 3. 테스트 데이터셋을 로드하고, 데이터셋 객체 생성
+ 4. 모델을 이용하여 예측 수행
+ 5. 예측 결과를 csv 파일로 저장
+ 6. full train이 아닐 경우 검증 데이터셋에 대해서도 같은 과정을 수행
+
+ Args:
+ None
+
+ Returns:
+ None
+ """
+
+ try:
+ config_path = sys.argv[1]
+ except IndexError:
+ config_path = './config.yaml'
+
+ config = parse_arguments(config_path)
+
+ seed_everything(config.seed)
+
+ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+
+ # load tokenizer
+ model_name = config.model['name']
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+ # load my model
+ model_module = __import__('model.model', fromlist=[config.model['variant']])
+ model_class = getattr(model_module, config.model['variant'])
+ # Available customized classes:
+ # BaseREModel, BiLSTMREModel, BiGRUREModel
+ model = model_class(config, len(tokenizer))
+
+ load_model_path = './best_model/pytorch_model.bin'
+ checkpoint = torch.load(load_model_path)
+ model.load_state_dict(checkpoint)
+
+ model.parameters
+ model.to(device)
+
+ # load test dataset
+ revision = config.dataloader['revision']
+ input_format = config.dataloader.get('input_format')
+ prompt = config.dataloader.get('prompt')
+ type_transform = config.dataloader.get('type_transform')
+
+ test_id, test_dataset, test_label = load_test_dataset(
+ split='test',
+ revision=revision,
+ tokenizer=tokenizer,
+ input_format=input_format,
+ prompt=prompt,
+ type_transform=type_transform,
+ )
+ re_test_dataset = REDataset(test_dataset, test_label)
+
+ # predict answer
+ pred_answer, output_prob = inference(model, re_test_dataset, device) # model에서 class 추론
+ pred_answer = num_to_label(pred_answer) # 숫자로 된 class를 원래 문자열 라벨로 변환
+
+ # make csv file with predicted answer
+ output = pd.DataFrame(
+ {
+ 'id': test_id,
+ 'pred_label': pred_answer,
+ 'probs': output_prob,
+ }
+ )
+ output_path = config.trainer['pred_dir']
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
+ output.to_csv(output_path, index=False) # 최종적으로 완성된 예측한 라벨 csv 파일 형태로 저장
+
+ ## 사후분석을 위한 validation data inference
+ # load validation dataset(full train일 경우 revision에 valid가 없어서 load_test_dataset에서 오류가 생기므로 넘기기)
+ try:
+ val_id, val_dataset, val_label = load_test_dataset(
+ split=config.dataloader['valid_split'],
+ revision=revision,
+ tokenizer=tokenizer,
+ input_format=input_format,
+ prompt=prompt,
+ type_transform=type_transform,
+ )
+ re_val_dataset = REDataset(val_dataset, [100] * len(val_id))
+
+ # predict validation answer
+ pred_val_answer, val_output_prob = inference(model, re_val_dataset, device)
+ pred_val_answer = num_to_label(pred_val_answer)
+
+ # make csv file with predicted validation answer
+ val_output = pd.DataFrame(
+ {
+ 'id': val_id,
+ 'true_label': val_label,
+ 'pred_label': pred_val_answer,
+ 'probs': val_output_prob,
+ }
+ )
+ val_output_path = config.trainer['val_pred_dir']
+ os.makedirs(os.path.dirname(val_output_path), exist_ok=True)
+ val_output.to_csv(val_output_path, index=False) # 최종적으로 완성된 예측한 라벨 csv 파일 형태로 저장
+
+ except ValueError:
+ print('There is no existing valiation dataset. The inference output is from full dataset model.')
+
+ print('---- Finish! ----')
+
+
+if __name__ == '__main__':
main()
\ No newline at end of file
diff --git a/load_data/load_data.py b/load_data/load_data.py
index f2794ad..a0babde 100755
--- a/load_data/load_data.py
+++ b/load_data/load_data.py
@@ -1,320 +1,320 @@
-import pickle as pickle
-import re
-
-import torch
-from datasets import Dataset, load_dataset
-from transformers import PreTrainedTokenizer
-from tqdm import tqdm
-from typing import Dict, List, Tuple, Union
-
-from utils.utils import *
-
-
-def load_train_dataset(
- split: str,
- revision: str,
- tokenizer: PreTrainedTokenizer,
- input_format: str = None,
- prompt: str = None,
- type_transform: bool = False
-) -> Tuple[Dict[str, Union[List[str], List[int]]], Union[int, List[str]]]:
- """
- train dataset을 불러온 후, tokenizing 하는 함수입니다.
-
- Args:
- split (str): 데이터셋의 분할 유형 (train, validation, test).
- revision (str): 데이터셋의 버전 (commit hash).
- tokenizer (PreTrainedTokenizer): 사용할 토크나이저 객체.
- input_format (str, optional): entity representation 유형. 기본값은 None이며, default로 설정됩니다.
- prompt (str, optional): prompt 유형. 기본값은 None이며, default로 설정됩니다.
- type_transform (bool, optional): entity type을 한글로 번역할지 여부. 기본값은 False입니다.
-
- Returns:
- Tuple[Dict[str, Union[List[str], List[int]]], Union[int, List[str]]]
- : 토큰화된 train 데이터셋과 레이블.
- """
-
- if input_format is None:
- input_format = 'default'
- if prompt is None:
- prompt = 'default'
- print('input format: ',input_format, '| prompt: ', prompt)
-
- dataset = load_dataset(
- 'Smoked-Salmon-s/RE_Competition',
- split=split,
- column_names=['id', 'sentence', 'subject_entity', 'object_entity', 'label', 'source'],
- revision=revision,
- )
- pd_dataset = dataset.to_pandas().iloc[1:].reset_index(drop=True).astype({'id': 'int64'})
- train_dataset = preprocessing_dataset(pd_dataset, input_format, type_transform)
- tokenized_train = tokenized_dataset(train_dataset, tokenizer, input_format, prompt)
- train_label = pd_dataset['label'].values
-
- return tokenized_train, train_label
-
-
-def load_test_dataset(
- split: str,
- revision: str,
- tokenizer: PreTrainedTokenizer,
- input_format: str = None,
- prompt: str = None,
- type_transform: bool = False
-) -> Tuple[Union[int, str], Dict[str, Union[List[str], List[int]]], Union[int, List[str]]]:
- """
- test dataset을 불러온 후, tokenizing 하는 함수입니다.
-
- Args:
- split (str): 데이터셋의 분할 유형 (train, validation, test).
- revision (str): 데이터셋의 버전 (commit hash).
- tokenizer (PreTrainedTokenizer): 사용할 토크나이저 객체.
- input_format (str, optional): entity representation 유형. 기본값은 None이며, default로 설정됩니다.
- prompt (str, optional): prompt 유형. 기본값은 None이며, default로 설정됩니다.
- type_transform (bool, optional): entity type을 한글로 번역할지 여부. 기본값은 False입니다.
-
- Returns:
- Tuple[Union[int, str], Dict[str, Union[List[str], List[int]]], Union[int, List[str]]]
- : test 데이터셋의 id, 토큰화된 문장, 레이블.
- """
-
- if input_format is None:
- input_format = 'default'
- if prompt is None:
- prompt = 'default'
- print('input format: ',input_format, 'prompt: ', prompt)
-
- dataset = load_dataset(
- 'Smoked-Salmon-s/RE_Competition',
- split=split,
- column_names=['id', 'sentence', 'subject_entity', 'object_entity', 'label', 'source'],
- revision=revision,
- )
- pd_dataset = dataset.to_pandas().iloc[1:].reset_index(drop=True).astype({'id': 'int64'})
- test_dataset = preprocessing_dataset(pd_dataset, input_format, type_transform)
- tokenized_test = tokenized_dataset(test_dataset, tokenizer, input_format, prompt)
-
- if split == 'test':
- test_label = list(map(int, pd_dataset['label'].values))
- else:
- test_label = pd_dataset['label'].values
-
- return test_dataset['id'], tokenized_test, test_label
-
-
-def preprocessing_dataset(
- dataset: Dict[str, List[str]],
- input_format: str,
- type_transform: bool = False
-) -> Dict[str, List[str]]:
- """
- subject_entity column과 object_entity column을 리스트 형태로 변환하고,
- sentence column에 entity representation를 적용하는 함수입니다.
-
- Args:
- dataset (Dict[str, List[str]]): 전처리할 데이터셋.
- input_format (str): entity representation 유형.
- type_transform (bool, optional): entity type을 한글로 번역할지 여부. 기본값은 False입니다.
-
- Returns:
- Dict[str, List[str]]: 전처리된 데이터셋.
- """
-
- subject_entity = []
- object_entity = []
-
- for i, j in zip(dataset['subject_entity'], dataset['object_entity']):
- i = i[1:-1].split(',')[0].split(':')[1]
- j = j[1:-1].split(',')[0].split(':')[1]
- subject_entity.append(i)
- object_entity.append(j)
-
- dataset['subj_entity'] = subject_entity
- dataset['obj_entity'] = object_entity
-
- # entity type을 한글로 번역
- if type_transform:
- print('entity type을 한글로 번역합니다.')
- hanguled = [to_hangul(row_data) for index, row_data in tqdm(dataset.iterrows())]
- dataset['subject_entity'] = [x[0] for x in hanguled]
- dataset['object_entity'] = [x[1] for x in hanguled]
-
- # entity representation 적용
- input_format_list = ['entity_mask', 'entity_marker', 'entity_marker_punct', 'typed_entity_marker', 'typed_entity_marker_punct']
- if input_format in input_format_list:
- marked_sentences = [marker(row_data, input_format) for index, row_data in tqdm(dataset.iterrows())]
- dataset['sentence'] = marked_sentences
- elif input_format == 'default':
- pass
- else:
- raise ValueError('잘못된 input_format이 입력되었습니다. ')
-
- return dataset
-
-
-def tokenized_dataset(
- dataset: Dict[str, List[str]],
- tokenizer: PreTrainedTokenizer,
- input_format: str,
- prompt: str
-) -> Dict[str, Union[List[str], List[int]]]:
- """
- tokenizer에 따라 문장을 토큰화하는 함수입니다.
-
- Args:
- dataset (Dict[str, List[str]]): 토큰화할 데이터셋.
- tokenizer (PreTrainedTokenizer): 사용할 토크나이저 객체.
- input_format (str): entity representation 유형.
- prompt (str): prompt 유형.
-
- Returns:
- Dict[str, Union[List[str], List[int]]]: 토큰화된 문장의 딕셔너리.
- """
-
- # 새로운 특수 토큰 추가
- special_tokens = []
-
- if input_format == 'entity_mask':
- special_tokens = ['[SUBJ-ORG]', '[SUBJ-PER]', '[OBJ-ORG]', '[OBJ-PER]', '[OBJ-LOC]', '[OBJ-DAT]', '[OBJ-POH]', '[OBJ-NOH]']
-
- elif input_format == 'entity_marker':
- special_tokens = ['[E1]', '[/E1]', '[E2]', '[/E2]']
-
- elif input_format == 'typed_entity_marker':
- special_tokens = ['', '', '', '', '', '', '', '',
- '', '', '', '', '', '', '', '']
-
- tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
-
- # check
- print("length of tokenizer:", len(tokenizer))
- print("length of special tokens: ", tokenizer.all_special_tokens)
- print("special tokens:", tokenizer.special_tokens_map)
-
- # prompt 추가
- if prompt in ['s_sep_o', 's_and_o', 'quiz']:
- prompt_forward = []
-
- if prompt == 's_sep_o':
- for e01, e02 in zip(dataset['subj_entity'], dataset['obj_entity']):
- temp = ''
- temp = e01[2:-1] + '[SEP]' + e02[2:-1]
- prompt_forward.append(temp)
-
- elif prompt == 's_and_o':
- for e01, e02 in zip(dataset['subj_entity'], dataset['obj_entity']):
- temp = ''
- temp = e01[2:-1] + '와 ' + e02[2:-1] + '의 관계'
- prompt_forward.append(temp)
-
- elif prompt == 'quiz':
- for e01, e02 in zip(dataset['subj_entity'], dataset['obj_entity']):
- temp = ''
- temp = '다음 문장에서 ' + e01[2:-1] + '와 ' + e02[2:-1] + '사이의 관계를 추출하시오.'
- prompt_forward.append(temp)
-
- tokenized_sentences = tokenizer(
- prompt_forward,
- list(dataset['sentence']),
- return_tensors='pt',
- padding=True,
- truncation=True,
- max_length=180,
- add_special_tokens=True,
- )
-
- elif prompt == 'problem':
- prompt_forward = []
- prompt_backward = []
-
- for e01, e02 in zip(dataset['subj_entity'], dataset['obj_entity']):
- temp = ''
- temp = '다음 문장에서 ' + e01[2:-1] + '와 ' + e02[2:-1] + '사이의 관계를 추출하시오.'
- prompt_forward.append(temp)
- for e00, e01, e02 in zip(dataset['sentence'], dataset['subj_entity'], dataset['obj_entity']):
- temp = ''
- temp = e00 + e01[2:-1] + '와 ' + e02[2:-1] + '는 어떤 관계입니까?'
- prompt_backward.append(temp)
-
- tokenized_sentences = tokenizer(
- prompt_forward,
- prompt_backward,
- return_tensors='pt',
- padding=True,
- truncation=True,
- max_length=200,
- add_special_tokens=True,
- )
-
- elif prompt == 'default':
- tokenized_sentences = tokenizer(
- list(dataset['sentence']),
- return_tensors='pt',
- padding=True,
- truncation=True,
- max_length=180,
- add_special_tokens=True,
- )
-
- else:
- raise ValueError('잘못된 prompt가 입력되었습니다. ')
-
- return tokenized_sentences
-
-
-def label_to_num(label: List[str]) -> List[int]:
- """
- 원본 문자열 label을 숫자 형식 클래스로 변환하는 함수입니다.
-
- Args:
- label (List[str]): 변환할 원본 문자열 클래스 리스트.
-
- Returns:
- List[int]: 숫자 형식으로 변환된 클래스 리스트.
- """
-
- num_label = []
- with open('load_data/dict_label_to_num.pkl', 'rb') as f:
- dict_label_to_num = pickle.load(f)
- for v in label:
- num_label.append(dict_label_to_num[v])
-
- return num_label
-
-
-def num_to_label(label: List[int]) -> List[str]:
- """
- 숫자 형식 클래스를 원본 문자열 label로 변환하는 함수입니다.
-
- Args:
- label (List[int]): 변환할 숫자 형식의 클래스 리스트.
-
- Returns:
- List[str]: 원본 문자열로 변환된 클래스 리스트.
- """
-
- origin_label = []
- with open('load_data/dict_num_to_label.pkl', 'rb') as f:
- dict_num_to_label = pickle.load(f)
- for v in label:
- origin_label.append(dict_num_to_label[v])
-
- return origin_label
-
-
-class REDataset(torch.utils.data.Dataset):
- """Dataset 구성을 위한 class입니다."""
-
- def __init__(self, pair_dataset, labels):
- self.pair_dataset = pair_dataset
- self.labels = labels
-
- def __getitem__(self, idx):
- item = {
- key: val[idx].clone().detach() for key, val in self.pair_dataset.items()
- }
- item['labels'] = torch.tensor(self.labels[idx])
- return item
-
- def __len__(self):
+import pickle as pickle
+import re
+
+import torch
+from datasets import Dataset, load_dataset
+from transformers import PreTrainedTokenizer
+from tqdm import tqdm
+from typing import Dict, List, Tuple, Union
+
+from utils.utils import *
+
+
+def load_train_dataset(
+ split: str,
+ revision: str,
+ tokenizer: PreTrainedTokenizer,
+ input_format: str = None,
+ prompt: str = None,
+ type_transform: bool = False
+) -> Tuple[Dict[str, Union[List[str], List[int]]], Union[int, List[str]]]:
+ """
+ train dataset을 불러온 후, tokenizing 하는 함수입니다.
+
+ Args:
+ split (str): 데이터셋의 분할 유형 (train, validation, test).
+ revision (str): 데이터셋의 버전 (commit hash).
+ tokenizer (PreTrainedTokenizer): 사용할 토크나이저 객체.
+ input_format (str, optional): entity representation 유형. 기본값은 None이며, default로 설정됩니다.
+ prompt (str, optional): prompt 유형. 기본값은 None이며, default로 설정됩니다.
+ type_transform (bool, optional): entity type을 한글로 번역할지 여부. 기본값은 False입니다.
+
+ Returns:
+ Tuple[Dict[str, Union[List[str], List[int]]], Union[int, List[str]]]
+ : 토큰화된 train 데이터셋과 레이블.
+ """
+
+ if input_format is None:
+ input_format = 'default'
+ if prompt is None:
+ prompt = 'default'
+ print('input format: ',input_format, '| prompt: ', prompt)
+
+ dataset = load_dataset(
+ 'Smoked-Salmon-s/RE_Competition',
+ split=split,
+ column_names=['id', 'sentence', 'subject_entity', 'object_entity', 'label', 'source'],
+ revision=revision,
+ )
+ pd_dataset = dataset.to_pandas().iloc[1:].reset_index(drop=True).astype({'id': 'int64'})
+ train_dataset = preprocessing_dataset(pd_dataset, input_format, type_transform)
+ tokenized_train = tokenized_dataset(train_dataset, tokenizer, input_format, prompt)
+ train_label = pd_dataset['label'].values
+
+ return tokenized_train, train_label
+
+
+def load_test_dataset(
+ split: str,
+ revision: str,
+ tokenizer: PreTrainedTokenizer,
+ input_format: str = None,
+ prompt: str = None,
+ type_transform: bool = False
+) -> Tuple[Union[int, str], Dict[str, Union[List[str], List[int]]], Union[int, List[str]]]:
+ """
+ test dataset을 불러온 후, tokenizing 하는 함수입니다.
+
+ Args:
+ split (str): 데이터셋의 분할 유형 (train, validation, test).
+ revision (str): 데이터셋의 버전 (commit hash).
+ tokenizer (PreTrainedTokenizer): 사용할 토크나이저 객체.
+ input_format (str, optional): entity representation 유형. 기본값은 None이며, default로 설정됩니다.
+ prompt (str, optional): prompt 유형. 기본값은 None이며, default로 설정됩니다.
+ type_transform (bool, optional): entity type을 한글로 번역할지 여부. 기본값은 False입니다.
+
+ Returns:
+ Tuple[Union[int, str], Dict[str, Union[List[str], List[int]]], Union[int, List[str]]]
+ : test 데이터셋의 id, 토큰화된 문장, 레이블.
+ """
+
+ if input_format is None:
+ input_format = 'default'
+ if prompt is None:
+ prompt = 'default'
+ print('input format: ',input_format, 'prompt: ', prompt)
+
+ dataset = load_dataset(
+ 'Smoked-Salmon-s/RE_Competition',
+ split=split,
+ column_names=['id', 'sentence', 'subject_entity', 'object_entity', 'label', 'source'],
+ revision=revision,
+ )
+ pd_dataset = dataset.to_pandas().iloc[1:].reset_index(drop=True).astype({'id': 'int64'})
+ test_dataset = preprocessing_dataset(pd_dataset, input_format, type_transform)
+ tokenized_test = tokenized_dataset(test_dataset, tokenizer, input_format, prompt)
+
+ if split == 'test':
+ test_label = list(map(int, pd_dataset['label'].values))
+ else:
+ test_label = pd_dataset['label'].values
+
+ return test_dataset['id'], tokenized_test, test_label
+
+
+def preprocessing_dataset(
+ dataset: Dict[str, List[str]],
+ input_format: str,
+ type_transform: bool = False
+) -> Dict[str, List[str]]:
+ """
+ subject_entity column과 object_entity column을 리스트 형태로 변환하고,
+ sentence column에 entity representation를 적용하는 함수입니다.
+
+ Args:
+ dataset (Dict[str, List[str]]): 전처리할 데이터셋.
+ input_format (str): entity representation 유형.
+ type_transform (bool, optional): entity type을 한글로 번역할지 여부. 기본값은 False입니다.
+
+ Returns:
+ Dict[str, List[str]]: 전처리된 데이터셋.
+ """
+
+ subject_entity = []
+ object_entity = []
+
+ for i, j in zip(dataset['subject_entity'], dataset['object_entity']):
+ i = i[1:-1].split(',')[0].split(':')[1]
+ j = j[1:-1].split(',')[0].split(':')[1]
+ subject_entity.append(i)
+ object_entity.append(j)
+
+ dataset['subj_entity'] = subject_entity
+ dataset['obj_entity'] = object_entity
+
+ # entity type을 한글로 번역
+ if type_transform:
+ print('entity type을 한글로 번역합니다.')
+ hanguled = [to_hangul(row_data) for index, row_data in tqdm(dataset.iterrows())]
+ dataset['subject_entity'] = [x[0] for x in hanguled]
+ dataset['object_entity'] = [x[1] for x in hanguled]
+
+ # entity representation 적용
+ input_format_list = ['entity_mask', 'entity_marker', 'entity_marker_punct', 'typed_entity_marker', 'typed_entity_marker_punct']
+ if input_format in input_format_list:
+ marked_sentences = [marker(row_data, input_format) for index, row_data in tqdm(dataset.iterrows())]
+ dataset['sentence'] = marked_sentences
+ elif input_format == 'default':
+ pass
+ else:
+ raise ValueError('잘못된 input_format이 입력되었습니다. ')
+
+ return dataset
+
+
+def tokenized_dataset(
+ dataset: Dict[str, List[str]],
+ tokenizer: PreTrainedTokenizer,
+ input_format: str,
+ prompt: str
+) -> Dict[str, Union[List[str], List[int]]]:
+ """
+ tokenizer에 따라 문장을 토큰화하는 함수입니다.
+
+ Args:
+ dataset (Dict[str, List[str]]): 토큰화할 데이터셋.
+ tokenizer (PreTrainedTokenizer): 사용할 토크나이저 객체.
+ input_format (str): entity representation 유형.
+ prompt (str): prompt 유형.
+
+ Returns:
+ Dict[str, Union[List[str], List[int]]]: 토큰화된 문장의 딕셔너리.
+ """
+
+ # 새로운 특수 토큰 추가
+ special_tokens = []
+
+ if input_format == 'entity_mask':
+ special_tokens = ['[SUBJ-ORG]', '[SUBJ-PER]', '[OBJ-ORG]', '[OBJ-PER]', '[OBJ-LOC]', '[OBJ-DAT]', '[OBJ-POH]', '[OBJ-NOH]']
+
+ elif input_format == 'entity_marker':
+ special_tokens = ['[E1]', '[/E1]', '[E2]', '[/E2]']
+
+ elif input_format == 'typed_entity_marker':
+ special_tokens = ['', '', '', '', '', '', '', '',
+ '', '', '', '', '', '', '', '']
+
+ tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
+
+ # check
+ print("length of tokenizer:", len(tokenizer))
+ print("length of special tokens: ", tokenizer.all_special_tokens)
+ print("special tokens:", tokenizer.special_tokens_map)
+
+ # prompt 추가
+ if prompt in ['s_sep_o', 's_and_o', 'quiz']:
+ prompt_forward = []
+
+ if prompt == 's_sep_o':
+ for e01, e02 in zip(dataset['subj_entity'], dataset['obj_entity']):
+ temp = ''
+ temp = e01[2:-1] + '[SEP]' + e02[2:-1]
+ prompt_forward.append(temp)
+
+ elif prompt == 's_and_o':
+ for e01, e02 in zip(dataset['subj_entity'], dataset['obj_entity']):
+ temp = ''
+ temp = e01[2:-1] + '와 ' + e02[2:-1] + '의 관계'
+ prompt_forward.append(temp)
+
+ elif prompt == 'quiz':
+ for e01, e02 in zip(dataset['subj_entity'], dataset['obj_entity']):
+ temp = ''
+ temp = '다음 문장에서 ' + e01[2:-1] + '와 ' + e02[2:-1] + '사이의 관계를 추출하시오.'
+ prompt_forward.append(temp)
+
+ tokenized_sentences = tokenizer(
+ prompt_forward,
+ list(dataset['sentence']),
+ return_tensors='pt',
+ padding=True,
+ truncation=True,
+ max_length=180,
+ add_special_tokens=True,
+ )
+
+ elif prompt == 'problem':
+ prompt_forward = []
+ prompt_backward = []
+
+ for e01, e02 in zip(dataset['subj_entity'], dataset['obj_entity']):
+ temp = ''
+ temp = '다음 문장에서 ' + e01[2:-1] + '와 ' + e02[2:-1] + '사이의 관계를 추출하시오.'
+ prompt_forward.append(temp)
+ for e00, e01, e02 in zip(dataset['sentence'], dataset['subj_entity'], dataset['obj_entity']):
+ temp = ''
+ temp = e00 + e01[2:-1] + '와 ' + e02[2:-1] + '는 어떤 관계입니까?'
+ prompt_backward.append(temp)
+
+ tokenized_sentences = tokenizer(
+ prompt_forward,
+ prompt_backward,
+ return_tensors='pt',
+ padding=True,
+ truncation=True,
+ max_length=200,
+ add_special_tokens=True,
+ )
+
+ elif prompt == 'default':
+ tokenized_sentences = tokenizer(
+ list(dataset['sentence']),
+ return_tensors='pt',
+ padding=True,
+ truncation=True,
+ max_length=180,
+ add_special_tokens=True,
+ )
+
+ else:
+ raise ValueError('잘못된 prompt가 입력되었습니다. ')
+
+ return tokenized_sentences
+
+
+def label_to_num(label: List[str]) -> List[int]:
+ """
+ 원본 문자열 label을 숫자 형식 클래스로 변환하는 함수입니다.
+
+ Args:
+ label (List[str]): 변환할 원본 문자열 클래스 리스트.
+
+ Returns:
+ List[int]: 숫자 형식으로 변환된 클래스 리스트.
+ """
+
+ num_label = []
+ with open('load_data/dict_label_to_num.pkl', 'rb') as f:
+ dict_label_to_num = pickle.load(f)
+ for v in label:
+ num_label.append(dict_label_to_num[v])
+
+ return num_label
+
+
+def num_to_label(label: List[int]) -> List[str]:
+ """
+ 숫자 형식 클래스를 원본 문자열 label로 변환하는 함수입니다.
+
+ Args:
+ label (List[int]): 변환할 숫자 형식의 클래스 리스트.
+
+ Returns:
+ List[str]: 원본 문자열로 변환된 클래스 리스트.
+ """
+
+ origin_label = []
+ with open('load_data/dict_num_to_label.pkl', 'rb') as f:
+ dict_num_to_label = pickle.load(f)
+ for v in label:
+ origin_label.append(dict_num_to_label[v])
+
+ return origin_label
+
+
+class REDataset(torch.utils.data.Dataset):
+ """Dataset 구성을 위한 class입니다."""
+
+ def __init__(self, pair_dataset, labels):
+ self.pair_dataset = pair_dataset
+ self.labels = labels
+
+ def __getitem__(self, idx):
+ item = {
+ key: val[idx].clone().detach() for key, val in self.pair_dataset.items()
+ }
+ item['labels'] = torch.tensor(self.labels[idx])
+ return item
+
+ def __len__(self):
return len(self.labels)
\ No newline at end of file
diff --git a/model/loss.py b/model/loss.py
index 29931c4..6206e4a 100644
--- a/model/loss.py
+++ b/model/loss.py
@@ -1,160 +1,160 @@
-from typing import Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-
-
-class FocalLoss(nn.Module):
- """
- Dense detection을 위한 RetinaNet에서 제안된 loss: https://arxiv.org/abs/1708.02002.
- """
- def __init__(self, alpha: float = 1.0, gamma: float = 2.0, reduction: str = 'mean') -> None:
- """
- Args:
- alpha (float): 개구간 (0, 1) 내의 실수값을 가지는 가중치 factor.
- 양성 및 음성 예제간의 균형을 맞추는 역할. 기본값: 0.25.
- gamma (float): 쉬운 예제와 어려운 예제 간의 균형을 맞추는 역할을 하는 modulating factor의 지수.
- 기본값: 2.0
- reduction (string): ``'none'`` | ``'mean'`` | ``'sum'``
- ``'none'``: No reduction will be applied to the output.
- ``'mean'``: 평균 출력 반환.
- ``'sum'``: 합계 출력 반환. 기본값: none.
- Returns:
- Loss Tensor
- """
- super(FocalLoss, self).__init__()
- self.alpha = alpha # 각 클래스에 대한 가중치
- self.gamma = gamma # "focus" 매개변수로 어려운 예시에 더 많은 주의를 기울이는 역할
- self.reduction = reduction
-
- def forward(self, inputs: Tensor, targets: Tensor) -> Tensor:
- """
- Args:
- inputs (Tensor): (bsz, 30) 사이즈의 Float Tensor.
- 각 예제에 대한 예측.
- targets (Tensor): (30,) 사이즈의 true class 정보. 0부터 29까지의 정수가 담긴 Long Tensor.
- 음성 클래스: 0, 양성 클래스: 1.
- """
- p = torch.sigmoid(inputs)
- targets = F.one_hot(targets, num_classes=30).float()
- ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
- p_t = p * targets + (1 - p) * (1 - targets)
- loss = ce_loss * ((1 - p_t) ** self.gamma)
-
- if self.alpha >= 0:
- alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets)
- loss = alpha_t * loss
-
- # Check reduction option and return loss accordingly
- if self.reduction == 'none':
- pass
- elif self.reduction == 'mean':
- loss = loss.mean()
- elif self.reduction == 'sum':
- loss = loss.sum()
- else:
- raise ValueError(
- f'Invalid Value for arg "reduction": {self.reduction} \n Supported reduction modes: "none", "mean", "sum"'
- )
- return loss
-
-class WeightedFocalLoss(nn.Module):
- def __init__(self, alpha: torch.Tensor = None, gamma: float = 2.0, reduction: str = 'mean') -> None:
- super(WeightedFocalLoss, self).__init__()
- self.alpha = alpha # 각 클래스에 대한 가중치
- self.gamma = gamma # "focus" 매개변수로 어려운 예시에 더 많은 주의를 기울이는 역할
- self.reduction = reduction
-
- # alpha가 None인 경우, 모든 클래스에 동일한 가중치 적용
- # alpha가 텐서인 경우, alpha의 각 요소는 해당 클래스의 가중치로 설정
- self.ce_loss = nn.CrossEntropyLoss(weight=self.alpha, reduction='none')
-
- def forward(self, inputs: Tensor, targets: Tensor) -> Tensor:
- ce_loss = self.ce_loss(inputs, targets)
- pt = torch.exp(-ce_loss)
- focal_loss = (1 - pt)**self.gamma * ce_loss
-
- if self.reduction == 'mean':
- return torch.mean(focal_loss)
- elif self.reduction == 'sum':
- return torch.sum(focal_loss)
- else:
- return focal_loss
-
-
-class LovaszSoftmaxLoss(nn.Module):
- def __init__(self, weight=None, reduction: str = 'mean') -> None:
- super(LovaszSoftmaxLoss, self).__init__()
- self.weight = weight
- self.reduction = reduction
-
- def lovasz_grad(self, true_sorted):
- p = len(true_sorted)
- gts = true_sorted.sum()
- intersection = gts - true_sorted.cumsum(0)
- union = gts + (1 - true_sorted).cumsum(0)
- jaccard = 1 - intersection / union
- if p > 1: # cover 1-pixel case
- jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
- return jaccard
-
- def lovasz_softmax(self, log_probs, labels):
- C = log_probs.shape[1]
- losses = []
- for c in range(C):
- fg = (labels == c).float() # foreground for class c
- if fg.sum() == 0:
- continue
- errors = (fg - log_probs[:, c]).abs()
- errors_sorted, perm = torch.sort(errors, 0, descending=True)
- fg_sorted = fg[perm]
- losses.append(torch.dot(errors_sorted, self.lovasz_grad(fg_sorted)))
- return torch.stack(losses)
-
- def forward(self, inputs: Tensor, targets: Tensor) -> Tensor:
- log_probs = F.log_softmax(inputs, dim=1)
- lovasz_loss = self.lovasz_softmax(log_probs, targets)
-
- if self.reduction == 'mean':
- return torch.mean(lovasz_loss)
- elif self.reduction == 'sum':
- return torch.sum(lovasz_loss)
- else:
- return lovasz_loss
-
-
-class MulticlassDiceLoss(nn.Module):
- def __init__(self, smooth: float = 1e-5, reduction: str = 'mean'):
- super(MulticlassDiceLoss, self).__init__()
- self.smooth = smooth
- self.reduction = reduction
-
- def forward(self, inputs: Tensor, targets: Tensor) -> Tensor:
- # Softmax over the inputs
- inputs = torch.softmax(inputs, dim=1)
-
-
- # One-hot encode targets
- targets_one_hot = torch.nn.functional.one_hot(targets, num_classes=inputs.shape[1])
-
- # Move targets_one_hot to device of inputs
- targets_one_hot = targets_one_hot.to(inputs.device)
-
- # Calculate Dice Loss for each class
- dice_loss = 0
- for i in range(inputs.shape[1]):
- intersection = 2 * (inputs[:, i] * targets_one_hot[:, i]).sum()
- union = inputs[:, i].sum() + targets_one_hot[:, i].sum()
- dice_loss += (1 - (intersection + self.smooth) / (union + self.smooth))
-
- # Average the dice loss for all classes
- dice_loss /= inputs.shape[1]
-
- if self.reduction == 'mean':
- return dice_loss
- elif self.reduction == 'sum':
- return dice_loss.sum()
- else:
- return dice_loss
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+
+class FocalLoss(nn.Module):
+ """
+ Dense detection을 위한 RetinaNet에서 제안된 loss: https://arxiv.org/abs/1708.02002.
+ """
+ def __init__(self, alpha: float = 1.0, gamma: float = 2.0, reduction: str = 'mean') -> None:
+ """
+ Args:
+ alpha (float): 개구간 (0, 1) 내의 실수값을 가지는 가중치 factor.
+ 양성 및 음성 예제간의 균형을 맞추는 역할. 기본값: 0.25.
+ gamma (float): 쉬운 예제와 어려운 예제 간의 균형을 맞추는 역할을 하는 modulating factor의 지수.
+ 기본값: 2.0
+ reduction (string): ``'none'`` | ``'mean'`` | ``'sum'``
+ ``'none'``: No reduction will be applied to the output.
+ ``'mean'``: 평균 출력 반환.
+ ``'sum'``: 합계 출력 반환. 기본값: none.
+ Returns:
+ Loss Tensor
+ """
+ super(FocalLoss, self).__init__()
+ self.alpha = alpha # 각 클래스에 대한 가중치
+ self.gamma = gamma # "focus" 매개변수로 어려운 예시에 더 많은 주의를 기울이는 역할
+ self.reduction = reduction
+
+ def forward(self, inputs: Tensor, targets: Tensor) -> Tensor:
+ """
+ Args:
+ inputs (Tensor): (bsz, 30) 사이즈의 Float Tensor.
+ 각 예제에 대한 예측.
+ targets (Tensor): (30,) 사이즈의 true class 정보. 0부터 29까지의 정수가 담긴 Long Tensor.
+ 음성 클래스: 0, 양성 클래스: 1.
+ """
+ p = torch.sigmoid(inputs)
+ targets = F.one_hot(targets, num_classes=30).float()
+ ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+ p_t = p * targets + (1 - p) * (1 - targets)
+ loss = ce_loss * ((1 - p_t) ** self.gamma)
+
+ if self.alpha >= 0:
+ alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets)
+ loss = alpha_t * loss
+
+ # Check reduction option and return loss accordingly
+ if self.reduction == 'none':
+ pass
+ elif self.reduction == 'mean':
+ loss = loss.mean()
+ elif self.reduction == 'sum':
+ loss = loss.sum()
+ else:
+ raise ValueError(
+ f'Invalid Value for arg "reduction": {self.reduction} \n Supported reduction modes: "none", "mean", "sum"'
+ )
+ return loss
+
+class WeightedFocalLoss(nn.Module):
+ def __init__(self, alpha: torch.Tensor = None, gamma: float = 2.0, reduction: str = 'mean') -> None:
+ super(WeightedFocalLoss, self).__init__()
+ self.alpha = alpha # 각 클래스에 대한 가중치
+ self.gamma = gamma # "focus" 매개변수로 어려운 예시에 더 많은 주의를 기울이는 역할
+ self.reduction = reduction
+
+ # alpha가 None인 경우, 모든 클래스에 동일한 가중치 적용
+ # alpha가 텐서인 경우, alpha의 각 요소는 해당 클래스의 가중치로 설정
+ self.ce_loss = nn.CrossEntropyLoss(weight=self.alpha, reduction='none')
+
+ def forward(self, inputs: Tensor, targets: Tensor) -> Tensor:
+ ce_loss = self.ce_loss(inputs, targets)
+ pt = torch.exp(-ce_loss)
+ focal_loss = (1 - pt)**self.gamma * ce_loss
+
+ if self.reduction == 'mean':
+ return torch.mean(focal_loss)
+ elif self.reduction == 'sum':
+ return torch.sum(focal_loss)
+ else:
+ return focal_loss
+
+
+class LovaszSoftmaxLoss(nn.Module):
+ def __init__(self, weight=None, reduction: str = 'mean') -> None:
+ super(LovaszSoftmaxLoss, self).__init__()
+ self.weight = weight
+ self.reduction = reduction
+
+ def lovasz_grad(self, true_sorted):
+ p = len(true_sorted)
+ gts = true_sorted.sum()
+ intersection = gts - true_sorted.cumsum(0)
+ union = gts + (1 - true_sorted).cumsum(0)
+ jaccard = 1 - intersection / union
+ if p > 1: # cover 1-pixel case
+ jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
+ return jaccard
+
+ def lovasz_softmax(self, log_probs, labels):
+ C = log_probs.shape[1]
+ losses = []
+ for c in range(C):
+ fg = (labels == c).float() # foreground for class c
+ if fg.sum() == 0:
+ continue
+ errors = (fg - log_probs[:, c]).abs()
+ errors_sorted, perm = torch.sort(errors, 0, descending=True)
+ fg_sorted = fg[perm]
+ losses.append(torch.dot(errors_sorted, self.lovasz_grad(fg_sorted)))
+ return torch.stack(losses)
+
+ def forward(self, inputs: Tensor, targets: Tensor) -> Tensor:
+ log_probs = F.log_softmax(inputs, dim=1)
+ lovasz_loss = self.lovasz_softmax(log_probs, targets)
+
+ if self.reduction == 'mean':
+ return torch.mean(lovasz_loss)
+ elif self.reduction == 'sum':
+ return torch.sum(lovasz_loss)
+ else:
+ return lovasz_loss
+
+
+class MulticlassDiceLoss(nn.Module):
+ def __init__(self, smooth: float = 1e-5, reduction: str = 'mean'):
+ super(MulticlassDiceLoss, self).__init__()
+ self.smooth = smooth
+ self.reduction = reduction
+
+ def forward(self, inputs: Tensor, targets: Tensor) -> Tensor:
+ # Softmax over the inputs
+ inputs = torch.softmax(inputs, dim=1)
+
+
+ # One-hot encode targets
+ targets_one_hot = torch.nn.functional.one_hot(targets, num_classes=inputs.shape[1])
+
+ # Move targets_one_hot to device of inputs
+ targets_one_hot = targets_one_hot.to(inputs.device)
+
+ # Calculate Dice Loss for each class
+ dice_loss = 0
+ for i in range(inputs.shape[1]):
+ intersection = 2 * (inputs[:, i] * targets_one_hot[:, i]).sum()
+ union = inputs[:, i].sum() + targets_one_hot[:, i].sum()
+ dice_loss += (1 - (intersection + self.smooth) / (union + self.smooth))
+
+ # Average the dice loss for all classes
+ dice_loss /= inputs.shape[1]
+
+ if self.reduction == 'mean':
+ return dice_loss
+ elif self.reduction == 'sum':
+ return dice_loss.sum()
+ else:
+ return dice_loss
diff --git a/model/metric.py b/model/metric.py
index 8aca908..2631e25 100755
--- a/model/metric.py
+++ b/model/metric.py
@@ -1,80 +1,80 @@
-import numpy as np
-import sklearn
-from sklearn.metrics import accuracy_score
-
-
-def klue_re_micro_f1(preds, labels):
- """KLUE-RE micro f1 (except no_relation)"""
- label_list = [
- "no_relation",
- "org:top_members/employees",
- "org:members",
- "org:product",
- "per:title",
- "org:alternate_names",
- "per:employee_of",
- "org:place_of_headquarters",
- "per:product",
- "org:number_of_employees/members",
- "per:children",
- "per:place_of_residence",
- "per:alternate_names",
- "per:other_family",
- "per:colleagues",
- "per:origin",
- "per:siblings",
- "per:spouse",
- "org:founded",
- "org:political/religious_affiliation",
- "org:member_of",
- "per:parents",
- "org:dissolved",
- "per:schools_attended",
- "per:date_of_death",
- "per:place_of_birth",
- "per:place_of_death",
- "org:founded_by",
- "per:religion",
- ]
-
- no_relation_label_idx = label_list.index("no_relation")
- label_indices = list(range(len(label_list)))
- label_indices.remove(no_relation_label_idx)
-
- return (
- sklearn.metrics.f1_score(labels, preds, average="micro", labels=label_indices)
- * 100.0
- )
-
-
-def klue_re_auprc(probs, labels):
- """KLUE-RE AUPRC (with no_relation)"""
- labels = np.eye(30)[labels]
-
- score = np.zeros((30,))
- for c in range(30):
- targets_c = labels.take([c], axis=1).ravel()
- preds_c = probs.take([c], axis=1).ravel()
- precision, recall, _ = sklearn.metrics.precision_recall_curve(
- targets_c, preds_c
- )
- score[c] = sklearn.metrics.auc(recall, precision)
-
- return np.average(score) * 100.0
-
-
-def compute_metrics(pred):
- """validation을 위한 metrics function"""
- labels = pred.label_ids
- preds = pred.predictions.argmax(-1)
- probs = pred.predictions
-
- f1 = klue_re_micro_f1(preds, labels)
- auprc = klue_re_auprc(probs, labels)
- acc = accuracy_score(labels, preds) # 리더보드 평가에는 포함되지 않습니다.
-
- return {
- "micro f1 score": f1,
- "auprc": auprc,
- "accuracy": acc,
- }
+import numpy as np
+import sklearn
+from sklearn.metrics import accuracy_score
+
+
+def klue_re_micro_f1(preds, labels):
+ """KLUE-RE micro f1 (except no_relation)"""
+ label_list = [
+ "no_relation",
+ "org:top_members/employees",
+ "org:members",
+ "org:product",
+ "per:title",
+ "org:alternate_names",
+ "per:employee_of",
+ "org:place_of_headquarters",
+ "per:product",
+ "org:number_of_employees/members",
+ "per:children",
+ "per:place_of_residence",
+ "per:alternate_names",
+ "per:other_family",
+ "per:colleagues",
+ "per:origin",
+ "per:siblings",
+ "per:spouse",
+ "org:founded",
+ "org:political/religious_affiliation",
+ "org:member_of",
+ "per:parents",
+ "org:dissolved",
+ "per:schools_attended",
+ "per:date_of_death",
+ "per:place_of_birth",
+ "per:place_of_death",
+ "org:founded_by",
+ "per:religion",
+ ]
+
+ no_relation_label_idx = label_list.index("no_relation")
+ label_indices = list(range(len(label_list)))
+ label_indices.remove(no_relation_label_idx)
+
+ return (
+ sklearn.metrics.f1_score(labels, preds, average="micro", labels=label_indices)
+ * 100.0
+ )
+
+
+def klue_re_auprc(probs, labels):
+ """KLUE-RE AUPRC (with no_relation)"""
+ labels = np.eye(30)[labels]
+
+ score = np.zeros((30,))
+ for c in range(30):
+ targets_c = labels.take([c], axis=1).ravel()
+ preds_c = probs.take([c], axis=1).ravel()
+ precision, recall, _ = sklearn.metrics.precision_recall_curve(
+ targets_c, preds_c
+ )
+ score[c] = sklearn.metrics.auc(recall, precision)
+
+ return np.average(score) * 100.0
+
+
+def compute_metrics(pred):
+ """validation을 위한 metrics function"""
+ labels = pred.label_ids
+ preds = pred.predictions.argmax(-1)
+ probs = pred.predictions
+
+ f1 = klue_re_micro_f1(preds, labels)
+ auprc = klue_re_auprc(probs, labels)
+ acc = accuracy_score(labels, preds) # 리더보드 평가에는 포함되지 않습니다.
+
+ return {
+ "micro f1 score": f1,
+ "auprc": auprc,
+ "accuracy": acc,
+ }
diff --git a/model/model.py b/model/model.py
index 4e79d58..5983270 100644
--- a/model/model.py
+++ b/model/model.py
@@ -1,204 +1,204 @@
-import torch.nn as nn
-from torch.cuda.amp import autocast
-from transformers import (
- AutoConfig,
- AutoModel,
- AutoModelForSequenceClassification,
-)
-
-from model.loss import *
-
-
-class BaseREModel(nn.Module):
- """Pre-trained Language Model로부터 나온 logits를 FC layer에 통과시키는 기본 분류기."""
- def __init__(self, config, new_num_tokens: int):
- """
- Args:
- config: 사용자 config.
- new_num_tokens: tokenizer의 길이. Additional special tokens 수를 포함.
- """
- super().__init__()
-
- self.model_config = AutoConfig.from_pretrained(config.model['name'])
- self.model_config.num_labels = config.num_labels
-
- self.plm = AutoModelForSequenceClassification.from_pretrained(config.model['name'],
- config=self.model_config)
-
- if self.model_config.vocab_size != new_num_tokens:
- self.plm.resize_token_embeddings(new_num_tokens)
-
- @autocast()
- def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None):
- outputs = self.plm(input_ids=input_ids,
- token_type_ids=token_type_ids,
- attention_mask=attention_mask)
- logits = outputs['logits']
- return {
- 'logits': logits,
- }
-
-class CustomModel(nn.Module):
- def __init__(self, config, new_num_tokens: int):
- super().__init__()
-
- self.model_config = AutoConfig.from_pretrained(config.model['name'])
- self.model_config.num_labels = config.num_labels
-
- self.plm = AutoModelForSequenceClassification.from_pretrained(config.model['name'],
- config=self.model_config)
-
- if self.model_config.vocab_size != new_num_tokens:
- self.plm.resize_token_embeddings(new_num_tokens)
-
- self.hidden_size = self.model_config.hidden_size
-
- self.entity_embedding = nn.Embedding(3, self.hidden_size)
- nn.init.xavier_normal_(self.entity_embedding.weight)
-
- self.weight = nn.Parameter(torch.Tensor(1)) # Learnable weight parameter
- nn.init.uniform_(self.weight)
-
- # self.reduction_layer = nn.Linear(self.hidden_size * 2, self.hidden_size)
-
- @autocast()
- def forward(
- self,
- input_ids=None,
- token_type_ids=None,
- attention_mask=None,
- entity_ids=None,
- labels=None,
- ):
- # entity_ids = entity_ids.long()
- entity_embeddings = self.entity_embedding(entity_ids) # torch.tensor([64, 180, 1024])
- input_embeddings = self.plm.get_input_embeddings()(input_ids) # torch.tensor([64, 180, 1024])
-
- # 단순히 더한 버전
- # combined_embeddings = input_embeddings + entity_embeddings
-
- # concat 버전
- # combined_embeddings = torch.cat([input_embeddings,entity_embeddings], dim=-1) # torch.tensor([64, 180, 2048])
- # combined_embeddings = self.reduction_layer(combined_embeddings) # torch.tensor([64, 180, 1024])
-
- # weighted sum 버전
- combined_embeddings = self.weight * input_embeddings + (1 - self.weight) * entity_embeddings
-
- outputs = self.plm.roberta(
- inputs_embeds=combined_embeddings,
- attention_mask=attention_mask,
- token_type_ids=token_type_ids,
- )
- logits = self.plm.classifier(outputs['last_hidden_state'])
-
- return {
- 'logits': logits,
- }
-
-class BiGRUREModel(nn.Module):
- """
- Pre-trained Language Model로부터 나온 logits를 Bi-driectional GRU에 통과시킨 후
- hidden states 정보를 FC layer에 통과시킨 분류기.
- """
- def __init__(self, config, new_num_tokens: int):
- """
- Args:
- config: 사용자 config.
- new_num_tokens: tokenizer의 길이. Additional special tokens 수를 포함.
- """
- super().__init__()
-
- self.model_config = AutoConfig.from_pretrained(config.model['name'])
- self.model_config.num_labels = config.num_labels
-
- self.plm = AutoModel.from_pretrained(config.model['name'],
- config=self.model_config)
-
- if self.model_config.vocab_size != new_num_tokens:
- self.plm.resize_token_embeddings(new_num_tokens)
-
- self.hidden_size = self.model_config.hidden_size # 1024 for roberta-large
- self.gru = nn.GRU(input_size=self.hidden_size,
- hidden_size=self.hidden_size,
- num_layers=1,
- batch_first=True, # (bsz, seq, feature) if True else (seq, bsz, feature)
- bidirectional=True)
- self.init_gru()
- self.classifier = nn.Linear(self.hidden_size * 2, config.num_labels)
- nn.init.kaiming_normal_(self.classifier.weight, mode='fan_in', nonlinearity='relu')
- self.classifier.bias.data.fill_(0)
-
- def init_gru(self):
- for name, param in self.gru.named_parameters():
- if 'weight_ih' in name:
- nn.init.xavier_normal_(param.data)
- elif 'weight_hh' in name:
- nn.init.xavier_normal_(param.data)
- elif 'bias' in name:
- param.data.fill_(0)
-
- @autocast()
- def forward(self, input_ids: Tensor, token_type_ids: Tensor, attention_mask: Tensor, labels=None):
- outputs = self.plm(input_ids=input_ids,
- token_type_ids=token_type_ids,
- attention_mask=attention_mask).last_hidden_state
- _, next_hidden = self.gru(outputs)
- outputs = torch.cat([next_hidden[0], next_hidden[1]], dim=1)
- logits = self.classifier(outputs)
- return {
- 'logits': logits,
- }
-
-
-class BiLSTMREModel(nn.Module):
- """Pre-trained Language Model로부터 나온 logits를 Bi-driectional LSTM에 통과시킨 후
- hidden states 정보를 FC layer에 통과시킨 분류기.
- """
- def __init__(self, config, new_num_tokens: int):
- """
- Args:
- config: 사용자 config.
- new_num_tokens: tokenizer의 길이. Additional special tokens 수를 포함.
- """
- super().__init__()
-
- self.model_config = AutoConfig.from_pretrained(config.model['name'])
- self.model_config.num_labels = config.num_labels
-
- self.plm = AutoModel.from_pretrained(config.model['name'],
- config=self.model_config)
-
- if self.model_config.vocab_size != new_num_tokens:
- self.plm.resize_token_embeddings(new_num_tokens)
-
- self.hidden_size = self.model_config.hidden_size # 1024 for roberta-large
- self.lstm = nn.LSTM(input_size=self.hidden_size,
- hidden_size=self.hidden_size,
- num_layers=1,
- batch_first=True, # (bsz, seq, feature) if True else (seq, bsz, feature)
- bidirectional=True)
- self.init_lstm()
- self.classifier = nn.Linear(self.hidden_size * 2, config.num_labels)
- nn.init.kaiming_normal_(self.classifier.weight, mode='fan_in', nonlinearity='relu')
- self.classifier.bias.data.fill_(0)
-
- def init_lstm(self):
- for name, param in self.lstm.named_parameters():
- if 'weight_ih' in name:
- nn.init.xavier_normal_(param.data)
- elif 'weight_hh' in name:
- nn.init.xavier_normal_(param.data)
- elif 'bias' in name:
- param.data.fill_(0)
-
- @autocast()
- def forward(self, input_ids: Tensor, token_type_ids: Tensor, attention_mask: Tensor, labels=None):
- outputs = self.plm(input_ids=input_ids,
- token_type_ids=token_type_ids,
- attention_mask=attention_mask).last_hidden_state
- _, (next_hidden, _) = self.lstm(outputs)
- outputs = torch.cat([next_hidden[0], next_hidden[1]], dim=1)
- logits = self.classifier(outputs)
- return {
- 'logits': logits,
- }
+import torch.nn as nn
+from torch.cuda.amp import autocast
+from transformers import (
+ AutoConfig,
+ AutoModel,
+ AutoModelForSequenceClassification,
+)
+
+from model.loss import *
+
+
+class BaseREModel(nn.Module):
+ """Pre-trained Language Model로부터 나온 logits를 FC layer에 통과시키는 기본 분류기."""
+ def __init__(self, config, new_num_tokens: int):
+ """
+ Args:
+ config: 사용자 config.
+ new_num_tokens: tokenizer의 길이. Additional special tokens 수를 포함.
+ """
+ super().__init__()
+
+ self.model_config = AutoConfig.from_pretrained(config.model['name'])
+ self.model_config.num_labels = config.num_labels
+
+ self.plm = AutoModelForSequenceClassification.from_pretrained(config.model['name'],
+ config=self.model_config)
+
+ if self.model_config.vocab_size != new_num_tokens:
+ self.plm.resize_token_embeddings(new_num_tokens)
+
+ @autocast()
+ def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None):
+ outputs = self.plm(input_ids=input_ids,
+ token_type_ids=token_type_ids,
+ attention_mask=attention_mask)
+ logits = outputs['logits']
+ return {
+ 'logits': logits,
+ }
+
+class CustomModel(nn.Module):
+ def __init__(self, config, new_num_tokens: int):
+ super().__init__()
+
+ self.model_config = AutoConfig.from_pretrained(config.model['name'])
+ self.model_config.num_labels = config.num_labels
+
+ self.plm = AutoModelForSequenceClassification.from_pretrained(config.model['name'],
+ config=self.model_config)
+
+ if self.model_config.vocab_size != new_num_tokens:
+ self.plm.resize_token_embeddings(new_num_tokens)
+
+ self.hidden_size = self.model_config.hidden_size
+
+ self.entity_embedding = nn.Embedding(3, self.hidden_size)
+ nn.init.xavier_normal_(self.entity_embedding.weight)
+
+ self.weight = nn.Parameter(torch.Tensor(1)) # Learnable weight parameter
+ nn.init.uniform_(self.weight)
+
+ # self.reduction_layer = nn.Linear(self.hidden_size * 2, self.hidden_size)
+
+ @autocast()
+ def forward(
+ self,
+ input_ids=None,
+ token_type_ids=None,
+ attention_mask=None,
+ entity_ids=None,
+ labels=None,
+ ):
+ # entity_ids = entity_ids.long()
+ entity_embeddings = self.entity_embedding(entity_ids) # torch.tensor([64, 180, 1024])
+ input_embeddings = self.plm.get_input_embeddings()(input_ids) # torch.tensor([64, 180, 1024])
+
+ # 단순히 더한 버전
+ # combined_embeddings = input_embeddings + entity_embeddings
+
+ # concat 버전
+ # combined_embeddings = torch.cat([input_embeddings,entity_embeddings], dim=-1) # torch.tensor([64, 180, 2048])
+ # combined_embeddings = self.reduction_layer(combined_embeddings) # torch.tensor([64, 180, 1024])
+
+ # weighted sum 버전
+ combined_embeddings = self.weight * input_embeddings + (1 - self.weight) * entity_embeddings
+
+ outputs = self.plm.roberta(
+ inputs_embeds=combined_embeddings,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ )
+ logits = self.plm.classifier(outputs['last_hidden_state'])
+
+ return {
+ 'logits': logits,
+ }
+
+class BiGRUREModel(nn.Module):
+ """
+ Pre-trained Language Model로부터 나온 logits를 Bi-driectional GRU에 통과시킨 후
+ hidden states 정보를 FC layer에 통과시킨 분류기.
+ """
+ def __init__(self, config, new_num_tokens: int):
+ """
+ Args:
+ config: 사용자 config.
+ new_num_tokens: tokenizer의 길이. Additional special tokens 수를 포함.
+ """
+ super().__init__()
+
+ self.model_config = AutoConfig.from_pretrained(config.model['name'])
+ self.model_config.num_labels = config.num_labels
+
+ self.plm = AutoModel.from_pretrained(config.model['name'],
+ config=self.model_config)
+
+ if self.model_config.vocab_size != new_num_tokens:
+ self.plm.resize_token_embeddings(new_num_tokens)
+
+ self.hidden_size = self.model_config.hidden_size # 1024 for roberta-large
+ self.gru = nn.GRU(input_size=self.hidden_size,
+ hidden_size=self.hidden_size,
+ num_layers=1,
+ batch_first=True, # (bsz, seq, feature) if True else (seq, bsz, feature)
+ bidirectional=True)
+ self.init_gru()
+ self.classifier = nn.Linear(self.hidden_size * 2, config.num_labels)
+ nn.init.kaiming_normal_(self.classifier.weight, mode='fan_in', nonlinearity='relu')
+ self.classifier.bias.data.fill_(0)
+
+ def init_gru(self):
+ for name, param in self.gru.named_parameters():
+ if 'weight_ih' in name:
+ nn.init.xavier_normal_(param.data)
+ elif 'weight_hh' in name:
+ nn.init.xavier_normal_(param.data)
+ elif 'bias' in name:
+ param.data.fill_(0)
+
+ @autocast()
+ def forward(self, input_ids: Tensor, token_type_ids: Tensor, attention_mask: Tensor, labels=None):
+ outputs = self.plm(input_ids=input_ids,
+ token_type_ids=token_type_ids,
+ attention_mask=attention_mask).last_hidden_state
+ _, next_hidden = self.gru(outputs)
+ outputs = torch.cat([next_hidden[0], next_hidden[1]], dim=1)
+ logits = self.classifier(outputs)
+ return {
+ 'logits': logits,
+ }
+
+
+class BiLSTMREModel(nn.Module):
+ """Pre-trained Language Model로부터 나온 logits를 Bi-driectional LSTM에 통과시킨 후
+ hidden states 정보를 FC layer에 통과시킨 분류기.
+ """
+ def __init__(self, config, new_num_tokens: int):
+ """
+ Args:
+ config: 사용자 config.
+ new_num_tokens: tokenizer의 길이. Additional special tokens 수를 포함.
+ """
+ super().__init__()
+
+ self.model_config = AutoConfig.from_pretrained(config.model['name'])
+ self.model_config.num_labels = config.num_labels
+
+ self.plm = AutoModel.from_pretrained(config.model['name'],
+ config=self.model_config)
+
+ if self.model_config.vocab_size != new_num_tokens:
+ self.plm.resize_token_embeddings(new_num_tokens)
+
+ self.hidden_size = self.model_config.hidden_size # 1024 for roberta-large
+ self.lstm = nn.LSTM(input_size=self.hidden_size,
+ hidden_size=self.hidden_size,
+ num_layers=1,
+ batch_first=True, # (bsz, seq, feature) if True else (seq, bsz, feature)
+ bidirectional=True)
+ self.init_lstm()
+ self.classifier = nn.Linear(self.hidden_size * 2, config.num_labels)
+ nn.init.kaiming_normal_(self.classifier.weight, mode='fan_in', nonlinearity='relu')
+ self.classifier.bias.data.fill_(0)
+
+ def init_lstm(self):
+ for name, param in self.lstm.named_parameters():
+ if 'weight_ih' in name:
+ nn.init.xavier_normal_(param.data)
+ elif 'weight_hh' in name:
+ nn.init.xavier_normal_(param.data)
+ elif 'bias' in name:
+ param.data.fill_(0)
+
+ @autocast()
+ def forward(self, input_ids: Tensor, token_type_ids: Tensor, attention_mask: Tensor, labels=None):
+ outputs = self.plm(input_ids=input_ids,
+ token_type_ids=token_type_ids,
+ attention_mask=attention_mask).last_hidden_state
+ _, (next_hidden, _) = self.lstm(outputs)
+ outputs = torch.cat([next_hidden[0], next_hidden[1]], dim=1)
+ logits = self.classifier(outputs)
+ return {
+ 'logits': logits,
+ }
diff --git a/pyproject.toml b/pyproject.toml
index 6d42880..94ad0f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,16 +1,16 @@
-[tool.black]
-line-length = 88
-target-version = ['py37','py38']
-include = '\.pyi?$'
-exclude = '''
-/(
- \.git
- | _build
- | best_model
- | logs
- | prediction
- | results
- | saved
- | wandb
-)/
+[tool.black]
+line-length = 88
+target-version = ['py37','py38']
+include = '\.pyi?$'
+exclude = '''
+/(
+ \.git
+ | _build
+ | best_model
+ | logs
+ | prediction
+ | results
+ | saved
+ | wandb
+)/
'''
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 553ac5f..e184671 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,9 @@
-datasets==2.12.0
-pandas==1.1.5
-pytz==2023.3
-scikit-learn~=0.24.1
-torch==2.0.1
-torchvision==0.15.2
-tqdm==4.62.1
-transformers==4.10.0
+datasets==2.12.0
+pandas==1.1.5
+pytz==2023.3
+scikit-learn~=0.24.1
+torch==2.0.1
+torchvision==0.15.2
+tqdm==4.62.1
+transformers==4.10.0
wandb==0.15.1
\ No newline at end of file
diff --git a/run.sh b/run.sh
index 08a8738..9a4653e 100755
--- a/run.sh
+++ b/run.sh
@@ -1,24 +1,24 @@
-#!/bin/bash
-
-configs=("configs/config_1.yaml" "configs/config_2.yaml")
-index=0
-
-for config in "${configs[@]}"; do
- let index++
- log_filename_train="output_logs/output_train_${index}.log"
- log_filename_infer="output_logs/output_infer_${index}.log"
-
- echo "Starting training with ${config}..."
-
- nohup python3 train.py ${config} > "${log_filename_train}" 2>&1 &
- wait $!
-
- echo "Training with ${config} has completed."
-
- nohup python3 inference.py ${config} > "${log_filename_infer}" 2>&1 &
- wait $!
-
- echo "Inferencing with ${config} has completed."
-done
-
-echo "All experiments have been completed."
+#!/bin/bash
+
+configs=("configs/config_1.yaml" "configs/config_2.yaml")
+index=0
+
+for config in "${configs[@]}"; do
+ let index++
+ log_filename_train="output_logs/output_train_${index}.log"
+ log_filename_infer="output_logs/output_infer_${index}.log"
+
+ echo "Starting training with ${config}..."
+
+ nohup python3 train.py ${config} > "${log_filename_train}" 2>&1 &
+ wait $!
+
+ echo "Training with ${config} has completed."
+
+ nohup python3 inference.py ${config} > "${log_filename_infer}" 2>&1 &
+ wait $!
+
+ echo "Inferencing with ${config} has completed."
+done
+
+echo "All experiments have been completed."
diff --git a/sweep.py b/sweep.py
index c79d2f7..9597008 100644
--- a/sweep.py
+++ b/sweep.py
@@ -1,182 +1,182 @@
-import sys
-import pickle as pickle
-import pytz
-from datetime import datetime
-import wandb
-
-import torch
-from transformers import (
- AutoTokenizer,
- EarlyStoppingCallback,
- TrainingArguments,
-)
-
-from argparse import Namespace
-
-from utils.args import *
-from load_data.load_data import *
-from model.model import *
-from model.metric import *
-from trainer.trainer import *
-from utils.utils import *
-
-from typing import Any
-
-
-def main(config: Namespace) -> None:
- """
- Sweep 초기화 및 Wandb sweep agent 선언
-
- Args:
- config(Namespace): 모델 학습에 필요한 hyperparameter를 포함하는 딕셔너리
- Returns:
- None
- """
- def sweep_train(config: Namespace = config) -> None:
- """
- Sweep agent 선언시 function에 전달되는 함수
-
- Args:
- config(Namespace): 모델 학습에 필요한 hyperparmeter를 포함하는 딕셔너리
- Returns:
- None
- """
- wandb.init(
- entity=config.wandb['entity'],
- project=config.wandb['sweep_project_name']
- )
-
- sweep_config = wandb.config
-
- seed_everything(config.seed)
-
- # load model and tokenizer
- model_name = config.model['name']
- tokenizer = AutoTokenizer.from_pretrained(model_name)
-
- # 1. load dataset
- # 2. preprocess dataset
- # 3. tokenize dataset
- revision = config.dataloader['revision']
- input_format = sweep_config['input_format']
- prompt = sweep_config['prompt']
- type_transform = sweep_config['type_transform']
-
- train_dataset, train_raw_label = load_train_dataset(
- split=config.dataloader['train_split'],
- revision=revision,
- tokenizer=tokenizer,
- input_format=input_format,
- prompt=prompt,
- type_transform=type_transform,
- )
- dev_dataset, dev_raw_label = load_train_dataset(
- split=config.dataloader['valid_split'],
- revision=revision,
- tokenizer=tokenizer,
- input_format=input_format,
- prompt=prompt,
- type_transform=type_transform,
- )
-
- train_label = label_to_num(train_raw_label)
- dev_label = label_to_num(dev_raw_label)
-
- # 4. make Dataset object
- re_train_dataset = REDataset(train_dataset, train_label)
- re_dev_dataset = REDataset(dev_dataset, dev_label)
-
- device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
- print(device)
-
- # 5. import model
- # setting model hyperparameter
- model_module = __import__('model.model', fromlist=[config.model['variant']])
- model_class = getattr(model_module, config.model['variant'])
- # Available customized classes:
- # BaseREModel, BiLSTMREModel, BiGRUREModel
- model = model_class(config, len(tokenizer))
-
- print(model.model_config)
-
- model.parameters
- model.to(device)
-
- # 6. training arguments 설정
- ## 사용한 option 외에도 다양한 option들이 있습니다.
- ## https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
- training_args = TrainingArguments(
- # 기본 설정
- output_dir=config.trainer['output_dir'], # 모델 저장 디렉토리
- report_to=('wandb' if config.use_wandb else 'none'), # wandb 사용 여부
- fp16=True, # 16-bit floating point precision
-
- # 학습 설정
- num_train_epochs=sweep_config['epochs'], # 전체 훈련 epoch 수
- learning_rate=sweep_config['lr'], # learning rate
- weight_decay=config.optimizer['weight_decay'], # weight decay
- adam_beta2=sweep_config['adam_beta2'], # AdamW 옵티마이저의 beta2 하이퍼파라미터
-
- # 배치 사이즈 설정
- per_device_train_batch_size=config.dataloader['batch_size'], # 훈련 중 장치 당 batch size
- per_device_eval_batch_size=config.dataloader['batch_size'], # 평가 중 장치 당 batch size
-
- # 스케줄링 설정
- warmup_ratio=sweep_config['warmup_ratio'], # learning rate scheduler의 warmup 비율
- # warmup_steps=config.lr_scheduler['warmup_steps'], # number of warmup steps for learning rate scheduler
-
- # 로깅 설정
- logging_dir=config.trainer['logging_dir'], # 로그 저장 디렉토리
- logging_steps=config.trainer['logging_steps'], # 로그 저장 스텝
-
- # 모델 저장 설정
- save_total_limit=config.trainer['save_total_limit'], # 전체 저장 모델 수 제한
- save_steps=config.trainer['save_steps'], # 모델 저장 스텝
- save_strategy=config.trainer['save_strategy'],
-
- # 평가 설정
- evaluation_strategy=config.trainer['evaluation_strategy'], # 훈련 중 평가 전략
- eval_steps=config.trainer['evaluation_steps'], # 평가 스텝
- load_best_model_at_end=True,
- )
-
- # 7. trainer 설정
- # 8. evaluate 함수 설정
- trainer = RETrainer(
- model=model, # the instantiated 🤗 Transformers model to be trained
- args=training_args, # training arguments, defined above
- train_dataset=re_train_dataset, # training dataset
- eval_dataset=re_dev_dataset, # evaluation dataset
- compute_metrics=compute_metrics, # define metrics function
- # callbacks=([WandbCallback()] if config.use_wandb else []),
- # callbacks=[EarlyStoppingCallback(early_stopping_patience=config.trainer['early_stop'])],
- loss_cfg=config.loss,
- )
-
- # 9. train model
- trainer.train()
- # 10. save model
- trainer.save_model(config.trainer['model_dir'])
-
- sweep_id = wandb.sweep(
- sweep=config.sweep_config
- )
-
- wandb.agent(
- sweep_id=sweep_id,
- function=sweep_train,
- count=config.wandb['sweep_count']
- )
-
-
-if __name__ == '__main__':
- try:
- config_path = sys.argv[1]
- except IndexError:
- config_path = './config.yaml'
- config = parse_arguments(config_path)
-
- now = datetime.now(pytz.timezone('Asia/Seoul'))
- run_name = f'{config.run_name}_{now.strftime("%d-%H-%M")}'
-
+import sys
+import pickle as pickle
+import pytz
+from datetime import datetime
+import wandb
+
+import torch
+from transformers import (
+ AutoTokenizer,
+ EarlyStoppingCallback,
+ TrainingArguments,
+)
+
+from argparse import Namespace
+
+from utils.args import *
+from load_data.load_data import *
+from model.model import *
+from model.metric import *
+from trainer.trainer import *
+from utils.utils import *
+
+from typing import Any
+
+
+def main(config: Namespace) -> None:
+ """
+ Sweep 초기화 및 Wandb sweep agent 선언
+
+ Args:
+ config(Namespace): 모델 학습에 필요한 hyperparameter를 포함하는 딕셔너리
+ Returns:
+ None
+ """
+ def sweep_train(config: Namespace = config) -> None:
+ """
+ Sweep agent 선언시 function에 전달되는 함수
+
+ Args:
+ config(Namespace): 모델 학습에 필요한 hyperparmeter를 포함하는 딕셔너리
+ Returns:
+ None
+ """
+ wandb.init(
+ entity=config.wandb['entity'],
+ project=config.wandb['sweep_project_name']
+ )
+
+ sweep_config = wandb.config
+
+ seed_everything(config.seed)
+
+ # load model and tokenizer
+ model_name = config.model['name']
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+ # 1. load dataset
+ # 2. preprocess dataset
+ # 3. tokenize dataset
+ revision = config.dataloader['revision']
+ input_format = sweep_config['input_format']
+ prompt = sweep_config['prompt']
+ type_transform = sweep_config['type_transform']
+
+ train_dataset, train_raw_label = load_train_dataset(
+ split=config.dataloader['train_split'],
+ revision=revision,
+ tokenizer=tokenizer,
+ input_format=input_format,
+ prompt=prompt,
+ type_transform=type_transform,
+ )
+ dev_dataset, dev_raw_label = load_train_dataset(
+ split=config.dataloader['valid_split'],
+ revision=revision,
+ tokenizer=tokenizer,
+ input_format=input_format,
+ prompt=prompt,
+ type_transform=type_transform,
+ )
+
+ train_label = label_to_num(train_raw_label)
+ dev_label = label_to_num(dev_raw_label)
+
+ # 4. make Dataset object
+ re_train_dataset = REDataset(train_dataset, train_label)
+ re_dev_dataset = REDataset(dev_dataset, dev_label)
+
+ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+ print(device)
+
+ # 5. import model
+ # setting model hyperparameter
+ model_module = __import__('model.model', fromlist=[config.model['variant']])
+ model_class = getattr(model_module, config.model['variant'])
+ # Available customized classes:
+ # BaseREModel, BiLSTMREModel, BiGRUREModel
+ model = model_class(config, len(tokenizer))
+
+ print(model.model_config)
+
+ model.parameters
+ model.to(device)
+
+ # 6. training arguments 설정
+ ## 사용한 option 외에도 다양한 option들이 있습니다.
+ ## https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
+ training_args = TrainingArguments(
+ # 기본 설정
+ output_dir=config.trainer['output_dir'], # 모델 저장 디렉토리
+ report_to=('wandb' if config.use_wandb else 'none'), # wandb 사용 여부
+ fp16=True, # 16-bit floating point precision
+
+ # 학습 설정
+ num_train_epochs=sweep_config['epochs'], # 전체 훈련 epoch 수
+ learning_rate=sweep_config['lr'], # learning rate
+ weight_decay=config.optimizer['weight_decay'], # weight decay
+ adam_beta2=sweep_config['adam_beta2'], # AdamW 옵티마이저의 beta2 하이퍼파라미터
+
+ # 배치 사이즈 설정
+ per_device_train_batch_size=config.dataloader['batch_size'], # 훈련 중 장치 당 batch size
+ per_device_eval_batch_size=config.dataloader['batch_size'], # 평가 중 장치 당 batch size
+
+ # 스케줄링 설정
+ warmup_ratio=sweep_config['warmup_ratio'], # learning rate scheduler의 warmup 비율
+ # warmup_steps=config.lr_scheduler['warmup_steps'], # number of warmup steps for learning rate scheduler
+
+ # 로깅 설정
+ logging_dir=config.trainer['logging_dir'], # 로그 저장 디렉토리
+ logging_steps=config.trainer['logging_steps'], # 로그 저장 스텝
+
+ # 모델 저장 설정
+ save_total_limit=config.trainer['save_total_limit'], # 전체 저장 모델 수 제한
+ save_steps=config.trainer['save_steps'], # 모델 저장 스텝
+ save_strategy=config.trainer['save_strategy'],
+
+ # 평가 설정
+ evaluation_strategy=config.trainer['evaluation_strategy'], # 훈련 중 평가 전략
+ eval_steps=config.trainer['evaluation_steps'], # 평가 스텝
+ load_best_model_at_end=True,
+ )
+
+ # 7. trainer 설정
+ # 8. evaluate 함수 설정
+ trainer = RETrainer(
+ model=model, # the instantiated 🤗 Transformers model to be trained
+ args=training_args, # training arguments, defined above
+ train_dataset=re_train_dataset, # training dataset
+ eval_dataset=re_dev_dataset, # evaluation dataset
+ compute_metrics=compute_metrics, # define metrics function
+ # callbacks=([WandbCallback()] if config.use_wandb else []),
+ # callbacks=[EarlyStoppingCallback(early_stopping_patience=config.trainer['early_stop'])],
+ loss_cfg=config.loss,
+ )
+
+ # 9. train model
+ trainer.train()
+ # 10. save model
+ trainer.save_model(config.trainer['model_dir'])
+
+ sweep_id = wandb.sweep(
+ sweep=config.sweep_config
+ )
+
+ wandb.agent(
+ sweep_id=sweep_id,
+ function=sweep_train,
+ count=config.wandb['sweep_count']
+ )
+
+
+if __name__ == '__main__':
+ try:
+ config_path = sys.argv[1]
+ except IndexError:
+ config_path = './config.yaml'
+ config = parse_arguments(config_path)
+
+ now = datetime.now(pytz.timezone('Asia/Seoul'))
+ run_name = f'{config.run_name}_{now.strftime("%d-%H-%M")}'
+
main(config)
\ No newline at end of file
diff --git a/trainer/trainer.py b/trainer/trainer.py
index cd0b221..0cc63d3 100644
--- a/trainer/trainer.py
+++ b/trainer/trainer.py
@@ -1,41 +1,41 @@
-import torch
-from transformers import Trainer
-
-
-class RETrainer(Trainer):
- def __init__(self, loss_cfg=None, *args, **kwargs):
- super().__init__(*args, **kwargs)
- self.loss_cfg = loss_cfg
-
- def compute_loss(self, model, inputs, return_outputs=False):
- device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu:0')
-
- labels = inputs.pop('labels')
- outputs = model(**inputs)
- logits = outputs['logits'] if isinstance(outputs, dict) else outputs[0]
-
- # 인덱스에 맞춰서 과거 ouput을 다 저장
- if self.args.past_index >= 0:
- self._past= outputs[self.args.past_index]
-
- # 커스텀 loss 정의
- if self.loss_cfg['type'] == 'CrossEntropyLoss':
- loss_fct = torch.nn.functional.cross_entropy
- elif self.loss_cfg['type'] == 'WeightedCrossEntropyLoss':
- loss_fct = torch.nn.CrossEntropyLoss(weight=torch.Tensor(self.loss_cfg['weights']).to(device))
- else:
- loss_module = __import__('model.loss', fromlist=[self.loss_cfg['type']])
- loss_class = getattr(loss_module, self.loss_cfg['type'])
- if self.loss_cfg['type'] == 'LovaszSoftmaxLoss':
- loss_fct = loss_class()
- elif self.loss_cfg['type'] == 'FocalLoss':
- loss_fct = loss_class(self.loss_cfg['focal_alpha'], self.loss_cfg['focal_gamma'])
- elif self.loss_cfg['type'] == 'WeightedFocalLoss':
- loss_fct = loss_class(alpha = torch.Tensor(self.loss_cfg['weight_focal_alpha']).to(device), gamma= self.loss_cfg['focal_gamma'])
- elif self.loss_cfg['type'] == 'MulticlassDiceLoss':
- loss_fct = loss_class(self.loss_cfg['dice_smooth'])
- else:
- raise ValueError('Unsupported loss type')
-
- loss = loss_fct(logits, labels).to(device)
- return (loss, outputs) if return_outputs else loss
+import torch
+from transformers import Trainer
+
+
+class RETrainer(Trainer):
+ def __init__(self, loss_cfg=None, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.loss_cfg = loss_cfg
+
+ def compute_loss(self, model, inputs, return_outputs=False):
+ device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu:0')
+
+ labels = inputs.pop('labels')
+ outputs = model(**inputs)
+ logits = outputs['logits'] if isinstance(outputs, dict) else outputs[0]
+
+ # 인덱스에 맞춰서 과거 ouput을 다 저장
+ if self.args.past_index >= 0:
+ self._past= outputs[self.args.past_index]
+
+ # 커스텀 loss 정의
+ if self.loss_cfg['type'] == 'CrossEntropyLoss':
+ loss_fct = torch.nn.functional.cross_entropy
+ elif self.loss_cfg['type'] == 'WeightedCrossEntropyLoss':
+ loss_fct = torch.nn.CrossEntropyLoss(weight=torch.Tensor(self.loss_cfg['weights']).to(device))
+ else:
+ loss_module = __import__('model.loss', fromlist=[self.loss_cfg['type']])
+ loss_class = getattr(loss_module, self.loss_cfg['type'])
+ if self.loss_cfg['type'] == 'LovaszSoftmaxLoss':
+ loss_fct = loss_class()
+ elif self.loss_cfg['type'] == 'FocalLoss':
+ loss_fct = loss_class(self.loss_cfg['focal_alpha'], self.loss_cfg['focal_gamma'])
+ elif self.loss_cfg['type'] == 'WeightedFocalLoss':
+ loss_fct = loss_class(alpha = torch.Tensor(self.loss_cfg['weight_focal_alpha']).to(device), gamma= self.loss_cfg['focal_gamma'])
+ elif self.loss_cfg['type'] == 'MulticlassDiceLoss':
+ loss_fct = loss_class(self.loss_cfg['dice_smooth'])
+ else:
+ raise ValueError('Unsupported loss type')
+
+ loss = loss_fct(logits, labels).to(device)
+ return (loss, outputs) if return_outputs else loss
diff --git a/utils/args.py b/utils/args.py
index b6b6a4f..b35ea72 100755
--- a/utils/args.py
+++ b/utils/args.py
@@ -1,16 +1,16 @@
-import yaml
-from argparse import Namespace
-
-
-def parse_arguments(config_path: str) -> Namespace:
- """config.json 파일의 내용을 argparse.Namespace 객체로 변환.
-
- Returns:
- args (argparse.Namespace): config.json 파일의 내용을 포함하는 Namespace 객체.
- """
-
- with open(config_path, "r") as f:
- config = yaml.load(f, Loader=yaml.FullLoader)
-
- args = Namespace(**config)
- return args
+import yaml
+from argparse import Namespace
+
+
+def parse_arguments(config_path: str) -> Namespace:
+ """config.json 파일의 내용을 argparse.Namespace 객체로 변환.
+
+ Returns:
+ args (argparse.Namespace): config.json 파일의 내용을 포함하는 Namespace 객체.
+ """
+
+ with open(config_path, "r") as f:
+ config = yaml.load(f, Loader=yaml.FullLoader)
+
+ args = Namespace(**config)
+ return args
diff --git a/utils/utils.py b/utils/utils.py
index 9075d1c..addae29 100644
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -1,154 +1,154 @@
-import logging
-import os
-import re
-import random
-from argparse import Namespace
-from typing import Tuple
-
-import numpy as np
-import torch
-import wandb
-from wandb import AlertLevel
-
-log = logging.getLogger(__name__)
-
-def seed_everything(seed: int, workers: bool = False) -> int:
- log.info(f"Global seed set to {seed}")
- os.environ["PL_GLOBAL_SEED"] = str(seed)
- random.seed(seed)
- np.random.seed(seed)
- torch.manual_seed(seed)
- torch.cuda.manual_seed_all(seed)
-
- os.environ["PL_SEED_WORKERS"] = f"{int(workers)}"
- return seed
-
-
-def init_wandb(config: Namespace, run_name: str) -> None:
- if not config.use_wandb:
- return
-
- wandb.init(
- entity=config.wandb['entity'],
- project=config.wandb['project_name'],
- name=run_name,
- config=config,
- )
- wandb.alert(title='start', level=AlertLevel.INFO, text=f'{run_name}')
-
-
-def alert_wandb(config: Namespace, run_name: str, title: str) -> None:
- if config.use_wandb:
- wandb.alert(title=title, level=AlertLevel.INFO, text=f'{run_name}')
-
-
-def to_hangul(sent) -> Tuple[str, str]:
- """
- entity명을 한글로 변경
- """
- dic = {
- "ORG" : "조직",
- "PER" : "사람",
- "DAT" : "시간",
- "LOC" : "장소",
- "POH" : "기타",
- "NOH" : "수량",
- }
-
- sub = eval(sent['subject_entity'])
- obj = eval(sent['object_entity'])
-
- sub['type'] = dic[sub['type']]
- obj['type'] = dic[obj['type']]
-
- sent['subject_entity'] = str(sub)
- sent['object_entity'] = str(obj)
-
- return sent['subject_entity'], sent['object_entity']
-
-
-def marker(sent, input_format: str) -> str:
- """dataframe에서 하나의 row 내의 정보들을 조합해 마킹한 sentence를 만드는 함수"""
-
- # str 타입에서 dict 뽑아내기
- sub = eval(sent['subject_entity'])
- obj = eval(sent['object_entity'])
-
- # 인덱스 뽑아와서 entity 구분하기
- indices = sorted([sub['start_idx'], sub['end_idx'], obj['start_idx'], obj['end_idx']])
- indices[1] += 1
- indices[3] += 1
-
- def split_string_by_index(string, indices):
- substrings = []
- start_index = 0
- for index in indices:
- substrings.append(string[start_index:index])
- start_index = index
- substrings.append(string[start_index:])
- return substrings
-
- split_sent = split_string_by_index(sent['sentence'], indices)
-
- # entity에 마킹하기
- lst = []
- if input_format == 'entity_mask':
- for i in split_sent:
- if i == sub['word']:
- sub_token = f'[SUBJ-{sub["type"]}]'
- lst.append(sub_token)
- elif i == obj['word']:
- obj_token = f'[OBJ-{obj["type"]}]'
- lst.append(obj_token)
- else:
- lst.append(i)
-
- elif input_format == 'entity_marker':
- for i in split_sent:
- if i == sub['word']:
- new_sub = ['[E1] '] + [sub['word']] + [' [/E1]']
- lst.append(new_sub)
- elif i == obj['word']:
- new_obj = ['[E2] '] + [obj['word']] + [' [/E2]']
- lst.append(new_obj)
- else:
- lst.append(i)
-
- elif input_format == 'entity_marker_punct':
- for i in split_sent:
- if i == sub['word']:
- new_sub = ['@ '] + [sub['word']] + [' @']
- lst.append(new_sub)
- elif i == obj['word']:
- new_obj = ['# '] + [obj['word']] + [' #']
- lst.append(new_obj)
- else:
- lst.append(i)
-
- elif input_format == 'typed_entity_marker':
- for i in split_sent:
- if i == sub['word']:
- new_sub = [' '] + [sub['word']] + [' ']
- lst.append(new_sub)
- elif i == obj['word']:
- new_obj = [' '] + [obj['word']] + [' ']
- lst.append(new_obj)
- else:
- lst.append(i)
-
- elif input_format == 'typed_entity_marker_punct':
- for i in split_sent:
- if i == sub['word']:
- new_sub = ['@ '] + [' * '] + [sub['type'].lower()] + [' * '] + [sub['word']] + [' @ ']
- lst.append(new_sub)
- elif i == obj['word']:
- new_obj = [' # '] + [' ^ '] + [obj['type'].lower()] + [' ^ '] + [obj['word']] + [' # ']
- lst.append(new_obj)
- else:
- lst.append(i)
-
- # 최종 sentence로 만들고 공백 처리하기
- sentence = ''.join(str(item) if isinstance(item, str) else ''.join(item) for item in lst)
- sentence = re.sub(r'\s+', ' ', sentence)
-
- return sentence
+import logging
+import os
+import re
+import random
+from argparse import Namespace
+from typing import Tuple
+
+import numpy as np
+import torch
+import wandb
+from wandb import AlertLevel
+
+log = logging.getLogger(__name__)
+
+def seed_everything(seed: int, workers: bool = False) -> int:
+ log.info(f"Global seed set to {seed}")
+ os.environ["PL_GLOBAL_SEED"] = str(seed)
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+ torch.cuda.manual_seed_all(seed)
+
+ os.environ["PL_SEED_WORKERS"] = f"{int(workers)}"
+ return seed
+
+
+def init_wandb(config: Namespace, run_name: str) -> None:
+ if not config.use_wandb:
+ return
+
+ wandb.init(
+ entity=config.wandb['entity'],
+ project=config.wandb['project_name'],
+ name=run_name,
+ config=config,
+ )
+ wandb.alert(title='start', level=AlertLevel.INFO, text=f'{run_name}')
+
+
+def alert_wandb(config: Namespace, run_name: str, title: str) -> None:
+ if config.use_wandb:
+ wandb.alert(title=title, level=AlertLevel.INFO, text=f'{run_name}')
+
+
+def to_hangul(sent) -> Tuple[str, str]:
+ """
+ entity명을 한글로 변경
+ """
+ dic = {
+ "ORG" : "조직",
+ "PER" : "사람",
+ "DAT" : "시간",
+ "LOC" : "장소",
+ "POH" : "기타",
+ "NOH" : "수량",
+ }
+
+ sub = eval(sent['subject_entity'])
+ obj = eval(sent['object_entity'])
+
+ sub['type'] = dic[sub['type']]
+ obj['type'] = dic[obj['type']]
+
+ sent['subject_entity'] = str(sub)
+ sent['object_entity'] = str(obj)
+
+ return sent['subject_entity'], sent['object_entity']
+
+
+def marker(sent, input_format: str) -> str:
+ """dataframe에서 하나의 row 내의 정보들을 조합해 마킹한 sentence를 만드는 함수"""
+
+ # str 타입에서 dict 뽑아내기
+ sub = eval(sent['subject_entity'])
+ obj = eval(sent['object_entity'])
+
+ # 인덱스 뽑아와서 entity 구분하기
+ indices = sorted([sub['start_idx'], sub['end_idx'], obj['start_idx'], obj['end_idx']])
+ indices[1] += 1
+ indices[3] += 1
+
+ def split_string_by_index(string, indices):
+ substrings = []
+ start_index = 0
+ for index in indices:
+ substrings.append(string[start_index:index])
+ start_index = index
+ substrings.append(string[start_index:])
+ return substrings
+
+ split_sent = split_string_by_index(sent['sentence'], indices)
+
+ # entity에 마킹하기
+ lst = []
+ if input_format == 'entity_mask':
+ for i in split_sent:
+ if i == sub['word']:
+ sub_token = f'[SUBJ-{sub["type"]}]'
+ lst.append(sub_token)
+ elif i == obj['word']:
+ obj_token = f'[OBJ-{obj["type"]}]'
+ lst.append(obj_token)
+ else:
+ lst.append(i)
+
+ elif input_format == 'entity_marker':
+ for i in split_sent:
+ if i == sub['word']:
+ new_sub = ['[E1] '] + [sub['word']] + [' [/E1]']
+ lst.append(new_sub)
+ elif i == obj['word']:
+ new_obj = ['[E2] '] + [obj['word']] + [' [/E2]']
+ lst.append(new_obj)
+ else:
+ lst.append(i)
+
+ elif input_format == 'entity_marker_punct':
+ for i in split_sent:
+ if i == sub['word']:
+ new_sub = ['@ '] + [sub['word']] + [' @']
+ lst.append(new_sub)
+ elif i == obj['word']:
+ new_obj = ['# '] + [obj['word']] + [' #']
+ lst.append(new_obj)
+ else:
+ lst.append(i)
+
+ elif input_format == 'typed_entity_marker':
+ for i in split_sent:
+ if i == sub['word']:
+ new_sub = [' '] + [sub['word']] + [' ']
+ lst.append(new_sub)
+ elif i == obj['word']:
+ new_obj = [' '] + [obj['word']] + [' ']
+ lst.append(new_obj)
+ else:
+ lst.append(i)
+
+ elif input_format == 'typed_entity_marker_punct':
+ for i in split_sent:
+ if i == sub['word']:
+ new_sub = ['@ '] + [' * '] + [sub['type'].lower()] + [' * '] + [sub['word']] + [' @ ']
+ lst.append(new_sub)
+ elif i == obj['word']:
+ new_obj = [' # '] + [' ^ '] + [obj['type'].lower()] + [' ^ '] + [obj['word']] + [' # ']
+ lst.append(new_obj)
+ else:
+ lst.append(i)
+
+ # 최종 sentence로 만들고 공백 처리하기
+ sentence = ''.join(str(item) if isinstance(item, str) else ''.join(item) for item in lst)
+ sentence = re.sub(r'\s+', ' ', sentence)
+
+ return sentence