From a8ec3e427fd4ef3df92c1f9c840013e32ec59b4d Mon Sep 17 00:00:00 2001 From: Ipadeola Ladipo Ezekiel <105759894+rileydrizzy@users.noreply.github.com> Date: Fri, 15 Dec 2023 10:03:58 +0100 Subject: [PATCH] merge (#9) * update * updates * updates * [add] updates * [add] updates * updates * updates * [add] updates * merge --- .github/workflows/run_units_test.yml | 13 +- .gitignore | 6 +- .gitpod.yml | 10 - linguify_yb/data/.gitkeep => Dockerfile | 0 Makefile | 10 +- README.md | 24 +- linguify_yb/src/config/config.yaml => app.py | 0 test_inference.py => inference.py | 0 linguify_yb/README.md | 23 +- linguify_yb/data/dev_samples.json | 9 - linguify_yb/src/benchmark.py | 65 ----- linguify_yb/src/config.py | 6 - linguify_yb/src/dataset/dataset_loader.py | 119 --------- linguify_yb/src/dataset/preprocess.py | 28 --- linguify_yb/src/main.py | 101 -------- linguify_yb/src/trainer.py | 137 ---------- linguify_yb/src/utils/logger_util.py | 2 +- pyproject.toml | 4 +- requirements.txt | 171 ++++++------- run_setup.sh | 7 + set_environment_variables_template.sh | 2 + signa2text/README.md | 25 +- .../metrics.py => signa2text/data/.gitkeep | 0 .../data/dataset_paths.json | 0 signa2text/data/dev_samples.json | 9 + .../development/code_dev.ipynb | 0 .../development/data_dev.ipynb | 0 .../development/dev.ipynb | 0 .../development/trans_dev.ipynb | 0 .../notebooks/analyasis.ipynb | 0 signa2text/run_train.sh | 8 + signa2text/src/benchmark.py | 117 +++++++++ signa2text/src/config.py | 6 + .../src/config/config.yaml | 0 signa2text/src/dataset/dataset_loader.py | 237 ++++++++++++++++++ signa2text/src/dataset/dataset_paths.py | 46 ++++ .../src/dataset/frames_config.py | 56 ++--- signa2text/src/dataset/preprocess.py | 61 +++++ {linguify_yb => signa2text}/src/dev_data.py | 6 +- .../src/evalute.py | 0 signa2text/src/main.py | 95 +++++++ .../__init__.py => signa2text/src/metrics.py | 0 signa2text/src/{utils => models}/__init__.py | 0 .../src/models/baseline_transformer.py | 108 ++++---- .../src/models/model_loader.py | 12 +- .../src/models/static_transfromer.py | 0 signa2text/src/tests/__init__.py | 0 .../src/tests/test_data_ingestion.py | 36 ++- .../src/tests/test_model.py | 0 signa2text/src/tests/test_pipeline.py | 0 signa2text/src/trainer.py | 132 ++++++++++ signa2text/src/utils/logger_util.py | 3 +- {linguify_yb => signa2text}/src/utils/util.py | 10 +- version.txt | 1 + 54 files changed, 987 insertions(+), 718 deletions(-) delete mode 100644 .gitpod.yml rename linguify_yb/data/.gitkeep => Dockerfile (100%) rename linguify_yb/src/config/config.yaml => app.py (100%) rename test_inference.py => inference.py (100%) delete mode 100644 linguify_yb/data/dev_samples.json delete mode 100644 linguify_yb/src/benchmark.py delete mode 100644 linguify_yb/src/config.py delete mode 100644 linguify_yb/src/dataset/dataset_loader.py delete mode 100644 linguify_yb/src/dataset/preprocess.py delete mode 100644 linguify_yb/src/main.py delete mode 100644 linguify_yb/src/trainer.py create mode 100644 run_setup.sh rename linguify_yb/src/metrics.py => signa2text/data/.gitkeep (100%) rename {linguify_yb => signa2text}/data/dataset_paths.json (100%) create mode 100644 signa2text/data/dev_samples.json rename {linguify_yb => signa2text}/development/code_dev.ipynb (100%) rename {linguify_yb => signa2text}/development/data_dev.ipynb (100%) rename {linguify_yb => signa2text}/development/dev.ipynb (100%) rename {linguify_yb => signa2text}/development/trans_dev.ipynb (100%) rename {linguify_yb => signa2text}/notebooks/analyasis.ipynb (100%) create mode 100644 signa2text/run_train.sh create mode 100644 signa2text/src/benchmark.py create mode 100644 signa2text/src/config.py rename linguify_yb/src/models/__init__.py => signa2text/src/config/config.yaml (100%) create mode 100644 signa2text/src/dataset/dataset_loader.py create mode 100644 signa2text/src/dataset/dataset_paths.py rename {linguify_yb => signa2text}/src/dataset/frames_config.py (58%) create mode 100644 signa2text/src/dataset/preprocess.py rename {linguify_yb => signa2text}/src/dev_data.py (96%) rename linguify_yb/src/models/static_transfromer.py => signa2text/src/evalute.py (100%) create mode 100644 signa2text/src/main.py rename linguify_yb/src/tests/__init__.py => signa2text/src/metrics.py (100%) rename signa2text/src/{utils => models}/__init__.py (100%) rename {linguify_yb => signa2text}/src/models/baseline_transformer.py (78%) rename {linguify_yb => signa2text}/src/models/model_loader.py (68%) rename linguify_yb/src/tests/test_pipeline.py => signa2text/src/models/static_transfromer.py (100%) create mode 100644 signa2text/src/tests/__init__.py rename {linguify_yb => signa2text}/src/tests/test_data_ingestion.py (70%) rename {linguify_yb => signa2text}/src/tests/test_model.py (100%) create mode 100644 signa2text/src/tests/test_pipeline.py create mode 100644 signa2text/src/trainer.py rename {linguify_yb => signa2text}/src/utils/util.py (90%) diff --git a/.github/workflows/run_units_test.yml b/.github/workflows/run_units_test.yml index 43fa0cb2..034dd036 100644 --- a/.github/workflows/run_units_test.yml +++ b/.github/workflows/run_units_test.yml @@ -1,5 +1,3 @@ -name: Units Tests - on: push: branches: @@ -22,8 +20,13 @@ jobs: uses: actions/setup-python@v2 with: python-version: 3.10 + run: | + python -m venv venv + source venv/bin/activate + python -m pip install --upgrade pip + python -m pip install -r requirements.txt - - name: Pytest + - name: Set up Pytest run: | - cd linguify - pytest \ No newline at end of file + cd signa2text + pytest diff --git a/.gitignore b/.gitignore index aa4ae93d..7b0f4fb0 100644 --- a/.gitignore +++ b/.gitignore @@ -90,9 +90,14 @@ target/ # pytest cache .pytest_cache/ +#mics +.gitpod.yml +poetry.lock + # Data and models data/*/* models/* +kaggle !.gitkeep !dataset_paths.json !dev_samples.json @@ -114,7 +119,6 @@ yb2audio/data/*/* # Development Enviroment dev.py -#development dev_env.txt # Keys diff --git a/.gitpod.yml b/.gitpod.yml deleted file mode 100644 index 63a3e4fd..00000000 --- a/.gitpod.yml +++ /dev/null @@ -1,10 +0,0 @@ -# This configuration file was automatically generated by Gitpod. -# Please adjust to your needs (see https://www.gitpod.io/docs/introduction/learn-gitpod/gitpod-yaml) -# and commit this file to your remote git repository to share the goodness with others. - -# Learn more from ready-to-use templates: https://www.gitpod.io/docs/introduction/getting-started/quickstart - -tasks: - - init: make - - diff --git a/linguify_yb/data/.gitkeep b/Dockerfile similarity index 100% rename from linguify_yb/data/.gitkeep rename to Dockerfile diff --git a/Makefile b/Makefile index 44a6076c..fc07bc95 100644 --- a/Makefile +++ b/Makefile @@ -13,16 +13,16 @@ setup: poetry install poetry add pre-commit python pre-commit install - @echo "Environment setup complete" + @echo "Environment setup complete" precommit: @echo "Running precommit on all files" python pre-commit run --all-files -export: +export_: @echo "Exporting dependencies to requirements file" poetry export --without-hashes -f requirements.txt --output requirements.txt -backup: # To push to Github without running precommit - git commit --no-verify -m "updates" - git push origin main +run_container: + @echo "Running Docker Contain" + diff --git a/README.md b/README.md index 320d59a3..82fd2a3d 100644 --- a/README.md +++ b/README.md @@ -40,25 +40,29 @@ Effective communication is a cornerstone of societal cohesion, and this project ```bash # Clone this repository -$ git clone +$ git clone # Go into the repository $ cd # Install dependencies -$ make setup +$ . ./run_setup.sh -# activate virtual enviroment -$ source $(poetry env info --path)/bin/activate ``` ### Project Roadmap Here's a glimpse of the exciting features we plan to implement in the coming weeks: -- [x] Add project's documentation -- [] Develop a Proof of Concept System -- [] Deployment of Proof of Concept System +| Feature | Description | Status | +| ------------------------- | ---------------------------------------------------------- | ----------- | +| SignText Model | Implement the training of the SignText model | In Progress | +| Deployement of the System| Develop and Deploy the system to Google Cloud. | Planned | +| User Interface | Developing a friendly and functionaly User Interface| Planned | + +## How to Contribute + +We welcome contributions from the community. If you're interested in contributing, please refer to the [Contributing Guidelines](CONTRIBUTING.md). ## Acknowledgments @@ -68,9 +72,13 @@ I would like to acknowledge the outstanding contributions of : **Email:** **GitHub:** [@tejuafonja](https://github.com/tejuafonja) -## Contact +## Support and Contact + +If you have questions or need assistance, feel free to reach out to: **Name:** **Ipadeola Ezekiel Ladipo** **Email:** **GitHub:** [@rileydrizzy](https://github.com/rileydrizzy) **Linkdeln:** [Ipadeola Ladipo](https://www.linkedin.com/in/ladipo-ipadeola/) + +--- diff --git a/linguify_yb/src/config/config.yaml b/app.py similarity index 100% rename from linguify_yb/src/config/config.yaml rename to app.py diff --git a/test_inference.py b/inference.py similarity index 100% rename from test_inference.py rename to inference.py diff --git a/linguify_yb/README.md b/linguify_yb/README.md index a5a84d35..d427164d 100644 --- a/linguify_yb/README.md +++ b/linguify_yb/README.md @@ -1,30 +1,11 @@ -# Linguify-YB +# Signa-Text [![LICENSE](https://img.shields.io/badge/license-MIT-green?style=flat-square)](LICENSE) [![Python](https://img.shields.io/badge/python-3.6-blue.svg?style=flat-square)](https://www.python.org/) [![PyTorch](https://img.shields.io/badge/PyTorch-1.7.0-orange)](https://pytorch.org/) -![image/gif](https://github.com/rileydrizzy/Cohort8-Ransom-Kuti-Ladipo/blob/main/images/sign%20lang.gif) +![image/gif]() ## Project description ***Overview:*** \ - -## Project Roadmap - -- **[Month Year]:** Project Initiation -- **[Month Year]:** Core Functionality Completion -- **[Month Year]:** User Interface Design Completion -- **[Month Year]:** Data Integration Completion -- **[Month Year]:** Testing and Quality Assurance Completion -- **[Month Year]:** Deployment to Production - -## How to Contribute - -We welcome contributions from the community. If you're interested in contributing, please refer to the [Contributing Guidelines](CONTRIBUTING.md). - -## Support and Contact - -If you have questions or need assistance, feel free to reach out to [Your Contact Information]. - ---- diff --git a/linguify_yb/data/dev_samples.json b/linguify_yb/data/dev_samples.json deleted file mode 100644 index 93702d14..00000000 --- a/linguify_yb/data/dev_samples.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "train_files": [ - "data/asl-fingerspelling/train_landmarks/1019715464.parquet", - "data/asl-fingerspelling/train_landmarks/1021040628.parquet" - ], - "valid_files": [ - "data/asl-fingerspelling/train_landmarks/105143404.parquet" - ] -} \ No newline at end of file diff --git a/linguify_yb/src/benchmark.py b/linguify_yb/src/benchmark.py deleted file mode 100644 index 9b83888a..00000000 --- a/linguify_yb/src/benchmark.py +++ /dev/null @@ -1,65 +0,0 @@ -"""doc -""" -from torchprofile import profile_macs -from torch import nn - - -Byte = 8 -KiB = 1024 * Byte -MiB = 1024 * KiB -GiB = 1024 * MiB - - -class BenchMarker: - """_summary_""" - - def __init__(self) -> None: - pass - - def get_model_macs(self, model, inputs=None) -> int: - """ - calculate the MACS of a model - """ - return profile_macs(model, inputs) - - def get_model_sparsity(self, model: nn.Module) -> float: - """ - calculate the sparsity of the given model - sparsity = #zeros / #elements = 1 - #nonzeros / #elements - """ - num_nonzeros, num_elements = 0, 0 - for param in model.parameters(): - num_nonzeros += param.count_nonzero() - num_elements += param.numel() - return 1 - float(num_nonzeros) / num_elements - - def get_num_parameters(self, model: nn.Module, count_nonzero_only=False) -> int: - """ - calculate the total number of parameters of model - :param count_nonzero_only: only count nonzero weights - """ - num_counted_elements = 0 - for param in model.parameters(): - if count_nonzero_only: - num_counted_elements += param.count_nonzero() - else: - num_counted_elements += param.numel() - return num_counted_elements - - def get_model_size( - self, model: nn.Module, data_width=32, count_nonzero_only=False - ) -> int: - """ - calculate the model size in bits - :param data_width: #bits per element - :param count_nonzero_only: only count nonzero weights - """ - return self.get_num_parameters(model, count_nonzero_only) * data_width - - def runner(self, model): - model_macs = self.get_model_macs(model) - model_sparsity = self.get_model_sparsity(model) - model_num_params = self.get_num_parameters(model) - model_size = self.get_model_size(model) - - return diff --git a/linguify_yb/src/config.py b/linguify_yb/src/config.py deleted file mode 100644 index 98a1d336..00000000 --- a/linguify_yb/src/config.py +++ /dev/null @@ -1,6 +0,0 @@ -"""doc -""" -from pydantic import BaseModel - -class Data(BaseModel): - \ No newline at end of file diff --git a/linguify_yb/src/dataset/dataset_loader.py b/linguify_yb/src/dataset/dataset_loader.py deleted file mode 100644 index afaa302f..00000000 --- a/linguify_yb/src/dataset/dataset_loader.py +++ /dev/null @@ -1,119 +0,0 @@ -"""doc -""" - -import json - -import numpy as np -import pandas as pd -import pyarrow.parquet as pq -import torch -from torch.nn import functional as F -from torch.utils.data import DataLoader, Dataset - -from dataset.frames_config import FEATURE_COLUMNS, FRAME_LEN, LHAND_IDX, RHAND_IDX -from dataset.preprocess import clean_frames_process - -PHRASE_PATH = "/kaggle/input/asl-fingerspelling/character_to_prediction_index.json" -METADATA = "/kaggle/input/asl-fingerspelling/train.csv" - -with open(PHRASE_PATH, "r", encoding="utf-8") as f: - character_to_num = json.load(f) - -PAD_TOKEN = "P" -START_TOKEN = "<" -END_TOKEN = ">" -PAD_TOKEN_IDX = 59 -START_TOKEN_IDX = 60 -END_TOKEN_IDX = 61 - -character_to_num[PAD_TOKEN] = PAD_TOKEN_IDX -character_to_num[START_TOKEN] = START_TOKEN_IDX -character_to_num[END_TOKEN] = END_TOKEN_IDX -num_to_character = {j: i for i, j in character_to_num.items()} - - -class TokenHashTable: - def __init__( - self, word2index_mapping=character_to_num, index2word_mapping=num_to_character - ): - self.word2index = word2index_mapping - self.index2word = index2word_mapping - - def _indexesfromsentence(self, sentence): - return [self.word2index[word] for word in sentence] - - def sentence_to_tensor(self, sentence): - indexes = self._indexesfromsentence(sentence) - return torch.tensor(indexes, dtype=torch.long) - - def index_to_sentence(self, indexes_list): - if torch.is_tensor(indexes_list): - indexes_list = indexes_list.tolist() - words = [self.index2word[idx] for idx in indexes_list] - return words - - -def read_file(file, file_id, landmarks_metadata_path): - phrase_list = [] - frames_list = [] - metadata_train_dataframe = pd.read_csv(landmarks_metadata_path) - file_id_df = metadata_train_dataframe.loc[ - metadata_train_dataframe["file_id"] == file_id - ] - saved_parueat_df = pq.read_table( - file, columns=["sequence_id"] + FEATURE_COLUMNS - ).to_pandas() - for seq_id, phrase in zip(file_id_df.sequence_id, file_id_df.phrase): - frames = saved_parueat_df[saved_parueat_df.index == seq_id].to_numpy() - # NaN - frames_list.append(torch.tensor(frames)) - phrase_list.append(phrase) - return (frames_list, phrase_list) - - -class LandmarkDataset(Dataset): - def __init__(self, file_path, file_id, table, transform=True): - self.landmarks_metadata_path = METADATA - self.frames, self.labels = read_file( - file_path, file_id, self.landmarks_metadata_path - ) - self.trans = transform - self.table = table - - def _label_pre(self, label_sample): - sample = START_TOKEN + label_sample + END_TOKEN - new_phrase = self.table.tensorfromsentence(list(sample)) - ans = F.pad( - input=new_phrase, - pad=[0, 64 - new_phrase.shape[0]], - mode="constant", - value=PAD_TOKEN_IDX, - ) - return ans - - def __len__(self): - return len(self.labels) - - def __getitem__(self, idx): - if torch.is_tensor(idx): - idx = idx.tolist() - phrase = self.labels[idx] - frames = self.frames[idx] - - if self.trans: - phrase = self._label_pre(phrase) - frames = clean_frames_process(frames) - return frames, phrase - - -def get_dataloader(file_path, file_id, batch_size=32, num_workers_=1): - lookup_table = TokenHashTable(character_to_num, num_to_character) - dataset = LandmarkDataset(file_path, file_id, lookup_table, transform=True) - - dataloader = DataLoader( - dataset, - batch_size=batch_size, - num_workers=num_workers_, - pin_memory=True, - ) - return dataloader diff --git a/linguify_yb/src/dataset/preprocess.py b/linguify_yb/src/dataset/preprocess.py deleted file mode 100644 index 192dd73a..00000000 --- a/linguify_yb/src/dataset/preprocess.py +++ /dev/null @@ -1,28 +0,0 @@ -"""doc -""" -import torch -from torch.nn import functional as F - -# TODO Clean up code, add comments and docs -# TODO remove print and debug statements - - -def clean_frames_process( - x, max_frame_len=128, n_hand_landmarks=21, n_pose_landmarks=33, n_face_landmarks=40 -): - x = x[:max_frame_len] - x = torch.where(torch.isnan(x), torch.zeros_like(x), x) - n_frames = x.size(0) - lhand = x[:, 0:63].view(n_frames, 3, n_hand_landmarks).transpose(1, 2) - rhand = x[:, 63:126].view(n_frames, 3, n_hand_landmarks).transpose(1, 2) - pose = x[:, 126:225].view(n_frames, 3, n_pose_landmarks).transpose(1, 2) - face = x[:, 225:345].view(n_frames, 3, n_face_landmarks).transpose(1, 2) - - x = torch.cat([lhand, rhand, pose, face], axis=1) - x = x.view(n_frames, 345) - if n_frames < max_frame_len: - # Calculate the padding on the first dimension from the bottom - padding_bottom = max(0, max_frame_len - x.size(0)) - # Pad the tensor along the first dimension from the bottom - x = F.pad(x, (0, 0, 0, padding_bottom)) - return x diff --git a/linguify_yb/src/main.py b/linguify_yb/src/main.py deleted file mode 100644 index ac12cfc8..00000000 --- a/linguify_yb/src/main.py +++ /dev/null @@ -1,101 +0,0 @@ -""" -doc - -# Usage: -# python -m src/train.py \ -# --epochs 10 \ -# --batch 512 \ -""" -# TODO Complete and refactor code for distributed training - -import os -import json - -import numpy as np -import torch -import wandb -from torch import nn - -from utils.util import get_device_strategy, parse_args, set_seed -from utils.logger_util import logger -from models.model_loader import ModelLoader -from dataset.dataset_loader import get_dataloader -import trainer - -try: - dataset_paths = "data/dev_samples.json" # On kaggle replace with "data/dataset_paths.json" to train on full data - with open(dataset_paths, "r", encoding="utf-8") as json_file: - data_dict = json.load(json_file) - LANDMARK_DIR = "/kaggle/input/asl-fingerspelling/train_landmarks" - MODEL_DIR = "model.pt" - - # Training dataset - train_dataset = data_dict["train_files"] - train_file_ids = [os.path.basename(file) for file in train_dataset] - train_file_ids = [ - int(file_name.replace(".parquet", "")) for file_name in train_file_ids - ] - assert len(train_dataset) == len( - train_file_ids - ), "Failed import of Train files path " - TRAIN_DS_FILES = list(zip(train_dataset, train_file_ids)) - - # Validation dataset - valid_dataset = data_dict["valid_files"] - valid_file_ids = [os.path.basename(file) for file in valid_dataset] - valid_file_ids = [ - int(file_name.replace(".parquet", "")) for file_name in valid_file_ids - ] - assert len(train_dataset) == len( - train_file_ids - ), "Failed Import of Valid Files path" - VALID_DS_FILES = list(zip(valid_dataset, valid_file_ids)) -except AssertionError as asset_error: - logger.exception(f"failed {asset_error}") - - -def main(arg): - logger.info(f"Starting training on {arg.model}") - # To ensure reproducibility of the training process - set_seed() - DEVICE = get_device_strategy(tpu=arg.tpu) - logger.info(f"Training on {DEVICE} for {arg.epochs} epochs.") - - model = ModelLoader().get_model(arg.model) - - optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) - criterion = nn.CrossEntropyLoss(label_smoothing=0.1) - - # Optimizes given model/function using TorchDynamo and specified backend - torch.compile(model) - - logger.info("training") - wandb.init( - project="ASL-project", - config={ - "learning_rate": 0.01, - "architecture": "Test Model", - "dataset": "Google ASL Landmarks", - "epochs": 12, - }, - ) - - wandb.watch(model) - try: - train( - model=arg.model, - optim=optimizer, - loss_func=criterion, - n_epochs=arg.epochs, - batch=arg.batch, - device=DEVICE, - ) - logger.success(f"Training completed: {arg.epochs} epochs on {DEVICE}.") - - except Exception as error: - logger.exception(f"Training failed due to an {error}.") - - -if __name__ == "__main__": - args = parse_args() - main(args) diff --git a/linguify_yb/src/trainer.py b/linguify_yb/src/trainer.py deleted file mode 100644 index 3634140c..00000000 --- a/linguify_yb/src/trainer.py +++ /dev/null @@ -1,137 +0,0 @@ -""" -doc - -# Usage: -# python -m src/train.py \ -# --epochs 10 \ -# --batch 512 \ -""" -# TODO Complete and refactor code for distributed training - -import os -import json - -import numpy as np -import torch -import wandb -from torch import nn - -from utils.logger_util import logger - - -def train(model, optim, loss_func, n_epochs, batch, device,): - - model.to(device) - - train_losses = [] - val_losses = [] - val_dataloader = # get_dataloader(TRAIN_FILES[0][0], TRAIN_FILES[0][1], batch_size=batch) - for epoch in range(n_epochs): - logger.info(f"Training on epoch {epoch}.") - total_epochs = epoch - file_train_loss = [] - for file, file_id in TRAIN_DS_FILES: - train_dataloader = # get_dataloader(file, file_id, batch_size=batch) - - # Performs training using mini-batches - train_loss = mini_batch( - model, train_dataloader, optim, loss_func, device, validation=False - ) - file_train_loss.append(train_loss) - train_loss = np.mean(file_train_loss) - train_losses.append(train_loss) - - # Performs evaluation using mini-batches - logger.info("Starting validation.") - with torch.no_grad(): - val_loss = mini_batch( - model, val_dataloader, optim, loss_func, device, validation=True - ) - val_losses.append(val_loss) - - wandb.log( - { - "train_loss": train_loss, - "val_loss": val_loss, - "epoch": epoch, - } - ) - - if epoch // 2 == 0: - logger.info("Initiating checkpoint. Saving model and optimizer states.") - save_checkpoint( - MODEL_DIR, model, optim, total_epochs, train_losses, val_losses - ) - - -def mini_batch( - model, dataloader, mini_batch_optim, loss_func, device, validation=False -): - # The mini-batch can be used with both loaders - # The argument `validation`defines which loader and - # corresponding step function is going to be used - if validation: - step_func = val_step_func(model, loss_func) - else: - step_func = train_step_func(model, mini_batch_optim, loss_func) - - # Once the data loader and step function, this is the same - # mini-batch loop we had before - mini_batch_losses = [] - for x_batch, y_batch in dataloader: - x_batch = x_batch.to(device) - y_batch = y_batch.to(device) - loss = step_func(x=x_batch, y=y_batch) - mini_batch_losses.append(loss) - loss = np.mean(mini_batch_losses) - return loss - - -def train_step_func(model, optim_, loss_func): - def perform_train_step_fn(x, y): - model.train() - preds = model(x) - loss = loss_func(preds, y) - loss.backward() - optim_.step() - optim_.zero_grad() - return loss.item() - - return perform_train_step_fn - - -def val_step_func(model, loss_func): - def perform_val_step_fn(x, y): - model.eval() - preds = model(x) - loss = loss_func(preds, y) - return loss.item() - - return perform_val_step_fn - - -def save_checkpoint(filename, model, optimizer, total_epochs, train_losses, val_losses): - # Builds dictionary with all elements for resuming training - checkpoint = { - "epoch": total_epochs, - "model_state_dict": model.state_dict(), - "optimizer_state_dict": optimizer.state_dict(), - "loss": train_losses, - "val_loss": val_losses, - } - - torch.save(checkpoint, filename) - - -def load_checkpoint(model, optimizer, filename): - # Loads dictionary - checkpoint = torch.load(filename) - - # Restore state for model and optimizer - model.load_state_dict(checkpoint["model_state_dict"]) - optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) - - total_epochs = checkpoint["epoch"] - losses = checkpoint["loss"] - val_losses = checkpoint["val_loss"] - return model diff --git a/linguify_yb/src/utils/logger_util.py b/linguify_yb/src/utils/logger_util.py index a7e057e6..618d207b 100644 --- a/linguify_yb/src/utils/logger_util.py +++ b/linguify_yb/src/utils/logger_util.py @@ -10,7 +10,7 @@ - `logger.warning("Warning message")` - `logger.error("Error message")` - `logger.critical("Critical message")` - - `logger.success("success messgae")` + - `logger.succues("success messgae")` """ from pathlib import Path diff --git a/pyproject.toml b/pyproject.toml index 524497fc..7220de66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,6 @@ loguru = "^0.7.2" wandb = "^0.15.12" transformers = "^4.34.0" pandas = "^2.1.1" -ray = {extras = ["data", "serve", "train", "tune"], version = "^2.7.1"} hydra-core = "^1.3.2" tensorflow = "^2.14.0" torch = "^2.1.0" @@ -22,6 +21,9 @@ torchvision = "^0.16.0" ipykernel = "^6.26.0" opencv-python = "^4.8.1.78" torchprofile = "^0.0.4" +pydantic = "^2.5.2" +pytest = "^7.4.3" +pyarrow = "^14.0.1" [tool.poetry.group.dev.dependencies] diff --git a/requirements.txt b/requirements.txt index 69a12e79..f4610364 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,162 +1,143 @@ absl-py==2.0.0 ; python_version >= "3.10" and python_version < "3.12" -aiohttp-cors==0.7.0 ; python_version >= "3.10" and python_version < "3.12" -aiohttp==3.8.6 ; python_version >= "3.10" and python_version < "3.12" -aiorwlock==1.3.0 ; python_version >= "3.10" and python_version < "3.12" -aiosignal==1.3.1 ; python_version >= "3.10" and python_version < "3.12" -ansicon==1.89.0 ; python_version >= "3.10" and python_version < "3.12" and platform_system == "Windows" +annotated-types==0.6.0 ; python_version >= "3.10" and python_version < "3.12" antlr4-python3-runtime==4.9.3 ; python_version >= "3.10" and python_version < "3.12" -anyio==3.7.1 ; python_version >= "3.10" and python_version < "3.12" appdirs==1.4.4 ; python_version >= "3.10" and python_version < "3.12" -appnope==0.1.3 ; python_version >= "3.10" and python_version < "3.12" and (platform_system == "Darwin" or sys_platform == "darwin") -asttokens==2.4.0 ; python_version >= "3.10" and python_version < "3.12" +appnope==0.1.3 ; python_version >= "3.10" and python_version < "3.12" and platform_system == "Darwin" +asttokens==2.4.1 ; python_version >= "3.10" and python_version < "3.12" astunparse==1.6.3 ; python_version >= "3.10" and python_version < "3.12" -async-timeout==4.0.3 ; python_version >= "3.10" and python_version < "3.12" -attrs==23.1.0 ; python_version >= "3.10" and python_version < "3.12" -backcall==0.2.0 ; python_version >= "3.10" and python_version < "3.12" bleach==6.1.0 ; python_version >= "3.10" and python_version < "3.12" -blessed==1.20.0 ; python_version >= "3.10" and python_version < "3.12" -cachetools==5.3.1 ; python_version >= "3.10" and python_version < "3.12" -certifi==2023.7.22 ; python_version >= "3.10" and python_version < "3.12" +cachetools==5.3.2 ; python_version >= "3.10" and python_version < "3.12" +certifi==2023.11.17 ; python_version >= "3.10" and python_version < "3.12" cffi==1.16.0 ; python_version >= "3.10" and python_version < "3.12" and implementation_name == "pypy" cfgv==3.4.0 ; python_version >= "3.10" and python_version < "3.12" -charset-normalizer==3.3.0 ; python_version >= "3.10" and python_version < "3.12" +charset-normalizer==3.3.2 ; python_version >= "3.10" and python_version < "3.12" click==8.1.7 ; python_version >= "3.10" and python_version < "3.12" colorama==0.4.6 ; python_version >= "3.10" and python_version < "3.12" and (sys_platform == "win32" or platform_system == "Windows") -colorful==0.5.5 ; python_version >= "3.10" and python_version < "3.12" -comm==0.1.4 ; python_version >= "3.10" and python_version < "3.12" +comm==0.2.0 ; python_version >= "3.10" and python_version < "3.12" debugpy==1.8.0 ; python_version >= "3.10" and python_version < "3.12" decorator==5.1.1 ; python_version >= "3.10" and python_version < "3.12" -distlib==0.3.7 ; python_version >= "3.10" and python_version < "3.12" +distlib==0.3.8 ; python_version >= "3.10" and python_version < "3.12" docker-pycreds==0.4.0 ; python_version >= "3.10" and python_version < "3.12" -exceptiongroup==1.1.3 ; python_version >= "3.10" and python_version < "3.11" -executing==2.0.0 ; python_version >= "3.10" and python_version < "3.12" -fastapi==0.103.2 ; python_version >= "3.10" and python_version < "3.12" -filelock==3.12.4 ; python_version >= "3.10" and python_version < "3.12" +exceptiongroup==1.2.0 ; python_version >= "3.10" and python_version < "3.11" +executing==2.0.1 ; python_version >= "3.10" and python_version < "3.12" +filelock==3.13.1 ; python_version >= "3.10" and python_version < "3.12" flatbuffers==23.5.26 ; python_version >= "3.10" and python_version < "3.12" -frozenlist==1.4.0 ; python_version >= "3.10" and python_version < "3.12" -fsspec==2023.9.2 ; python_version >= "3.10" and python_version < "3.12" +fsspec==2023.12.2 ; python_version >= "3.10" and python_version < "3.12" gast==0.5.4 ; python_version >= "3.10" and python_version < "3.12" -gitdb==4.0.10 ; python_version >= "3.10" and python_version < "3.12" -gitpython==3.1.38 ; python_version >= "3.10" and python_version < "3.12" -google-api-core==2.12.0 ; python_version >= "3.10" and python_version < "3.12" -google-auth-oauthlib==1.0.0 ; python_version >= "3.10" and python_version < "3.12" -google-auth==2.23.3 ; python_version >= "3.10" and python_version < "3.12" +gitdb==4.0.11 ; python_version >= "3.10" and python_version < "3.12" +gitpython==3.1.40 ; python_version >= "3.10" and python_version < "3.12" +google-auth-oauthlib==1.2.0 ; python_version >= "3.10" and python_version < "3.12" +google-auth==2.25.2 ; python_version >= "3.10" and python_version < "3.12" google-pasta==0.2.0 ; python_version >= "3.10" and python_version < "3.12" -googleapis-common-protos==1.61.0 ; python_version >= "3.10" and python_version < "3.12" -gpustat==1.1.1 ; python_version >= "3.10" and python_version < "3.12" -grpcio==1.59.0 ; python_version >= "3.10" and python_version < "3.12" -h11==0.14.0 ; python_version >= "3.10" and python_version < "3.12" +grpcio==1.60.0 ; python_version >= "3.10" and python_version < "3.12" h5py==3.10.0 ; python_version >= "3.10" and python_version < "3.12" -huggingface-hub==0.17.3 ; python_version >= "3.10" and python_version < "3.12" +huggingface-hub==0.19.4 ; python_version >= "3.10" and python_version < "3.12" hydra-core==1.3.2 ; python_version >= "3.10" and python_version < "3.12" -identify==2.5.30 ; python_version >= "3.10" and python_version < "3.12" -idna==3.4 ; python_version >= "3.10" and python_version < "3.12" -ipykernel==6.26.0 ; python_version >= "3.10" and python_version < "3.12" -ipython==8.16.1 ; python_version >= "3.10" and python_version < "3.12" +identify==2.5.33 ; python_version >= "3.10" and python_version < "3.12" +idna==3.6 ; python_version >= "3.10" and python_version < "3.12" +iniconfig==2.0.0 ; python_version >= "3.10" and python_version < "3.12" +ipykernel==6.27.1 ; python_version >= "3.10" and python_version < "3.12" +ipython==8.18.1 ; python_version >= "3.10" and python_version < "3.12" jedi==0.19.1 ; python_version >= "3.10" and python_version < "3.12" jinja2==3.1.2 ; python_version >= "3.10" and python_version < "3.12" -jinxed==1.2.0 ; python_version >= "3.10" and python_version < "3.12" and platform_system == "Windows" -jsonschema-specifications==2023.7.1 ; python_version >= "3.10" and python_version < "3.12" -jsonschema==4.19.1 ; python_version >= "3.10" and python_version < "3.12" -jupyter-client==8.4.0 ; python_version >= "3.10" and python_version < "3.12" -jupyter-core==5.4.0 ; python_version >= "3.10" and python_version < "3.12" +jupyter-client==8.6.0 ; python_version >= "3.10" and python_version < "3.12" +jupyter-core==5.5.0 ; python_version >= "3.10" and python_version < "3.12" kaggle==1.5.16 ; python_version >= "3.10" and python_version < "3.12" -keras==2.14.0 ; python_version >= "3.10" and python_version < "3.12" +keras==2.15.0 ; python_version >= "3.10" and python_version < "3.12" libclang==16.0.6 ; python_version >= "3.10" and python_version < "3.12" loguru==0.7.2 ; python_version >= "3.10" and python_version < "3.12" -markdown==3.5 ; python_version >= "3.10" and python_version < "3.12" +markdown==3.5.1 ; python_version >= "3.10" and python_version < "3.12" markupsafe==2.1.3 ; python_version >= "3.10" and python_version < "3.12" matplotlib-inline==0.1.6 ; python_version >= "3.10" and python_version < "3.12" ml-dtypes==0.2.0 ; python_version >= "3.10" and python_version < "3.12" mpmath==1.3.0 ; python_version >= "3.10" and python_version < "3.12" -msgpack==1.0.7 ; python_version >= "3.10" and python_version < "3.12" -multidict==6.0.4 ; python_version >= "3.10" and python_version < "3.12" nest-asyncio==1.5.8 ; python_version >= "3.10" and python_version < "3.12" -networkx==3.2 ; python_version >= "3.10" and python_version < "3.12" +networkx==3.2.1 ; python_version >= "3.10" and python_version < "3.12" nodeenv==1.8.0 ; python_version >= "3.10" and python_version < "3.12" -numpy==1.26.1 ; python_version >= "3.10" and python_version < "3.12" -nvidia-ml-py==12.535.108 ; python_version >= "3.10" and python_version < "3.12" +numpy==1.26.2 ; python_version >= "3.10" and python_version < "3.12" +nvidia-cublas-cu12==12.1.3.1 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "3.12" +nvidia-cuda-cupti-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "3.12" +nvidia-cuda-nvrtc-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "3.12" +nvidia-cuda-runtime-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "3.12" +nvidia-cudnn-cu12==8.9.2.26 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "3.12" +nvidia-cufft-cu12==11.0.2.54 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "3.12" +nvidia-curand-cu12==10.3.2.106 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "3.12" +nvidia-cusolver-cu12==11.4.5.107 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "3.12" +nvidia-cusparse-cu12==12.1.0.106 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "3.12" +nvidia-nccl-cu12==2.18.1 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "3.12" +nvidia-nvjitlink-cu12==12.3.101 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "3.12" +nvidia-nvtx-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "3.12" oauthlib==3.2.2 ; python_version >= "3.10" and python_version < "3.12" omegaconf==2.3.0 ; python_version >= "3.10" and python_version < "3.12" -opencensus-context==0.1.3 ; python_version >= "3.10" and python_version < "3.12" -opencensus==0.11.3 ; python_version >= "3.10" and python_version < "3.12" opencv-python==4.8.1.78 ; python_version >= "3.10" and python_version < "3.12" opendatasets==0.1.22 ; python_version >= "3.10" and python_version < "3.12" opt-einsum==3.3.0 ; python_version >= "3.10" and python_version < "3.12" packaging==23.2 ; python_version >= "3.10" and python_version < "3.12" -pandas==2.1.1 ; python_version >= "3.10" and python_version < "3.12" +pandas==2.1.4 ; python_version >= "3.10" and python_version < "3.12" parso==0.8.3 ; python_version >= "3.10" and python_version < "3.12" pathtools==0.1.2 ; python_version >= "3.10" and python_version < "3.12" -pexpect==4.8.0 ; python_version >= "3.10" and python_version < "3.12" and sys_platform != "win32" -pickleshare==0.7.5 ; python_version >= "3.10" and python_version < "3.12" +pexpect==4.9.0 ; python_version >= "3.10" and python_version < "3.12" and sys_platform != "win32" pillow==10.1.0 ; python_version >= "3.10" and python_version < "3.12" platformdirs==3.11.0 ; python_version >= "3.10" and python_version < "3.12" -pre-commit==3.5.0 ; python_version >= "3.10" and python_version < "3.12" -prometheus-client==0.17.1 ; python_version >= "3.10" and python_version < "3.12" -prompt-toolkit==3.0.39 ; python_version >= "3.10" and python_version < "3.12" -protobuf==4.24.4 ; python_version >= "3.10" and python_version < "3.12" +pluggy==1.3.0 ; python_version >= "3.10" and python_version < "3.12" +pre-commit==3.6.0 ; python_version >= "3.10" and python_version < "3.12" +prompt-toolkit==3.0.42 ; python_version >= "3.10" and python_version < "3.12" +protobuf==4.23.4 ; python_version >= "3.10" and python_version < "3.12" psutil==5.9.6 ; python_version >= "3.10" and python_version < "3.12" ptyprocess==0.7.0 ; python_version >= "3.10" and python_version < "3.12" and sys_platform != "win32" pure-eval==0.2.2 ; python_version >= "3.10" and python_version < "3.12" -py-spy==0.3.14 ; python_version >= "3.10" and python_version < "3.12" -pyarrow==13.0.0 ; python_version >= "3.10" and python_version < "3.12" +pyarrow==14.0.1 ; python_version >= "3.10" and python_version < "3.12" pyasn1-modules==0.3.0 ; python_version >= "3.10" and python_version < "3.12" -pyasn1==0.5.0 ; python_version >= "3.10" and python_version < "3.12" +pyasn1==0.5.1 ; python_version >= "3.10" and python_version < "3.12" pycparser==2.21 ; python_version >= "3.10" and python_version < "3.12" and implementation_name == "pypy" -pydantic==1.10.13 ; python_version >= "3.10" and python_version < "3.12" -pygments==2.16.1 ; python_version >= "3.10" and python_version < "3.12" +pydantic-core==2.14.5 ; python_version >= "3.10" and python_version < "3.12" +pydantic==2.5.2 ; python_version >= "3.10" and python_version < "3.12" +pygments==2.17.2 ; python_version >= "3.10" and python_version < "3.12" +pytest==7.4.3 ; python_version >= "3.10" and python_version < "3.12" python-dateutil==2.8.2 ; python_version >= "3.10" and python_version < "3.12" python-slugify==8.0.1 ; python_version >= "3.10" and python_version < "3.12" pytz==2023.3.post1 ; python_version >= "3.10" and python_version < "3.12" pywin32==306 ; sys_platform == "win32" and platform_python_implementation != "PyPy" and python_version >= "3.10" and python_version < "3.12" pyyaml==6.0.1 ; python_version >= "3.10" and python_version < "3.12" -pyzmq==25.1.1 ; python_version >= "3.10" and python_version < "3.12" -ray[data,serve,train,tune]==2.7.1 ; python_version >= "3.10" and python_version < "3.12" -referencing==0.30.2 ; python_version >= "3.10" and python_version < "3.12" +pyzmq==25.1.2 ; python_version >= "3.10" and python_version < "3.12" regex==2023.10.3 ; python_version >= "3.10" and python_version < "3.12" requests-oauthlib==1.3.1 ; python_version >= "3.10" and python_version < "3.12" requests==2.31.0 ; python_version >= "3.10" and python_version < "3.12" -rpds-py==0.10.6 ; python_version >= "3.10" and python_version < "3.12" rsa==4.9 ; python_version >= "3.10" and python_version < "3.12" -safetensors==0.4.0 ; python_version >= "3.10" and python_version < "3.12" -sentry-sdk==1.32.0 ; python_version >= "3.10" and python_version < "3.12" +safetensors==0.4.1 ; python_version >= "3.10" and python_version < "3.12" +sentry-sdk==1.39.0 ; python_version >= "3.10" and python_version < "3.12" setproctitle==1.3.3 ; python_version >= "3.10" and python_version < "3.12" -setuptools==68.2.2 ; python_version >= "3.10" and python_version < "3.12" +setuptools==69.0.2 ; python_version >= "3.10" and python_version < "3.12" six==1.16.0 ; python_version >= "3.10" and python_version < "3.12" -smart-open==6.4.0 ; python_version >= "3.10" and python_version < "3.12" smmap==5.0.1 ; python_version >= "3.10" and python_version < "3.12" -sniffio==1.3.0 ; python_version >= "3.10" and python_version < "3.12" stack-data==0.6.3 ; python_version >= "3.10" and python_version < "3.12" -starlette==0.27.0 ; python_version >= "3.10" and python_version < "3.12" sympy==1.12 ; python_version >= "3.10" and python_version < "3.12" -tensorboard-data-server==0.7.1 ; python_version >= "3.10" and python_version < "3.12" -tensorboard==2.14.1 ; python_version >= "3.10" and python_version < "3.12" -tensorboardx==2.6.2.2 ; python_version >= "3.10" and python_version < "3.12" -tensorflow-estimator==2.14.0 ; python_version >= "3.10" and python_version < "3.12" +tensorboard-data-server==0.7.2 ; python_version >= "3.10" and python_version < "3.12" +tensorboard==2.15.1 ; python_version >= "3.10" and python_version < "3.12" +tensorflow-estimator==2.15.0 ; python_version >= "3.10" and python_version < "3.12" tensorflow-io-gcs-filesystem==0.34.0 ; python_version >= "3.10" and python_version < "3.12" -tensorflow==2.14.0 ; python_version >= "3.10" and python_version < "3.12" -termcolor==2.3.0 ; python_version >= "3.10" and python_version < "3.12" +tensorflow==2.15.0 ; python_version >= "3.10" and python_version < "3.12" +termcolor==2.4.0 ; python_version >= "3.10" and python_version < "3.12" text-unidecode==1.3 ; python_version >= "3.10" and python_version < "3.12" -tokenizers==0.14.1 ; python_version >= "3.10" and python_version < "3.12" -torch==2.1.0 ; python_version >= "3.10" and python_version < "3.12" -torchaudio==2.1.0 ; python_version >= "3.10" and python_version < "3.12" +tokenizers==0.15.0 ; python_version >= "3.10" and python_version < "3.12" +tomli==2.0.1 ; python_version >= "3.10" and python_version < "3.11" +torch==2.1.1 ; python_version >= "3.10" and python_version < "3.12" +torchaudio==2.1.1 ; python_version >= "3.10" and python_version < "3.12" torchprofile==0.0.4 ; python_version >= "3.10" and python_version < "3.12" -torchvision==0.16.0 ; python_version >= "3.10" and python_version < "3.12" -tornado==6.3.3 ; python_version >= "3.10" and python_version < "3.12" +torchvision==0.16.1 ; python_version >= "3.10" and python_version < "3.12" +tornado==6.4 ; python_version >= "3.10" and python_version < "3.12" tqdm==4.66.1 ; python_version >= "3.10" and python_version < "3.12" -traitlets==5.11.2 ; python_version >= "3.10" and python_version < "3.12" -transformers==4.34.1 ; python_version >= "3.10" and python_version < "3.12" -typing-extensions==4.8.0 ; python_version >= "3.10" and python_version < "3.12" +traitlets==5.14.0 ; python_version >= "3.10" and python_version < "3.12" +transformers==4.36.0 ; python_version >= "3.10" and python_version < "3.12" +triton==2.1.0 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "3.12" +typing-extensions==4.9.0 ; python_version >= "3.10" and python_version < "3.12" tzdata==2023.3 ; python_version >= "3.10" and python_version < "3.12" -urllib3==2.0.7 ; python_version >= "3.10" and python_version < "3.12" -uvicorn==0.23.2 ; python_version >= "3.10" and python_version < "3.12" +urllib3==2.1.0 ; python_version >= "3.10" and python_version < "3.12" virtualenv==20.21.0 ; python_version >= "3.10" and python_version < "3.12" wandb==0.15.12 ; python_version >= "3.10" and python_version < "3.12" -watchfiles==0.21.0 ; python_version >= "3.10" and python_version < "3.12" -wcwidth==0.2.8 ; python_version >= "3.10" and python_version < "3.12" +wcwidth==0.2.12 ; python_version >= "3.10" and python_version < "3.12" webencodings==0.5.1 ; python_version >= "3.10" and python_version < "3.12" -werkzeug==3.0.0 ; python_version >= "3.10" and python_version < "3.12" -wheel==0.41.2 ; python_version >= "3.10" and python_version < "3.12" +werkzeug==3.0.1 ; python_version >= "3.10" and python_version < "3.12" +wheel==0.42.0 ; python_version >= "3.10" and python_version < "3.12" win32-setctime==1.1.0 ; python_version >= "3.10" and python_version < "3.12" and sys_platform == "win32" wrapt==1.14.1 ; python_version >= "3.10" and python_version < "3.12" -yarl==1.9.2 ; python_version >= "3.10" and python_version < "3.12" diff --git a/run_setup.sh b/run_setup.sh new file mode 100644 index 00000000..2744c0b9 --- /dev/null +++ b/run_setup.sh @@ -0,0 +1,7 @@ +echo "Installing..." +curl -sSL https://install.python-poetry.org | python - +echo "Activating virtual environment" +poetry install +poetry shell +python pre-commit install +echo "Environment setup complete" diff --git a/set_environment_variables_template.sh b/set_environment_variables_template.sh index 2c80793d..488d881a 100644 --- a/set_environment_variables_template.sh +++ b/set_environment_variables_template.sh @@ -5,3 +5,5 @@ export KAGGLE_USERNAME=username export KAGGLE_KEY=xxxxxxxxxxxxxx #replace with WANDB key export WANDB_API_KEY=xxxxxxxxxxxxxx +# +export GOOGLE= diff --git a/signa2text/README.md b/signa2text/README.md index d427164d..6cc7a3e5 100644 --- a/signa2text/README.md +++ b/signa2text/README.md @@ -1,11 +1,30 @@ -# Signa-Text +# Linguify-YB [![LICENSE](https://img.shields.io/badge/license-MIT-green?style=flat-square)](LICENSE) [![Python](https://img.shields.io/badge/python-3.6-blue.svg?style=flat-square)](https://www.python.org/) [![PyTorch](https://img.shields.io/badge/PyTorch-1.7.0-orange)](https://pytorch.org/) -![image/gif]() +![image/gif](https://github.com/rileydrizzy/Cohort8-Ransom-Kuti-Ladipo/blob/main/images/sign%20lang.gif) ## Project description -***Overview:*** \ +***Overview:*** + + +## Project Roadmap + + +## How to Contribute + +We welcome contributions from the community. If you're interested in contributing, please refer to the [Contributing Guidelines](CONTRIBUTING.md). + +## Support and Contact + +If you have questions or need assistance, feel free to reach out to: + +**Name:** **Ipadeola Ezekiel Ladipo** +**Email:** +**GitHub:** [@rileydrizzy](https://github.com/rileydrizzy) +**Linkdeln:** [Ipadeola Ladipo](https://www.linkedin.com/in/ladipo-ipadeola/) + +--- diff --git a/linguify_yb/src/metrics.py b/signa2text/data/.gitkeep similarity index 100% rename from linguify_yb/src/metrics.py rename to signa2text/data/.gitkeep diff --git a/linguify_yb/data/dataset_paths.json b/signa2text/data/dataset_paths.json similarity index 100% rename from linguify_yb/data/dataset_paths.json rename to signa2text/data/dataset_paths.json diff --git a/signa2text/data/dev_samples.json b/signa2text/data/dev_samples.json new file mode 100644 index 00000000..a1f8506d --- /dev/null +++ b/signa2text/data/dev_samples.json @@ -0,0 +1,9 @@ +{ + "train_files": [ + "kaggle/input/asl-fingerspelling/train_landmarks/1019715464.parquet", + "kaggle/input/asl-fingerspelling/train_landmarks/1021040628.parquet" + ], + "valid_files": [ + "kaggle/input/asl-fingerspelling/train_landmarks/105143404.parquet" + ] +} \ No newline at end of file diff --git a/linguify_yb/development/code_dev.ipynb b/signa2text/development/code_dev.ipynb similarity index 100% rename from linguify_yb/development/code_dev.ipynb rename to signa2text/development/code_dev.ipynb diff --git a/linguify_yb/development/data_dev.ipynb b/signa2text/development/data_dev.ipynb similarity index 100% rename from linguify_yb/development/data_dev.ipynb rename to signa2text/development/data_dev.ipynb diff --git a/linguify_yb/development/dev.ipynb b/signa2text/development/dev.ipynb similarity index 100% rename from linguify_yb/development/dev.ipynb rename to signa2text/development/dev.ipynb diff --git a/linguify_yb/development/trans_dev.ipynb b/signa2text/development/trans_dev.ipynb similarity index 100% rename from linguify_yb/development/trans_dev.ipynb rename to signa2text/development/trans_dev.ipynb diff --git a/linguify_yb/notebooks/analyasis.ipynb b/signa2text/notebooks/analyasis.ipynb similarity index 100% rename from linguify_yb/notebooks/analyasis.ipynb rename to signa2text/notebooks/analyasis.ipynb diff --git a/signa2text/run_train.sh b/signa2text/run_train.sh new file mode 100644 index 00000000..9504319f --- /dev/null +++ b/signa2text/run_train.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# Display a header with script information +echo "=== Running Train Script ===" + +torchrun --standalone --nproc_per_node=1 src/main.py --model_name test_model --epoch 2 +#torchrun --standalone --nproc_per_node=1 src/main.py +#--epochs 10 --batch 512 \ No newline at end of file diff --git a/signa2text/src/benchmark.py b/signa2text/src/benchmark.py new file mode 100644 index 00000000..02a70d52 --- /dev/null +++ b/signa2text/src/benchmark.py @@ -0,0 +1,117 @@ +""" +Module for benchmarking a PyTorch model. + +This module provides a `BenchMarker` class for analyzing model metrics such as +Multiply-Accumulates(MACs), sparsity, the number of parameters, and model size. + +Classes: +- BenchMarker: A class for benchmarking a PyTorch model. + +Functions: +- get_model_macs: Calculate the MACs (Multiply-Accumulates) of a model. +- get_model_sparsity: Calculate the sparsity of a model. +- get_num_parameters: Calculate the total number of parameters of a model. +- get_model_size: Calculate the size of a model in bits. + + +""" +from torchprofile import profile_macs +from torch import nn + +Byte = 8 +KiB = 1024 * Byte +MiB = 1024 * KiB +GiB = 1024 * MiB + + +class BenchMarker: + """ + Benchmarking class to analyze model metrics such as MACs, + sparsity, number of parameters, and model size. + """ + + def __init__(self) -> None: + pass + + def get_model_macs(self, model, inputs=None) -> int: + """ + Calculate the Multiply-Accumulates (MACs) of a model. + + Parameters: + - model: The PyTorch model. + - inputs: The input tensor to the model. + + Returns: + - int: The number of MACs. + """ + return profile_macs(model, inputs) + + def get_model_sparsity(self, model: nn.Module) -> float: + """ + Calculate the sparsity of the given model. + + Sparsity is defined as 1 - (number of non-zeros / total number of elements). + + Parameters: + - model: The PyTorch model. + + Returns: + - float: The sparsity of the model. + """ + num_nonzeros, num_elements = 0, 0 + for param in model.parameters(): + num_nonzeros += param.count_nonzero() + num_elements += param.numel() + return 1 - float(num_nonzeros) / num_elements + + def get_num_parameters(self, model: nn.Module, count_nonzero_only=False) -> int: + """ + Calculate the total number of parameters of the model. + + Parameters: + - model: The PyTorch model. + - count_nonzero_only: If True, count only nonzero weights. + + Returns: + - int: The total number of parameters. + """ + num_counted_elements = 0 + for param in model.parameters(): + if count_nonzero_only: + num_counted_elements += param.count_nonzero() + else: + num_counted_elements += param.numel() + return num_counted_elements + + def get_model_size( + self, model: nn.Module, data_width=32, count_nonzero_only=False + ) -> int: + """ + Calculate the model size in bits. + + Parameters: + - model: The PyTorch model. + - data_width: Number of bits per element. + - count_nonzero_only: If True, count only nonzero weights. + + Returns: + - int: The model size in bits. + """ + return self.get_num_parameters(model, count_nonzero_only) * data_width + + def runner(self, model): + """ + Run the benchmark on the given model. + + Parameters: + - model: The PyTorch model. + + Returns: + - tuple: A tuple containing the model metrics + """ + model_macs = self.get_model_macs(model) + model_sparsity = self.get_model_sparsity(model) + model_num_params = self.get_num_parameters(model) + model_size = self.get_model_size(model) + + return model_macs, model_sparsity, model_num_params, model_size diff --git a/signa2text/src/config.py b/signa2text/src/config.py new file mode 100644 index 00000000..34ab84f8 --- /dev/null +++ b/signa2text/src/config.py @@ -0,0 +1,6 @@ +"""doc +""" +from pydantic import BaseModel + +class Arg_type(BaseModel): + save_every: int diff --git a/linguify_yb/src/models/__init__.py b/signa2text/src/config/config.yaml similarity index 100% rename from linguify_yb/src/models/__init__.py rename to signa2text/src/config/config.yaml diff --git a/signa2text/src/dataset/dataset_loader.py b/signa2text/src/dataset/dataset_loader.py new file mode 100644 index 00000000..de7cc4eb --- /dev/null +++ b/signa2text/src/dataset/dataset_loader.py @@ -0,0 +1,237 @@ +""" +Module to define datasets and dataloaders for ASL Fingerspelling project. + +Classes: +- TokenHashTable: A class for handling token-to-index and index-to-token mappings. +- LandmarkDataset: A dataset class for ASL Fingerspelling frames,\ + including methods for processing and cleaning frames. + +Functions: +- read_file: Read data from file based on file_id_list and landmarks_metadata_path. +- get_dataset: Create a dataset with token-to-index mapping. +- prepare_dataloader: Prepare a dataloader with distributed sampling. +""" + + +import json +import pandas as pd +import pyarrow.parquet as pq +import torch +from torch.nn import functional as F +from torch.utils.data import DataLoader, Dataset +from torch.utils.data.distributed import DistributedSampler +from dataset.frames_config import FEATURE_COLUMNS +from dataset.preprocess import clean_frames_process + +# File paths for metadata and phrase-to-index mapping +PHRASE_PATH = "/kaggle/input/asl-fingerspelling/character_to_prediction_index.json" +METADATA = "/kaggle/input/asl-fingerspelling/train.csv" + +# Load phrase-to-index mapping +with open(PHRASE_PATH, "r", encoding="utf-8") as f: + character_to_num = json.load(f) + +# Define special tokens and their corresponding indices +PAD_TOKEN = "P" +START_TOKEN = "<" +END_TOKEN = ">" +PAD_TOKEN_IDX = 59 +START_TOKEN_IDX = 60 +END_TOKEN_IDX = 61 + +# Add special tokens to the mapping +character_to_num[PAD_TOKEN] = PAD_TOKEN_IDX +character_to_num[START_TOKEN] = START_TOKEN_IDX +character_to_num[END_TOKEN] = END_TOKEN_IDX + +# Create a mapping from index to character +num_to_character = {j: i for i, j in character_to_num.items()} + + +class TokenHashTable: + def __init__( + self, word2index_mapping=character_to_num, index2word_mapping=num_to_character + ): + """ + Initialize a TokenHashTable to handle token-to-index and index-to-token mapping. + + Parameters: + word2index_mapping (dict): Mapping from word to index. + index2word_mapping (dict): Mapping from index to word. + """ + self.word2index = word2index_mapping + self.index2word = index2word_mapping + + def _indexesfromsentence(self, sentence): + """ + Convert a sentence into a list of corresponding indices. + + Parameters: + sentence (list): List of words in a sentence. + + Returns: + list: List of indices corresponding to words in the sentence. + """ + return [self.word2index[word] for word in sentence] + + def tensorfromsentence(self, sentence): + """ + Convert a sentence into a tensor of indices. + + Parameters: + sentence (list): List of words in a sentence. + + Returns: + torch.Tensor: Tensor of indices. + """ + indexes = self._indexesfromsentence(sentence) + return torch.tensor(indexes, dtype=torch.long) + + def indexes_to_sentence(self, indexes_list): + """ + Convert a list of indices into a list of corresponding words. + + Parameters: + indexes_list (list or torch.Tensor): List or tensor of indices. + + Returns: + list: List of words corresponding to the indices. + """ + if torch.is_tensor(indexes_list): + indexes_list = indexes_list.tolist() + words = [self.index2word[idx] for idx in indexes_list] + return words + + +def read_file(file_id_list, landmarks_metadata_path): + """ + Read data from file based on file_id_list and landmarks_metadata_path. + + Parameters: + file_id_list (list): List of tuples containing file paths and corresponding file_ids. + landmarks_metadata_path (str): Path to the metadata file. + + Returns: + tuple: A tuple containing lists of frames and phrases. + """ + phrase_list = [] + frames_list = [] + for file, file_id in file_id_list: + metadata_train_dataframe = pd.read_csv(landmarks_metadata_path) + file_id_df = metadata_train_dataframe.loc[ + metadata_train_dataframe["file_id"] == file_id + ] + saved_parquet_df = pq.read_table( + file, columns=["sequence_id"] + FEATURE_COLUMNS + ).to_pandas() + for seq_id, phrase in zip(file_id_df.sequence_id, file_id_df.phrase): + frames = saved_parquet_df[saved_parquet_df.index == seq_id].to_numpy() + # Handle NaN values + frames_list.append(torch.tensor(frames)) + phrase_list.append(phrase) + return frames_list, phrase_list + + +class LandmarkDataset(Dataset): + def __init__(self, file_path, table, transform=True): + """ + Initialize a LandmarkDataset. + + Parameters: + - file_path (_type_): _description_ + - table (_type_): _description_ + - transform (bool, optional): _description_, by default True + """ + self.landmarks_metadata_path = METADATA + self.frames, self.labels = read_file(file_path, self.landmarks_metadata_path) + self.trans = transform + self.table = table + + def _label_pre(self, label_sample): + """ + Preprocess label samples. + + Parameters: + - label_sample (_type_): _description_ + + Returns: + - _type_: _description_ + """ + sample = START_TOKEN + label_sample + END_TOKEN + new_phrase = self.table.tensorfromsentence(list(sample)) + ans = F.pad( + input=new_phrase, + pad=[0, 64 - new_phrase.shape[0]], + mode="constant", + value=PAD_TOKEN_IDX, + ) + return ans + + def __len__(self): + return len(self.labels) + + def __getitem__(self, idx): + if torch.is_tensor(idx): + idx = idx.tolist() + phrase = self.labels[idx] + frames = self.frames[idx] + + if self.trans: + phrase = self._label_pre(phrase) + frames = clean_frames_process(frames) + return frames, phrase + + +def get_dataset(file_path): + """ + Create a dataset with token-to-index mapping. + + Parameters: + - file_path (_type_): _description_ + + Returns: + - _type_: _description_ + """ + lookup_table = TokenHashTable(character_to_num, num_to_character) + dataset = LandmarkDataset(file_path, lookup_table, transform=True) + return dataset + + +def prepare_dataloader(dataset: Dataset, batch_size: int, num_workers_: int = 1): + """ + Prepare a dataloader with distributed sampling. + + Parameters: + dataset (Dataset): The dataset to load. + batch_size (int): Number of samples per batch. + num_workers_ (int, optional): Number of workers for data loading, by default 1. + + Returns: + DataLoader: A DataLoader instance for the specified dataset. + """ + return DataLoader( + dataset, + batch_size=batch_size, + pin_memory=True, + num_workers=num_workers_, + sampler=DistributedSampler(dataset), + ) + + +#! A dataset class for debugging the train pipeline +class TestDataset(Dataset): + def __init__(self, size): + self.size = size + self.data = [(torch.rand(20), torch.rand(1)) for _ in range(size)] + + def __len__(self): + return self.size + + def __getitem__(self, index): + return self.data[index] + + +#! Function to get a test dataset for debugging train pipeline +def get_test_dataset(): + dataset = TestDataset + return dataset diff --git a/signa2text/src/dataset/dataset_paths.py b/signa2text/src/dataset/dataset_paths.py new file mode 100644 index 00000000..3ea6fb0f --- /dev/null +++ b/signa2text/src/dataset/dataset_paths.py @@ -0,0 +1,46 @@ +"""doc +""" +import os +import json +from utils.logger_util import logger + + +def get_dataset_paths(): + """_summary_ + + Returns + ------- + _type_ + _description_ + """ + try: + # On kaggle replace with "data/dataset_paths.json" to train on full data + dataset_paths = "data/dev_samples.json" + with open(dataset_paths, "r", encoding="utf-8") as json_file: + dataset_paths_dict = json.load(json_file) + + # Training dataset + train_dataset_dict = dataset_paths_dict["train_files"] + train_file_ids = [os.path.basename(file) for file in train_dataset_dict] + train_file_ids = [ + int(file_name.replace(".parquet", "")) for file_name in train_file_ids + ] + assert len(train_dataset_dict) == len( + train_file_ids + ), "Failed getting Train files path" + train_ds_files = list(zip(train_dataset_dict, train_file_ids)) + + # Validation dataset + valid_dataset_dict = dataset_paths_dict["valid_files"] + valid_file_ids = [os.path.basename(file) for file in valid_dataset_dict] + valid_file_ids = [ + int(file_name.replace(".parquet", "")) for file_name in valid_file_ids + ] + assert len(train_dataset_dict) == len( + train_file_ids + ), "Failed getting of Valid Files path" + valid_ds_files = list(zip(valid_dataset_dict, valid_file_ids)) + + return train_ds_files, valid_ds_files + except AssertionError as asset_error: + logger.exception(f"Failed due to {asset_error}") diff --git a/linguify_yb/src/dataset/frames_config.py b/signa2text/src/dataset/frames_config.py similarity index 58% rename from linguify_yb/src/dataset/frames_config.py rename to signa2text/src/dataset/frames_config.py index 2f277968..a60ef371 100644 --- a/linguify_yb/src/dataset/frames_config.py +++ b/signa2text/src/dataset/frames_config.py @@ -1,51 +1,22 @@ -"""doc +""" +Module to define constants and lists related to ASL Fingerspelling frame features. +Variables: +- FRAME_LEN: +- LIP: +- FEATURE_COLOUMNS: """ - +# Length of each frame FRAME_LEN = 128 + +# Indices corresponding to lip features LIP = [ - 61, - 185, - 40, - 39, - 37, - 0, - 267, - 269, - 270, - 409, - 291, - 146, - 91, - 181, - 84, - 17, - 314, - 405, - 321, - 375, - 78, - 191, - 80, - 81, - 82, - 13, - 312, - 311, - 310, - 415, - 95, - 88, - 178, - 87, - 14, - 317, - 402, - 318, - 324, - 308, + 61, 185, 40, 39, 37, 0, 267, 269, 270, 409, 291, 146, 91, 181, 84, 17, 314, 405, 321, + 375, 78, 191, 80, 81, 82, 13, 312, 311, 310, 415, 95, 88, 178, 87, 14, 317, 402, 318, + 324, 308, ] +# Feature names for different body parts FRAME = ["frame"] N_LHAND = ( [f"x_left_hand_{i}" for i in range(21)] @@ -69,4 +40,5 @@ + [f"z_face_{i}" for i in LIP] ) +# Combined list of feature columns FEATURE_COLUMNS = FRAME + N_LHAND + N_RHAND + N_POSE + N_FACE diff --git a/signa2text/src/dataset/preprocess.py b/signa2text/src/dataset/preprocess.py new file mode 100644 index 00000000..7e927b9f --- /dev/null +++ b/signa2text/src/dataset/preprocess.py @@ -0,0 +1,61 @@ +""" +Module to define a function for cleaning and processing ASL Fingerspelling frames. +Functions: +- clean_frames_process: + +""" + +import torch +from torch.nn import functional as F + + +def clean_frames_process( + frames, + max_frame_len=128, + n_hand_landmarks=21, + n_pose_landmarks=33, + n_face_landmarks=40, +): + """Clean and process ASL Fingerspelling frames. + + Parameters + ---------- + frames : (torch.Tensor) + Input tensor containing frames. + max_frame_len : int, optional + Maximum length of frames, by default 128 + n_hand_landmarks : int, optional + Number of hand landmarks, by default 21 + n_pose_landmarks : int, optional + Number of pose landmarks, by default 33 + n_face_landmarks : int, optional + Number of face landmarks, by default 40 + + Returns + ------- + frames + torch.Tensor: Cleaned and processed frames tensor. + """ + # Clip frames to the maximum length + frames = frames[:max_frame_len] + # Replace NaN values with zeros + frames = torch.where(torch.isnan(frames), torch.zeros_like(frames), frames) + + # Split the tensor into different body part landmarks + lhand = frames[:, 0:63].view(frames.size(0), 3, n_hand_landmarks).transpose(1, 2) + rhand = frames[:, 63:126].view(frames.size(0), 3, n_hand_landmarks).transpose(1, 2) + pose = frames[:, 126:225].view(frames.size(0), 3, n_pose_landmarks).transpose(1, 2) + face = frames[:, 225:345].view(frames.size(0), 3, n_face_landmarks).transpose(1, 2) + + # Concatenate the landmarks along the specified axis + frames = torch.cat([lhand, rhand, pose, face], axis=1) + # Reshape the tensor + frames = frames.view(frames.size(0), 345) + + if frames.size(0) < max_frame_len: + # Calculate the padding on the first dimension from the bottom + padding_bottom = max(0, max_frame_len - frames.size(0)) + # Pad the tensor along the first dimension from the bottom + frames = F.pad(frames, (0, 0, 0, padding_bottom)) + + return frames diff --git a/linguify_yb/src/dev_data.py b/signa2text/src/dev_data.py similarity index 96% rename from linguify_yb/src/dev_data.py rename to signa2text/src/dev_data.py index a861d8b0..6f15731a 100644 --- a/linguify_yb/src/dev_data.py +++ b/signa2text/src/dev_data.py @@ -16,7 +16,7 @@ from utils.logger_util import logger -DATA_DIR = "data/asl-fingerspelling/" +DATA_DIR = "kaggle/input/asl-fingerspelling/" data_files = ["train.csv", "character_to_prediction_index.json"] train_landmarks = ["1019715464.parquet", "1021040628.parquet", "105143404.parquet"] TRAIN_LANDMARKS_DIR = "train_landmarks/" @@ -30,7 +30,7 @@ "-f", "FILE", "-p", - "data/raw/", + f"{DATA_DIR}", ] @@ -98,7 +98,7 @@ def main(): try: logger.info(f"Current Available space {check_storage()}GB") for file in data_files: - logger.info(f"Downloading{file} in {DATA_DIR}") + logger.info(f"Downloading {file} in {DATA_DIR}") COMMAND[6] = file unzipfile_path = DATA_DIR + file + ".zip" downlaod_file(COMMAND, unzipfile_path, DATA_DIR) diff --git a/linguify_yb/src/models/static_transfromer.py b/signa2text/src/evalute.py similarity index 100% rename from linguify_yb/src/models/static_transfromer.py rename to signa2text/src/evalute.py diff --git a/signa2text/src/main.py b/signa2text/src/main.py new file mode 100644 index 00000000..bdfeb63b --- /dev/null +++ b/signa2text/src/main.py @@ -0,0 +1,95 @@ +""" +Module for distributed training with PyTorch using Distributed Data Parallel (DDP). + +""" + +# TODO cleanup and complete documentation +# TODO Complete and refactor code for distributed training +# TODO remove test model and test data + +import torch + +from torch import nn + +from utils.util import parse_args, set_seed +from utils.logger_util import logger +from models.model_loader import ModelLoader +from dataset.dataset_loader import get_dataset, prepare_dataloader, get_test_dataset +from dataset.dataset_paths import get_dataset_paths +from trainer import Trainer, ddp_setup +from torch.distributed import destroy_process_group + + +def load_train_objs(model_name, files=None): + """ + Load training objects, including the model, optimizer, dataset, and criterion. + + Parameters: + - model_name (str): Name of the model to be loaded. + - files: Optional parameter for specifying files. + + Returns: + - model: The loaded model. + - optimizer_: The optimizer for training. + - dataset: The training dataset. + - criterion: The loss criterion for training. + """ + model = ModelLoader().get_model(model_name) + + # Optimizes given model/function using TorchDynamo and specified backend + torch.compile(model) + optimizer_ = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) + criterion = nn.CrossEntropyLoss(label_smoothing=0.1) + dataset = get_test_dataset() # get_dataset(files) + return model, optimizer_, dataset, criterion + + +def main(model_name: str, save_every: int, total_epochs: int, batch_size: int): + """ + Main function for training a model. + + Parameters: + - model_name (str): Name of the model to be trained. + - save_every (int): Frequency of saving the model during training. + - total_epochs (int): Total number of training epochs. + - batch_size (int): Batch size for training. + """ + logger.info(f"Starting training on {model_name}, epoch -> {total_epochs}") + logger.info(f"Batch Size -> {batch_size}, model saved every -> {save_every} epoch") + + # To ensure reproducibility of the training process + set_seed() + + try: + # train, valid = get_dataset_paths() + ddp_setup() + dataset, model, optimizer, criterion = load_train_objs(model_name) + train_dataset = prepare_dataloader( + dataset, + batch_size, + ) + trainer = Trainer( + model=model, + train_data=train_dataset, + optimizer=optimizer, + save_every=save_every, + loss_func=criterion, + ) + + trainer.train(total_epochs) + destroy_process_group() + + logger.success(f"Training completed: {total_epochs} epochs on.") + except Exception as error: + logger.exception(f"Training failed due to {error}.") + + +if __name__ == "__main__": + arg = parse_args() + logger.info(f"{arg.model_name}") + main( + model_name=arg.model_name, + save_every=arg.save_every, + total_epochs=arg.epochs, + batch_size=arg.batch, + ) diff --git a/linguify_yb/src/tests/__init__.py b/signa2text/src/metrics.py similarity index 100% rename from linguify_yb/src/tests/__init__.py rename to signa2text/src/metrics.py diff --git a/signa2text/src/utils/__init__.py b/signa2text/src/models/__init__.py similarity index 100% rename from signa2text/src/utils/__init__.py rename to signa2text/src/models/__init__.py diff --git a/linguify_yb/src/models/baseline_transformer.py b/signa2text/src/models/baseline_transformer.py similarity index 78% rename from linguify_yb/src/models/baseline_transformer.py rename to signa2text/src/models/baseline_transformer.py index 5fc151e2..9147d431 100644 --- a/linguify_yb/src/models/baseline_transformer.py +++ b/signa2text/src/models/baseline_transformer.py @@ -18,41 +18,46 @@ class TokenEmbedding(nn.Module): - """Embed the tokens with postion encoding""" + """Embed the tokens with position encoding""" def __init__(self, num_vocab, maxlen, embedding_dim): - """_summary_ - + """ Parameters ---------- num_vocab : int - number of vocabulary + number of character vocabulary maxlen : int - maximuin length of sequence + maximum length of sequence embedding_dim : int embedding output dimension """ super().__init__() self.token_embed_layer = nn.Embedding(num_vocab, embedding_dim) - self.postion_embed_layer = nn.Embedding(maxlen, embedding_dim) + self.position_embed_layer = nn.Embedding(maxlen, embedding_dim) def forward(self, x): - """_summary_ - + """ Parameters ---------- x : tensors - _description_ + input tensor with shape (batch_size, sequence_length) Returns ------- tensors - _description_ + embedded tensor with shape (batch_size, sequence_length, embedding_dim) """ - maxlen = x.size(-1) + batch_size, maxlen = x.size() + + # Token embedding x = self.token_embed_layer(x) + + # Positional encoding positions = torch.arange(0, maxlen).to(x.device) - positions = self.postion_embed_layer(positions) + positions = ( + self.position_embed_layer(positions).unsqueeze(0).expand(batch_size, -1, -1) + ) + return x + positions @@ -97,27 +102,27 @@ def forward(self, x): class TransformerEncoder(nn.Module): - """_summary_""" + """Transformer Encoder Module""" def __init__( self, embedding_dim, num_heads, feed_forward_dim, - rate=0.1, + dropout_rate=0.1, ): - """_summary_ + """Initialize the Transformer Encoder Parameters ---------- - embedding_dim : _type_ - _description_ - num_heads : _type_ - _description_ - feed_forward_dim : _type_ - _description_ - rate : float, optional - _description_, by default 0.1 + embedding_dim : int + Dimension of input embeddings + num_heads : int + Number of attention heads in the multi-head attention layer + feed_forward_dim : int + Dimension of the feed-forward layer + dropout_rate : float, optional + Dropout rate, by default 0.1 """ super().__init__() self.multi_attention = nn.MultiheadAttention(embedding_dim, num_heads) @@ -129,31 +134,55 @@ def __init__( self.layernorm1 = nn.LayerNorm(embedding_dim, eps=1e-6) self.layernorm2 = nn.LayerNorm(embedding_dim, eps=1e-6) - self.dropout1 = nn.Dropout(rate) - self.dropout2 = nn.Dropout(rate) + self.dropout1 = nn.Dropout(dropout_rate) + self.dropout2 = nn.Dropout(dropout_rate) def forward(self, inputs_x): + # Multi-head attention multi_attention_out, _ = self.multi_attention(inputs_x, inputs_x, inputs_x) multi_attention_out = self.dropout1(multi_attention_out) + + # Residual connection and layer normalization out1 = self.layernorm1(inputs_x + multi_attention_out) + # Feed-forward layer ffn_out = self.ffn(out1) ffn_out = self.dropout2(ffn_out) + + # Residual connection and layer normalization x = self.layernorm2(out1 + ffn_out) + return x class TransformerDecoder(nn.Module): - """_summary_""" + """Transformer Decoder Module""" def __init__(self, embedding_dim, num_heads, feed_forward_dim, dropout_rate=0.1): + """Initialize the Transformer Decoder + + Parameters + ---------- + embedding_dim : int + Dimension of input embeddings + num_heads : int + Number of attention heads in the multi-head attention layer + feed_forward_dim : int + Dimension of the feed-forward layer + dropout_rate : float, optional + Dropout rate, by default 0.1 + """ super().__init__() self.num_heads_ = num_heads self.layernorm1 = nn.LayerNorm(embedding_dim, eps=1e-6) self.layernorm2 = nn.LayerNorm(embedding_dim, eps=1e-6) self.layernorm3 = nn.LayerNorm(embedding_dim, eps=1e-6) - self.decoder_multi_attention = nn.MultiheadAttention(embedding_dim, num_heads) - self.encoder_multi_attention = nn.MultiheadAttention(embedding_dim, num_heads) + self.decoder_multi_attention = nn.MultiheadAttention( + embedding_dim, num_heads, batch_first=True + ) + self.encoder_multi_attention = nn.MultiheadAttention( + embedding_dim, num_heads, batch_first=True + ) self.decoder_dropout = nn.Dropout(0.5) self.encoder_dropout = nn.Dropout(dropout_rate) self.ffn_dropout = nn.Dropout(dropout_rate) @@ -163,7 +192,7 @@ def __init__(self, embedding_dim, num_heads, feed_forward_dim, dropout_rate=0.1) nn.Linear(feed_forward_dim, embedding_dim), ) - def _causal_attention_mask(self, sequence_length, batch_size=1, device=None): + def _causal_attention_mask(self, sequence_length, batch_size, device=None): mask = torch.triu(torch.ones(sequence_length, sequence_length), diagonal=1).to( device ) @@ -172,15 +201,10 @@ def _causal_attention_mask(self, sequence_length, batch_size=1, device=None): ) return mask - def forward( - self, - encoder_out, - src_target_, - ): - input_shape = src_target_.size() - batch_size = 1 # input_shape[0] - seq_len = input_shape[0] - x_device = src_target_.device + def forward(self, encoder_out, src_target): + input_shape = src_target.size() + batch_size, seq_len, _ = input_shape + x_device = src_target.device # Mask causal_mask = self._causal_attention_mask( @@ -188,12 +212,9 @@ def forward( ) target_att, _ = self.decoder_multi_attention( - src_target_, src_target_, src_target_, attn_mask=causal_mask - ) - target_norm_out = self.layernorm1( - src_target_ + self.decoder_dropout(target_att) + src_target, src_target, src_target, attn_mask=causal_mask ) - + target_norm_out = self.layernorm1(src_target + self.decoder_dropout(target_att)) encoder_out, _ = self.encoder_multi_attention( target_norm_out, encoder_out, encoder_out ) @@ -201,6 +222,7 @@ def forward( ffn_out = self.ffn(enc_out_norm) ffn_out_norm = self.layernorm3(enc_out_norm + self.ffn_dropout(ffn_out)) + return ffn_out_norm diff --git a/linguify_yb/src/models/model_loader.py b/signa2text/src/models/model_loader.py similarity index 68% rename from linguify_yb/src/models/model_loader.py rename to signa2text/src/models/model_loader.py index aabc97b7..529690f8 100644 --- a/linguify_yb/src/models/model_loader.py +++ b/signa2text/src/models/model_loader.py @@ -3,15 +3,21 @@ """ from models.baseline_transformer import ASLTransformer +import torch + + +def test_model(): + model = torch.nn.Sequential( + [torch.nn.Linear(20, 100), torch.nn.Linear(100, 10), torch.nn.Linear(10, 5)] + ) + return model class ModelLoader: """Model Loader""" def __init__(self): - self.models = { - "asl_transfomer": ASLTransformer(), - } + self.models = {"asl_transfomer": ASLTransformer(), "test_model": test_model()} def get_model(self, model_name): """build and retrieve the model instance diff --git a/linguify_yb/src/tests/test_pipeline.py b/signa2text/src/models/static_transfromer.py similarity index 100% rename from linguify_yb/src/tests/test_pipeline.py rename to signa2text/src/models/static_transfromer.py diff --git a/signa2text/src/tests/__init__.py b/signa2text/src/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/linguify_yb/src/tests/test_data_ingestion.py b/signa2text/src/tests/test_data_ingestion.py similarity index 70% rename from linguify_yb/src/tests/test_data_ingestion.py rename to signa2text/src/tests/test_data_ingestion.py index aaa5eedf..ab9d97f5 100644 --- a/linguify_yb/src/tests/test_data_ingestion.py +++ b/signa2text/src/tests/test_data_ingestion.py @@ -3,7 +3,6 @@ import pytest import torch -from torch.utils.data import DataLoader from src.dataset.frames_config import FRAME_LEN from src.dataset.preprocess import clean_frames_process from src.dataset.dataset_loader import TokenHashTable @@ -31,15 +30,34 @@ def test_token_hash_table(): token_table = TokenHashTable() sample_sentence = "this is a test run" sample_sentence_len = len(sample_sentence) - sample_sentence_token = [60,51,39,40,50,0,40,50,0,32,0,51,36,50,51,0,49,52,45,61] - # Padding the token - sample_sentence_token = sample_sentence_token + ( - [59] * (64 - len(sample_sentence_token)) - ) - sample_sentence_token = torch.tensor(sample_sentence_token) + sample_sentence_token = [ + 51, + 39, + 40, + 50, + 0, + 40, + 50, + 0, + 32, + 0, + 51, + 36, + 50, + 51, + 0, + 49, + 52, + 45, + ] + sample_sentence_token = torch.tensor(sample_sentence_token, dtype=torch.long) tokenize_result = token_table.sentence_to_tensor(sample_sentence) - assert sample_sentence_len == len(tokenize_result) - assert sample_sentence_token == tokenize_result + is_same = all( + torch.equal(idx1, idx2) + for idx1, idx2 in zip(sample_sentence_token, tokenize_result) + ) + assert sample_sentence_len == len(tokenize_result) + assert is_same == True # Assert that clean_frames is a PyTorch tensor assert torch.is_tensor(tokenize_result), "is not PyTorch tensor" diff --git a/linguify_yb/src/tests/test_model.py b/signa2text/src/tests/test_model.py similarity index 100% rename from linguify_yb/src/tests/test_model.py rename to signa2text/src/tests/test_model.py diff --git a/signa2text/src/tests/test_pipeline.py b/signa2text/src/tests/test_pipeline.py new file mode 100644 index 00000000..e69de29b diff --git a/signa2text/src/trainer.py b/signa2text/src/trainer.py new file mode 100644 index 00000000..c7c9587e --- /dev/null +++ b/signa2text/src/trainer.py @@ -0,0 +1,132 @@ +""" +Module for distributed training with PyTorch using Distributed Data Parallel (DDP). + +Classes: +- Trainer: A class for training neural network models in a distributed setup. + +Functions: +- ddp_setup: Setup Distributed Data Parallel (DDP) for training. +""" + +# TODO Complete and refactor code for distributed training + +import os + +import numpy as np +import torch +from torch.utils.data import DataLoader +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.distributed import init_process_group +from utils.logger_util import logger + + +def ddp_setup(): + """ + Setup Distributed Data Parallel (DDP) for training. + """ + init_process_group(backend="nccl") + torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) + + +class Trainer: + def __init__( + self, + model: torch.nn.Module, + train_data: DataLoader, + optimizer: torch.optim.Optimizer, + save_every: int, + loss_func, + ): + """ + Initialize a Trainer instance. + + Parameters: + - model (torch.nn.Module): The neural network model. + - train_data (DataLoader): The DataLoader for training data. + - optimizer (torch.optim.Optimizer): The optimizer for training. + - save_every (int): Save a snapshot of the model every `save_every` epochs. + - loss_func: The loss function for training. + """ + self.gpu_id = int(os.environ["LOCAL_RANK"]) + self.model = model.to(self.gpu_id) + self.train_data = train_data + self.optimizer = optimizer + self.loss_func = loss_func + self.save_every = save_every + self.epochs_run = 0 + self.snapshot_path = "snapshot.pt" + if os.path.exists(self.snapshot_path): + logger.info("Loading snapshot") + self._load_snapshot(self.snapshot_path) + + self.model = DDP(self.model, device_ids=[self.gpu_id]) + + def _load_snapshot(self, snapshot_path): + """ + Load a snapshot of the model. + + Parameters: + - snapshot_path (str): Path to the snapshot file. + """ + loc = f"cuda:{self.gpu_id}" + snapshot = torch.load(snapshot_path, map_location=loc) + self.model.load_state_dict(snapshot["MODEL_STATE"]) + self.epochs_run = snapshot["EPOCHS_RUN"] + logger.info(f"Resuming training from snapshot at Epoch {self.epochs_run}") + + def _run_batch(self, source, targets): + """ + Run a training batch. + + Parameters: + - source: _type_ + - targets: _type_ + """ + self.optimizer.zero_grad() + output = self.model(source) + loss = self.loss_func(output, targets) + loss.backward() + self.optimizer.step() + + def _run_epoch(self, epoch): + """ + Run a training epoch. + + Parameters: + - epoch (int): The current epoch. + """ + b_sz = len(next(iter(self.train_data))[0]) + logger.info( + f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}" + ) + self.train_data.sampler.set_epoch(epoch) + for source, targets in self.train_data: + source = source.to(self.gpu_id) + targets = targets.to(self.gpu_id) + self._run_batch(source, targets) + + def _save_snapshot(self, epoch): + """ + Save a snapshot of the model. + + Parameters: + - epoch (int): The current epoch. + """ + snapshot = { + "MODEL_STATE": self.model.module.state_dict(), + "EPOCHS_RUN": epoch, + } + torch.save(snapshot, self.snapshot_path) + logger.info(f"Epoch {epoch} | Training snapshot saved at {self.snapshot_path}") + + def train(self, max_epochs: int): + """ + Train the model for a specified number of epochs. + + Parameters: + - max_epochs (int): The maximum number of epochs to train. + """ + for epoch in range(self.epochs_run, max_epochs): + self._run_epoch(epoch) + if self.gpu_id == 0 and epoch % self.save_every == 0: + self._save_snapshot(epoch) diff --git a/signa2text/src/utils/logger_util.py b/signa2text/src/utils/logger_util.py index 618d207b..f81d88f9 100644 --- a/signa2text/src/utils/logger_util.py +++ b/signa2text/src/utils/logger_util.py @@ -10,7 +10,8 @@ - `logger.warning("Warning message")` - `logger.error("Error message")` - `logger.critical("Critical message")` - - `logger.succues("success messgae")` + - `logger.exception(" ")` + - `logger.success("success messgae")` """ from pathlib import Path diff --git a/linguify_yb/src/utils/util.py b/signa2text/src/utils/util.py similarity index 90% rename from linguify_yb/src/utils/util.py rename to signa2text/src/utils/util.py index c4371b7b..cd27952f 100644 --- a/linguify_yb/src/utils/util.py +++ b/signa2text/src/utils/util.py @@ -39,8 +39,8 @@ def parse_args(): ) parser.add_argument( - "--model", - default="asl_transfomer", + "--model_name", + default="baseline_transfomer", type=str, metavar="N", help="name of model to train", @@ -72,6 +72,12 @@ def parse_args(): type=bool, help="Path to the checkpoint for resuming training", ) + parser.add_argument( + "--save_every", + default= 2, + type=int, + help="", + ) args = parser.parse_args() return args diff --git a/version.txt b/version.txt index e69de29b..8a9ecc2e 100644 --- a/version.txt +++ b/version.txt @@ -0,0 +1 @@ +0.0.1 \ No newline at end of file