diff --git a/nlp/__init__.py b/nlp/__init__.py index 3c59c3c..3653ff3 100644 --- a/nlp/__init__.py +++ b/nlp/__init__.py @@ -14,11 +14,11 @@ # such as where data will be downloaded from. # here is an example. def write_default_config(path): - w = open(path, 'wt') - w.write('[data]\n') - w.write('url = https://drive.google.com/drive/folders/1gF0E9E8w1x-yz5FvxS8zFZlSNIivYfhT/train.csv\n') - w.write('file = %s%s%s\n' % (nlp_path, os.path.sep, 'train.csv')) - w.close() + with open(path, 'wt') as w: + w.write('[data]\n') + w.write('url = https://www.dropbox.com/scl/fi/8afm3cbr1ui1j3qrtv1u9/train.csv?rlkey=d0y73zduv1ira37d5xyd0sg2m&st=tfkqctcq&dl=1\n') # Corrected URL + w.write('file = %s%s%s\n' % (os.path.dirname(path), os.path.sep, 'nli.csv')) # Corrected 'file' option + # Find NLP_HOME path if 'NLP_HOME' in os.environ: diff --git a/nlp/cli.py b/nlp/cli.py index a273e40..df2982d 100644 --- a/nlp/cli.py +++ b/nlp/cli.py @@ -5,6 +5,7 @@ import glob import pickle import sys +import os import numpy as np import pandas as pd @@ -18,7 +19,7 @@ from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report -from . import clf_path, config, config_path +from . import clf_path, config, config_path, write_default_config model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7" tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -41,29 +42,52 @@ def web(port): from .app import app app.run(host='0.0.0.0', debug=True, port=port) + @main.command('dl-data') def dl_data(): """ Download training/testing data. """ - print("Config file path:", config_path) - config.read(config_path) # Reload the configuration - # data_url = config.get('data', 'url') - # data_file = config.get('data', 'file') - data_url = 'https://www.dropbox.com/scl/fi/8afm3cbr1ui1j3qrtv1u9/train.csv?rlkey=d0y73zduv1ira37d5xyd0sg2m&dl=0' - data_file = '/Users/petersapountzis/.nlp/nli_train.csv' - print('downloading from %s to %s' % (data_url, data_file)) + # Rewrite the default configuration to make sure it's updated + write_default_config(config_path) + + # Now reload the configuration to check the new values + config.read(config_path) + data_url = config.get('data', 'url') + data_file = config.get('data', 'file') + + print("configuration content:") + print("URL:", data_url) + print("File:", data_file) + + # Proceed with the data download + print('Downloading from %s to %s' % (data_url, data_file)) r = requests.get(data_url) + r.raise_for_status() # Ensure successful request with open(data_file, 'wt') as f: f.write(r.text) - + + def load_and_tokenize_data(file_path): df = pd.read_csv(file_path) + print("Columns in CSV:", df.columns) # Display column names + print("Number of rows:", len(df)) # Display number of rows + + + # Check if required columns are present + required_columns = ['premise', 'hypothesis', 'label'] + missing_columns = [col for col in required_columns if col not in df.columns] + + if missing_columns: + raise KeyError(f"Missing required columns: {', '.join(missing_columns)}") + df['premise'] = df['premise'].astype(str) df['hypothesis'] = df['hypothesis'].astype(str) + tokenized_data = tokenizer(df['premise'].tolist(), df['hypothesis'].tolist(), padding=True, truncation=True, return_tensors="pt") labels = torch.tensor(df['label'].values) + return tokenized_data, labels def train_model(data_file): @@ -90,6 +114,7 @@ def train_model(data_file): @main.command('stats') +#TODO: update stats function for my df def stats(): """ Read the data files and print interesting statistics. @@ -100,9 +125,17 @@ def stats(): print(df.partisan.value_counts()) @main.command('train') -@click.argument('data_file', type=click.Path(exists=True)) -def train(data_file): +def train(): """Train the NLI classifier.""" + config.read(config_path) # Reload the configuration to get the correct file path + + # Get the file path for training data from the configuration + data_file = config.get('data', 'file') + + if not data_file or not os.path.exists(data_file): + raise FileNotFoundError("Training data file not found. Please run ' nlp dl-data' first.") + + # Proceed with training using the correct data file train_model(data_file) print("Training complete.")