You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
###This is an inference script which uses fine tuned model please can anybody help me to incorporate some lines of code so that at the end it will also generate a shapley plot also
import os
import json
import argparse
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from typing import List, Dict
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, precision_score, recall_score
import random
class PredictionDataset(Dataset):
"""Dataset for predictions."""
def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, kmer: int = -1):
super(PredictionDataset, self).__init__()
# Load data from the disk
with open(data_path, "r") as f:
lines = [line.strip().split(',') for line in f.readlines()]
self.texts = [line[0] for line in lines]
self.true_labels = [int(line[1]) for line in lines] # Assuming labels are in the second column
if kmer != -1:
# Generate k-mer string
self.texts = [generate_kmer_str(text, kmer) for text in self.texts]
output = tokenizer(
self.texts,
return_tensors="pt",
padding="longest",
max_length=tokenizer.model_max_length,
truncation=True,
)
self.input_ids = output["input_ids"]
self.attention_mask = output["attention_mask"]
def __len__(self):
return len(self.input_ids)
def __getitem__(self, i) -> Dict[str, torch.Tensor]:
return dict(
input_ids=self.input_ids[i],
attention_mask=self.attention_mask[i],
true_label=torch.tensor(self.true_labels[i])
)
def generate_kmer_str(sequence: str, k: int) -> str:
"""Generate k-mer string from DNA sequence."""
return " ".join([sequence[i:i+k] for i in range(len(sequence) - k + 1)])
def predict(model_path: str, data_path: str, kmer: int):
# Set seed for reproducibility
set_seed(42)
# Load the model and tokenizer with trust_remote_code=True
model = transformers.AutoModelForSequenceClassification.from_pretrained(
model_path,
trust_remote_code=True
)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Load test data
test_dataset = PredictionDataset(
data_path=data_path,
tokenizer=tokenizer,
kmer=kmer
)
#### Prepare data loader
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)
#### Run prediction
model.eval()
all_predictions = []
all_true_labels = []
with torch.no_grad():
for batch in test_loader:
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
true_labels = batch["true_label"].numpy()
# Move to GPU if available
if torch.cuda.is_available():
model.cuda()
input_ids = input_ids.cuda()
attention_mask = attention_mask.cuda()
# Forward pass
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits
preds = torch.argmax(logits, dim=-1).cpu().numpy()
all_predictions.extend(preds.tolist())
all_true_labels.extend(true_labels.tolist())
####Calculate metrics
accuracy = accuracy_score(all_true_labels, all_predictions)
f1 = f1_score(all_true_labels, all_predictions, average="macro", zero_division=0)
mcc = matthews_corrcoef(all_true_labels, all_predictions)
precision = precision_score(all_true_labels, all_predictions, average="macro", zero_division=0)
recall = recall_score(all_true_labels, all_predictions, average="macro", zero_division=0)
# Save metrics and predictions
results = {
"predictions": all_predictions,
"metrics": {
"accuracy": accuracy,
"f1_score": f1,
"mcc": mcc,
"precision": precision,
"recall": recall
}
}
results_path = os.path.join(model_path, "predictions.json")
with open(results_path, "w") as f:
json.dump(results, f, indent=4)
print(f"Predictions and metrics saved to {results_path}")
def main():
parser = argparse.ArgumentParser(description="Run predictions with a fine-tuned model.")
parser.add_argument("--model_path", type=str, required=True, help="Path to the fine-tuned model.")
parser.add_argument("--data_path", type=str, required=True, help="Path to the data for prediction.")
parser.add_argument("--kmer", type=int, default=-1, help="k-mer size for sequence input. Default is -1 (no k-mer).")
###This is an inference script which uses fine tuned model please can anybody help me to incorporate some lines of code so that at the end it will also generate a shapley plot also
import os
import json
import argparse
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from typing import List, Dict
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, precision_score, recall_score
import random
def set_seed(seed: int):
"""Set seed for reproducibility."""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
Define the PredictionDataset class
class PredictionDataset(Dataset):
"""Dataset for predictions."""
def generate_kmer_str(sequence: str, k: int) -> str:
"""Generate k-mer string from DNA sequence."""
return " ".join([sequence[i:i+k] for i in range(len(sequence) - k + 1)])
def predict(model_path: str, data_path: str, kmer: int):
# Set seed for reproducibility
set_seed(42)
def main():
parser = argparse.ArgumentParser(description="Run predictions with a fine-tuned model.")
parser.add_argument("--model_path", type=str, required=True, help="Path to the fine-tuned model.")
parser.add_argument("--data_path", type=str, required=True, help="Path to the data for prediction.")
parser.add_argument("--kmer", type=int, default=-1, help="k-mer size for sequence input. Default is -1 (no k-mer).")
if name == "main":
main()
The text was updated successfully, but these errors were encountered: