Shapley plot #126

Anshullllllll · 2024-11-09T08:44:14Z

###This is an inference script which uses fine tuned model please can anybody help me to incorporate some lines of code so that at the end it will also generate a shapley plot also
import os
import json
import argparse
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from typing import List, Dict
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, precision_score, recall_score
import random

def set_seed(seed: int):
"""Set seed for reproducibility."""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

Define the PredictionDataset class

class PredictionDataset(Dataset):
"""Dataset for predictions."""

def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, kmer: int = -1):
    super(PredictionDataset, self).__init__()
    
    # Load data from the disk
    with open(data_path, "r") as f:
        lines = [line.strip().split(',') for line in f.readlines()]

    self.texts = [line[0] for line in lines]
    self.true_labels = [int(line[1]) for line in lines]  # Assuming labels are in the second column

    if kmer != -1:
        # Generate k-mer string
        self.texts = [generate_kmer_str(text, kmer) for text in self.texts]

    output = tokenizer(
        self.texts,
        return_tensors="pt",
        padding="longest",
        max_length=tokenizer.model_max_length,
        truncation=True,
    )

    self.input_ids = output["input_ids"]
    self.attention_mask = output["attention_mask"]

def __len__(self):
    return len(self.input_ids)

def __getitem__(self, i) -> Dict[str, torch.Tensor]:
    return dict(
        input_ids=self.input_ids[i],
        attention_mask=self.attention_mask[i],
        true_label=torch.tensor(self.true_labels[i])
    )

def generate_kmer_str(sequence: str, k: int) -> str:
"""Generate k-mer string from DNA sequence."""
return " ".join([sequence[i:i+k] for i in range(len(sequence) - k + 1)])

def predict(model_path: str, data_path: str, kmer: int):
# Set seed for reproducibility
set_seed(42)

# Load the model and tokenizer with trust_remote_code=True
model = transformers.AutoModelForSequenceClassification.from_pretrained(
    model_path,
    trust_remote_code=True
)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# Load test data
test_dataset = PredictionDataset(
    data_path=data_path,
    tokenizer=tokenizer,
    kmer=kmer
)

#### Prepare data loader
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

#### Run prediction
model.eval()
all_predictions = []
all_true_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        true_labels = batch["true_label"].numpy()

        # Move to GPU if available
        if torch.cuda.is_available():
            model.cuda()
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        
        all_predictions.extend(preds.tolist())
        all_true_labels.extend(true_labels.tolist())

####Calculate metrics
accuracy = accuracy_score(all_true_labels, all_predictions)
f1 = f1_score(all_true_labels, all_predictions, average="macro", zero_division=0)
mcc = matthews_corrcoef(all_true_labels, all_predictions)
precision = precision_score(all_true_labels, all_predictions, average="macro", zero_division=0)
recall = recall_score(all_true_labels, all_predictions, average="macro", zero_division=0)

# Save metrics and predictions
results = {
    "predictions": all_predictions,
    "metrics": {
        "accuracy": accuracy,
        "f1_score": f1,
        "mcc": mcc,
        "precision": precision,
        "recall": recall
    }
}

results_path = os.path.join(model_path, "predictions.json")
with open(results_path, "w") as f:
    json.dump(results, f, indent=4)

print(f"Predictions and metrics saved to {results_path}")

def main():
parser = argparse.ArgumentParser(description="Run predictions with a fine-tuned model.")
parser.add_argument("--model_path", type=str, required=True, help="Path to the fine-tuned model.")
parser.add_argument("--data_path", type=str, required=True, help="Path to the data for prediction.")
parser.add_argument("--kmer", type=int, default=-1, help="k-mer size for sequence input. Default is -1 (no k-mer).")

args = parser.parse_args()

predict(model_path=args.model_path, data_path=args.data_path, kmer=args.kmer)

if name == "main":
main()

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Shapley plot #126

Shapley plot #126

Anshullllllll commented Nov 9, 2024 •

edited

Loading

Shapley plot #126

Shapley plot #126

Comments

Anshullllllll commented Nov 9, 2024 • edited Loading

Define the PredictionDataset class

Anshullllllll commented Nov 9, 2024 •

edited

Loading