diff --git a/src/data_processing.py b/src/data_processing.py
index ccff97c..20eb352 100644
--- a/src/data_processing.py
+++ b/src/data_processing.py
@@ -89,7 +89,7 @@ def preprocess_images(
             allow_empty = True
         else:
             allow_empty = False
-            
+
         crop_annotation = process_image(
             image_path=image_path,
             annotation_df=annotation_df,
@@ -141,6 +141,12 @@ def process_image(
         
     full_path = os.path.join(root_dir, image_path)
     
+    # Check if all xmin values are 0, indicating empty annotations
+    if annotation_df is not None and all(annotation_df['xmin'] == 0):
+        allow_empty = True
+    else:
+        allow_empty = False
+    
     crop_annotation = preprocess.split_raster(
         path_to_raster=full_path,
         annotations_file=annotation_df,
diff --git a/src/model.py b/src/model.py
index 0724640..ab67ac6 100644
--- a/src/model.py
+++ b/src/model.py
@@ -6,13 +6,11 @@
 import warnings
 from logging import warn
 import math
-from datetime import datetime
 
 # Third party imports
 import dask.array as da
 import pandas as pd
 from deepforest import main, visualize
-from deepforest.utilities import read_file
 from pytorch_lightning.loggers import CometLogger
 
 # Local imports
@@ -87,7 +85,6 @@ def create_train_test(annotations, train_test_split = 0.1):
         pd.DataFrame: A DataFrame containing training annotations.
         pd.DataFrame: A DataFrame containing validation annotations.
     """
-    tmpdir = tempfile.gettempdir()
     # split train images into 90% train and 10% validation for each class as much as possible
     test_images = []
     validation_df = None
@@ -164,7 +161,7 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_pro
         model.create_trainer()
     
     with comet_logger.experiment.context_manager("train_images"):
-        non_empty_train_annotations = train_annotations[train_annotations.xmax.notnull()]
+        non_empty_train_annotations = train_annotations[~(train_annotations.xmax==0)]
         if non_empty_train_annotations.empty:
             pass
         else:
@@ -178,7 +175,7 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_pro
     model.trainer.fit(model)
     
     with comet_logger.experiment.context_manager("post-training prediction"):
-        for image_path in test_annotations.image_path.sample(5):
+        for image_path in test_annotations.image_path.head(5):
             prediction = model.predict_image(path = os.path.join(train_image_dir, image_path))
             if prediction is None:
                 continue
diff --git a/src/pipeline.py b/src/pipeline.py
index 3a05c38..c6c7c5f 100644
--- a/src/pipeline.py
+++ b/src/pipeline.py
@@ -37,7 +37,7 @@ def run(self):
             label_propagator.through_time(new_annotations)
             
         if self.config.train.validation_csv_path is not None:
-            validation_df = pd.read_csv(self.config.validation_csv_path)
+            validation_df = pd.read_csv(self.config.train.validation_csv_path)
         else:
             validation_df = None
 
diff --git a/src/pipeline_evaluation.py b/src/pipeline_evaluation.py
index 6ccf9d6..1fa69be 100644
--- a/src/pipeline_evaluation.py
+++ b/src/pipeline_evaluation.py
@@ -6,11 +6,12 @@
 import pandas as pd
 
 class PipelineEvaluation:
-    def __init__(self, model, detect_ground_truth_dir=None, classify_confident_ground_truth_dir=None, classify_uncertain_ground_truth_dir=None, detection_true_positive_threshold=0.8, detection_false_positive_threshold=0.5, classification_avg_score=0.5, target_labels=None, patch_size=450, patch_overlap=0, min_score=0.5):
+    def __init__(self, model, image_dir, detect_ground_truth_dir=None, classify_confident_ground_truth_dir=None, classify_uncertain_ground_truth_dir=None, detection_true_positive_threshold=0.8, detection_false_positive_threshold=0.5, classification_avg_score=0.5, target_labels=None, patch_size=450, patch_overlap=0, min_score=0.5):
         """Initialize pipeline evaluation.
         
         Args:
             model: Trained model for making predictions
+            image_dir (str): Directory containing images
             detect_ground_truth_dir (str): Directory containing detection ground truth annotation CSV files
             classify_confident_ground_truth_dir (str): Directory containing confident classification ground truth annotation CSV files
             classify_uncertain_ground_truth_dir (str): Directory containing uncertain classification ground truth annotation CSV files
@@ -28,7 +29,10 @@ def __init__(self, model, detect_ground_truth_dir=None, classify_confident_groun
         self.patch_size = patch_size
         self.patch_overlap = patch_overlap
         self.min_score = min_score
-
+        self.detection_ground_truth_dir = detect_ground_truth_dir
+        self.confident_classification_ground_truth_dir = classify_confident_ground_truth_dir
+        self.uncertain_classification_ground_truth_dir = classify_uncertain_ground_truth_dir
+        self.image_dir = image_dir
         self.detection_annotations_df = gather_data(detect_ground_truth_dir)
         self.confident_classification_annotations_df = gather_data(classify_confident_ground_truth_dir)
         self.uncertain_classification_annotations_df = gather_data(classify_uncertain_ground_truth_dir)
@@ -56,9 +60,10 @@ def _format_targets(self, annotations_df):
         return targets
 
     def evaluate_detection(self):
+        full_image_paths = [self.image_dir + "/" + image_path for image_path in self.detection_annotations_df.image_path.tolist()] 
         preds = predict(
-            model=self.model,
-            image_paths=self.detection_annotations_df.image_path.tolist(), 
+            m=self.model,
+            image_paths=full_image_paths, 
             patch_size=self.patch_size, 
             patch_overlap=self.patch_overlap, 
             min_score=self.min_score
@@ -67,15 +72,19 @@ def evaluate_detection(self):
 
         self.mAP.update(preds=preds, target=targets)
 
-        return self.mAP.compute()
+        results = {"mAP": self.mAP.compute()}
+
+        return results
 
     def confident_classification_accuracy(self):
         self.classification_accuracy.update(self.classification_confident_annotations_df)
-        return self.classification_accuracy.compute()
+        results = {"confident_classification_accuracy": self.classification_accuracy.compute()}
+        return results
 
     def uncertain_classification_accuracy(self):
         self.classification_accuracy.update(self.classification_uncertain_annotations_df)
-        return self.classification_accuracy.compute()
+        results = {"uncertain_classification_accuracy": self.classification_accuracy.compute()}
+        return results
 
     def target_classification_accuracy(self):
         # Combine confident and uncertain classifications
@@ -83,7 +92,8 @@ def target_classification_accuracy(self):
         if self.target_classes is not None:
             self.confident_classification_accuracy.update(combined_annotations_df, self.target_classes)
             self.uncertain_classification_accuracy.update(combined_annotations_df, self.target_classes)
-            return self.confident_classification_accuracy.compute(), self.uncertain_classification_accuracy.compute()
+            results = {"target_classification_accuracy": {"confident_classification_accuracy": self.confident_classification_accuracy.compute(), "target_uncertain_classification_accuracy": self.uncertain_classification_accuracy.compute()}}
+            return results
         else:
             return None, None
 
@@ -92,9 +102,16 @@ def evaluate(self):
         Evaluate pipeline performance for both detection and classification
             
         """
-        self.detection_results = self.evaluate_detection()
-        self.confident_classification_results = self.confident_classification_accuracy()
-        self.uncertain_classification_results = self.uncertain_classification_accuracy()
+        results = {}
+        detection_results = self.evaluate_detection()
+        confident_classification_results = self.confident_classification_accuracy()
+        uncertain_classification_results = self.uncertain_classification_accuracy()
+        if self.target_classes is not None:
+            target_classification_results = self.target_classification_accuracy()
+
+        results = {"detection": detection_results, "confident_classficiation":confident_classification_results, "uncertain_classification":uncertain_classification_results, "target_classification":target_classification_results}
+        
+        return results
     
     def check_success(self):
         """Check if pipeline performance is satisfactory"""
diff --git a/tests/conftest.py b/tests/conftest.py
index 8710258..630fa90 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -40,23 +40,39 @@ def config(tmpdir_factory):
             shutil.copy("tests/data/" + f, cfg.train.train_image_dir)
 
     # Create sample bounding box annotations
-    data = {
-        'image_path': ['empty.jpg', 'birds.jpg', 'birds_val.jpg'],
+    train_data = {
+        'image_path': ['empty.jpg', 'birds.jpg',"birds.jpg"],
         'xmin': [0, 200, 150],
-        'ymin': [0, 300, 250], 
+        'ymin': [0, 300, 250],  
         'xmax': [0, 300, 250],
         'ymax': [0, 400, 350],
-        'label': ['Bird', 'Bird', 'Bird'],
+        'label': ['Bird', 'Bird1', 'Bird2'],
         'annotator': ['test_user', 'test_user', 'test_user']
     }
 
-    # Create DataFrame
-    df = pd.DataFrame(data)
+    val_data = {
+        'image_path': ['birds_val.jpg', 'birds_val.jpg'],
+        'xmin': [150, 150],
+        'ymin': [250, 250],
+        'xmax': [250, 250], 
+        'ymax': [350, 350],
+        'label': ['Bird1', 'Bird2'],
+        'annotator': ['test_user', 'test_user']
+    }
+
+    # Create DataFrames
+    train_df = pd.DataFrame(train_data)
+    val_df = pd.DataFrame(val_data)
+
+    # Save training data to CSV
+    train_csv_path = os.path.join(cfg.train.train_csv_folder, 'training_data.csv')
+    train_df.to_csv(train_csv_path, index=False)
 
-    # Save to CSV in the configured training directory
-    csv_path = os.path.join(cfg.train.train_csv_folder, 'training_data.csv')
-    df.to_csv(csv_path, index=False)
+    # Save validation data to CSV 
+    val_csv_path = os.path.join(cfg.train.train_csv_folder, 'validation.csv')
+    val_df.to_csv(val_csv_path, index=False)
 
+    cfg.train.validation_csv_path = val_csv_path
     cfg.train.fast_dev_run = True
     cfg.checkpoint = "bird"
     cfg.train.checkpoint_dir = tmpdir_factory.mktemp("checkpoints").strpath
@@ -64,16 +80,16 @@ def config(tmpdir_factory):
     # Create detection annotations
     cfg.pipeline_evaluation.detect_ground_truth_dir = tmpdir_factory.mktemp("detection_annotations").strpath
     csv_path = os.path.join(cfg.pipeline_evaluation.detect_ground_truth_dir, 'detection_annotations.csv')
-    df.to_csv(csv_path, index=False)
+    val_df.to_csv(csv_path, index=False)
 
     # Create classification annotations
     cfg.pipeline_evaluation.classify_confident_ground_truth_dir = tmpdir_factory.mktemp("confident_classification_annotations").strpath
     csv_path = os.path.join(cfg.pipeline_evaluation.classify_confident_ground_truth_dir, 'confident_classification_annotations.csv')
-    df.to_csv(csv_path, index=False)
+    val_df.to_csv(csv_path, index=False)
 
     cfg.pipeline_evaluation.classify_uncertain_ground_truth_dir = tmpdir_factory.mktemp("uncertain_classification_annotations").strpath
     csv_path = os.path.join(cfg.pipeline_evaluation.classify_uncertain_ground_truth_dir, 'uncertain_classification_annotations.csv')
-    df.to_csv(csv_path, index=False)
+    val_df.to_csv(csv_path, index=False)
     
     return cfg
     
diff --git a/tests/test_pipeline_evaluation.py b/tests/test_pipeline_evaluation.py
index e69de29..40f59c7 100644
--- a/tests/test_pipeline_evaluation.py
+++ b/tests/test_pipeline_evaluation.py
@@ -0,0 +1,27 @@
+from src.pipeline_evaluation import PipelineEvaluation
+from deepforest import main
+
+def test_pipeline_evaluation(config):
+    m = main.deepforest()
+    pipeline_evaluation = PipelineEvaluation(model=m, **config.pipeline_evaluation)
+    performance = pipeline_evaluation.evaluate()
+
+def test_check_success(config):
+    m = main.deepforest()
+    pipeline_evaluation = PipelineEvaluation(model=m, **config.pipeline_evaluation)
+    assert pipeline_evaluation.check_success() is False
+
+def test_evaluate_detection(config):
+    m = main.deepforest()
+    pipeline_evaluation = PipelineEvaluation(model=m, **config.pipeline_evaluation)
+    detection_results = pipeline_evaluation.evaluate_detection()
+    assert detection_results["mAP"] is not None
+
+def test_confident_classification_accuracy(config):
+    m = main.deepforest()
+    pipeline_evaluation = PipelineEvaluation(model=m, **config.pipeline_evaluation)
+    confident_classification_accuracy = pipeline_evaluation.confident_classification_accuracy()
+    assert confident_classification_accuracy is not None
+
+    
+