Skip to content

Commit

Permalink
mocked eval tests pass, cropmodel fails
Browse files Browse the repository at this point in the history
  • Loading branch information
bw4sz committed Nov 18, 2024
1 parent c9a5ba3 commit 996e282
Show file tree
Hide file tree
Showing 10 changed files with 301 additions and 181 deletions.
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# ML Pipeline Project
# Bird Detection Pipeline

A modular machine learning pipeline for data processing, model training, evaluation, and deployment with pre-annotation prediction capabilities for Bureau of Ocean Energy Management (BOEM) data.
A machine learning pipeline for detecting and annotating birds in aerial imagery.

## Project Structure

Expand Down Expand Up @@ -29,7 +29,6 @@ project_root/
│ ├── test_model_deployment.py
│ ├── test_monitoring.py
│ ├── test_reporting.py
│ └── test_pre_annotation_prediction.py
├── conf/ # Configuration files
│ └── config.yaml # Main configuration file
Expand All @@ -54,7 +53,6 @@ project_root/
- **model_deployment.py**: Manages model deployment
- **monitoring.py**: Provides monitoring and logging capabilities
- **reporting.py**: Generates reports for pipeline results
- **pre_annotation_prediction.py**: Handles pre-annotation model predictions
- **annotation/**: Contains annotation-related functionality
- **pipeline.py**: Implements the annotation pipeline

Expand Down
15 changes: 12 additions & 3 deletions conf/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,26 +27,35 @@ propagate:
time_threshold_seconds: 5
distance_threshold_pixels: 50

train:
detection_model:
validation_csv_path:
train_csv_folder:
train_image_dir:
crop_image_dir:
under_sample_ratio: 0
limit_empty_frac: 0
fast_dev_run: false
checkpoint_dir:
labels:
- "Bird"

classification_model:
validation_csv_path:
train_csv_folder:
under_sample_ratio: 0
fast_dev_run: false

pipeline_evaluation:
detect_ground_truth_dir:
classify_confident_ground_truth_dir:
classify_uncertain_ground_truth_dir:
# This is an average mAP threshold for now, but we may want to add a per-iou threshold in the future
detection_true_positive_threshold: 0.8
detection_false_positive_threshold: 0.5
classification_avg_score: 0.5
target_labels:
image_dir:

reporting:
report_dir:

active_learning:
images_to_annotate_dir:
Expand Down
29 changes: 15 additions & 14 deletions src/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,13 +184,13 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_pro

return model

def preprocess_and_train(config, validation_df=None):
def preprocess_and_train(config, validation_df=None, model_type="detection"):
"""Preprocess data and train model.
Args:
config: Configuration object containing training parameters
validation_df (pd.DataFrame): A DataFrame containing validation annotations.
model_type (str): The type of model to train. Defaults to "detection".
Returns:
trained_model: Trained model object
"""
Expand Down Expand Up @@ -257,7 +257,7 @@ def get_latest_checkpoint(checkpoint_dir, annotations):

return m

def _predict_list_(image_paths, min_score, patch_size, patch_overlap, model_path, m=None):
def _predict_list_(image_paths, patch_size, patch_overlap, model_path, m=None, crop_model=None):
if model_path:
m = load(model_path)
else:
Expand All @@ -270,20 +270,21 @@ def _predict_list_(image_paths, min_score, patch_size, patch_overlap, model_path

predictions = []
for image_path in image_paths:
try:
prediction = m.predict_tile(raster_path=image_path, return_plot=False, patch_size=patch_size, patch_overlap=patch_overlap)
except ValueError:
continue
prediction = prediction[prediction.score > min_score]
predictions.append(prediction)
prediction = m.predict_tile(raster_path=image_path, return_plot=False, patch_size=patch_size, patch_overlap=patch_overlap, crop_model=crop_model)
if prediction is None:
prediction = pd.DataFrame({"image_path": image_path, "xmin": [None], "ymin": [None], "xmax": [None], "ymax": [None], "label": [None], "score": [None]})
predictions.append(prediction)

return predictions

def predict(image_paths, patch_size, patch_overlap, min_score, m=None, model_path=None,dask_client=None):
def predict(image_paths, patch_size, patch_overlap, m=None, model_path=None, dask_client=None, crop_model=None):
"""Predict bounding boxes for images
Args:
m (main.deepforest): A trained deepforest model.
image_paths (list): A list of image paths.
image_paths (list): A list of image paths.
crop_model (main.deepforest): A trained deepforest model for classification.
model_path (str): The path to a model checkpoint.
dask_client (dask.distributed.Client): A dask client for parallel prediction.
Returns:
list: A list of image predictions.
"""
Expand All @@ -304,16 +305,16 @@ def update_sys_path():
image_paths=block.compute(),
patch_size=patch_size,
patch_overlap=patch_overlap,
min_score=min_score,
model_path=model_path,
m=m)
m=m,
crop_model=crop_model)
block_futures.append(block_future)
# Get results
predictions = []
for block_result in block_futures:
block_result = block_result.result()
predictions.append(pd.concat(block_result))
else:
predictions = _predict_list_(image_paths=image_paths, patch_size=patch_size, patch_overlap=patch_overlap, min_score=min_score, model_path=model_path, m=m)
predictions = _predict_list_(image_paths=image_paths, patch_size=patch_size, patch_overlap=patch_overlap, model_path=model_path, m=m, crop_model=crop_model)

return predictions
48 changes: 29 additions & 19 deletions src/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
from src.active_learning import choose_train_images, choose_test_images
from src import propagate
from src import label_studio
from src.model import preprocess_and_train
from src.model import preprocess_and_train, predict
from src.pipeline_evaluation import PipelineEvaluation
from src.pre_annotation_prediction import PreAnnotationPrediction
from src.reporting import Reporting

class Pipeline:
Expand All @@ -19,9 +18,9 @@ def __init__(self, cfg: DictConfig):
self.label_studio_project = label_studio.connect_to_label_studio(url=self.config.label_studio.url, project_name=self.config.label_studio.project_name)
self.sftp_client = label_studio.create_sftp_client(**self.config.server)

def save_model(self, model):
def save_model(self, model, directory):
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
checkpoint_path = os.path.join(self.config.train.checkpoint_dir, f"model_{timestamp}.ckpt")
checkpoint_path = os.path.join(directory, f"model_{timestamp}.ckpt")
model.trainer.save_checkpoint(checkpoint_path)

def run(self):
Expand All @@ -36,35 +35,46 @@ def run(self):
label_propagator = propagate.LabelPropagator(**self.config.propagate)
label_propagator.through_time(new_annotations)

if self.config.train.validation_csv_path is not None:
validation_df = pd.read_csv(self.config.train.validation_csv_path)
if self.config.detection_model.validation_csv_path is not None:
validation_df = pd.read_csv(self.config.detection_model.validation_csv_path)
else:
validation_df = None

trained_model = preprocess_and_train(self.config, validation_df=validation_df)
self.save_model(trained_model)
trained_detection_model = preprocess_and_train(self.config, validation_df=validation_df, model_type="detection")
trained_classification_model = preprocess_and_train(self.config, validation_df=validation_df, model_type="classification")

pipeline_monitor = PipelineEvaluation(model=trained_model, **self.config.pipeline_evaluation)
self.save_model(trained_detection_model, self.config.detection_model.checkpoint_dir)
self.save_model(trained_classification_model, self.config.classification_model.checkpoint_dir)

pipeline_monitor = PipelineEvaluation(model=trained_detection_model, **self.config.pipeline_evaluation)
performance = pipeline_monitor.evaluate()

reporting = Reporting()
reporting = Reporting(self.config.reporting.report_dir)
reporting.generate_reports(pipeline_monitor)

if pipeline_monitor.check_success():
print("Pipeline performance is satisfactory, exiting")
return None
else:
train_images_to_annotate = choose_train_images(performance, trained_model, **self.config.active_learning)
train_images_to_annotate = choose_train_images(performance, trained_detection_model, **self.config.active_learning)
test_images_to_annotate = choose_test_images(performance, **self.config.active_testing)

pre_annotation = PreAnnotationPrediction(train_images_to_annotate)
confident_annotations, uncertain_annotations = pre_annotation.predict_and_divide(train_images_to_annotate)
predictions = predict(
m=trained_detection_model,
crop_model=trained_classification_model,
image_paths= train_images_to_annotate,
patch_size=self.config.active_learning.patch_size,
patch_overlap=self.config.active_learning.patch_overlap,
)
combined_predictions = pd.concat(predictions)

print(f"Images requiring human review: {len(confident_annotations)}")
print(f"Images auto-annotated: {len(uncertain_annotations)}")
# Split predictions into confident and uncertain
confident_predictions = combined_predictions[combined_predictions["score"] > self.config.active_learning.confident_threshold]
uncertain_predictions = combined_predictions[combined_predictions["score"] <= self.config.active_learning.confident_threshold]

# Run the annotation pipeline
label_studio.upload_to_label_studio(self.sftp_client, uncertain_annotations, **self.config)
label_studio.upload_to_label_studio(self.sftp_client, test_images_to_annotate, **self.config)
print(f"Images requiring human review: {len(confident_predictions)}")
print(f"Images auto-annotated: {len(uncertain_predictions)}")

reporting.generate_reports(trained_model)
# Run the annotation pipeline
label_studio.upload_to_label_studio(self.sftp_client, uncertain_predictions, **self.config)
label_studio.upload_to_label_studio(self.sftp_client, test_images_to_annotate, **self.config)
Loading

0 comments on commit 996e282

Please sign in to comment.