diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 528da84f..0565b82e 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
- rev: v3.3.0
+ rev: v4.5.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
@@ -8,13 +8,13 @@ repos:
- id: trailing-whitespace
exclude: .ipynb_checkpoints|data/Gaia_hp8_densitymap.fits
- repo: https://github.com/python/black
- rev: 22.3.0
+ rev: 24.2.0
hooks:
- id: black
pass_filenames: true
exclude: .ipynb_checkpoints|data|^.fits
- repo: https://github.com/pycqa/flake8
- rev: 3.8.4
+ rev: 7.0.0
hooks:
- id: flake8
pass_filenames: true
diff --git a/.requirements/dev.txt b/.requirements/dev.txt
deleted file mode 100644
index b663b35f..00000000
--- a/.requirements/dev.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-deepdiff>=5.0
-gsutil>=4.60
-keras-tuner>=1.0.2
-matplotlib>=3.3
-pytest>=6.1.2
-questionary>=1.8.1
-scikit-learn>=0.24.1
-tensorflow>=2.14.0
-wandb>=0.12.1
diff --git a/.requirements/doc.txt b/.requirements/doc.txt
deleted file mode 100644
index 83f9f419..00000000
--- a/.requirements/doc.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-h5py>=3.10.0
-astropy>=5.2.2
-fast-histogram>=0.11
-fire>=0.4.0
-healpy>=1.16.2
-Jinja2<=3.1
-myst-parser>=0.18.1
-pandas>=1.2
-penquins>=2.3.1
-pre-commit>=3.2.2
-pyyaml>=5.3.1
-sphinx>=4.2
-sphinx_press_theme>=0.8.0
-tdtax>=0.1.6
-tables>=3.7
-pyarrow>=9.0.0
-numba>=0.56.4
-numpy>=1.23,<1.24
-cesium>=0.11.1
-xgboost>=1.7.5
-seaborn>=0.12.2
-pydot>=1.4.2
-jupyter>=1.0.0
diff --git a/README.md b/README.md
index 22778d6d..2d22b201 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,14 @@
-# SCoPe: ZTF source classification project
+# SCoPe: ZTF Source Classification Project
[![arXiv](https://img.shields.io/badge/arXiv-2102.11304-brightgreen)](https://arxiv.org/abs/2102.11304)
[![arXiv](https://img.shields.io/badge/arXiv-2009.14071-brightgreen)](https://arxiv.org/abs/2009.14071)
+[![arXiv](https://img.shields.io/badge/arXiv-2312.00143-brightgreen)](https://arxiv.org/abs/2312.00143)
-The documentation is hosted at [https://zwickytransientfacility.github.io/scope-docs/](https://zwickytransientfacility.github.io/scope-docs/). To generate HTML files of the documentation locally, run `./scope.py doc`
+`scope-ml` uses machine learning to classify light curves from the Zwicky Transient Facility ([ZTF](https://www.ztf.caltech.edu)). The documentation is hosted at [https://zwickytransientfacility.github.io/scope-docs/](https://zwickytransientfacility.github.io/scope-docs/). To generate HTML files of the documentation locally, clone the repository and run `scope-doc` after installing.
## Funding
We gratefully acknowledge previous and current support from the U.S. National Science Foundation (NSF) Harnessing the Data Revolution (HDR) Institute for Accelerated AI Algorithms for Data-Driven Discovery (A3D3) under Cooperative Agreement No. PHY-2117997.
-
-
+
+
diff --git a/tools/SCoPe_data_analysis_plots.ipynb b/SCoPe_data_analysis_plots.ipynb
similarity index 100%
rename from tools/SCoPe_data_analysis_plots.ipynb
rename to SCoPe_data_analysis_plots.ipynb
diff --git a/config.defaults.yaml b/config.defaults.yaml
index e4b25239..909921fa 100644
--- a/config.defaults.yaml
+++ b/config.defaults.yaml
@@ -1731,6 +1731,15 @@ training:
eval_metric: 'auc'
early_stopping_rounds: 10
num_boost_round: 999
+ plot_params:
+ cm_include_count: False
+ cm_include_percent: True
+ annotate_scores: False
+ dnn:
+ dense_branch: True
+ conv_branch: True
+ loss: 'binary_crossentropy'
+ optimizer: 'adam'
classes:
# phenomenological classes
vnv:
diff --git a/dev-requirements.txt b/dev-requirements.txt
new file mode 100644
index 00000000..ae9ca6ff
--- /dev/null
+++ b/dev-requirements.txt
@@ -0,0 +1,5 @@
+pytest>=6.1.2
+pre-commit>=3.5.0
+sphinx>=4.2
+sphinx_press_theme>=0.8.0
+poetry>=1.7.1
diff --git a/doc/developer.md b/doc/developer.md
index 6957058f..68b18156 100644
--- a/doc/developer.md
+++ b/doc/developer.md
@@ -1,6 +1,23 @@
# Installation/Developer Guidelines
-## Initial steps
+## Science users
+- Create and activate a virtual/conda environment with Python 3.11, e.g:
+ ```shell script
+ conda create -n scope-env python=3.11
+ conda activate scope-env
+ ```
+- Install the latest release of `scope-ml` from PyPI:
+ ```shell script
+ pip install scope-ml
+ ```
+- In the directory of your choice, run the initialization script. This will create the required directories and copy the necessary files to run the code:
+ ```shell script
+ scope-initialize
+ ```
+- Change directories to `scope` and modify `config.yaml` to finish the initialization process. This config file is used by default when running all scripts. You can also specify another config file using the `--config-path` argument.
+
+
+## Developers/contributors
- Create your own fork the [scope repository](https://github.com/ZwickyTransientFacility/scope) by clicking the "fork" button. Then, decide whether you would like to use HTTPS (easier for beginners) or SSH.
- Following one set of instructions below, clone (download) your copy of the repository, and set up a remote called `upstream` that points to the main `scope` repository.
@@ -21,9 +38,9 @@ git clone git@github.com:/scope.git && cd scope
git remote add upstream git@github.com:ZwickyTransientFacility/scope.git
```
-## Setting up your environment (Windows/Linux/macOS)
+### Setting up your environment (Windows/Linux/macOS)
-### Use a package manager for installation
+#### Use a package manager for installation
We currently recommend running `scope` with Python 3.11. You may want to begin your installation by creating/activating a virtual environment, for example using conda. We specifically recommend installing miniforge3 (https://github.com/conda-forge/miniforge).
@@ -34,9 +51,9 @@ conda create -n scope-env -c conda-forge python=3.11
conda activate scope-env
```
-### Update your `PYTHONPATH`
+#### (Optional): Update your `PYTHONPATH`
-Ensure that Python can import from `scope` by modifying the `PYTHONPATH` environment variable. Use a simple text editor like `nano` to modify the appropriate file (depending on which shell you are using). For example, if using bash, run `nano ~/.bash_profile` and add the following line:
+If you plan to import from `scope`, ensure that Python can import from `scope` by modifying the `PYTHONPATH` environment variable. Use a simple text editor like `nano` to modify the appropriate file (depending on which shell you are using). For example, if using bash, run `nano ~/.bash_profile` and add the following line:
```bash
export PYTHONPATH="$PYTHONPATH:$HOME/scope"
@@ -44,13 +61,20 @@ export PYTHONPATH="$PYTHONPATH:$HOME/scope"
Save the updated file (`Ctrl+O` in `nano`) and close/reopen your terminal for this change to be recognized. Then `cd` back into scope and activate your `scope-env` again.
-### Install pre-commit
+### Install required packages
+
+Ensure you are in the `scope` directory that contains `pyproject.toml`. Then, install the required python packages by running:
+```bash
+pip install .
+```
+
+#### Install dev requirements, pre-commit hook
We use `black` to format the code and `flake8` to verify that code complies with [PEP8](https://www.python.org/dev/peps/pep-0008/).
-Please install our pre-commit hook as follows:
+Please install our dev requirements and pre-commit hook as follows:
```shell script
-pip install pre-commit
+pip install -r dev-requirements.txt
pre-commit install
```
@@ -60,14 +84,7 @@ code.
The pre-commit hook will lint *changes* made to the source.
-## Install required packages
-
-Install the required python packages by running:
-```bash
-pip install -r requirements.txt
-```
-
-### Create and modify config.yaml
+#### Create and modify config.yaml
From the included config.defaults.yaml, make a copy called config.yaml:
@@ -77,14 +94,15 @@ cp config.defaults.yaml config.yaml
Edit config.yaml to include Kowalski instance and Fritz tokens in the associated empty `token:` fields.
-### Testing
-Run `./scope.py test` to test your installation. Note that for the test to pass, you will need access to the Kowalski database. If you do not have Kowalski access, you can run `./scope.py test_limited` to run a more limited (but still useful) set of tests.
+#### Testing
+Run `scope-test` to test your installation. Note that for the test to pass, you will need access to the Kowalski database. If you do not have Kowalski access, you can run `scope-test-limited` to run a more limited (but still useful) set of tests.
### Troubleshooting
Upon encountering installation/testing errors, manually install the package in question using `conda install xxx` , and remove it from `.requirements/dev.txt`. After that, re-run `pip install -r requirements.txt` to continue.
-### Known issues
-- Across all platforms, we are currently aware of `scope` dependency issues with Python 3.11.
+#### Known issues
+- If using GPU-accelerated period-finding algorithms for feature generation, you will need to install [periodfind](https://github.com/ZwickyTransientFacility/periodfind) separately from the source.
+- Across all platforms, we are currently aware of `scope` dependency issues with Python 3.12.
- Anaconda continues to cause problems with environment setup.
- Using `pip` to install `healpy` on an arm64 Mac can raise an error upon import. We recommend including `h5py` as a requirement during the creation of your `conda` environment.
- On Windows machines, `healpy` and `cesium` raise errors upon installation.
@@ -93,7 +111,7 @@ Upon encountering installation/testing errors, manually install the package in q
If the installation continues to raise errors, update the conda environment and try again.
-## How to contribute
+### How to contribute
Contributions to `scope` are made through [GitHub Pull Requests](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests), a set of proposed commits (or patches):
@@ -144,7 +162,7 @@ Developers may merge `main` into their branch as many times as they want to.
1. Once the pull request has been reviewed and approved by at least one team member, it will be merged into `scope`.
-## Contributing Field Guide sections
+### Contributing Field Guide sections
If you would like to contribute a Field Guide section, please follow the steps below.
diff --git a/doc/quickstart.md b/doc/quickstart.md
index 6fad4ae1..c3184e70 100644
--- a/doc/quickstart.md
+++ b/doc/quickstart.md
@@ -1,16 +1,18 @@
# Quick Start Guide
-This guide is intended to facilitate quick interactions with SCoPe code after you have completed the **Installation/Developer Guidelines** section. More detailed usage info can be found in the **Usage** section. **All of the following examples assume that SCoPe is installed in your home directory. If the `scope` directory is located elsewhere, adjust the example code as necessary.**
+This guide is intended to facilitate quick interactions with SCoPe code after you have completed the **Installation/Developer Guidelines** section. More detailed usage info can be found in the **Usage** section.
## Modify `config.yaml`
To start out, provide SCoPe your training set's filepath using the `training:` `dataset:` field in `config.yaml`. The path should be a partial one starting within the `scope` directory. For example, if your training set `trainingSet.parquet` is within the `tools` directory (which itself is within `scope`), provide `tools/trainingSet.parquet` in the `dataset:` field.
+When running scripts, `scope` will by default use the `config.yaml` file in your current directory. You can specify a different config file by providing its path to any installed script using the `--config-path` argument.
+
## Training
Train an XGBoost binary classifier using the following code:
```
-./scope.py train --tag=vnv --algorithm=xgb --group=ss23 --period_suffix=ELS_ECE_EAOV --epochs=30 --verbose --save --plot --skip_cv
+scope-train --tag vnv --algorithm xgb --group ss23 --period-suffix ELS_ECE_EAOV --epochs 30 --verbose --save --plot --skip-cv
```
### Arguments:
@@ -20,34 +22,34 @@ Train an XGBoost binary classifier using the following code:
`--group`: if `--save` is passed, training results are saved to the group/directory named here.
-`--period_suffix`: SCoPe determines light curve periods using GPU-accelerated algorithms. These algorithms include a Lomb-Scargle approach (ELS), Conditional Entropy (ECE), Analysis of Variance (AOV), and an approach nesting all three (ELS_ECE_EAOV). Periodic features are stored with the suffix specified here.
+`--period-suffix`: SCoPe determines light curve periods using GPU-accelerated algorithms. These algorithms include a Lomb-Scargle approach (ELS), Conditional Entropy (ECE), Analysis of Variance (AOV), and an approach nesting all three (ELS_ECE_EAOV). Periodic features are stored with the suffix specified here.
-`--min_count`: requires at least min_count positive examples to run training.
+`--min-count`: requires at least min_count positive examples to run training.
`--epochs`: neural network training takes an --epochs argument that is set to 30 here.
***Notes:***
-- *The above training runs the XGB algorithm by default and skips cross-validation in the interest of time. For a full run, you can remove the `--skip_cv` argument to run a cross-validated grid search of XGB hyperparameters during training.*
+- *The above training runs the XGB algorithm by default and skips cross-validation in the interest of time. For a full run, you can remove the `--skip-cv` argument to run a cross-validated grid search of XGB hyperparameters during training.*
-- *DNN hyperparameters are optimized using a different approach - Weights and Biases Sweeps (https://docs.wandb.ai/guides/sweeps). The results of these sweeps are the default hyperparameters in the config file. To run another round of sweeps for DNN, create a WandB account and set the `--run_sweeps` keyword in the call to `scope.py train`.*
+- *DNN hyperparameters are optimized using a different approach - Weights and Biases Sweeps (https://docs.wandb.ai/guides/sweeps). The results of these sweeps are the default hyperparameters in the config file. To run another round of sweeps for DNN, create a WandB account and set the `--run-sweeps` keyword in the call to `scope-train`.*
- *SCoPe DNN training does not provide feature importance information (due to the hidden layers of the network). Feature importance is possible to estimate for neural networks, but it is more computationally expensive compared to this "free" information from XGB.*
### Train multiple classifiers with one script
-Create a shell script that contains multiple calls to `scope.py train`:
+Create a shell script that contains multiple calls to `scope-train`:
```
-./scope.py create_training_script --filename=train_xgb.sh --min_count=1000 --algorithm=xgb --period_suffix=ELS_ECE_EAOV --add_keywords="--save --plot --group=ss23 --epochs=30 --skip_cv"
+create-training-script --filename train_xgb.sh --min-count 1000 --algorithm xgb --period-suffix ELS_ECE_EAOV --add-keywords "--save --plot --group ss23 --epochs 30 --skip-cv"
```
-Modify the permissions of this script by running `chmod +x train_xgb.sh`. Run the generated training script in a terminal window (using e.g. `./train_xgb.sh`) to train multiple label sequentially.
+Modify the permissions of this script by running `chmod +x train_xgb.sh`. Run the generated training script in a terminal window (using e.g. `./train_xgb.sh`) to train multiple classifers sequentially.
***Note:***
-- *The code will throw an error if the training script filename already exists.*
+- *The code will raise an error if the training script filename already exists.*
### Running training on HPC resources
-`train_algorithm_slurm.py` and `train_algorithm_job_submission.py` can be used generate and submit `slurm` scripts to train all classifiers in parallel using HPC resources.
+`train-algorithm-slurm` and `train-algorithm-job-submission` can be used generate and submit `slurm` scripts to train all classifiers in parallel using HPC resources.
## Plotting Classifier Performance
SCoPe saves diagnostic plots and json files to report each classifier's performance. The below code shows the location of the validation set results for one classifier.
@@ -82,10 +84,10 @@ This code may also be placed in a loop over multiple labels to compare each clas
## Inference
-Use `tools/inference.py` to run inference on a field (297) of features (within a directory called `generated_features`). The classifiers used for this inference are within the `ss23` directory/group specified during training.
+Use `run-inference` to run inference on a field (297) of features (in this example, located in a directory called `generated_features`). The classifiers used for this inference are within the `ss23` directory/group specified during training.
```
-./scope.py create_inference_script --filename=get_all_preds_xgb.sh --group_name=ss23 --algorithm=xgb --period_suffix=ELS_ECE_EAOV --feature_directory=generated_features
+create-inference-script --filename get_all_preds_xgb.sh --group-name ss23 --algorithm xgb --period-suffix ELS_ECE_EAOV --feature-directory generated_features
```
Modify the permissions of this script using `chmod +x get_all_preds_xgb.sh`, then run on the desired field:
@@ -94,12 +96,12 @@ Modify the permissions of this script using `chmod +x get_all_preds_xgb.sh`, the
```
***Notes:***
-- *`scope.py create_inference_script` will throw an error if the inference script filename already exists.*
+- *`create-inference-script` will raise an error if the inference script filename already exists.*
- *Inference begins by imputing missing features using the strategies specified in the `features:` section of the config file.*
### Running inference on HPC resources
-`run_inference_slurm.py` and `run_inference_job_submission.py` can be used generate and submit `slurm` scripts to run inference for all classifiers in parallel using HPC resources.*
+`run-inference-slurm` and `run-inference-job-submission` can be used generate and submit `slurm` scripts to run inference for all classifiers in parallel using HPC resources.*
## Examining predictions
diff --git a/doc/usage.md b/doc/usage.md
index ca920dcd..a143e68e 100644
--- a/doc/usage.md
+++ b/doc/usage.md
@@ -5,49 +5,49 @@
- Create HDF5 file for single CCD/quad pair in a field:
```sh
-./get_quad_ids.py --catalog ZTF_source_features_DR16 --field 301 --ccd 2 --quad 3 --minobs 20 --skip 0 --limit 10000
+get-quad-ids --catalog ZTF_source_features_DR16 --field 301 --ccd 2 --quad 3 --minobs 20 --skip 0 --limit 10000
```
- Create multiple HDF5 files for some CCD/quad pairs in a field:
```sh
-./get_quad_ids.py --catalog ZTF_source_features_DR16 --field 301 --multi-quads --ccd-range 1 8 --quad-range 2 4 --minobs 20 --limit 10000
+get-quad-ids --catalog ZTF_source_features_DR16 --field 301 --multi-quads --ccd-range 1 8 --quad-range 2 4 --minobs 20 --limit 10000
```
- Create multiple HDF5 files for all CCD/quad pairs in a field:
```sh
-./get_quad_ids.py --catalog ZTF_source_features_DR16 --field 301 --multi-quads --minobs 20 --limit 10000
+get-quad-ids --catalog ZTF_source_features_DR16 --field 301 --multi-quads --minobs 20 --limit 10000
```
- Create single HDF5 file for all sources in a field:
```sh
-./get_quad_ids.py --catalog ZTF_source_features_DR16 --field 301 --whole-field
+get-quad-ids --catalog ZTF_source_features_DR16 --field 301 --whole-field
```
## Download SCoPe features for ZTF fields/CCDs/quadrants
-- First, run `get_quad_ids.py` for desired fields/ccds/quads.
+- First, run `get-quad_ids` for desired fields/ccds/quads.
- Download features for all sources in a field:
```sh
-./tools/get_features.py --field 301 --whole-field
+get-features --field 301 --whole-field
```
- Download features for all sources in a field, imputing missing features using the strategies in `config.yaml`:
```sh
-./tools/get_features.py --field 301 --whole-field --impute-missing-features
+get-features --field 301 --whole-field --impute-missing-features
```
- Download features for a range of ccd/quads individually:
```sh
-./tools/get_features.py --field 301 --ccd-range 1 2 --quad-range 3 4
+get-features --field 301 --ccd-range 1 2 --quad-range 3 4
```
- Download features for a single pair of ccd/quad:
```sh
-./tools/get_features.py --field 301 --ccd-range 1 --quad-range 2
+get-features --field 301 --ccd-range 1 --quad-range 2
```
@@ -59,16 +59,16 @@ please refer to [arxiv:2102.11304](https://arxiv.org/pdf/2102.11304.pdf).
- The training pipeline can be invoked with the `scope.py` utility. For example:
```sh
-./scope.py train --tag=vnv --path_dataset=data/training/dataset.d15.csv --batch_size=64 --epochs=100 --verbose=1 --pre_trained_model=models/experiment/vnv/vnv.20221117_001502.h5
+scope-train --tag vnv --path-dataset data/training/dataset.d15.csv --batch-size 64 --epochs 100 --verbose 1 --pre-trained-model models/experiment/vnv/vnv.20221117_001502.h5
```
-Refer to `./scope.py train --help` for details.
+Refer to `scope-train --help` for details.
- All the necessary metadata/configuration could be defined in `config.yaml` under `training`,
-but could also be overridden with optional `scope.py train` arguments, e.g.
-`./scope.py train ... --batch_size=32 --threshold=0.6 ...`.
+but could also be overridden with optional `scope-train` arguments, e.g.
+`scope-train ... --batch-size 32 --threshold 0.6 ...`.
-- By default, the pipeline uses the `DNN` models defined in `scope/nn.py` using the tensorflow's `keras` functional API. SCoPe also supports an implementation of XGBoost (set `--algorithm=xgb`; see `scope/xgb.py`).
+- By default, the pipeline uses the `DNN` models defined in `scope/nn.py` using the tensorflow's `keras` functional API. SCoPe also supports an implementation of XGBoost (set `--algorithm xgb`; see `scope/xgb.py`).
- If `--save` is specified during `DNN` training, an HDF5 file of the model's layers and weights will be saved. This file can be directly used for additional training and inferencing. For `XGB`, a json file will save the model along with a `.params` file with the model parameters.
- The `Dataset` class defined in `scope.utils` hides the complexity of our dataset handling "under the rug".
- You can request access to a Google Drive folder containing the latest trained models [here](https://drive.google.com/drive/folders/1_oLBxveioKtw7LyMJfism745USe9tEGZ?usp=sharing).
@@ -77,47 +77,47 @@ but could also be overridden with optional `scope.py train` arguments, e.g.
These are referenced in `config.yaml` under `training.classes..features`.
- Feature stats to be used for feature scaling/standardization before training
- is defined in `config.yaml` under `feature_stats`.
+ are either computed by the code (default) or defined in `config.yaml` under `feature_stats`.
- We use [Weights & Biases](https://wandb.com) to track experiments.
Project details and access credentials can be defined in `config.yaml` under `wandb`.
-Initially, SCoPe used a `bash` script to train all classifier families:
+Initially, SCoPe used a `bash` script to train all classifier families, e.g:
```sh
for class in pnp longt i fla ew eb ea e agn bis blyr ceph dscu lpv mir puls rrlyr rscvn srv wuma yso; \
do echo $class; \
for state in 1 2 3 4 5 6 7 8 9 42; \
- do ./scope.py train \
- --tag=$class --path_dataset=data/training/dataset.d15.csv \
- --scale_features=min_max --batch_size=64 \
- --epochs=300 --patience=30 --random_state=$state \
- --verbose=1 --gpu=1 --conv_branch=true --save; \
+ do scope-train \
+ --tag $class --path-dataset data/training/dataset.d15.csv \
+ --scale-features min_max --batch-size 64 \
+ --epochs 300 --patience 30 --random-state $state \
+ --verbose 1 --gpu 1 --conv-branch --save; \
done; \
done;
```
-Now, a training script containing one line per class to be trained can be generated by running `./scope.py create_training_script`, for example:
+Now, a training script containing one line per class to be trained can be generated by running `create-training-script`, for example:
```bash
-./scope.py create_training_script --filename='train_dnn.sh' --min_count=100 --pre_trained_group_name='experiment' --add_keywords='--save --batch_size=32 --group=new_experiment --period_suffix=ELS_ECE_EAOV'
+create-training-script --filename train_dnn.sh --min-count 100 --pre-trained-group-name experiment --add-keywords '--save --batch-size 32 --group new_experiment --period-suffix ELS_ECE_EAOV'
```
-A path to the training set may be provided as input to this method or otherwise taken from `config.yaml` (`training: dataset:`). To continue training on existing models, specify the `--pre_trained_group_name` keyword containing the models in `create_training_script`. If training on a feature collection containing multiple sets of periodic features (from different algorithms), set the suffix corresponding to the desired algorithm using `--period_suffix` or the `features: info: period_suffix:` field in the config file. The string specified in `--add_keywords` serves as a catch-all for additional keywords that the user wishes to be included in each line of the script.
+A path to the training set may be provided as input to this method or otherwise taken from `config.yaml` (`training: dataset:`). To continue training on existing models, specify the `--pre-trained-group-name` keyword containing the models in `create-training-script`. If training on a feature collection containing multiple sets of periodic features (from different algorithms), set the suffix corresponding to the desired algorithm using `--period-suffix` or the `features: info: period_suffix:` field in the config file. The string specified in `--add-keywords` serves as a catch-all for additional keywords that the user wishes to be included in each line of the script.
-If `--pre_trained_group_name` is specified and the `--train_all` keyword is set, the output script will train all classes specified in `config.yaml` regardless of whether they have a pre-trained model. If `--train_all` is not set (the default), the script will limit training to classes that have an existing trained model.
+If `--pre-trained-group-name` is specified and the `--train-all` keyword is set, the output script will train all classes specified in `config.yaml` regardless of whether they have a pre-trained model. If `--train-all` is not set (the default), the script will limit training to classes that have an existing trained model.
## Running inference
Running inference requires the following steps: download ids of a field, download (or generate) features for all downloaded ids, run inference for all available trained models, e.g:
```
-./tools/get_quad_ids.py --field= --whole_field
-./tools/get_features.py --field= --whole_field --impute_missing_features
+get-quad-ids --field --whole-field
+get-features --field --whole-field --impute-missing-features
```
OR
```
-./tools/generate_features.py --field --ccd --quad --doGPU
+generate-features --field --ccd --quad --doGPU
```
-The optimal way to run inference is through an inference script generated by running `./scope.py create_inference_script` with the appropriate arguments. After creating the script and adding the needed permissions (e.g. using `chmod +x`), the commands to run inference on the field `` are (in order):
+The optimal way to run inference is through an inference script generated by running `create-inference-script` with the appropriate arguments. After creating the script and adding the needed permissions (e.g. using `chmod +x`), the commands to run inference on the field `` are (in order):
```
./get_all_preds.sh
```
@@ -173,45 +173,44 @@ The fields associated with each key are `fritz_label` (containing the associated
```
## Generating features
-Code has been adapted from [ztfperiodic](https://github.com/mcoughlin/ztfperiodic) and other sources to calculate basic and Fourier stats for light curves along with other features. This allows new features to be generated with SCoPe, both locally and using GPU cluster resources. The feature generation script is contained within `tools/generate_features.py`.
+Code has been adapted from [ztfperiodic](https://github.com/mcoughlin/ztfperiodic) and other sources to calculate basic and Fourier stats for light curves along with other features. This allows new features to be generated with SCoPe, both locally and using GPU cluster resources. The feature generation script is run using the `generate-features` command.
Currently, the basic stats are calculated via `tools/featureGeneration/lcstats.py`, and a host of period-finding algorithms are available in `tools/featureGeneration/periodsearch.py`. Among the CPU-based period-finding algorithms, there is not yet support for `AOV_cython`. For the `AOV` algorithm to work, run `source build.sh` in the `tools/featureGeneration/pyaov/` directory, then copy the newly created `.so` file (`aov.cpython-310-darwin.so` or similar) to `lib/python3.10/site-packages/` or equivalent within your environment. The GPU-based algorithms require CUDA support (so Mac GPUs are not supported).
inputs:
-1. --source_catalog* : name of Kowalski catalog containing ZTF sources (str)
-2. --alerts_catalog* : name of Kowalski catalog containing ZTF alerts (str)
-3. --gaia_catalog* : name of Kowalski catalog containing Gaia data (str)
-4. --bright_star_query_radius_arcsec : maximum angular distance from ZTF sources to query nearby bright stars in Gaia (float)
-5. --xmatch_radius_arcsec : maximum angular distance from ZTF sources to match external catalog sources (float)
-6. --kowalski_instances* : dictionary containing {names of Kowalski instances : authenticated penquins.Kowalski objects} (dict)
-7. --limit : maximum number of sources to process in batch queries / statistics calculations (int)
-8. --period_algorithms* : dictionary containing names of period algorithms to run. Normally specified in config - if specified here, should be a (list)
-9. --period_batch_size : maximum number of sources to simultaneously perform period finding (int)
-10. --doCPU : flag to run config-specified CPU period algorithms (bool)
-11. --doGPU : flag to run config-specified GPU period algorithms (bool)
-12. --samples_per_peak : number of samples per periodogram peak (int)
-13. --doScaleMinPeriod : for period finding, scale min period based on min_cadence_minutes (bool). Otherwise, set --max_freq to desired value
-14. --doRemoveTerrestrial : remove terrestrial frequencies from period-finding analysis (bool)
-15. --Ncore : number of CPU cores to parallelize queries (int)
-16. --field : ZTF field to run (int)
-17. --ccd : ZTF ccd to run (int)
-18. --quad : ZTF quadrant to run (int)
-19. --min_n_lc_points : minimum number of points required to generate features for a light curve (int)
-20. --min_cadence_minutes : minimum cadence between light curve points. Higher-cadence data are dropped except for the first point in the sequence (float)
-21. --dirname : name of generated feature directory (str)
-22. --filename : prefix of each feature filename (str)
-23. --doCesium : flag to compute config-specified cesium features in addition to default list (bool)
-24. --doNotSave : flag to avoid saving generated features (bool)
-25. --stop_early : flag to stop feature generation before entire quadrant is run. Pair with --limit to run small-scale tests (bool)
-26. --doQuadrantFile : flag to use a generated file containing [jobID, field, ccd, quad] columns instead of specifying --field, --ccd and --quad (bool)
-27. --quadrant_file : name of quadrant file in the generated_features/slurm directory or equivalent (str)
-28. --quadrant_index : number of job in quadrant file to run (int)
-29. --doSpecificIDs: flag to perform feature generation for ztf_id column in config-specified file (bool)
-30. --skipCloseSources: flag to skip removal of sources too close to bright stars via Gaia (bool)
-31. --top_n_periods: number of (E)LS, (E)CE periods to pass to (E)AOV if using (E)LS_(E)CE_(E)AOV algorithm (int)
-32. --max_freq: maximum frequency [1 / days] to use for period finding (float). Overridden by --doScaleMinPeriod
-33. --fg_dataset*: path to parquet, hdf5 or csv file containing specific sources for feature generation (str)
-34. --max_timestamp_hjd*: maximum timestamp of queried light curves, HJD (float)
+1. --source-catalog* : name of Kowalski catalog containing ZTF sources (str)
+2. --alerts-catalog* : name of Kowalski catalog containing ZTF alerts (str)
+3. --gaia-catalog* : name of Kowalski catalog containing Gaia data (str)
+4. --bright-star-query-radius-arcsec : maximum angular distance from ZTF sources to query nearby bright stars in Gaia (float)
+5. --xmatch-radius-arcsec : maximum angular distance from ZTF sources to match external catalog sources (float)
+6. --limit : maximum number of sources to process in batch queries / statistics calculations (int)
+7. --period-algorithms* : dictionary containing names of period algorithms to run. Normally specified in config - if specified here, should be a (list)
+8. --period-batch-size : maximum number of sources to simultaneously perform period finding (int)
+9. --doCPU : flag to run config-specified CPU period algorithms (bool)
+10. --doGPU : flag to run config-specified GPU period algorithms (bool)
+11. --samples_per_peak : number of samples per periodogram peak (int)
+12. --doScaleMinPeriod : for period finding, scale min period based on min-cadence-minutes (bool). Otherwise, set --max-freq to desired value
+13. --doRemoveTerrestrial : remove terrestrial frequencies from period-finding analysis (bool)
+14. --Ncore : number of CPU cores to parallelize queries (int)
+15. --field : ZTF field to run (int)
+16. --ccd : ZTF ccd to run (int)
+17. --quad : ZTF quadrant to run (int)
+18. --min-n-lc-points : minimum number of points required to generate features for a light curve (int)
+19. --min-cadence-minutes : minimum cadence between light curve points. Higher-cadence data are dropped except for the first point in the sequence (float)
+20. --dirname : name of generated feature directory (str)
+21. --filename : prefix of each feature filename (str)
+22. --doCesium : flag to compute config-specified cesium features in addition to default list (bool)
+23. --doNotSave : flag to avoid saving generated features (bool)
+24. --stop-early : flag to stop feature generation before entire quadrant is run. Pair with --limit to run small-scale tests (bool)
+25. --doQuadrantFile : flag to use a generated file containing [jobID, field, ccd, quad] columns instead of specifying --field, --ccd and --quad (bool)
+26. --quadrant-file : name of quadrant file in the generated_features/slurm directory or equivalent (str)
+27. --quadrant-index : number of job in quadrant file to run (int)
+28. --doSpecificIDs: flag to perform feature generation for ztf_id column in config-specified file (bool)
+29. --skipCloseSources: flag to skip removal of sources too close to bright stars via Gaia (bool)
+30. --top-n-periods: number of (E)LS, (E)CE periods to pass to (E)AOV if using (E)LS_(E)CE_(E)AOV algorithm (int)
+31. --max-freq: maximum frequency [1 / days] to use for period finding (float). Overridden by --doScaleMinPeriod
+32. --fg-dataset*: path to parquet, hdf5 or csv file containing specific sources for feature generation (str)
+33. --max-timestamp-hjd*: maximum timestamp of queried light curves, HJD (float)
output:
feature_df : dataframe containing generated features
@@ -222,7 +221,7 @@ feature_df : dataframe containing generated features
The following is an example of running the feature generation script locally:
```
-./generate_features.py --field 301 --ccd 2 --quad 4 --source_catalog ZTF_sources_20230109 --alerts_catalog ZTF_alerts --gaia_catalog Gaia_EDR3 --bright_star_query_radius_arcsec 300.0 --xmatch_radius_arcsec 2.0 --query_size_limit 10000 --period_batch_size 1000 --samples_per_peak 10 --Ncore 4 --min_n_lc_points 50 --min_cadence_minutes 30.0 --dirname generated_features --filename gen_features --doCPU --doRemoveTerrestrial --doCesium
+generate-features --field 301 --ccd 2 --quad 4 --source-catalog ZTF_sources_20230109 --alerts-catalog ZTF_alerts --gaia-catalog Gaia_EDR3 --bright-star-query-radius-arcsec 300.0 --xmatch-radius-arcsec 2.0 --query-size-limit 10000 --period-batch-size 1000 --samples-per-peak 10 --Ncore 4 --min-n-lc-points 50 --min-cadence-minutes 30.0 --dirname generated_features --filename gen_features --doCPU --doRemoveTerrestrial --doCesium
```
Setting `--doCPU` will run the config-specified CPU period algorithms on each source. Setting `--doGPU` instead will do likewise with the specified GPU algorithms. If neither of these keywords is set, the code will assign a value of `1.0` to each period and compute Fourier statistics using that number.
@@ -230,34 +229,36 @@ Setting `--doCPU` will run the config-specified CPU period algorithms on each so
Below is an example run the script using a job/quadrant file (containing [job id, field, ccd, quad] columns) instead of specifying field/ccd/quad directly:
```
-/home/bhealy/scope/tools/generate_features.py --source_catalog ZTF_sources_20230109 --alerts_catalog ZTF_alerts --gaia_catalog Gaia_EDR3 --bright_star_query_radius_arcsec 300.0 --xmatch_radius_arcsec 2.0 --query_size_limit 10000 --period_batch_size 1000 --samples_per_peak 10 --Ncore 20 --min_n_lc_points 50 --min_cadence_minutes 30.0 --dirname generated_features_DR15 --filename gen_features --doGPU --doRemoveTerrestrial --doCesium --doQuadrantFile --quadrant_file slurm.dat --quadrant_index 5738
+generate-features --source-catalog ZTF_sources_20230109 --alerts-catalog ZTF_alerts --gaia-catalog Gaia_EDR3 --bright-star-query-radius-arcsec 300.0 --xmatch-radius-arcsec 2.0 --query-size-limit 10000 --period-batch-size 1000 --samples-per-peak 10 --Ncore 20 --min-n-lc-points 50 --min-cadence-minutes 30.0 --dirname generated_features_DR15 --filename gen_features --doGPU --doRemoveTerrestrial --doCesium --doQuadrantFile --quadrant-file slurm.dat --quadrant-index 5738
```
### Slurm scripts
-For large-scale feature generation, `generate_features.py` is intended to be run on a high-performance computing cluster. Often these clusters require jobs to be submitted using a utility like `slurm` (Simple Linux Utility for Resource Management) to generate scripts. These scripts contain information about the type, amount and duration of computing resources to allocate to the user.
+For large-scale feature generation, `generate-features` is intended to be run on a high-performance computing cluster. Often these clusters require jobs to be submitted using a utility like `slurm` (Simple Linux Utility for Resource Management) to generate scripts. These scripts contain information about the type, amount and duration of computing resources to allocate to the user.
-Scope's `generate_features_slurm.py` code creates two slurm scripts: (1) runs single instance of `generate_features.py`, and (2) runs the `generate_features_job_submission.py` which submits multiple jobs in parallel, periodically checking to see if additional jobs can be started. See below for more information about these components of feature generation.
+Scope's `generate-features-slurm` code creates two slurm scripts: (1) runs single instance of `generate-features`, and (2) runs the `generate-features-job-submission` which submits multiple jobs in parallel, periodically checking to see if additional jobs can be started. See below for more information about these components of feature generation.
-`generate_features_slurm.py` can receive all of the arguments used by `generate_features.py`. These arguments are passed to the instances of feature generation begun by running slurm script (1). There are also additional arguments specific to cluster resource management:
+`generate-features-slurm` can receive all of the arguments used by `generate-features`. These arguments are passed to the instances of feature generation begun by running slurm script (1). There are also additional arguments specific to cluster resource management:
inputs:
-1. --job_name : name of submitted jobs (str)
-2. --cluster_name : name of HPC cluster (str)
-3. --partition_type : cluster partition to use (str)
+1. --job-name : name of submitted jobs (str)
+2. --cluster-name : name of HPC cluster (str)
+3. --partition-type : cluster partition to use (str)
4. --nodes : number of nodes to request (int)
5. --gpus : number of GPUs to request (int)
-6. --memory_GB : amount of memory to request in GB (int)
-7. --time : amount of time before instance times out (str)
-8. --mail_user: user's email address for job updates (str)
-9. --account_name : name of account having HPC allocation (str)
-10. --python_env_name : name of Python environment to activate before running `generate_features.py` (str)
-11. --kowalski_instance_name : name of Kowalski instance containing ZTF source catalog (str)
+6. --memory-GB : amount of memory to request in GB (int)
+7. --submit-memory-GB : Memory allocation to request for job submission (int)
+8. --time : amount of time before instance times out (str)
+9. --mail-user: user's email address for job updates (str)
+10. --account-name : name of account having HPC allocation (str)
+11. --python-env-name : name of Python environment to activate before running `generate_features.py` (str)
12. --generateQuadrantFile : flag to map fields/ccds/quads containing sources to job numbers, save file (bool)
-13. --max_instances : maximum number of HPC instances to run in parallel (int)
-14. --wait_time_minutes : amount of time to wait between status checks in minutes (float)
-15. --doSubmitLoop : flag to run loop initiating instances until out of jobs (hard on Kowalski)
-16. --runParallel : flag to run jobs in parallel using slurm [recommended]. Otherwise, run in series on a single instance
-17. --user : if using slurm, your username. This will be used to periodically run `squeue` and list your running jobs (str)
+13. --field-list : space-separated list of fields for which to generate quadrant file. If None, all populated fields included (int)
+14. --max-instances : maximum number of HPC instances to run in parallel (int)
+15. --wait-time-minutes : amount of time to wait between status checks in minutes (float)
+16. --doSubmitLoop : flag to run loop initiating instances until out of jobs (hard on Kowalski)
+17. --runParallel : flag to run jobs in parallel using slurm [recommended]. Otherwise, run in series on a single instance
+18. --user : if using slurm, your username. This will be used to periodically run `squeue` and list your running jobs (str)
+19. --submit-interval-minutes : Time to wait between job submissions, minutes (float)
## Feature definitions
### Selected phenomenological feature definitions
@@ -265,21 +266,21 @@ inputs:
| name | definition |
| ---- | ---------- |
|ad | Anderson-Darling statistic |
-|chi2red | Reduced chi^2 |
-|f1_BIC | Bayesian information criterion, first order (Fourier analysis) |
-|f1_a | a coefficient, first order (Fourier analysis) |
-|f1_amp | Amplitude, first order (Fourier analysis) |
-|f1_b | b coefficient (Fourier analysis) |
-|f1_phi0 | Zero-phase, first order (Fourier analysis) |
-|f1_power | Power, first order (Fourier analysis) |
-|f1_relamp1 | Relative amplitude, first order (Fourier analysis) |
-|f1_relamp2 | Relative amplitude, second order (Fourier analysis) |
-|f1_relamp3 | Relative amplitude, third order (Fourier analysis) |
-|f1_relamp4 | Relative amplitude, fourth order (Fourier analysis) |
-|f1_relphi1 | Relative phase, first order (Fourier analysis) |
-|f1_relphi2 | Relative phase, second order (Fourier analysis) |
-|f1_relphi3 | Relative phase, third order (Fourier analysis) |
-|f1_relphi4 | Relative phase, fourth order (Fourier analysis) |
+|chi2red | Reduced chi^2 after mean subtraction |
+|f1_BIC | Bayesian information criterion of best-fitting series (Fourier analysis) |
+|f1_a | a coefficient of best-fitting series (Fourier analysis) |
+|f1_amp | Amplitude of best-fitting series (Fourier analysis) |
+|f1_b | b coefficient of best-fitting series (Fourier analysis) |
+|f1_phi0 | Zero-phase of best-fitting series (Fourier analysis) |
+|f1_power | Normalized chi^2 of best-fitting series (Fourier analysis) |
+|f1_relamp1 | Relative amplitude, first harmonic (Fourier analysis) |
+|f1_relamp2 | Relative amplitude, second harmonic (Fourier analysis) |
+|f1_relamp3 | Relative amplitude, third harmonic (Fourier analysis) |
+|f1_relamp4 | Relative amplitude, fourth harmonic (Fourier analysis) |
+|f1_relphi1 | Relative phase, first harmonic (Fourier analysis) |
+|f1_relphi2 | Relative phase, second harmonic (Fourier analysis) |
+|f1_relphi3 | Relative phase, third harmonic (Fourier analysis) |
+|f1_relphi4 | Relative phase, fourth harmonic (Fourier analysis) |
|i60r | Mag ratio between 20th, 80th percentiles |
|i70r | Mag ratio between 15th, 85th percentiles |
|i80r | Mag ratio between 10th, 90th percentiles |
@@ -315,6 +316,7 @@ inputs:
| AllWISE_w3mpro | AllWISE W3 mag |
| AllWISE_w4mpro | AllWISE W4 mag |
| Gaia_EDR3__parallax | Gaia parallax |
+| Gaia_EDR3__parallax_error | Gaia parallax error |
| Gaia_EDR3__phot_bp_mean_mag | Gaia BP mag |
| Gaia_EDR3__phot_bp_rp_excess_factor | Gaia BP-RP excess factor |
| Gaia_EDR3__phot_g_mean_mag | Gaia G mag |
@@ -340,12 +342,12 @@ It is useful to know the classifications of any persistent ZTF sources that are
To set up a `cron` job, first run `EDITOR=emacs crontab -e`. You can replace `emacs` with your text editor of choice as long as it is installed on your machine. This command will open a text file in which to place `cron` commands. An example command is as follows:
```bash
-0 */2 * * * ~/scope/gcn_cronjob.py > ~/scope/log_gcn_cronjob.txt 2>&1
+0 */2 * * * cd scope && ~/miniforge3/envs/scope-env/bin/python ~/scope/gcn_cronjob.py > ~/scope/log_gcn_cronjob.txt 2>&1
```
Above, the `0 */2 * * *` means that this command will run every two hours, on minute 0 of that hour. Time increments increase from left to right; in this example, the five numbers are minute, hour, day (of month), month, day (of week). The `*/2` means that the hour has to be divisible by 2 for the job to run. Check out [crontab.guru](https://crontab.guru) to learn more about `cron` timing syntax.
-Next in the line, `~/scope/gcn_cronjob.py` is the command that gets run. The `>` character forwards the output from the command (e.g. what your script prints) into a log file in a specific location (here `~/scope/log_gcn_cronjob.txt`). Finally, the `2>&1` suppresses 'emails' from `cron` about the status of your job (unnecessary since the log is being saved to the user-specified file).
+Next in the line, we change directories to `scope` in order for the code to access our `config.yaml` file located in this directory. Then, `~/miniforge3/envs/scope-env/bin/python ~/scope/gcn_cronjob.py` is the command that gets run (using the Python environment installed in `scope-env`). The `>` character forwards the output from the command (e.g. what your script prints) into a log file in a specific location (here `~/scope/log_gcn_cronjob.txt`). Finally, the `2>&1` suppresses 'emails' from `cron` about the status of your job (unnecessary since the log is being saved to the user-specified file).
Save the text file once you finish modifying it to install the cron job. **Ensure that the last line of your file is a newline to avoid issues when running.** Your computer may pop up a window to which you should respond in the affirmative in order to successfully initialize the job. To check which `cron` jobs have been installed, run `crontab -l`. To uninstall your jobs, run `crontab -r`.
@@ -355,18 +357,18 @@ Because `cron` runs in a simple environment, the usual details of environment se
```
PYTHONPATH = /Users/username/scope
-0 */2 * * * /opt/homebrew/bin/gtimeout 2h ~/miniforge3/envs/scope-env/bin/python ~/scope/gcn_cronjob.py > ~/scope/log_gcn_cronjob.txt 2>&1
+0 */2 * * * /opt/homebrew/bin/gtimeout 2h ~/miniforge3/envs/scope-env/bin/python scope-gcn-cronjob > ~/scope/log_gcn_cronjob.txt 2>&1
```
In the first line above, the `PYTHONPATH` environment variable is defined to include the `scope` directory. Without this line, any code that imports from `scope` will throw an error, since the user's usual `PYTHONPATH` variable is not accessed in the `cron` environment.
-The second line begins with the familiar `cron` timing pattern described above. It continues by specifying the a maximum runtime of 2 hours before timing out using the `gtimeout` command. On a Mac, this can be installed with `homebrew` by running `brew install coreutils`. Note that the full path to `gtimeout` must be specified. After the timeout comes the call to the `gcn_cronjob.py` script. Note that the usual `#/usr/bin/env python` line at the top of SCoPe's python scripts does not work within the `cron` environment. Instead, `python` must be explicitly specified, and in order to have access to the modules installed in `scope-env` we must provide a full path like the one above (`~/miniforge3/envs/scope-env/bin/python`). The line concludes by sending the script's output to a dedicated log file. This file gets overwritten each time the script runs.
+The second line begins with the familiar `cron` timing pattern described above. It continues by specifying the a maximum runtime of 2 hours before timing out using the `gtimeout` command. On a Mac, this can be installed with `homebrew` by running `brew install coreutils`. Note that the full path to `gtimeout` must be specified. After the timeout comes the call to the `gcn_cronjob.py` script. Note that the usual `#/usr/bin/env python` line at the top of SCoPe's python scripts does not work within the `cron` environment. Instead, `python` must be explicitly specified, and in order to have access to the modules and scripts installed in `scope-env` we must provide a full path like the one above (`~/miniforge3/envs/scope-env/bin/python`). The line concludes by sending the script's output to a dedicated log file. This file gets overwritten each time the script runs.
### Check if `cron` job is running
It can be useful to know whether the script within a cron job is currently running. One way to do this for `gcn_cronjob.py` is to run the command `ps aux | grep gcn_cronjob.py`. This will always return one item (representing the command you just ran), but if the script is currently running you will see more than one item.
## Local feature generation/inference
-SCoPe contains a script that runs local feature generation and inference on sources specified in an input file. Example input files are contained within the `tools` directory (`local_scope_radec.csv` and `local_scope_ztfid.csv`). After receiving either ra/dec coordinates or ZTF light curve IDs (plus an object ID for each entry), the `run_scope_local.py` script will generate features and run inference using existing trained models, saving the results to timestamped directories. This script accepts most arguments from `generate_features.py` and `inference.py`. Additional inputs specific to this script are listed below.
+SCoPe contains a script that runs local feature generation and inference on sources specified in an input file. Example input files are contained within the `tools` directory (`local_scope_radec.csv` and `local_scope_ztfid.csv`). After receiving either ra/dec coordinates or ZTF light curve IDs (plus an object ID for each entry), the `run-scope-local` script will generate features and run inference using existing trained models, saving the results to timestamped directories. This script accepts most arguments from `generate-features` and `scope-inference`. Additional inputs specific to this script are listed below.
inputs:
1. --path-dataset : path (from base scope directory or fully qualified) to parquet, hdf5 or csv file containing specific sources (str)
@@ -380,28 +382,28 @@ current_dt : formatted datetime string used to label output directories
### Example usage
```
-./run_scope_local.py --path-dataset tools/local_scope_ztfid.csv --doCPU --doRemoveTerrestrial --scale_features min_max --group-names DR16_stats nobalance_DR16_DNN_stats --algorithms xgb
+run-scope-local --path-dataset tools/local_scope_ztfid.csv --doCPU --doRemoveTerrestrial --scale_features min_max --group-names DR16_stats nobalance_DR16_DNN_stats --algorithms xgb
-./run_scope_local.py --path-dataset tools/local_scope_radec.csv --doCPU --write_csv --doRemoveTerrestrial --group-names DR16_stats nobalance_DR16_DNN_stats --algorithms xgb dnn
+run-scope-local --path-dataset tools/local_scope_radec.csv --doCPU --write_csv --doRemoveTerrestrial --group-names DR16_stats nobalance_DR16_DNN_stats --algorithms xgb dnn
```
-## Scope Download Classification
+## scope-download-classification
inputs:
1. --file : CSV file containing obj_id and/or ra dec coordinates. Set to "parse" to download sources by group id.
-2. --group_ids : target group id(s) on Fritz for download (if CSV file not provided)
+2. --group-ids : target group id(s) on Fritz for download, space-separated (if CSV file not provided)
3. --start : Index or page number (if in "parse" mode) to begin downloading (optional)
-4. --merge_features : Flag to merge features from Kowalski with downloaded sources
-5. --features_catalog : Name of features catalog to query
-6. --features_limit : Limit on number of sources to query at once
-7. --taxonomy_map : Filename of taxonomy mapper (JSON format)
-8. --output_dir : Name of directory to save downloaded files
-9. --output_filename : Name of file containing merged classifications and features
-10. --output_format : Output format of saved files, if not specified in (9). Must be one of parquet, h5, or csv.
-11. --get_ztf_filters : Flag to add ZTF filter IDs (separate catalog query) to default features
-12. --impute_missing_features : Flag to impute missing features using scope.utils.impute_features
-13. --update_training_set : if downloading an active learning sample, update the training set with the new classification based on votes
-14. --updated_training_set_prefix : Prefix to add to updated training set file
-15. --min_vote_diff : Minimum number of net votes (upvotes - downvotes) to keep an active learning classification. Caution: if zero, all classifications of reviewed sources will be added
+4. --merge-features : Flag to merge features from Kowalski with downloaded sources
+5. --features-catalog : Name of features catalog to query
+6. --features-limit : Limit on number of sources to query at once
+7. --taxonomy-map : Filename of taxonomy mapper (JSON format)
+8. --output-dir : Name of directory to save downloaded files
+9. --output-filename : Name of file containing merged classifications and features
+10. --output-format : Output format of saved files, if not specified in (9). Must be one of parquet, h5, or csv.
+11. --get-ztf-filters : Flag to add ZTF filter IDs (separate catalog query) to default features
+12. --impute-missing-features : Flag to impute missing features using scope.utils.impute_features
+13. --update-training-set : if downloading an active learning sample, update the training set with the new classification based on votes
+14. --updated-training-set-prefix : Prefix to add to updated training set file
+15. --min-vote-diff : Minimum number of net votes (upvotes - downvotes) to keep an active learning classification. Caution: if zero, all classifications of reviewed sources will be added
process:
1. if CSV file provided, query by object ids or ra, dec
@@ -411,60 +413,60 @@ process:
5. if merge_features, query Kowalski and merge sources with features, saving new CSV file
6. Fritz sources with multiple associated ZTF IDs will generate multiple rows in the merged feature file
7. To skip the source download part of the code, provide an input CSV file containing columns named 'obj_id', 'classification', 'probability', 'period_origin', 'period', 'ztf_id_origin', and 'ztf_id'.
-8. Set `--update_training_set` to read the config-specified training set and merge new sources/classifications from an active learning group
+8. Set `--update-training-set` to read the config-specified training set and merge new sources/classifications from an active learning group
output: data with new columns appended.
```sh
-./scope_download_classification.py --file sample.csv --group_ids 360 361 --start 10 --merge_features True --features_catalog ZTF_source_features_DR16 --features_limit 5000 --taxonomy_map golden_dataset_mapper.json --output_dir fritzDownload --output_filename merged_classifications_features --output_format parquet -get_ztf_filters --impute_missing_features
+scope-download-classification --file sample.csv --group-ids 360 361 --start 10 --merge-features True --features-catalog ZTF_source_features_DR16 --features-limit 5000 --taxonomy-map golden_dataset_mapper.json --output-dir fritzDownload --output-filename merged_classifications_features --output-format parquet -get-ztf-filters --impute-missing-features
```
-## Scope Download GCN Sources
+## scope-download-gcn-sources
inputs:
1. --dateobs: unique dateObs of GCN event (str)
-2. --group_ids: group ids to query sources [all if not specified] (list)
-3. --days_range: max days past event to search for sources (float)
-4. --radius_arcsec: radius [arcsec] around new sources to search for existing ZTF sources (float)
-5. --save_filename: filename to save source ids/coordinates (str)
+2. --group-ids: group ids to query sources, space-separated [all if not specified] (list)
+3. --days-range: max days past event to search for sources (float)
+4. --radius-arcsec: radius [arcsec] around new sources to search for existing ZTF sources (float)
+5. --save-filename: filename to save source ids/coordinates (str)
process:
1. query all sources associated with GCN event
2. get fritz names, ras and decs for each page of sources
-3. save json file in a useful format to use with `generate_features.py --doSpecificIDs`
+3. save json file in a useful format to use with `generate-features --doSpecificIDs`
```sh
-./scope_download_gcn_sources.py --dateobs 2023-05-21T05:30:43
+scope-download-gcn-sources --dateobs 2023-05-21T05:30:43
```
-## Scope Upload Classification
+## scope-upload-classification
inputs:
1. --file : path to CSV, HDF5 or Parquet file containing ra, dec, period, and labels
-2. --group_ids : target group id(s) on Fritz for upload
+2. --group-ids : target group id(s) on Fritz for upload, space-separated
3. --classification : Name(s) of input file columns containing classification probabilities (one column per label). Set this to "read" to automatically upload all classes specified in the taxonomy mapper at once.
-4. --taxonomy_map : Filename of taxonomy mapper (JSON format)
+4. --taxonomy-map : Filename of taxonomy mapper (JSON format)
5. --comment : Comment to post (if specified)
6. --start : Index to start uploading (zero-based)
7. --stop : Index to stop uploading (inclusive)
-8. --classification_origin: origin of classifications. If 'SCoPe' (default), Fritz will apply custom color-coding
-9. --skip_phot : flag to skip photometry upload (skips for existing sources only)
-10. --post_survey_id : flag to post an annotation for the Gaia, AllWISE or PS1 id associated with each source
-11. --survey_id_origin : Annotation origin name for survey_id
-12. --p_threshold : Probability threshold for posted classification (values must be >= than this number to post)
-13. --match_ids : flag to match input and existing survey_id values during upload. It is recommended to instead match obj_ids (see next line)
-14. --use_existing_obj_id : flag to use existing source names in a column named 'obj_id' (a coordinate-based ID is otherwise generated by default)
-15. --post_upvote : flag to post an upvote to newly uploaded classifications. Not recommended when posting automated classifications for active learning.
-16. --check_labelled_box : flag to check the 'labelled' box for each source when uploading classifications. Not recommended when posting automated classifications for active learning.
-17. --write_obj_id : flag to output a copy of the input file with an 'obj_id' column containing the coordinate-based IDs for each posted object. Use this file as input for future uploads to add to this column.
-18. --result_dir : name of directory where upload results file is saved. Default is 'fritzUpload' within the tools directory.
-19. --result_filetag: name of tag appended to the result filename. Default is 'fritzUpload'.
-20. --result_format : result file format; one of csv, h5 or parquet. Default is parquet.
-21. --replace_classifications : flag to delete each source's existing classifications before posting new ones.
-22. --radius_arcsec: photometry search radius for uploaded sources.
-23. --no_ml: flag to post classifications that do not originate from an ML classifier.
-24. --post_phot_as_comment: flag to post photometry as a comment on the source (bool)
-25. --post_phasefolded_phot: flag to post phase-folded photometry as comment in addition to time series (bool)
-26. --phot_dirname: name of directory in which to save photometry plots (str)
-27. --instrument_name: name of instrument used for observations (str)
+8. --classification-origin: origin of classifications. If 'SCoPe' (default), Fritz will apply custom color-coding
+9. --skip-phot : flag to skip photometry upload (skips for existing sources only)
+10. --post-survey-id : flag to post an annotation for the Gaia, AllWISE or PS1 id associated with each source
+11. --survey-id-origin : Annotation origin name for survey_id
+12. --p-threshold : Probability threshold for posted classification (values must be >= than this number to post)
+13. --match-ids : flag to match input and existing survey_id values during upload. It is recommended to instead match obj_ids (see next line)
+14. --use-existing-obj-id : flag to use existing source names in a column named 'obj_id' (a coordinate-based ID is otherwise generated by default)
+15. --post-upvote : flag to post an upvote to newly uploaded classifications. Not recommended when posting automated classifications for active learning.
+16. --check-labelled-box : flag to check the 'labelled' box for each source when uploading classifications. Not recommended when posting automated classifications for active learning.
+17. --write-obj-id : flag to output a copy of the input file with an 'obj_id' column containing the coordinate-based IDs for each posted object. Use this file as input for future uploads to add to this column.
+18. --result-dir : name of directory where upload results file is saved. Default is 'fritzUpload' within the tools directory.
+19. --result-filetag: name of tag appended to the result filename. Default is 'fritzUpload'.
+20. --result-format : result file format; one of csv, h5 or parquet. Default is parquet.
+21. --replace-classifications : flag to delete each source's existing classifications before posting new ones.
+22. --radius-arcsec: photometry search radius for uploaded sources.
+23. --no-ml: flag to post classifications that do not originate from an ML classifier.
+24. --post-phot-as-comment: flag to post photometry as a comment on the source (bool)
+25. --post-phasefolded-phot: flag to post phase-folded photometry as comment in addition to time series (bool)
+26. --phot-dirname: name of directory in which to save photometry plots (str)
+27. --instrument-name: name of instrument used for observations (str)
process:
0. include Kowalski host, port, protocol, and token or username+password in config.yaml
@@ -476,14 +478,14 @@ process:
6. (post comment to each uploaded source)
```sh
-./scope_upload_classification.py --file sample.csv --group_ids 500 250 750 --classification variable flaring --taxonomy_map map.json --comment confident --start 35 --stop 50 --skip_phot --p_threshold 0.9 --write_obj_id --result_format csv --use_existing_obj_id --post_survey_id --replace_classifications
+scope-upload-classification --file sample.csv --group-ids 500 250 750 --classification variable flaring --taxonomy-map map.json --comment confident --start 35 --stop 50 --skip-phot --p-threshold 0.9 --write-obj-id --result-format csv --use-existing-obj-id --post-survey-id --replace-classifications
```
-## Scope Manage Annotation
+## scope-manage-annotation
inputs:
1. --action : one of "post", "update", or "delete"
2. --source : ZTF ID or path to .csv file with multiple objects (ID column "obj_id")
-3. --target : group id(s) on Fritz
+3. --group-ids : target group id(s) on Fritz, space-separated
4. --origin : name of annotation
5. --key : name of annotation
6. --value : value of annotation (required for "post" and "update" - if source is a .csv file, value will auto-populate from `source[key]`)
@@ -494,10 +496,10 @@ process:
3. confirm changes with printed messages
```sh
-./scope_manage_annotation.py --action post --source sample.csv --group_ids 200 300 400 --origin revisedperiod --key period
+scope-manage-annotation --action post --source sample.csv --group_ids 200 300 400 --origin revisedperiod --key period
```
-## Scope Upload Disagreements
+## Scope Upload Disagreements (deprecated)
inputs:
1. dataset
2. group id on Fritz
diff --git a/gcn_cronjob.py b/gcn_cronjob.py
index 3c89cb69..29ed47f6 100755
--- a/gcn_cronjob.py
+++ b/gcn_cronjob.py
@@ -5,21 +5,20 @@
from datetime import datetime, timedelta
import argparse
import pathlib
-import yaml
from tools.scope_download_gcn_sources import download_gcn_sources
import os
-from scope.utils import read_parquet
+from scope.utils import read_parquet, parse_load_config
import numpy as np
import warnings
import json
+from scope.scope_class import Scope
+from tools.combine_preds import combine_preds
+from tools.scope_upload_classification import upload_classification
-BASE_DIR = pathlib.Path(__file__).parent.absolute()
NUM_PER_PAGE = 100
-
-config_path = BASE_DIR / "config.yaml"
-with open(config_path) as config_yaml:
- config = yaml.load(config_yaml, Loader=yaml.FullLoader)
+BASE_DIR = pathlib.Path.cwd()
+config = parse_load_config()
def query_gcn_events(
@@ -28,24 +27,23 @@ def query_gcn_events(
post_group_ids: list = [1544],
days_range: float = 7.0,
radius_arcsec: float = 0.5,
- save_filename: str = 'tools/fritzDownload/specific_ids_GCN_sources',
+ save_filename: str = 'fritzDownload/specific_ids_GCN_sources',
taxonomy_map: str = 'tools/fritz_mapper.json',
combined_preds_dirname: str = 'GCN_dnn_xgb',
dateobs: str = None,
p_threshold: float = 0.7,
username: str = 'bhealy',
- generated_features_dirname: str = 'generated_features_gcn_sources',
+ generated_features_dirname: str = 'generated_features_GCN_sources',
partition: str = 'gpu-debug',
doNotPost: bool = False,
agg_method: str = 'mean',
dnn_preds_directory: str = 'GCN_dnn',
xgb_preds_directory: str = 'GCN_xgb',
- path_to_python: str = '~/miniforge3/envs/scope-env/bin/python',
checkpoint_filename: str = 'gcn_sources_checkpoint.json',
checkpoint_refresh_days: float = 180.0,
ignore_checkpoint: bool = False,
):
-
+ scope = Scope()
currentDate = datetime.utcnow()
current_dt = currentDate.strftime("%Y-%m-%dT%H:%M:%S")
@@ -133,7 +131,6 @@ def query_gcn_events(
# EM+GW group on Fritz
post_group_ids.append(1544)
- post_group_ids_str = "".join([f"{x} " for x in post_group_ids]).strip()
print(f'Running for event {dateobs}...')
# Colons can confuse the file system; replace them for saving
@@ -141,8 +138,7 @@ def query_gcn_events(
# Check for existing sources file
filepath = (
- BASE_DIR
- / f'tools/fritzDownload/specific_ids_GCN_sources.{save_dateobs}.parquet'
+ BASE_DIR / f'fritzDownload/specific_ids_GCN_sources.{save_dateobs}.parquet'
)
if filepath.exists():
existing_sources = read_parquet(filepath)
@@ -251,34 +247,77 @@ def query_gcn_events(
print(
"Consolidating DNN and XGB classification results for Fritz..."
)
- os.system(
- f"{path_to_python} {BASE_DIR}/scope.py select_fritz_sample --fields='{save_dateobs}_specific_ids' --group='DR16' --algorithm='xgb' \
- --probability_threshold=0 --consol_filename='inference_results_{save_dateobs}' --al_directory='GCN' \
- --al_filename='GCN_sources_{save_dateobs}' --write_consolidation_results --select_top_n --doAllSources --write_csv"
- )
- os.system(
- f"{path_to_python} {BASE_DIR}/scope.py select_fritz_sample --fields='{save_dateobs}_specific_ids' --group='nobalance_DR16_DNN' --algorithm='dnn' \
- --probability_threshold=0 --consol_filename='inference_results_{save_dateobs}' --al_directory='GCN' \
- --al_filename='GCN_sources_{save_dateobs}' --write_consolidation_results --select_top_n --doAllSources --write_csv"
- )
+ try:
+ generator = scope.select_fritz_sample(
+ fields=[f"{save_dateobs}_specific_ids"],
+ group="DR16_importance",
+ algorithm="xgb",
+ probability_threshold=0.0,
+ consol_filename=f"inference_results_{save_dateobs}",
+ al_directory="GCN",
+ al_filename=f"GCN_sources_{save_dateobs}",
+ write_consolidation_results=True,
+ select_top_n=True,
+ doAllSources=True,
+ write_csv=True,
+ )
+ [x for x in generator]
+
+ generator = scope.select_fritz_sample(
+ fields=[f"{save_dateobs}_specific_ids"],
+ group="nobalance_DR16_DNN",
+ algorithm="dnn",
+ probability_threshold=0.0,
+ consol_filename=f"inference_results_{save_dateobs}",
+ al_directory="GCN",
+ al_filename=f"GCN_sources_{save_dateobs}",
+ write_consolidation_results=True,
+ select_top_n=True,
+ doAllSources=True,
+ write_csv=True,
+ )
+ [x for x in generator]
+
+ except Exception as e:
+ print(f"Exception raised during select_fritz_sample: {e}")
print("Combining DNN and XGB preds...")
- os.system(
- f"{path_to_python} {BASE_DIR}/tools/combine_preds.py --dateobs {save_dateobs} --combined_preds_dirname {combined_preds_dirname}/{save_dateobs} \
- --merge_dnn_xgb --write_csv --p_threshold {p_threshold} --agg_method {agg_method} --dnn_directory {dnn_preds_directory} \
- --xgb_directory {xgb_preds_directory}"
- )
+
+ try:
+ combine_preds(
+ dateobs=save_dateobs,
+ combined_preds_dirname=f"{combined_preds_dirname}/{save_dateobs}",
+ merge_dnn_xgb=True,
+ write_csv=True,
+ p_threshold=p_threshold,
+ agg_method=agg_method,
+ dnn_directory=dnn_preds_directory,
+ xgb_directory=xgb_preds_directory,
+ )
+ except Exception as e:
+ print(f"Exception raised during combine_preds: {e}")
if not doNotPost:
print(
f"Uploading classifications with p > {p_threshold}. Posting light curves as comments."
)
- os.system(
- f"{path_to_python} {BASE_DIR}/tools/scope_upload_classification.py --file {BASE_DIR}/{combined_preds_dirname}/{save_dateobs}/merged_GCN_sources_{save_dateobs}.parquet \
- --classification read --taxonomy_map {BASE_DIR}/{taxonomy_map} --skip_phot --use_existing_obj_id --group_ids {post_group_ids_str} --radius_arcsec {radius_arcsec} \
- --p_threshold {p_threshold} --post_phot_as_comment --post_phasefolded_phot"
- )
+
+ try:
+ upload_classification(
+ file=f"{BASE_DIR}/{combined_preds_dirname}/{save_dateobs}/merged_GCN_sources_{save_dateobs}.parquet",
+ classification="read",
+ taxonomy_map=f"{BASE_DIR}/{taxonomy_map}",
+ skip_phot=True,
+ use_existing_obj_id=True,
+ group_ids=post_group_ids,
+ radius_arcsec=radius_arcsec,
+ p_threshold=p_threshold,
+ post_phot_as_comment=True,
+ post_phasefolded_phot=True,
+ )
+ except Exception as e:
+ print(f"Exception raised during upload_classification: {e}")
print(f"Finished for {dateobs}.")
@@ -296,7 +335,7 @@ def query_gcn_events(
json.dump(checkpoint_dict, f)
-if __name__ == '__main__':
+def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument(
@@ -306,45 +345,45 @@ def query_gcn_events(
help="Number of days before today to query GCN events",
)
parser.add_argument(
- "--query_group_ids",
+ "--query-group-ids",
type=int,
nargs='+',
default=[],
help="group ids to query sources (all if not specified)",
)
parser.add_argument(
- "--post_group_ids",
+ "--post-group-ids",
type=int,
nargs='+',
default=[1544],
help="group ids to post source classifications (EM+GW group if not specified)",
)
parser.add_argument(
- "--days_range",
+ "--days-range",
type=float,
default=7.0,
help="max days past event to search for sources",
)
parser.add_argument(
- "--radius_arcsec",
+ "--radius-arcsec",
type=float,
default=0.5,
help="radius around new sources to search for existing ZTF sources",
)
parser.add_argument(
- "--save_filename",
+ "--save-filename",
type=str,
- default='tools/fritzDownload/specific_ids_GCN_sources',
+ default='fritzDownload/specific_ids_GCN_sources',
help="filename to save source ids/coordinates",
)
parser.add_argument(
- "--taxonomy_map",
+ "--taxonomy-map",
type=str,
default='tools/fritz_mapper.json',
help="path to taxonomy map for uploading classifications to Fritz",
)
parser.add_argument(
- "--combined_preds_dirname",
+ "--combined-preds-dirname",
type=str,
default='GCN_dnn_xgb',
help="dirname in which to save combined preds files",
@@ -356,7 +395,7 @@ def query_gcn_events(
help="If querying specific dateobs, specify here to override daysAgo.",
)
parser.add_argument(
- "--p_threshold",
+ "--p-threshold",
type=float,
default=0.7,
help="minimum classification probability to post to Fritz",
@@ -368,7 +407,7 @@ def query_gcn_events(
help="Username for compute resources (e.g. Expanse)",
)
parser.add_argument(
- "--generated_features_dirname",
+ "--generated-features-dirname",
type=str,
default='generated_features_GCN_sources',
help="dirname containing generated GCN source features",
@@ -385,48 +424,47 @@ def query_gcn_events(
help="If set, run analysis but do not post classifications. Useful for testing",
)
parser.add_argument(
- "--agg_method",
+ "--agg-method",
type=str,
default='mean',
help="Aggregation method for classification probabilities (mean or max)",
)
parser.add_argument(
- "--dnn_preds_directory",
+ "--dnn-preds-directory",
type=str,
default='GCN_dnn',
help="dirname in which dnn preds are saved",
)
parser.add_argument(
- "--xgb_preds_directory",
+ "--xgb-preds-directory",
type=str,
default='GCN_xgb',
help="dirname in which xgb preds preds are saved",
)
parser.add_argument(
- "--path_to_python",
- type=str,
- default='~/miniforge3/envs/scope-env/bin/python',
- help="path to python within scope environment (run 'which python' while your scope environment is active to find)",
- )
- parser.add_argument(
- "--checkpoint_filename",
+ "--checkpoint-filename",
type=str,
default='gcn_sources_checkpoint.json',
help="filename containing source ids already classified",
)
parser.add_argument(
- "--checkpoint_refresh_days",
+ "--checkpoint-refresh-days",
type=float,
default=180.0,
help="days after checkpoint start_date to delete json file and re-generate",
)
parser.add_argument(
- "--ignore_checkpoint",
+ "--ignore-checkpoint",
action='store_true',
help="If set, ignore current classified sources listed in checkpoint file (bool)",
)
- args = parser.parse_args()
+ return parser
+
+
+if __name__ == "__main__":
+ parser = get_parser()
+ args, _ = parser.parse_known_args()
query_gcn_events(
daysAgo=args.daysAgo,
@@ -446,7 +484,6 @@ def query_gcn_events(
agg_method=args.agg_method,
dnn_preds_directory=args.dnn_preds_directory,
xgb_preds_directory=args.xgb_preds_directory,
- path_to_python=args.path_to_python,
checkpoint_filename=args.checkpoint_filename,
checkpoint_refresh_days=args.checkpoint_refresh_days,
ignore_checkpoint=args.ignore_checkpoint,
diff --git a/tools/kowalski_query_examples.ipynb b/kowalski_query_examples.ipynb
similarity index 100%
rename from tools/kowalski_query_examples.ipynb
rename to kowalski_query_examples.ipynb
diff --git a/pyproject.toml b/pyproject.toml
index 0324e470..ff94686e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,90 @@
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
[tool.black]
-target-version = ['py37', 'py38']
+target-version = ['py39', 'py310', 'py311']
skip-string-normalization = true
+
+[tool.poetry]
+name = "scope-ml"
+version = "0.9.0"
+description = "SCoPe: ZTF Source Classification Project"
+readme = "README.md"
+authors = ["Brian F. Healy, Michael W. Coughlin, Ashish A. Mahabal, Theophile J. du Laz, Andrew Drake, Matthew J. Graham, Lynne A. Hillenbrand, Jan van Roestel, Paula Szkody et al."]
+maintainers = ["Brian F. Healy "]
+license = "MIT"
+repository = "https://github.com/ZwickyTransientFacility/scope"
+documentation = "https://zwickytransientfacility.github.io/scope-docs/"
+packages = [
+ {include = "scope"},
+ {include = "tools"},
+]
+exclude = ["**/*.parquet", "**/*.csv", "**/*.ipynb", "**/*.json", "**/*.h5", "**/*.yaml", "**/*.html", "**/*.txt"]
+include = ["config.defaults.yaml", "tools/golden_dataset_mapper.json", "tools/fritz_mapper.json", "tools/DNN_AL_mapper.json", "tools/XGB_AL_mapper.json", "tools/local_scope_ztfid.csv", "tools/local_scope_radec.csv"]
+
+[tool.poetry.dependencies]
+python = "^3.9, <3.12"
+deepdiff = ">=5.0"
+gsutil = ">=4.60"
+keras-tuner = ">=1.0.2"
+matplotlib = ">=3.3"
+questionary = ">=1.8.1"
+scikit-learn = ">=0.24.1"
+tensorflow = ">=2.14.0,<=2.15.0"
+wandb = ">=0.12.1"
+h5py = ">=3.10.0"
+astropy = ">=5.2.2"
+fast-histogram = ">=0.11"
+healpy = ">=1.16.2"
+jinja2 = "<=3.1"
+myst-parser = ">=0.18.1"
+pandas = ">=1.2"
+penquins = ">=2.3.1"
+pyyaml = ">=5.3.1"
+tdtax = ">=0.1.6"
+pyarrow = ">=9.0.0"
+numba = ">=0.56.4"
+numpy = ">=1.23,<1.24"
+cesium = ">=0.11.1"
+xgboost = ">=1.7.5"
+seaborn = ">=0.12.2"
+pydot = ">=1.4.2"
+notebook = ">=7.0.6"
+tables = ">=3.7,<3.9.2"
+
+[tool.poetry.dev-dependencies]
+pre-commit = ">=3.5.0"
+pytest = ">=6.1.2"
+sphinx = ">=4.2"
+sphinx-press-theme = ">=0.8.0"
+poetry = ">=1.7.1"
+
+[tool.poetry.scripts]
+scope-initialize = "scope.__init__:initialize"
+scope-develop = "scope._instantiate:develop"
+scope-lint = "scope.scope_class:Scope.lint"
+scope-doc = "scope._instantiate:doc"
+scope-train = "scope._instantiate:train"
+create-training-script = "scope._instantiate:create_training_script"
+assemble-training-stats = "scope._instantiate:assemble_training_stats"
+create-inference-script = "scope._instantiate:create_inference_script"
+select-fritz-sample = "scope._instantiate:select_fritz_sample"
+scope-test-limited = "scope._instantiate:test_limited"
+scope-test = "scope._instantiate:test"
+scope-download-classification = "tools.scope_download_classification:main"
+scope-upload-classification = "tools.scope_upload_classification:main"
+scope-manage-annotation = "tools.scope_manage_annotation:main"
+post-taxonomy = "tools.taxonomy:main"
+generate-features = "tools.generate_features:main"
+generate-features-slurm = "tools.generate_features_slurm:main"
+generate-features-job-submission = "tools.generate_features_job_submission:main"
+train-algorithm-slurm = "tools.train_algorithm_slurm:main"
+train-algorithm-job-submission = "tools.train_algorithm_job_submission:main"
+run-inference = "tools.inference:main"
+run-inference-slurm = "tools.run_inference_slurm:main"
+run-inference-job-submission = "tools.run_inference_job_submission:main"
+combine-preds = "tools.combine_preds:main"
+get-quad-ids = "tools.get_quad_ids:main"
+run-scope-local = "tools.run_scope_local:main"
+analyze-logs = "tools.analyze_logs:main"
diff --git a/requirements.txt b/requirements.txt
index 1c61c5a5..966080d2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,27 @@
--r .requirements/dev.txt
--r .requirements/doc.txt
+h5py>=3.10.0
+astropy>=5.2.2
+fast-histogram>=0.11
+healpy>=1.16.2
+Jinja2<=3.1
+myst-parser>=0.18.1
+pandas>=1.2
+penquins>=2.3.1
+pyyaml>=5.3.1
+tdtax>=0.1.6
+tables>=3.7,<3.9.2
+pyarrow>=9.0.0
+numba>=0.56.4
+numpy>=1.23,<1.24
+cesium>=0.11.1
+xgboost>=1.7.5
+seaborn>=0.12.2
+pydot>=1.4.2
+notebook>=7.0.6
+deepdiff>=5.0
+gsutil>=4.60
+keras-tuner>=1.0.2
+matplotlib>=3.3
+questionary>=1.8.1
+scikit-learn>=0.24.1
+tensorflow>=2.14.0,<=2.15.0
+wandb>=0.12.1
diff --git a/scope/__init__.py b/scope/__init__.py
index 0ffb151c..8956345c 100644
--- a/scope/__init__.py
+++ b/scope/__init__.py
@@ -1,13 +1,12 @@
-from .nn import *
-from .utils import *
-from .models import *
-from .fritz import *
+import shutil
+import os
+import site
# Below code adapted from https://github.com/skyportal/skyportal/blob/main/skyportal/__init__.py
# 2022-10-18
-__version__ = '0.5.dev0'
+__version__ = "0.9.0"
-if 'dev' in __version__:
+if "dev" in __version__:
# Append last commit date and hash to dev version information, if available
import subprocess
@@ -15,7 +14,7 @@
try:
p = subprocess.Popen(
- ['git', 'log', '-1', '--format="%h %aI"'],
+ ["git", "log", "-1", '--format="%h %aI"'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
cwd=os.path.dirname(__file__),
@@ -26,15 +25,67 @@
out, err = p.communicate()
if p.returncode == 0:
git_hash, git_date = (
- out.decode('utf-8')
+ out.decode("utf-8")
.strip()
- .replace('"', '')
- .split('T')[0]
- .replace('-', '')
+ .replace('"', "")
+ .split("T")[0]
+ .replace("-", "")
.split()
)
- __version__ = '+'.join(
- [tag for tag in __version__.split('+') if not tag.startswith('git')]
+ __version__ = "+".join(
+ [tag for tag in __version__.split("+") if not tag.startswith("git")]
)
- __version__ += f'+git{git_date}.{git_hash}'
+ __version__ += f"+git{git_date}.{git_hash}"
+
+
+def initialize():
+ """create directories, copy config and data files"""
+ main_dir = "scope"
+ scope_dirs = ["tools"]
+ os.makedirs(main_dir, exist_ok=True)
+ for directory in scope_dirs:
+ os.makedirs(f"{main_dir}/{directory}", exist_ok=True)
+
+ site_packages_path = site.getsitepackages()[0]
+ default_config_name = "config.defaults.yaml"
+ copied_config_name = "config.yaml"
+ tools_dir = "tools"
+ mappers = [
+ "golden_dataset_mapper.json",
+ "fritz_mapper.json",
+ "DNN_AL_mapper.json",
+ "XGB_AL_mapper.json",
+ "local_scope_ztfid.csv",
+ "local_scope_radec.csv",
+ ]
+
+ print()
+ # Copy config defaults to new directory strucutre if needed
+ if not os.path.exists(f"{main_dir}/{copied_config_name}"):
+ shutil.copy(
+ f"{site_packages_path}/{default_config_name}",
+ f"{main_dir}/{default_config_name}",
+ )
+ shutil.copy(
+ f"{site_packages_path}/{default_config_name}",
+ f"{main_dir}/{copied_config_name}",
+ )
+ print(
+ f"Created new '{copied_config_name}' config file. Please customize/add tokens there before running scope."
+ )
+ else:
+ print(
+ f"Warning: {copied_config_name} already exists in the '{main_dir}' directory."
+ )
+
+ print()
+ for mapper in mappers:
+ print(f"Copying default data '{mapper}' to '{main_dir}/{tools_dir}'")
+ shutil.copy(
+ f"{site_packages_path}/{tools_dir}/{mapper}",
+ f"{main_dir}/{tools_dir}/{mapper}",
+ )
+
+ print()
+ print(f"scope-ml initialized. Run scripts from '{main_dir}' directory.")
diff --git a/scope/_instantiate.py b/scope/_instantiate.py
new file mode 100644
index 00000000..ce0b4052
--- /dev/null
+++ b/scope/_instantiate.py
@@ -0,0 +1,40 @@
+# For use by pip-installed scope package
+from scope.scope_class import Scope
+
+scope = Scope()
+
+
+def develop():
+ scope.develop()
+
+
+def doc():
+ scope.doc()
+
+
+def train():
+ scope.parse_run_train()
+
+
+def create_training_script():
+ scope.parse_run_create_training_script()
+
+
+def assemble_training_stats():
+ scope.parse_run_assemble_training_stats()
+
+
+def create_inference_script():
+ scope.parse_run_create_inference_script()
+
+
+def select_fritz_sample():
+ scope.parse_run_select_fritz_sample()
+
+
+def test_limited():
+ scope.test_limited()
+
+
+def test():
+ scope.parse_run_test()
diff --git a/scope/fritz.py b/scope/fritz.py
index bd6252a1..0df37fea 100755
--- a/scope/fritz.py
+++ b/scope/fritz.py
@@ -1,19 +1,17 @@
import urllib
import requests
-import pathlib
-import yaml
import time
from typing import Optional, Mapping
import numpy as np
import pandas as pd
from requests.exceptions import InvalidJSONError, JSONDecodeError
from urllib3.exceptions import ProtocolError
-
+from scope.utils import parse_load_config
# define the baseurl and set the fritz token to connect
-config_path = pathlib.Path(__file__).parent.parent.absolute() / "config.yaml"
-with open(config_path) as config_yaml:
- config = yaml.load(config_yaml, Loader=yaml.FullLoader)
+
+config = parse_load_config()
+
BASE_URL = f"{config['fritz']['protocol']}://{config['fritz']['host']}/"
MAX_ATTEMPTS = config['fritz']['max_attempts']
SLEEP_TIME = config['fritz']['sleep_time']
diff --git a/scope/nn.py b/scope/nn.py
index c7193077..b23b7fe2 100644
--- a/scope/nn.py
+++ b/scope/nn.py
@@ -11,7 +11,7 @@
auc,
precision_recall_curve,
)
-from scope.utils import make_confusion_matrix, plot_roc, plot_pr
+from .utils import make_confusion_matrix, plot_roc, plot_pr
import numpy as np
import wandb
import json
@@ -269,43 +269,43 @@ def build_model(
# fixme: for now, simply use Keras' Functional API
if (not dense_branch) and (not conv_branch):
- raise ValueError('model must have at least one branch')
+ raise ValueError("model must have at least one branch")
features_input = tf.keras.Input(
- shape=kwargs.get("features_input_shape", (40,)), name='features'
+ shape=kwargs.get("features_input_shape", (40,)), name="features"
)
dmdt_input = tf.keras.Input(
- shape=kwargs.get("dmdt_input_shape", (26, 26, 1)), name='dmdt'
+ shape=kwargs.get("dmdt_input_shape", (26, 26, 1)), name="dmdt"
)
# dense branch to digest features
if dense_branch:
- x_dense = tf.keras.layers.Dense(256, activation='relu', name='dense_fc_1')(
+ x_dense = tf.keras.layers.Dense(256, activation="relu", name="dense_fc_1")(
features_input
)
x_dense = tf.keras.layers.Dropout(0.25)(x_dense)
- x_dense = tf.keras.layers.Dense(32, activation='relu', name='dense_fc_2')(
+ x_dense = tf.keras.layers.Dense(32, activation="relu", name="dense_fc_2")(
x_dense
)
# CNN branch to digest dmdt
if conv_branch:
x_conv = tf.keras.layers.SeparableConv2D(
- 16, (3, 3), activation='relu', name='conv_conv_1'
+ 16, (3, 3), activation="relu", name="conv_conv_1"
)(dmdt_input)
# x_conv = tf.keras.layers.Dropout(0.25)(x_conv)
x_conv = tf.keras.layers.SeparableConv2D(
- 16, (3, 3), activation='relu', name='conv_conv_2'
+ 16, (3, 3), activation="relu", name="conv_conv_2"
)(x_conv)
x_conv = tf.keras.layers.Dropout(0.25)(x_conv)
x_conv = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x_conv)
x_conv = tf.keras.layers.SeparableConv2D(
- 32, (3, 3), activation='relu', name='conv_conv_3'
+ 32, (3, 3), activation="relu", name="conv_conv_3"
)(x_conv)
# x_conv = tf.keras.layers.Dropout(0.25)(x_conv)
x_conv = tf.keras.layers.SeparableConv2D(
- 32, (3, 3), activation='relu', name='conv_conv_4'
+ 32, (3, 3), activation="relu", name="conv_conv_4"
)(x_conv)
x_conv = tf.keras.layers.Dropout(0.25)(x_conv)
# x_conv = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x_conv)
@@ -322,10 +322,10 @@ def build_model(
x = tf.keras.layers.Dropout(0.4)(x)
# one more dense layer?
- x = tf.keras.layers.Dense(16, activation='relu', name='fc_1')(x)
+ x = tf.keras.layers.Dense(16, activation="relu", name="fc_1")(x)
# Logistic regression to output the final score
- x = tf.keras.layers.Dense(1, activation='sigmoid', name='score')(x)
+ x = tf.keras.layers.Dense(1, activation="sigmoid", name="score")(x)
m = tf.keras.Model(inputs=[features_input, dmdt_input], outputs=x)
@@ -385,13 +385,13 @@ def assign_datasets(
val_dataset,
wandb_token,
):
- self.meta['features_input_shape'] = features_input_shape
- self.meta['train_dataset_repeat'] = train_dataset_repeat
- self.meta['val_dataset_repeat'] = val_dataset_repeat
- self.meta['steps_per_epoch_train'] = steps_per_epoch_train
- self.meta['steps_per_epoch_val'] = steps_per_epoch_val
- self.meta['train_dataset'] = train_dataset
- self.meta['val_dataset'] = val_dataset
+ self.meta["features_input_shape"] = features_input_shape
+ self.meta["train_dataset_repeat"] = train_dataset_repeat
+ self.meta["val_dataset_repeat"] = val_dataset_repeat
+ self.meta["steps_per_epoch_train"] = steps_per_epoch_train
+ self.meta["steps_per_epoch_val"] = steps_per_epoch_val
+ self.meta["train_dataset"] = train_dataset
+ self.meta["val_dataset"] = val_dataset
wandb.login(key=wandb_token)
@@ -399,7 +399,7 @@ def sweep(
self,
):
wandb.init(
- job_type='sweep',
+ job_type="sweep",
)
wandb_epochs = wandb.config.epochs
@@ -420,7 +420,7 @@ def sweep(
wandb_decay = wandb.config.decay
self.setup(
- features_input_shape=self.meta['features_input_shape'],
+ features_input_shape=self.meta["features_input_shape"],
dense_branch=wandb_dense_branch,
conv_branch=wandb_conv_branch,
dmdt_input_shape=(26, 26, 1),
@@ -440,36 +440,36 @@ def sweep(
)
self.train(
- train_dataset=self.meta['train_dataset_repeat'],
- val_dataset=self.meta['val_dataset_repeat'],
- steps_per_epoch_train=self.meta['steps_per_epoch_train'],
- steps_per_epoch_val=self.meta['steps_per_epoch_val'],
+ train_dataset=self.meta["train_dataset_repeat"],
+ val_dataset=self.meta["val_dataset_repeat"],
+ steps_per_epoch_train=self.meta["steps_per_epoch_train"],
+ steps_per_epoch_val=self.meta["steps_per_epoch_val"],
epochs=wandb_epochs,
)
- stats_train = self.evaluate(self.meta['train_dataset'], name='train', verbose=0)
- stats_val = self.evaluate(self.meta['val_dataset'], name='val', verbose=0)
+ stats_train = self.evaluate(self.meta["train_dataset"], name="train", verbose=0)
+ stats_val = self.evaluate(self.meta["val_dataset"], name="val", verbose=0)
wandb.log(
{
- 'dense_branch': wandb_dense_branch,
- 'conv_branch': wandb_conv_branch,
- 'loss': wandb_loss,
- 'optimizer': wandb_optimizer,
- 'lr': wandb_lr,
- 'momentum': wandb_momentum,
- 'monitor': wandb_monitor,
- 'patience': wandb_patience,
- 'callbacks': wandb_callbacks,
- 'run_eagerly': wandb_run_eagerly,
- 'beta_1': wandb_beta_1,
- 'beta_2': wandb_beta_2,
- 'epsilon': wandb_epsilon,
- 'amsgrad': wandb_amsgrad,
- 'decay': wandb_decay,
- 'epochs': wandb_epochs,
- 'train_loss': stats_train[0],
- 'val_loss': stats_val[0],
+ "dense_branch": wandb_dense_branch,
+ "conv_branch": wandb_conv_branch,
+ "loss": wandb_loss,
+ "optimizer": wandb_optimizer,
+ "lr": wandb_lr,
+ "momentum": wandb_momentum,
+ "monitor": wandb_monitor,
+ "patience": wandb_patience,
+ "callbacks": wandb_callbacks,
+ "run_eagerly": wandb_run_eagerly,
+ "beta_1": wandb_beta_1,
+ "beta_2": wandb_beta_2,
+ "epsilon": wandb_epsilon,
+ "amsgrad": wandb_amsgrad,
+ "decay": wandb_decay,
+ "epochs": wandb_epochs,
+ "train_loss": stats_train[0],
+ "val_loss": stats_val[0],
}
)
@@ -499,14 +499,14 @@ def train(
verbose=verbose,
)
- def evaluate(self, eval_dataset, name='test', **kwargs):
+ def evaluate(self, eval_dataset, name="test", **kwargs):
y_eval = np.concatenate([y for _, y in eval_dataset], axis=0)
y_pred = np.around(self.predict(eval_dataset, name=f"_{name}", **kwargs))
- self.meta[f'y_{name}'] = y_eval
+ self.meta[f"y_{name}"] = y_eval
# Generate confusion matrix
- self.meta[f'cm_{name}'] = confusion_matrix(y_eval, y_pred, normalize='all')
+ self.meta[f"cm_{name}"] = confusion_matrix(y_eval, y_pred, normalize="all")
return self.model.evaluate(eval_dataset, **kwargs)
@@ -514,9 +514,9 @@ def predict(self, X, name=None, **kwargs):
y_pred = self.model.predict(X)
if name is not None:
- self.meta[f'y_pred{name}'] = y_pred
+ self.meta[f"y_pred{name}"] = y_pred
else:
- self.meta['y_pred'] = y_pred
+ self.meta["y_pred"] = y_pred
return y_pred
@@ -534,7 +534,7 @@ def save(
output_path: str = "./",
output_format: str = "h5",
plot: bool = False,
- names: list = ['train', 'val', 'test'],
+ names: list = ["train", "val", "test"],
cm_include_count: bool = False,
cm_include_percent: bool = True,
annotate_scores: bool = False,
@@ -549,8 +549,8 @@ def save(
output_path.mkdir(parents=True, exist_ok=True)
output_name = self.name if not tag else tag
- if not output_name.endswith('.h5'):
- output_name += '.h5'
+ if not output_name.endswith(".h5"):
+ output_name += ".h5"
self.model.save(output_path / output_name, save_format=output_format)
stats_dct = {}
@@ -561,48 +561,48 @@ def save(
path = output_path / f"{tag}_plots" / name
if not path.exists():
path.mkdir(parents=True, exist_ok=True)
- cmpdf = tag + '_cm.pdf'
- recallpdf = tag + '_recall.pdf'
- rocpdf = tag + '_roc.pdf'
- stats_json = tag + '_stats.json'
+ cmpdf = tag + "_cm.pdf"
+ recallpdf = tag + "_recall.pdf"
+ rocpdf = tag + "_roc.pdf"
+ stats_json = tag + "_stats.json"
- if self.meta[f'cm_{name}'] is not None:
- cname = tag.split('.')[0]
+ if self.meta[f"cm_{name}"] is not None:
+ cname = tag.split(".")[0]
accuracy, precision, recall, f1_score = make_confusion_matrix(
- self.meta[f'cm_{name}'],
+ self.meta[f"cm_{name}"],
figsize=(8, 6),
cbar=False,
count=cm_include_count,
percent=cm_include_percent,
- categories=['not ' + cname, cname],
+ categories=["not " + cname, cname],
annotate_scores=annotate_scores,
)
- stats_dct['accuracy'] = accuracy
- stats_dct['precision'] = precision
- stats_dct['recall'] = recall
- stats_dct['f1_score'] = f1_score
- sns.set_context('talk')
+ stats_dct["accuracy"] = accuracy
+ stats_dct["precision"] = precision
+ stats_dct["recall"] = recall
+ stats_dct["f1_score"] = f1_score
+ sns.set_context("talk")
plt.title(cname)
- plt.savefig(path / cmpdf, bbox_inches='tight')
+ plt.savefig(path / cmpdf, bbox_inches="tight")
plt.close()
- y_compare = self.meta.get(f'y_{name}', None)
- y_pred = self.meta.get(f'y_pred_{name}', None)
+ y_compare = self.meta.get(f"y_{name}", None)
+ y_pred = self.meta.get(f"y_pred_{name}", None)
if (y_compare is not None) & (y_pred is not None):
fpr, tpr, _ = roc_curve(y_compare, y_pred)
roc_auc = auc(fpr, tpr)
precision, recall, _ = precision_recall_curve(y_compare, y_pred)
- stats_dct['roc_auc'] = roc_auc
+ stats_dct["roc_auc"] = roc_auc
plot_roc(fpr, tpr, roc_auc)
- plt.savefig(path / rocpdf, bbox_inches='tight')
+ plt.savefig(path / rocpdf, bbox_inches="tight")
plt.close()
plot_pr(recall, precision)
- plt.savefig(path / recallpdf, bbox_inches='tight')
+ plt.savefig(path / recallpdf, bbox_inches="tight")
plt.close()
- with open(path / stats_json, 'w') as f:
+ with open(path / stats_json, "w") as f:
json.dump(stats_dct, f)
diff --git a/scope.py b/scope/scope_class.py
similarity index 55%
rename from scope.py
rename to scope/scope_class.py
index 840e0a7b..b9ac4fa6 100755
--- a/scope.py
+++ b/scope/scope_class.py
@@ -1,30 +1,26 @@
#!/usr/bin/env python
from contextlib import contextmanager
import datetime
-from deepdiff import DeepDiff
-import fire
import numpy as np
import os
import pandas as pd
import pathlib
from penquins import Kowalski
-from pprint import pprint
-import questionary
import subprocess
import sys
import tdtax
from typing import Optional, Sequence, Union
-import yaml
-from scope.utils import (
+from .utils import (
forgiving_true,
- load_config,
read_hdf,
read_parquet,
write_parquet,
+ parse_load_config,
)
-from scope.fritz import radec_to_iau_name
+from .fritz import radec_to_iau_name
import json
import shutil
+import argparse
@contextmanager
@@ -46,56 +42,15 @@ def status(message):
print(f"\r[✓] {message}")
-def check_configs(config_wildcards: Sequence = ("config.*yaml",)):
- """
- - Check if config files exist
- - Offer to use the config files that match the wildcards
- - For config.yaml, check its contents against the defaults to make sure nothing is missing/wrong
-
- :param config_wildcards:
- :return:
- """
- path = pathlib.Path(__file__).parent.absolute()
-
- for config_wildcard in config_wildcards:
- config = config_wildcard.replace("*", "")
- # use config defaults if configs do not exist?
- if not (path / config).exists():
- answer = questionary.select(
- f"{config} does not exist, do you want to use one of the following"
- " (not recommended without inspection)?",
- choices=[p.name for p in path.glob(config_wildcard)],
- ).ask()
- subprocess.run(["cp", f"{path / answer}", f"{path / config}"])
-
- # check contents of config.yaml WRT config.defaults.yaml
- if config == "config.yaml":
- with open(path / config.replace(".yaml", ".defaults.yaml")) as config_yaml:
- config_defaults = yaml.load(config_yaml, Loader=yaml.FullLoader)
- with open(path / config) as config_yaml:
- config_wildcard = yaml.load(config_yaml, Loader=yaml.FullLoader)
- deep_diff = DeepDiff(config_defaults, config_wildcard, ignore_order=True)
- difference = {
- k: v for k, v in deep_diff.items() if k in ("dictionary_item_removed",)
- }
- if len(difference) > 0:
- print("config.yaml structure differs from config.defaults.yaml")
- pprint(difference)
- raise KeyError("Fix config.yaml before proceeding")
-
-
class Scope:
def __init__(self):
- # check configuration
- with status("Checking configuration"):
- check_configs(config_wildcards=["config.*yaml"])
-
- self.base_path = pathlib.Path(__file__).parent.absolute()
-
- self.config = load_config(self.base_path / "config.yaml")
+ # load configuration
+ with status("Loading configuration"):
+ self.base_path = pathlib.Path.cwd()
+ self.config = parse_load_config()
self.default_path_dataset = (
- self.base_path / self.config['training']['dataset']
+ self.base_path / self.config["training"]["dataset"]
)
# use tokens specified as env vars (if exist)
@@ -113,15 +68,15 @@ def __init__(self):
hosts = [
x
- for x in self.config['kowalski']['hosts']
- if self.config['kowalski']['hosts'][x]['token'] is not None
+ for x in self.config["kowalski"]["hosts"]
+ if self.config["kowalski"]["hosts"][x]["token"] is not None
]
instances = {
host: {
- 'protocol': self.config['kowalski']['protocol'],
- 'port': self.config['kowalski']['port'],
- 'host': f'{host}.caltech.edu',
- 'token': self.config['kowalski']['hosts'][host]['token'],
+ "protocol": self.config["kowalski"]["protocol"],
+ "port": self.config["kowalski"]["port"],
+ "host": f"{host}.caltech.edu",
+ "token": self.config["kowalski"]["hosts"][host]["token"],
}
for host in hosts
}
@@ -158,8 +113,8 @@ def _get_features(
if catalog is None:
catalog = self.config["kowalski"]["collections"]["features"]
- period_colname = 'period'
- if not ((period_suffix is None) | (period_suffix == 'None')):
+ period_colname = "period"
+ if not ((period_suffix is None) | (period_suffix == "None")):
period_colname = f"{period_colname}_{period_suffix}"
features_dct = {}
@@ -186,7 +141,7 @@ def _get_features(
if len(responses[name]) > 0:
response = responses[name]
if response.get("status", "error") == "success":
- features_response = response.get('data').get(catalog)
+ features_response = response.get("data").get(catalog)
features_dct.update(features_response)
features_nearest = [v[0] for k, v in features_response.items() if len(v) > 0]
df = pd.DataFrame.from_records(features_nearest)
@@ -246,7 +201,7 @@ def _get_nearest_gaia(
if len(responses[name]) > 0:
response = responses[name]
if response.get("status", "error") == "success":
- gaia_response = response.get('data').get(catalog)
+ gaia_response = response.get("data").get(catalog)
gaia_dct.update(gaia_response)
gaia_nearest = [v[0] for k, v in gaia_dct.items() if len(v) > 0]
df = pd.DataFrame.from_records(gaia_nearest)
@@ -320,7 +275,7 @@ def _get_light_curve_data(
if len(responses[name]) > 0:
response = responses[name]
if response.get("status", "error") == "success":
- lcs = response.get('data').get(catalog).get('target')
+ lcs = response.get("data").get(catalog).get("target")
light_curves_raw += lcs
light_curves = []
@@ -368,7 +323,7 @@ def lint(cls):
def doc(self):
"""Build docs"""
- from scope.utils import (
+ from .utils import (
make_tdtax_taxonomy,
plot_gaia_density,
plot_gaia_hr,
@@ -376,11 +331,11 @@ def doc(self):
plot_periods,
)
- period_suffix_config = self.config['features']['info']['period_suffix']
+ period_suffix_config = self.config["features"]["info"]["period_suffix"]
# generate taxonomy.html
with status("Generating taxonomy visualization"):
- path_static = pathlib.Path(__file__).parent.absolute() / "doc" / "_static"
+ path_static = self.base_path / "doc" / "_static"
if not path_static.exists():
path_static.mkdir(parents=True, exist_ok=True)
tdtax.write_viz(
@@ -410,10 +365,10 @@ def doc(self):
# example periods
with status("Generating example period histograms"):
- path_doc_data = pathlib.Path(__file__).parent.absolute() / "doc" / "data"
+ path_doc_data = self.base_path / "doc" / "data"
# stored as ra/decs in csv format under /data/golden
- golden_sets = pathlib.Path(__file__).parent.absolute() / "data" / "golden"
+ golden_sets = self.base_path / "data" / "golden"
for golden_set in golden_sets.glob("*.csv"):
golden_set_name = golden_set.stem
positions = pd.read_csv(golden_set).to_numpy().tolist()
@@ -438,15 +393,11 @@ def doc(self):
# example skymaps for all Golden sets
with status("Generating skymaps diagrams for Golden sets"):
- path_doc_data = pathlib.Path(__file__).parent.absolute() / "doc" / "data"
+ path_doc_data = self.base_path / "doc" / "data"
- path_gaia_density = (
- pathlib.Path(__file__).parent.absolute()
- / "data"
- / "Gaia_hp8_densitymap.fits"
- )
+ path_gaia_density = self.base_path / "data" / "Gaia_hp8_densitymap.fits"
# stored as ra/decs in csv format under /data/golden
- golden_sets = pathlib.Path(__file__).parent.absolute() / "data" / "golden"
+ golden_sets = self.base_path / "data" / "golden"
for golden_set in golden_sets.glob("*.csv"):
golden_set_name = golden_set.stem
positions = pd.read_csv(golden_set).to_numpy().tolist()
@@ -459,7 +410,7 @@ def doc(self):
# example light curves
with status("Generating example light curves"):
- path_doc_data = pathlib.Path(__file__).parent.absolute() / "doc" / "data"
+ path_doc_data = self.base_path / "doc" / "data"
for sample_object_name, sample_object in self.config["docs"][
"field_guide"
@@ -479,13 +430,10 @@ def doc(self):
# example HR diagrams for all Golden sets
with status("Generating HR diagrams for Golden sets"):
path_gaia_hr_histogram = (
- pathlib.Path(__file__).parent.absolute()
- / "doc"
- / "data"
- / "gaia_hr_histogram.dat"
+ self.base_path / "doc" / "data" / "gaia_hr_histogram.dat"
)
# stored as ra/decs in csv format under /data/golden
- golden_sets = pathlib.Path(__file__).parent.absolute() / "data" / "golden"
+ golden_sets = self.base_path / "data" / "golden"
for golden_set in golden_sets.glob("*.csv"):
golden_set_name = golden_set.stem
positions = pd.read_csv(golden_set).to_numpy().tolist()
@@ -503,11 +451,11 @@ def doc(self):
@staticmethod
def fetch_models(gcs_path: str = "gs://ztf-scope/models"):
"""
- Fetch SCoPe models from GCP
+ (deprecated) Fetch SCoPe models from GCP
:return:
"""
- path_models = pathlib.Path(__file__).parent / "models"
+ path_models = pathlib.Path.cwd() / "models"
if not path_models.exists():
path_models.mkdir(parents=True, exist_ok=True)
@@ -527,11 +475,11 @@ def fetch_models(gcs_path: str = "gs://ztf-scope/models"):
@staticmethod
def fetch_datasets(gcs_path: str = "gs://ztf-scope/datasets"):
"""
- Fetch SCoPe datasets from GCP
+ (deprecated) Fetch SCoPe datasets from GCP
:return:
"""
- path_datasets = pathlib.Path(__file__).parent / "data" / "training"
+ path_datasets = pathlib.Path.cwd() / "data" / "training"
if not path_datasets.exists():
path_datasets.mkdir(parents=True, exist_ok=True)
@@ -548,26 +496,310 @@ def fetch_datasets(gcs_path: str = "gs://ztf-scope/datasets"):
if p.returncode != 0:
raise RuntimeError("Failed to fetch SCoPe datasets")
+ def parse_run_train(self):
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--tag",
+ type=str,
+ help="classifier designation, refers to 'class' in config.taxonomy",
+ )
+ parser.add_argument(
+ "--path-dataset",
+ type=str,
+ help="local path to .parquet, .h5 or .csv file with the dataset",
+ )
+ parser.add_argument(
+ "--algorithm",
+ type=str,
+ default="dnn",
+ help="name of ML algorithm to use",
+ )
+ parser.add_argument(
+ "--gpu",
+ type=int,
+ help="GPU id to use, zero-based. check tf.config.list_physical_devices('GPU') for available devices",
+ )
+ parser.add_argument(
+ "--verbose",
+ action="store_true",
+ help="if set, print verbose output",
+ )
+ parser.add_argument(
+ "--job-type",
+ type=str,
+ default="train",
+ help="name of job type for WandB",
+ )
+ parser.add_argument(
+ "--group",
+ type=str,
+ default="experiment",
+ help="name of group for WandB",
+ )
+ parser.add_argument(
+ "--run-sweeps",
+ action="store_true",
+ help="if set, run WandB sweeps instead of training",
+ )
+ parser.add_argument(
+ "--period-suffix",
+ type=str,
+ help="suffix of period/Fourier features to use for training",
+ )
+ parser.add_argument(
+ "--threshold",
+ type=float,
+ help="classification threshold separating positive from negative examples",
+ )
+ parser.add_argument(
+ "--balance",
+ type=float,
+ default=-1,
+ help="factor by which to weight majority vs. minority examples",
+ )
+ parser.add_argument(
+ "--weight-per-class",
+ action="store_true",
+ help="if set, weight training data based on fraction of positive/negative samples",
+ )
+ parser.add_argument(
+ "--scale-features",
+ type=str,
+ help="method by which to scale input features (min_max or median_std)",
+ )
+ parser.add_argument(
+ "--test-size",
+ type=float,
+ help="fractional size of test set, taken from initial learning set",
+ )
+ parser.add_argument(
+ "--val-size",
+ type=float,
+ help="fractional size of val set, taken from initial learning set less test set",
+ )
+ parser.add_argument(
+ "--random-state",
+ type=int,
+ help="random seed to set for reproducibility",
+ )
+ parser.add_argument(
+ "--feature-stats",
+ type=str,
+ help="feature stats to use to standardize features. If set to 'config', source feature stats from values in config file. Otherwise, compute them from data, taking balance into account",
+ )
+ parser.add_argument(
+ "--batch-size",
+ type=int,
+ help="batch size to use for training",
+ )
+ parser.add_argument(
+ "--shuffle-buffer-size",
+ type=int,
+ help="buffer size to use when shuffling training set",
+ )
+ parser.add_argument(
+ "--epochs",
+ type=int,
+ help="number of training epochs",
+ )
+ parser.add_argument(
+ "--float-convert-types",
+ type=int,
+ nargs=2,
+ help="convert floats from a to b bits (e.g. 64 32)",
+ )
+ parser.add_argument(
+ "--lr",
+ type=float,
+ help="dnn learning rate",
+ )
+ parser.add_argument(
+ "--beta-1",
+ type=float,
+ help="dnn beta_1",
+ )
+ parser.add_argument(
+ "--beta-2",
+ type=float,
+ help="dnn beta_2",
+ )
+ parser.add_argument(
+ "--epsilon",
+ type=float,
+ help="dnn epsilon",
+ )
+ parser.add_argument(
+ "--decay",
+ type=float,
+ help="dnn decay",
+ )
+ parser.add_argument(
+ "--momentum",
+ type=float,
+ help="dnn momentum",
+ )
+ parser.add_argument(
+ "--monitor",
+ type=float,
+ help="dnn monitor quantity",
+ )
+ parser.add_argument(
+ "--patience",
+ type=int,
+ help="dnn patience (in epochs)",
+ )
+ parser.add_argument(
+ "--callbacks",
+ type=str,
+ nargs="+",
+ help="dnn callbacks",
+ )
+ parser.add_argument(
+ "--run-eagerly",
+ action="store_true",
+ help="dnn run_eagerly",
+ )
+ parser.add_argument(
+ "--pre-trained-model",
+ type=str,
+ help="name of dnn pre-trained model to load, if any",
+ )
+ parser.add_argument(
+ "--save",
+ action="store_true",
+ help="if set, save trained model",
+ )
+ parser.add_argument(
+ "--plot",
+ action="store_true",
+ help="if set, generate/save diagnostic training plots",
+ )
+ parser.add_argument(
+ "--weights-only",
+ action="store_true",
+ help="if set and pre-trained model specified, load only weights",
+ )
+ parser.add_argument(
+ "--skip-cv",
+ action="store_true",
+ help="if set, skip XGB cross-validation",
+ )
+
+ args, _ = parser.parse_known_args()
+ self.train(**vars(args))
+
+ # args to add for ds.make (override config-specified values)
+ # threshold
+ # balance
+ # weight_per_class (test this to make sure it works as intended)
+ # scale_features
+ # test_size
+ # val_size
+ # random_state
+ # feature_stats
+ # batch_size
+ # shuffle_buffer_size
+ # epochs
+ # float_convert_types
+
+ # Args to add with descriptions (or references to tf docs)
+ # lr
+ # beta_1
+ # beta_2
+ # epsilon
+ # decay
+ # amsgrad
+ # momentum
+ # monitor
+ # patience
+ # callbacks
+ # run_eagerly
+ # pre_trained_model
+ # save
+ # plot
+ # weights_only
+
def train(
self,
tag: str,
path_dataset: Union[str, pathlib.Path] = None,
- algorithm: str = 'DNN',
+ algorithm: str = "dnn",
gpu: Optional[int] = None,
verbose: bool = False,
- job_type: str = 'train',
- group: str = 'experiment',
+ job_type: str = "train",
+ group: str = "experiment",
run_sweeps: bool = False,
+ period_suffix: str = None,
+ threshold: float = 0.7,
+ balance: Union[float, str] = -1,
+ weight_per_class=False,
+ scale_features: str = "min_max",
+ test_size: float = 0.1,
+ val_size: float = 0.1,
+ random_state: int = 42,
+ feature_stats: str = None,
+ batch_size: int = 64,
+ shuffle_buffer_size: int = 512,
+ epochs: int = 100,
+ float_convert_types: list = [64, 32],
+ lr: float = 3e-4,
+ beta_1: float = 0.9,
+ beta_2: float = 0.999,
+ epsilon: float = 1e-7,
+ decay: float = 0.0,
+ amsgrad: float = 3e-4,
+ momentum: float = 0.9,
+ monitor: str = "val_loss",
+ patience: int = 20,
+ callbacks: list = ["reduce_lr_on_plateau", "early_stopping"],
+ run_eagerly: bool = False,
+ pre_trained_model: str = None,
+ save: bool = False,
+ plot: bool = False,
+ weights_only: bool = False,
+ skip_cv: bool = False,
**kwargs,
):
"""Train classifier
- :param tag: classifier designation, refers to "class" in config.taxonomy
- :param path_dataset: local path to .parquet, .h5 or .csv file with the dataset
- :param algorithm: name of ML algorithm to use
- :param gpu: GPU id to use, zero-based. check tf.config.list_physical_devices('GPU') for available devices
- :param verbose:
- :param kwargs: refer to utils.DNN.setup and utils.Dataset.make
+ :param tag: classifier designation, refers to "class" in config.taxonomy (str)
+ :param path_dataset: local path to .parquet, .h5 or .csv file with the dataset (str)
+ :param algorithm: name of ML algorithm to use (str)
+ :param gpu: GPU id to use, zero-based. check tf.config.list_physical_devices('GPU') for available devices (int)
+ :param verbose: if set, print verbose output (bool)
+ :param job_type: name of job type for WandB (str)
+ :param group: name of group for WandB (str)
+ :param run_sweeps: if set, run WandB sweeps instead of training (bool)
+ :param period_suffix: suffix of period/Fourier features to use for training (str)
+ :param threshold: classification threshold separating positive from negative examples (float)
+ :param balance: factor by which to weight majority vs. minority examples (float or None)
+ :param weight_per_class: if set, weight training data based on fraction of positive/negative samples (bool)
+ :param scale_features: method by which to scale input features [min_max or median_std] (str)
+ :param test_size: fractional size of test set, taken from initial learning set (float)
+ :param val_size: fractional size of val set, taken from learning set less test set (float)
+ :param random_state: random seed to set for reproducibility (int)
+ :param feature_stats: feature stats to use to standardize features. If set to 'config', source feature stats from values in config file. Otherwise, compute them from data, taking balance into account (str)
+ :param batch_size: batch size to use for training (int)
+ :param shuffle_buffer_size: buffer size to use when shuffling training set (int)
+ :param epochs: number of training epochs (int)
+ :param float_convert_types: convert from a-bit to b-bit [e.g. 64 to 32] (list)
+ :param lr: dnn learning rate (float)
+ :param beta_1: dnn beta_1 (float)
+ :param beta_2: dnn beta_2 (float)
+ :param epsilon: dnn epsilon (float)
+ :param decay: dnn decay (float)
+ :param amsgrad: dnn amsgrad (float)
+ :param momentum: dnn momentum (float)
+ :param monitor: dnn monitor quantity (str)
+ :param patience: dnn patience [in epochs] (int)
+ :param callbacks: dnn callbacks (list)
+ :param run_eagerly: dnn run_eagerly (bool)
+ :param pre_trained_model: name of dnn pre-trained model to load, if any (str)
+ :param save: if set, save trained model (bool)
+ :param plot: if set, generate/save diagnostic training plots (bool)
+ :param weights_only: if set and pre-trained model specified, load only weights (bool)
+ :param skip_cv: if set, skip XGB cross-validation (bool)
+
:return:
"""
@@ -584,36 +816,36 @@ def train(
import wandb
from wandb.keras import WandbCallback
- from scope.nn import DNN
- from scope.xgb import XGB
- from scope.utils import Dataset
+ from .nn import DNN
+ from .xgb import XGB
+ from .utils import Dataset
if path_dataset is None:
path_dataset = self.default_path_dataset
- label_params = self.config["training"]["classes"][tag]
- train_config_xgb = self.config["training"]['xgboost']
+ config_params = self.config["training"]["classes"][tag]
+ train_config_dnn = self.config["training"]["dnn"]
+ train_config_xgb = self.config["training"]["xgboost"]
- period_suffix = kwargs.get(
- 'period_suffix', self.config['features']['info']['period_suffix']
- )
+ if period_suffix is None:
+ period_suffix = self.config["features"]["info"]["period_suffix"]
- if algorithm in ['DNN', 'NN', 'dnn', 'nn']:
- algorithm = 'dnn'
- elif algorithm in ['XGB', 'xgb', 'XGBoost', 'xgboost', 'XGBOOST']:
- algorithm = 'xgb'
+ if algorithm in ["DNN", "NN", "dnn", "nn"]:
+ algorithm = "dnn"
+ elif algorithm in ["XGB", "xgb", "XGBoost", "xgboost", "XGBOOST"]:
+ algorithm = "xgb"
else:
- raise ValueError('Current supported algorithms are DNN and XGB.')
+ raise ValueError("Current supported algorithms are DNN and XGB.")
- all_features = self.config["features"][label_params["features"]]
+ all_features = self.config["features"][config_params["features"]]
features = [
key for key in all_features if forgiving_true(all_features[key]["include"])
]
- if not ((period_suffix is None) | (period_suffix == 'None')):
- periodic_bool = [all_features[x]['periodic'] for x in features]
+ if not ((period_suffix is None) | (period_suffix == "None")):
+ periodic_bool = [all_features[x]["periodic"] for x in features]
for j, name in enumerate(features):
if periodic_bool[j]:
- features[j] = f'{name}_{period_suffix}'
+ features[j] = f"{name}_{period_suffix}"
ds = Dataset(
tag=tag,
@@ -621,32 +853,36 @@ def train(
features=features,
verbose=verbose,
algorithm=algorithm,
- **kwargs,
- )
-
- label = label_params["label"]
-
- # values from kwargs override those defined in config. if latter is absent, use reasonable default
- threshold = kwargs.get("threshold", label_params.get("threshold", 0.5))
- balance = kwargs.get("balance", label_params.get("balance", None))
- weight_per_class = kwargs.get(
- "weight_per_class", label_params.get("weight_per_class", False)
+ period_suffix=period_suffix,
)
- scale_features = kwargs.get("scale_features", "min_max")
- test_size = kwargs.get("test_size", label_params.get("test_size", 0.1))
- val_size = kwargs.get("val_size", label_params.get("val_size", 0.1))
- random_state = kwargs.get("random_state", label_params.get("random_state", 42))
- feature_stats = kwargs.get("feature_stats", None)
- if feature_stats == 'config':
+ label = config_params["label"]
+
+ # values from argparse args override those defined in config. if latter is absent, use reasonable default
+ if threshold is None:
+ threshold = config_params.get("threshold", 0.7)
+ if balance == -1:
+ balance = config_params.get("balance", None)
+ if not weight_per_class:
+ weight_per_class = config_params.get("weight_per_class", False)
+ if scale_features is None:
+ scale_features = config_params.get("scale_features", "min_max")
+ if test_size is None:
+ test_size = config_params.get("test_size", 0.1)
+ if val_size is None:
+ val_size = config_params.get("val_size", 0.1)
+ if random_state is None:
+ random_state = config_params.get("random_state", 42)
+ if feature_stats == "config":
feature_stats = self.config.get("feature_stats", None)
-
- batch_size = kwargs.get("batch_size", label_params.get("batch_size", 64))
- shuffle_buffer_size = kwargs.get(
- "shuffle_buffer_size", label_params.get("shuffle_buffer_size", 512)
- )
- epochs = kwargs.get("epochs", label_params.get("epochs", 100))
- float_convert_types = kwargs.get("float_convert_types", (64, 32))
+ if batch_size is None:
+ batch_size = config_params.get("batch_size", 64)
+ if shuffle_buffer_size is None:
+ shuffle_buffer_size = config_params.get("shuffle_buffer_size", 512)
+ if epochs is None:
+ epochs = config_params.get("epochs", 100)
+ if float_convert_types is None:
+ float_convert_types = config_params.get("float_convert_types", [64, 32])
datasets, indexes, steps_per_epoch, class_weight = ds.make(
target_label=label,
@@ -664,32 +900,53 @@ def train(
float_convert_types=float_convert_types,
)
- # Define default hyperparameters for model
- dense_branch = kwargs.get("dense_branch", True)
- conv_branch = kwargs.get("conv_branch", True)
- loss = kwargs.get("loss", "binary_crossentropy")
- optimizer = kwargs.get("optimizer", "adam")
- lr = float(kwargs.get("lr", 3e-4))
- beta_1 = kwargs.get("beta_1", 0.9)
- beta_2 = kwargs.get("beta_2", 0.999)
- epsilon = kwargs.get("epsilon", 1e-7) # None?
- decay = kwargs.get("decay", 0.0)
- amsgrad = kwargs.get("amsgrad", 3e-4)
- momentum = float(kwargs.get("momentum", 0.9))
- monitor = kwargs.get("monitor", "val_loss")
- patience = int(kwargs.get("patience", 20))
- callbacks = kwargs.get("callbacks", ("reduce_lr_on_plateau", "early_stopping"))
- run_eagerly = kwargs.get("run_eagerly", False)
- pre_trained_model = kwargs.get("pre_trained_model")
- save = kwargs.get("save", False)
- plot = kwargs.get("plot", False)
- weights_only = kwargs.get("weights_only", False)
- skip_cv = kwargs.get("skip_cv", False)
+ if lr is None:
+ lr = float(config_params.get("lr", 3e-4))
+ if beta_1 is None:
+ beta_1 = float(config_params.get("beta_1", 0.9))
+ if beta_2 is None:
+ beta_2 = float(config_params.get("beta_2", 0.999))
+ if epsilon is None:
+ epsilon = float(config_params.get("epsilon", 1e-7))
+ if decay is None:
+ decay = float(config_params.get("decay", 0.0))
+ if amsgrad is None:
+ amsgrad = float(config_params.get("amsgrad", 3e-4))
+ if momentum is None:
+ momentum = float(config_params.get("momentum", 0.9))
+ if monitor is None:
+ monitor = config_params.get("monitor", "val_loss")
+ if patience is None:
+ patience = int(config_params.get("patience", 20))
+ if callbacks is None:
+ callbacks = tuple(
+ config_params.get(
+ "callbacks", ["reduce_lr_on_plateau", "early_stopping"]
+ )
+ )
+ else:
+ callbacks = tuple(callbacks)
+ if not run_eagerly:
+ run_eagerly = config_params.get("run_eagerly", False)
+ if pre_trained_model is None:
+ pre_trained_model = config_params.get("pre_trained_model")
+ if not save:
+ save = config_params.get("save", False)
+ if not plot:
+ plot = config_params.get("plot", False)
+ if not weights_only:
+ weights_only = config_params.get("weights_only", False)
+
+ # Define default parameters for all DNN models
+ dense_branch = train_config_dnn.get("dense_branch", True)
+ conv_branch = train_config_dnn.get("conv_branch", True)
+ loss = train_config_dnn.get("loss", "binary_crossentropy")
+ optimizer = train_config_dnn.get("optimizer", "adam")
# xgb-specific arguments (descriptions adapted from https://xgboost.readthedocs.io/en/stable/parameter.html and https://xgboost.readthedocs.io/en/stable/python/python_api.html)
# max_depth: maximum depth of a tree
- max_depth_config = train_config_xgb['gridsearch_params_start_stop_step'].get(
- 'max_depth', [3, 8, 2]
+ max_depth_config = train_config_xgb["gridsearch_params_start_stop_step"].get(
+ "max_depth", [3, 8, 2]
)
max_depth_start = max_depth_config[0]
max_depth_stop = max_depth_config[1]
@@ -697,66 +954,68 @@ def train(
# min_child_weight: minimum sum of instance weight (hessian) needed in a child
min_child_weight_config = train_config_xgb[
- 'gridsearch_params_start_stop_step'
- ].get('min_child_weight', [1, 6, 2])
+ "gridsearch_params_start_stop_step"
+ ].get("min_child_weight", [1, 6, 2])
min_child_weight_start = min_child_weight_config[0]
min_child_weight_stop = min_child_weight_config[1]
min_child_weight_step = min_child_weight_config[2]
# eta = kwargs.get("xgb_eta", 0.1)
- eta_list = train_config_xgb['other_training_params'].get(
- 'eta_list', [0.3, 0.2, 0.1, 0.05]
+ eta_list = train_config_xgb["other_training_params"].get(
+ "eta_list", [0.3, 0.2, 0.1, 0.05]
)
# subsample: Subsample ratio of the training instances (setting to 0.5 means XGBoost would randomly sample half of the training data prior to growing trees)
- # subsample = kwargs.get("xgb_subsample", 0.7)
- subsample_config = train_config_xgb['gridsearch_params_start_stop_step'].get(
- 'subsample', [6, 11, 2]
+ subsample_config = train_config_xgb["gridsearch_params_start_stop_step"].get(
+ "subsample", [6, 11, 2]
)
subsample_start = subsample_config[0]
subsample_stop = subsample_config[1]
subsample_step = subsample_config[2]
# colsample_bytree: subsample ratio of columns when constructing each tree.
- # colsample_bytree = kwargs.get("xgb_colsample_bytree", 0.7)
colsample_bytree_config = train_config_xgb[
- 'gridsearch_params_start_stop_step'
- ].get('subsample', [6, 11, 2])
+ "gridsearch_params_start_stop_step"
+ ].get("subsample", [6, 11, 2])
colsample_bytree_start = colsample_bytree_config[0]
colsample_bytree_stop = colsample_bytree_config[1]
colsample_bytree_step = colsample_bytree_config[2]
# confusion matrix plotting parameters:
- cm_include_count = kwargs.get("cm_include_count", False)
- cm_include_percent = kwargs.get("cm_include_percent", True)
- annotate_scores = kwargs.get("annotate_scores", False)
+ cm_include_count = train_config_xgb["plot_params"].get(
+ "cm_include_count", False
+ )
+ cm_include_percent = train_config_xgb["plot_params"].get(
+ "cm_include_percent", True
+ )
+ annotate_scores = train_config_xgb["plot_params"].get("annotate_scores", False)
# seed: random seed
- seed = train_config_xgb['other_training_params'].get('seed', 42)
+ seed = random_state
# nfold: number of folds during cross-validation
- nfold = train_config_xgb['other_training_params'].get('nfold', 5)
+ nfold = train_config_xgb["other_training_params"].get("nfold", 5)
# metrics: evaluation metrics to use during cross-validation
- metrics = train_config_xgb['other_training_params'].get('metrics', ['auc'])
+ metrics = train_config_xgb["other_training_params"].get("metrics", ["auc"])
# objective: name of learning objective
- objective = train_config_xgb['other_training_params'].get(
+ objective = train_config_xgb["other_training_params"].get(
"objective", "binary:logistic"
)
# eval_metric: Evaluation metrics for validation data
- eval_metric = train_config_xgb['other_training_params'].get(
+ eval_metric = train_config_xgb["other_training_params"].get(
"eval_metric", "auc"
)
# early_stopping_rounds: Validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training
- early_stopping_rounds = train_config_xgb['other_training_params'].get(
+ early_stopping_rounds = train_config_xgb["other_training_params"].get(
"early_stopping_rounds", 10
)
# num_boost_round: Number of boosting iterations
- num_boost_round = train_config_xgb['other_training_params'].get(
+ num_boost_round = train_config_xgb["other_training_params"].get(
"num_boost_round", 999
)
@@ -765,14 +1024,16 @@ def train(
conv_branch = forgiving_true(conv_branch)
run_eagerly = forgiving_true(run_eagerly)
save = forgiving_true(save)
+ plot = forgiving_true(plot)
+ cm_include_count = forgiving_true(cm_include_count)
+ cm_include_percent = forgiving_true(cm_include_percent)
+ annotate_scores = forgiving_true(annotate_scores)
time_tag = datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S")
- output_path = (
- pathlib.Path(__file__).parent.absolute() / f"models_{algorithm}" / group
- )
+ output_path = self.base_path / f"models_{algorithm}" / group
- if algorithm == 'dnn':
+ if algorithm == "dnn":
classifier = DNN(name=tag)
@@ -786,17 +1047,17 @@ def train(
steps_per_epoch_val=steps_per_epoch["val"],
train_dataset=datasets["train"],
val_dataset=datasets["val"],
- wandb_token=self.config['wandb']['token'],
+ wandb_token=self.config["wandb"]["token"],
)
wandb.login(key=self.config["wandb"]["token"])
# Define sweep config
- sweep_configuration = self.config['wandb']['sweep_config_dnn']
- sweep_configuration['name'] = f"{group}-{tag}-{time_tag}"
+ sweep_configuration = self.config["wandb"]["sweep_config_dnn"]
+ sweep_configuration["name"] = f"{group}-{tag}-{time_tag}"
- entity = self.config['wandb']['entity']
- project = self.config['wandb']['project']
+ entity = self.config["wandb"]["entity"]
+ project = self.config["wandb"]["project"]
# Set up sweep/id
sweep_id = wandb.sweep(
@@ -808,27 +1069,27 @@ def train(
wandb.agent(sweep_id, function=classifier.sweep)
print(
- 'Sweep complete. Adjust hyperparameters in config file and run scope.py train again without the --run_sweeps flag.'
+ "Sweep complete. Adjust hyperparameters in config file and run scope-train again without the --run-sweeps flag."
)
# Stop sweep job
try:
- print('Stopping sweep.')
+ print("Stopping sweep.")
os.system(
- f'python -m wandb sweep --stop {entity}/{project}/{sweep_id}'
+ f"python -m wandb sweep --stop {entity}/{project}/{sweep_id}"
)
except Exception:
- print('Sweep already stopped.')
+ print("Sweep already stopped.")
return
if pre_trained_model is not None:
classifier.load(pre_trained_model, weights_only=weights_only)
model_input = classifier.model.input
- training_set_inputs = datasets['train'].element_spec[0]
+ training_set_inputs = datasets["train"].element_spec[0]
# Compare input shapes with model inputs
print(
- 'Comparing shapes of input features with inputs for existing model...'
+ "Comparing shapes of input features with inputs for existing model..."
)
for inpt in model_input:
inpt_name = inpt.name
@@ -836,7 +1097,7 @@ def train(
inpt_shape.assert_is_compatible_with(
training_set_inputs[inpt_name].shape
)
- print('Input shapes are consistent.')
+ print("Input shapes are consistent.")
classifier.set_callbacks(callbacks, tag, **kwargs)
else:
@@ -913,17 +1174,17 @@ def train(
verbose=verbose,
)
- elif algorithm == 'xgb':
+ elif algorithm == "xgb":
# XGB-specific code
- X_train = ds.df_ds.loc[indexes['train']][features]
- y_train = ds.target[indexes['train']]
+ X_train = ds.df_ds.loc[indexes["train"]][features]
+ y_train = ds.target[indexes["train"]]
- X_val = ds.df_ds.loc[indexes['val']][features]
- y_val = ds.target[indexes['val']]
+ X_val = ds.df_ds.loc[indexes["val"]][features]
+ y_val = ds.target[indexes["val"]]
- X_test = ds.df_ds.loc[indexes['test']][features]
- y_test = ds.target[indexes['test']]
+ X_test = ds.df_ds.loc[indexes["test"]][features]
+ y_test = ds.target[indexes["test"]]
scale_pos_weight = class_weight[1] / class_weight[0]
@@ -968,27 +1229,27 @@ def train(
if verbose:
print("Evaluating on train/val/test sets:")
# TODO: there should not need to be this algorithm-based split in the call to classifier.evaluate()
- if algorithm == 'xgb':
- stats_train = classifier.evaluate(X_train, y_train, name='train')
- stats_val = classifier.evaluate(X_val, y_val, name='val')
- stats_test = classifier.evaluate(X_test, y_test, name='test')
+ if algorithm == "xgb":
+ stats_train = classifier.evaluate(X_train, y_train, name="train")
+ stats_val = classifier.evaluate(X_val, y_val, name="val")
+ stats_test = classifier.evaluate(X_test, y_test, name="test")
else:
stats_train = classifier.evaluate(
- datasets["train"], name='train', verbose=verbose
+ datasets["train"], name="train", verbose=verbose
)
stats_val = classifier.evaluate(
- datasets["val"], name='val', verbose=verbose
+ datasets["val"], name="val", verbose=verbose
)
stats_test = classifier.evaluate(
- datasets["test"], name='test', verbose=verbose
+ datasets["test"], name="test", verbose=verbose
)
- print('training stats: ', stats_train)
- print('validation stats: ', stats_val)
+ print("training stats: ", stats_train)
+ print("validation stats: ", stats_val)
if verbose:
- print('test stats: ', stats_test)
+ print("test stats: ", stats_test)
- if algorithm == 'DNN':
+ if algorithm == "DNN":
param_names = (
"loss",
"tp",
@@ -1043,16 +1304,66 @@ def train(
return time_tag
+ def parse_run_create_training_script(self):
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--filename",
+ type=str,
+ default="train_script.sh",
+ help="filename of shell script (must not currently exist)",
+ )
+ parser.add_argument(
+ "--algorithm",
+ type=str,
+ default="dnn",
+ help="name of algorithm to use for training",
+ )
+ parser.add_argument(
+ "--min-count",
+ type=int,
+ default=100,
+ help="minimum number of positive examples to include in script",
+ )
+ parser.add_argument(
+ "--path-dataset",
+ type=str,
+ help="local path to .parquet, .h5 or .csv file with the dataset, if not provided in config.yaml",
+ )
+ parser.add_argument(
+ "--pre-trained-group-name",
+ type=str,
+ help="name of group containing pre-trained models within models directory",
+ )
+ parser.add_argument(
+ "--add-keywords",
+ type=str,
+ default="",
+ help="str containing additional training keywords to append to each line in the script",
+ )
+ parser.add_argument(
+ "--train-all",
+ action="store_true",
+ help="if group_name is specified, set this keyword to train all classes regardeless of whether a trained model exists",
+ )
+ parser.add_argument(
+ "--period-suffix",
+ type=str,
+ help="suffix of period/Fourier features to use for training",
+ )
+
+ args, _ = parser.parse_known_args()
+ self.create_training_script(**vars(args))
+
def create_training_script(
self,
- filename: str = 'train_script.sh',
- algorithm: str = 'dnn',
+ filename: str = "train_script.sh",
+ algorithm: str = "dnn",
min_count: int = 100,
path_dataset: str = None,
pre_trained_group_name: str = None,
- add_keywords: str = '',
+ add_keywords: str = "",
train_all: bool = False,
- **kwargs,
+ period_suffix: str = None,
):
"""
Create training shell script from classes in config file meeting minimum count requirement
@@ -1064,63 +1375,61 @@ def create_training_script(
:param pre_trained_group_name: name of group containing pre-trained models within models directory (str)
:param add_keywords: str containing additional training keywords to append to each line in the script
:param train_all: if group_name is specified, set this keyword to train all classes regardeless of whether a trained model exists (bool)
+ :param period_suffix: suffix of period/Fourier features to use for training (str)
:return:
- :examples: ./scope.py create_training_script --filename='train_dnn.sh' --algorithm='dnn' --min_count=1000 \
- --path_dataset='tools/fritzDownload/merged_classifications_features.parquet' --add_keywords='--save --plot --group=groupname'
+ :examples: create-training-script --filename train_dnn.sh --algorithm dnn --min-count 1000 \
+ --path-dataset tools/fritzDownload/merged_classifications_features.parquet --add-keywords '--save --plot --group groupname'
- ./scope.py create_training_script --filename='train_xgb.sh' --algorithm='xgb' --min_count=100 \
- --add_keywords='--save --plot --batch_size=32 --group=groupname'
+ create-training-script --filename train_xgb.sh --algorithm xgb --min-count 100 \
+ --add-keywords '--save --plot --batch-size 32 --group groupname'
"""
path = str(self.base_path / filename)
phenom_tags = []
ontol_tags = []
- period_suffix = kwargs.get(
- 'period_suffix', self.config['features']['info']['period_suffix']
- )
+ if period_suffix is None:
+ period_suffix = self.config["features"]["info"]["period_suffix"]
if path_dataset is None:
- dataset_name = self.config['training']['dataset']
+ dataset_name = self.config["training"]["dataset"]
path_dataset = str(self.base_path / dataset_name)
- if path_dataset.endswith('.parquet'):
+ if path_dataset.endswith(".parquet"):
dataset = read_parquet(path_dataset)
- elif path_dataset.endswith('.h5'):
+ elif path_dataset.endswith(".h5"):
dataset = read_hdf(path_dataset)
- elif path_dataset.endswith('.csv'):
+ elif path_dataset.endswith(".csv"):
dataset = pd.read_csv(path_dataset)
else:
raise ValueError(
- 'Dataset in config file must end with .parquet, .h5 or .csv'
+ "Dataset in config file must end with .parquet, .h5 or .csv"
)
- with open(path, 'x') as script:
+ with open(path, "x") as script:
- script.write('#!/bin/bash\n')
+ script.write("#!/bin/bash\n")
- for tag in self.config['training']['classes'].keys():
- label = self.config['training']['classes'][tag]['label']
- threshold = self.config['training']['classes'][tag]['threshold']
- branch = self.config['training']['classes'][tag]['features']
+ for tag in self.config["training"]["classes"].keys():
+ label = self.config["training"]["classes"][tag]["label"]
+ threshold = self.config["training"]["classes"][tag]["threshold"]
+ branch = self.config["training"]["classes"][tag]["features"]
num_pos = np.sum(dataset[label] > threshold)
if num_pos > min_count:
print(
- f'Label {label}: {num_pos} positive examples with P > {threshold}'
+ f"Label {label}: {num_pos} positive examples with P > {threshold}"
)
- if branch == 'phenomenological':
+ if branch == "phenomenological":
phenom_tags += [tag]
else:
ontol_tags += [tag]
if pre_trained_group_name is not None:
group_path = (
- pathlib.Path(__file__).parent.absolute()
- / f'models_{algorithm}'
- / pre_trained_group_name
+ self.base_path / f"models_{algorithm}" / pre_trained_group_name
)
gen = os.walk(group_path)
model_tags = [tag[1] for tag in gen]
@@ -1134,112 +1443,228 @@ def create_training_script(
set.intersection(set(ontol_tags), set(model_tags))
)
- script.write('# Phenomenological\n')
+ script.write("# Phenomenological\n")
for tag in phenom_tags:
if tag in phenom_hasmodel:
- tag_file_gen = (group_path / tag).glob('*.h5')
+ tag_file_gen = (group_path / tag).glob("*.h5")
most_recent_file = max(
[file for file in tag_file_gen], key=os.path.getctime
).name
script.writelines(
- f'./scope.py train --tag={tag} --algorithm={algorithm} --path_dataset={path_dataset} --pre_trained_model=models/{pre_trained_group_name}/{tag}/{most_recent_file} --period_suffix={period_suffix} --verbose {add_keywords} \n'
+ f"scope-train --tag {tag} --algorithm {algorithm} --path_dataset {path_dataset} --pre_trained_model models/{pre_trained_group_name}/{tag}/{most_recent_file} --period_suffix {period_suffix} --verbose {add_keywords} \n"
)
elif train_all:
script.writelines(
- f'./scope.py train --tag={tag} --algorithm={algorithm} --path_dataset={path_dataset} --period_suffix={period_suffix} --verbose {add_keywords} \n'
+ f"scope-train --tag {tag} --algorithm {algorithm} --path_dataset {path_dataset} --period_suffix {period_suffix} --verbose {add_keywords} \n"
)
- script.write('# Ontological\n')
+ script.write("# Ontological\n")
for tag in ontol_tags:
if tag in ontol_hasmodel:
- tag_file_gen = (group_path / tag).glob('*.h5')
+ tag_file_gen = (group_path / tag).glob("*.h5")
most_recent_file = max(
[file for file in tag_file_gen], key=os.path.getctime
).name
script.writelines(
- f'./scope.py train --tag={tag} --algorithm={algorithm} --path_dataset={path_dataset} --pre_trained_model=models/{pre_trained_group_name}/{tag}/{most_recent_file} --period_suffix={period_suffix} --verbose {add_keywords} \n'
+ f"scope-train --tag {tag} --algorithm {algorithm} --path_dataset {path_dataset} --pre_trained_model models/{pre_trained_group_name}/{tag}/{most_recent_file} --period_suffix {period_suffix} --verbose {add_keywords} \n"
)
elif train_all:
script.writelines(
- f'./scope.py train --tag={tag} --algorithm={algorithm} --path_dataset={path_dataset} --period_suffix={period_suffix} --verbose {add_keywords} \n'
+ f"scope-train --tag {tag} --algorithm {algorithm} --path_dataset {path_dataset} --period_suffix {period_suffix} --verbose {add_keywords} \n"
)
else:
- script.write('# Phenomenological\n')
+ script.write("# Phenomenological\n")
script.writelines(
[
- f'./scope.py train --tag={tag} --algorithm={algorithm} --path_dataset={path_dataset} --period_suffix={period_suffix} --verbose {add_keywords} \n'
+ f"scope-train --tag {tag} --algorithm {algorithm} --path_dataset {path_dataset} --period_suffix {period_suffix} --verbose {add_keywords} \n"
for tag in phenom_tags
]
)
- script.write('# Ontological\n')
+ script.write("# Ontological\n")
script.writelines(
[
- f'./scope.py train --tag={tag} --algorithm={algorithm} --path_dataset={path_dataset} --period_suffix={period_suffix} --verbose {add_keywords} \n'
+ f"scope-train --tag {tag} --algorithm {algorithm} --path_dataset {path_dataset} --period_suffix {period_suffix} --verbose {add_keywords} \n"
for tag in ontol_tags
]
)
+ print(f"Wrote traininig script to {path}.")
+
+ def parse_run_assemble_training_stats(self):
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--group-name",
+ type=str,
+ default="experiment",
+ help="trained model group name",
+ )
+ parser.add_argument(
+ "--algorithm",
+ type=str,
+ default="dnn",
+ help="name of ML algorithm",
+ )
+ parser.add_argument(
+ "--set-name",
+ type=str,
+ default="val",
+ help="one of train, val or test",
+ )
+ parser.add_argument(
+ "--importance-directory",
+ type=str,
+ default="xgb_feature_importance",
+ help="name of directory to save XGB feature importance",
+ )
+ parser.add_argument(
+ "--stats-directory",
+ type=str,
+ default="stats",
+ help="name of directory to save training stats",
+ )
+
+ args, _ = parser.parse_known_args()
+ self.assemble_training_stats(**vars(args))
def assemble_training_stats(
self,
- group_name: str = 'experiment',
- algorithm: str = 'dnn',
- set_name: str = 'val',
- importance_directory: str = 'xgb_feature_importance',
- stats_directory: str = 'stats',
+ group_name: str = "experiment",
+ algorithm: str = "dnn",
+ set_name: str = "val",
+ importance_directory: str = "xgb_feature_importance",
+ stats_directory: str = "stats",
):
+ """
+ Assemble training stats from individal class results
+
+ :param group_name: trained model group name (str)
+ :param algorithm: name of ML algorithm (str)
+ :param set_name: one of train, val or test (str)
+ :param importance_directory: name of directory to save XGB feature importance (str)
+ :param stats_directory: name of directory to save training stats (str)
+
+ :return:
+
+ :example: assemble-training-stats --group-name DR16 --algorithm xgb --set-name test \
+ --importance-directory xgb_importance --stats-directory xgb_stats
+ """
base_path = self.base_path
- group_path = base_path / f'models_{algorithm}' / group_name
+ group_path = base_path / f"models_{algorithm}" / group_name
- if algorithm in ['xgb', 'xgboost', 'XGB', 'XGBoost']:
+ if algorithm in ["xgb", "xgboost", "XGB", "XGBoost"]:
importance_path = base_path / importance_directory
importance_path.mkdir(exist_ok=True)
# XGB feature importance
- labels = [x for x in group_path.iterdir() if x.name != '.DS_Store']
+ labels = [x for x in group_path.iterdir() if x.name != ".DS_Store"]
statpaths = []
for label in labels:
statpaths.append(
- [x for x in label.glob(f'*plots/{set_name}/*impvars.json')][0]
+ [x for x in label.glob(f"*plots/{set_name}/*impvars.json")][0]
)
for statpath in statpaths:
strpath = str(statpath)
- os.system(f'cp {strpath} {importance_path}/.')
+ os.system(f"cp {strpath} {importance_path}/.")
# DNN/XGB stats
stats_path = base_path / f"{algorithm}_{stats_directory}"
stats_path.mkdir(exist_ok=True)
- labels = [x for x in group_path.iterdir() if x.name != '.DS_Store']
+ labels = [x for x in group_path.iterdir() if x.name != ".DS_Store"]
statpaths = []
for label in labels:
statpaths.append(
- [x for x in label.glob(f'*plots/{set_name}/*stats.json')][0]
+ [x for x in label.glob(f"*plots/{set_name}/*stats.json")][0]
)
for statpath in statpaths:
strpath = str(statpath)
- os.system(f'cp {strpath} {stats_path}/.')
+ os.system(f"cp {strpath} {stats_path}/.")
+
+ print("Finished assembling stats.")
+
+ def parse_run_create_inference_script(self):
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--filename",
+ type=str,
+ default="get_all_preds_dnn.sh",
+ help="filename of shell script (must not currently exist)",
+ )
+ parser.add_argument(
+ "--group-name",
+ type=str,
+ default="experiment",
+ help="name of group containing trained models within models directory",
+ )
+ parser.add_argument(
+ "--algorithm",
+ type=str,
+ default="dnn",
+ help="algorithm to use in script",
+ )
+ parser.add_argument(
+ "--scale-features",
+ type=str,
+ default="min_max",
+ help="method to scale features (currently 'min_max' or 'median_std')",
+ )
+ parser.add_argument(
+ "--feature-directory",
+ type=str,
+ default="features",
+ help="name of directory containing downloaded or generated features",
+ )
+ parser.add_argument(
+ "--write-csv",
+ action="store_true",
+ help="if set, write CSV file in addition to parquet",
+ )
+ parser.add_argument(
+ "--batch-size",
+ type=int,
+ default=100000,
+ help="batch size to use when reading feature files",
+ )
+ parser.add_argument(
+ "--use-custom-python",
+ action="store_true",
+ help="if True, the call to run-inference will be preceded by a specific path to python",
+ )
+ parser.add_argument(
+ "--path-to-python",
+ type=str,
+ default="~/miniforge3/envs/scope-env/bin/python",
+ help="if --use-custom-python is set (e.g. for a cron job), path to custom python installation",
+ )
+ parser.add_argument(
+ "--period-suffix",
+ type=str,
+ help="suffix of period/Fourier features to use for training",
+ )
+
+ args, _ = parser.parse_known_args()
+ self.create_inference_script(**vars(args))
def create_inference_script(
self,
- filename: str = 'get_all_preds_dnn.sh',
- group_name: str = 'experiment',
- algorithm: str = 'dnn',
- scale_features: str = 'min_max',
- feature_directory: str = 'features',
+ filename: str = "get_all_preds_dnn.sh",
+ group_name: str = "experiment",
+ algorithm: str = "dnn",
+ scale_features: str = "min_max",
+ feature_directory: str = "features",
write_csv: bool = False,
batch_size: int = 100000,
use_custom_python: bool = False,
- path_to_python: str = '~/miniforge3/envs/scope-env/bin/python',
- **kwargs,
+ path_to_python: str = "~/miniforge3/envs/scope-env/bin/python",
+ period_suffix: str = None,
):
"""
- Create inference shell script
+ Save shell script to use when running inference
:param filename: filename of shell script (must not currently exist) (str)
:param group_name: name of group containing trained models within models directory (str)
@@ -1248,84 +1673,84 @@ def create_inference_script(
:param feature_directory: name of directory containing downloaded or generated features (str)
:param write_csv: if True, write CSV file in addition to parquet (bool)
:param batch_size: batch size to use when reading feature files (int)
- :param use_custom_python: if True, the call to inference.py will be preceded by a specific path to python (bool)
+ :param use_custom_python: if True, the call to run-inference will be preceded by a specific path to python (bool)
:param path_to_python: if use_custom_python is set (e.g. for a cron job), path to custom python installation (str)
+ :param period_suffix: suffix of period/Fourier features to use for training (str)
:return:
- Saves shell script to use when running inference
- :example: ./scope.py create_inference_script --filename='get_all_preds_dnn.sh' --group_name='experiment' \
- --algorithm='dnn' --feature_directory='generated_features'
+ :example: create-inference-script --filename get_all_preds_dnn.sh --group-name experiment \
+ --algorithm dnn --feature-directory generated_features
"""
-
base_path = self.base_path
path = str(base_path / filename)
- group_path = base_path / f'models_{algorithm}' / group_name
+ group_path = base_path / f"models_{algorithm}" / group_name
- addtl_args = ''
+ addtl_args = ""
if write_csv:
- addtl_args += '--write_csv'
+ addtl_args += "--write-csv"
gen = os.walk(group_path)
model_tags = [tag[1] for tag in gen]
model_tags = model_tags[0]
- period_suffix = kwargs.get(
- 'period_suffix', self.config['features']['info']['period_suffix']
- )
+ if period_suffix is None:
+ period_suffix = self.config["features"]["info"]["period_suffix"]
if not use_custom_python:
- path_to_python = ''
+ path_to_python = ""
- with open(path, 'x') as script:
- script.write('#!/bin/bash\n')
+ with open(path, "x") as script:
+ script.write("#!/bin/bash\n")
script.write(
- '# Call script followed by field number, e.g: ./get_all_preds_dnn.sh 301\n'
+ "# Call script followed by field number, e.g: ./get_all_preds_dnn.sh 301\n"
)
- paths_models_str = ''
- model_class_names_str = ''
+ paths_models_str = ""
+ model_class_names_str = ""
- if algorithm in ['dnn', 'DNN', 'nn', 'NN']:
- algorithm = 'dnn'
+ if algorithm in ["dnn", "DNN", "nn", "NN"]:
+ algorithm = "dnn"
script.write('echo "dnn inference"\n')
# Select most recent model for each tag
for tag in model_tags:
- tag_file_gen = (group_path / tag).glob('*.h5')
+ tag_file_gen = (group_path / tag).glob("*.h5")
most_recent_file = max(
[file for file in tag_file_gen], key=os.path.getctime
).name
- paths_models_str += f'{str(base_path)}/models_{algorithm}/{group_name}/{tag}/{most_recent_file} '
- model_class_names_str += f'{tag} '
+ paths_models_str += f"{str(base_path)}/models_{algorithm}/{group_name}/{tag}/{most_recent_file} "
+ model_class_names_str += f"{tag} "
script.write(
- f'echo -n "Running inference..." && {path_to_python} {str(base_path)}/tools/inference.py --paths_models {paths_models_str} --model_class_names {model_class_names_str} --field $1 --whole_field --flag_ids --scale_features {scale_features} --feature_directory {feature_directory} --period_suffix {period_suffix} --batch_size {batch_size} {addtl_args} && echo "done"\n'
+ f'echo -n "Running inference..." && {path_to_python} run-inference --paths-models {paths_models_str} --model-class-names {model_class_names_str} --field $1 --whole-field --flag-ids --scale-features {scale_features} --feature-directory {feature_directory} --period-suffix {period_suffix} --batch-size {batch_size} {addtl_args} && echo "done"\n'
)
- elif algorithm in ['XGB', 'xgb', 'XGBoost', 'xgboost', 'XGBOOST']:
- algorithm = 'xgb'
+ elif algorithm in ["XGB", "xgb", "XGBoost", "xgboost", "XGBOOST"]:
+ algorithm = "xgb"
script.write('echo "xgb inference"\n')
for tag in model_tags:
- tag_file_gen = (group_path / tag).glob('*.json')
+ tag_file_gen = (group_path / tag).glob("*.json")
most_recent_file = max(
[file for file in tag_file_gen], key=os.path.getctime
).name
- paths_models_str += f'{str(base_path)}/models_{algorithm}/{group_name}/{tag}/{most_recent_file} '
- model_class_names_str += f'{tag} '
+ paths_models_str += f"{str(base_path)}/models_{algorithm}/{group_name}/{tag}/{most_recent_file} "
+ model_class_names_str += f"{tag} "
script.write(
- f'echo -n "Running inference..." && {path_to_python} {str(base_path)}/tools/inference.py --paths_models {paths_models_str} --model_class_names {model_class_names_str} --scale_features {scale_features} --feature_directory {feature_directory} --period_suffix {period_suffix} --batch_size {batch_size} --xgb_model --field $1 --whole_field --flag_ids {addtl_args} && echo "done"\n'
+ f'echo -n "Running inference..." && {path_to_python} run-inference --paths-models {paths_models_str} --model-class-names {model_class_names_str} --scale-features {scale_features} --feature-directory {feature_directory} --period-suffix {period_suffix} --batch-size {batch_size} --xgb-model --field $1 --whole-field --flag-ids {addtl_args} && echo "done"\n'
)
else:
- raise ValueError('algorithm must be dnn or xgb')
+ raise ValueError("algorithm must be dnn or xgb")
+
+ print(f"Wrote inference script to {path}")
def consolidate_inference_results(
self,
dataset: pd.DataFrame,
- statistic: str = 'mean',
+ statistic: str = "mean",
):
"""
Consolidate inference results from multiple rows to one per source (called in select_fritz_sample)
@@ -1340,147 +1765,147 @@ def consolidate_inference_results(
# Begin with Gaia EDR3 ID
# If no Gaia ID, use AllWISE
# If no AllWISE, use PS1
- withGaiaID = dataset[dataset['Gaia_EDR3___id'] != 0].reset_index(drop=True)
- nanGaiaID = dataset[dataset['Gaia_EDR3___id'] == 0].reset_index(drop=True)
+ withGaiaID = dataset[dataset["Gaia_EDR3___id"] != 0].reset_index(drop=True)
+ nanGaiaID = dataset[dataset["Gaia_EDR3___id"] == 0].reset_index(drop=True)
- withAllWiseID = nanGaiaID[nanGaiaID['AllWISE___id'] != 0].reset_index(drop=True)
- nanAllWiseID = nanGaiaID[nanGaiaID['AllWISE___id'] == 0].reset_index(drop=True)
+ withAllWiseID = nanGaiaID[nanGaiaID["AllWISE___id"] != 0].reset_index(drop=True)
+ nanAllWiseID = nanGaiaID[nanGaiaID["AllWISE___id"] == 0].reset_index(drop=True)
- withPS1ID = nanAllWiseID[nanAllWiseID['PS1_DR1___id'] != 0].reset_index(
+ withPS1ID = nanAllWiseID[nanAllWiseID["PS1_DR1___id"] != 0].reset_index(
drop=True
)
# Define columns for each subset that should not be averaged or otherwise aggregated
- skipList = ['Gaia_EDR3___id', 'AllWISE___id', 'PS1_DR1___id', '_id']
+ skipList = ["Gaia_EDR3___id", "AllWISE___id", "PS1_DR1___id", "_id"]
skip_mean_cols_Gaia = withGaiaID[skipList]
skip_mean_cols_AllWise = withAllWiseID[skipList]
skip_mean_cols_PS1 = withPS1ID[skipList]
if statistic in [
- 'mean',
- 'Mean',
- 'MEAN',
- 'average',
- 'AVERAGE',
- 'Average',
- 'avg',
- 'AVG',
+ "mean",
+ "Mean",
+ "MEAN",
+ "average",
+ "AVERAGE",
+ "Average",
+ "avg",
+ "AVG",
]:
groupedMeans_Gaia = (
- withGaiaID.groupby('Gaia_EDR3___id')
+ withGaiaID.groupby("Gaia_EDR3___id")
.mean()
- .drop(['_id', 'AllWISE___id', 'PS1_DR1___id'], axis=1)
+ .drop(["_id", "AllWISE___id", "PS1_DR1___id"], axis=1)
.reset_index()
)
groupedMeans_AllWise = (
- withAllWiseID.groupby('AllWISE___id')
+ withAllWiseID.groupby("AllWISE___id")
.mean()
- .drop(['_id', 'Gaia_EDR3___id', 'PS1_DR1___id'], axis=1)
+ .drop(["_id", "Gaia_EDR3___id", "PS1_DR1___id"], axis=1)
.reset_index()
)
groupedMeans_PS1 = (
- withPS1ID.groupby('PS1_DR1___id')
+ withPS1ID.groupby("PS1_DR1___id")
.mean()
- .drop(['_id', 'Gaia_EDR3___id', 'AllWISE___id'], axis=1)
+ .drop(["_id", "Gaia_EDR3___id", "AllWISE___id"], axis=1)
.reset_index()
)
- elif statistic in ['max', 'Max', 'MAX', 'maximum', 'Maximum', 'MAXIMUM']:
+ elif statistic in ["max", "Max", "MAX", "maximum", "Maximum", "MAXIMUM"]:
groupedMeans_Gaia = (
- withGaiaID.groupby('Gaia_EDR3___id')
+ withGaiaID.groupby("Gaia_EDR3___id")
.max()
- .drop(['_id', 'AllWISE___id', 'PS1_DR1___id'], axis=1)
+ .drop(["_id", "AllWISE___id", "PS1_DR1___id"], axis=1)
.reset_index()
)
groupedMeans_AllWise = (
- withAllWiseID.groupby('AllWISE___id')
+ withAllWiseID.groupby("AllWISE___id")
.max()
- .drop(['_id', 'Gaia_EDR3___id', 'PS1_DR1___id'], axis=1)
+ .drop(["_id", "Gaia_EDR3___id", "PS1_DR1___id"], axis=1)
.reset_index()
)
groupedMeans_PS1 = (
- withPS1ID.groupby('PS1_DR1___id')
+ withPS1ID.groupby("PS1_DR1___id")
.max()
- .drop(['_id', 'Gaia_EDR3___id', 'AllWISE___id'], axis=1)
+ .drop(["_id", "Gaia_EDR3___id", "AllWISE___id"], axis=1)
.reset_index()
)
- elif statistic in ['median', 'Median', 'MEDIAN', 'med', 'MED']:
+ elif statistic in ["median", "Median", "MEDIAN", "med", "MED"]:
groupedMeans_Gaia = (
- withGaiaID.groupby('Gaia_EDR3___id')
+ withGaiaID.groupby("Gaia_EDR3___id")
.median()
- .drop(['_id', 'AllWISE___id', 'PS1_DR1___id'], axis=1)
+ .drop(["_id", "AllWISE___id", "PS1_DR1___id"], axis=1)
.reset_index()
)
groupedMeans_AllWise = (
- withAllWiseID.groupby('AllWISE___id')
+ withAllWiseID.groupby("AllWISE___id")
.median()
- .drop(['_id', 'Gaia_EDR3___id', 'PS1_DR1___id'], axis=1)
+ .drop(["_id", "Gaia_EDR3___id", "PS1_DR1___id"], axis=1)
.reset_index()
)
groupedMeans_PS1 = (
- withPS1ID.groupby('PS1_DR1___id')
+ withPS1ID.groupby("PS1_DR1___id")
.median()
- .drop(['_id', 'Gaia_EDR3___id', 'AllWISE___id'], axis=1)
+ .drop(["_id", "Gaia_EDR3___id", "AllWISE___id"], axis=1)
.reset_index()
)
else:
raise ValueError(
- 'Mean, median and max are the currently supported statistics.'
+ "Mean, median and max are the currently supported statistics."
)
# Construct new survey_id column that contains the ID used to add grouped source to the list
- string_ids_Gaia = groupedMeans_Gaia['Gaia_EDR3___id'].astype(str)
- groupedMeans_Gaia['survey_id'] = ["Gaia_EDR3___" + s for s in string_ids_Gaia]
+ string_ids_Gaia = groupedMeans_Gaia["Gaia_EDR3___id"].astype(str)
+ groupedMeans_Gaia["survey_id"] = ["Gaia_EDR3___" + s for s in string_ids_Gaia]
- string_ids_AllWise = groupedMeans_AllWise['AllWISE___id'].astype(str)
- groupedMeans_AllWise['survey_id'] = [
+ string_ids_AllWise = groupedMeans_AllWise["AllWISE___id"].astype(str)
+ groupedMeans_AllWise["survey_id"] = [
"AllWISE___" + s for s in string_ids_AllWise
]
- string_ids_PS1 = groupedMeans_PS1['PS1_DR1___id'].astype(str)
- groupedMeans_PS1['survey_id'] = ["PS1_DR1___" + s for s in string_ids_PS1]
+ string_ids_PS1 = groupedMeans_PS1["PS1_DR1___id"].astype(str)
+ groupedMeans_PS1["survey_id"] = ["PS1_DR1___" + s for s in string_ids_PS1]
# Merge averaged, non-averaged columns on obj_id
allRows_Gaia = pd.merge(
- groupedMeans_Gaia, skip_mean_cols_Gaia, on=['Gaia_EDR3___id']
+ groupedMeans_Gaia, skip_mean_cols_Gaia, on=["Gaia_EDR3___id"]
)
- noDup_ids_Gaia = allRows_Gaia.drop_duplicates('Gaia_EDR3___id')[
- ['Gaia_EDR3___id', '_id']
+ noDup_ids_Gaia = allRows_Gaia.drop_duplicates("Gaia_EDR3___id")[
+ ["Gaia_EDR3___id", "_id"]
]
groupedMeans_Gaia = pd.merge(
- groupedMeans_Gaia, noDup_ids_Gaia, on='Gaia_EDR3___id'
+ groupedMeans_Gaia, noDup_ids_Gaia, on="Gaia_EDR3___id"
)
- groupedMeans_Gaia.drop('Gaia_EDR3___id', axis=1, inplace=True)
+ groupedMeans_Gaia.drop("Gaia_EDR3___id", axis=1, inplace=True)
allRows_AllWise = pd.merge(
- groupedMeans_AllWise, skip_mean_cols_AllWise, on=['AllWISE___id']
+ groupedMeans_AllWise, skip_mean_cols_AllWise, on=["AllWISE___id"]
)
- noDup_ids_AllWise = allRows_AllWise.drop_duplicates('AllWISE___id')[
- ['AllWISE___id', '_id']
+ noDup_ids_AllWise = allRows_AllWise.drop_duplicates("AllWISE___id")[
+ ["AllWISE___id", "_id"]
]
groupedMeans_AllWise = pd.merge(
- groupedMeans_AllWise, noDup_ids_AllWise, on='AllWISE___id'
+ groupedMeans_AllWise, noDup_ids_AllWise, on="AllWISE___id"
)
- groupedMeans_AllWise.drop('AllWISE___id', axis=1, inplace=True)
+ groupedMeans_AllWise.drop("AllWISE___id", axis=1, inplace=True)
allRows_PS1 = pd.merge(
- groupedMeans_PS1, skip_mean_cols_PS1, on=['PS1_DR1___id']
+ groupedMeans_PS1, skip_mean_cols_PS1, on=["PS1_DR1___id"]
)
- noDup_ids_PS1 = allRows_PS1.drop_duplicates('PS1_DR1___id')[
- ['PS1_DR1___id', '_id']
+ noDup_ids_PS1 = allRows_PS1.drop_duplicates("PS1_DR1___id")[
+ ["PS1_DR1___id", "_id"]
]
- groupedMeans_PS1 = pd.merge(groupedMeans_PS1, noDup_ids_PS1, on='PS1_DR1___id')
- groupedMeans_PS1.drop('PS1_DR1___id', axis=1, inplace=True)
+ groupedMeans_PS1 = pd.merge(groupedMeans_PS1, noDup_ids_PS1, on="PS1_DR1___id")
+ groupedMeans_PS1.drop("PS1_DR1___id", axis=1, inplace=True)
# Create dataframe with one row per source
consol_rows = pd.concat(
@@ -1490,44 +1915,149 @@ def consolidate_inference_results(
# Create dataframe containing all rows (including duplicates for multiple light curves)
all_rows = pd.concat([allRows_Gaia, allRows_AllWise, allRows_PS1])
all_rows.drop(
- ['Gaia_EDR3___id', 'AllWISE___id', 'PS1_DR1___id'], axis=1, inplace=True
+ ["Gaia_EDR3___id", "AllWISE___id", "PS1_DR1___id"], axis=1, inplace=True
)
# Reorder columns for better legibility
- consol_rows = consol_rows.set_index('survey_id').reset_index()
- all_rows = all_rows.set_index('survey_id').reset_index()
+ consol_rows = consol_rows.set_index("survey_id").reset_index()
+ all_rows = all_rows.set_index("survey_id").reset_index()
return consol_rows, all_rows
+ def parse_run_select_fritz_sample(self):
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--fields",
+ type=Union[int, str],
+ nargs="+",
+ default=["all"],
+ help="list of field predictions (integers) to include, 'all' to use all available fields, or 'specific_ids' if running on e.g. GCN sources",
+ )
+ parser.add_argument(
+ "--group",
+ type=str,
+ default="experiment",
+ help="name of group containing trained models within models directory",
+ )
+ parser.add_argument(
+ "--min-class-examples",
+ type=int,
+ default=1000,
+ help="minimum number of examples to include for each class. Some classes may contain fewer than this if the sample is limited",
+ )
+ parser.add_argument(
+ "--select-top-n",
+ action="store_true",
+ help="if set, select top N probabilities above probability_threshold from each class",
+ )
+ parser.add_argument(
+ "--include-all-highprob-labels",
+ action="store_true",
+ help="if select_top_n is set, setting this keyword includes any classification above the probability_threshold for all top N sources. Otherwise, literally only the top N probabilities for each classification will be included, which may artifically exclude relevant labels.",
+ )
+ parser.add_argument(
+ "--probability-threshold",
+ type=float,
+ default=0.9,
+ help="minimum probability to select for Fritz",
+ )
+ parser.add_argument(
+ "--al-directory",
+ type=str,
+ default="AL_datasets",
+ help="name of directory to create/populate with Fritz sample",
+ )
+ parser.add_argument(
+ "--al-filename",
+ type=str,
+ default="active_learning_set",
+ help="name of file (no extension) to store Fritz sample",
+ )
+ parser.add_argument(
+ "--algorithm",
+ type=str,
+ default="dnn",
+ help="ML algorithm (dnn or xgb)",
+ )
+ parser.add_argument(
+ "--exclude-training-sources",
+ action="store_true",
+ help="if set, exclude sources in current training set from AL sample",
+ )
+ parser.add_argument(
+ "--write-csv",
+ action="store_true",
+ help="if set, write CSV file in addition to parquet",
+ )
+ parser.add_argument(
+ "--verbose",
+ action="store_true",
+ help="if set, print additional information",
+ )
+ parser.add_argument(
+ "--consolidation-statistic",
+ type=str,
+ default="mean",
+ help="method to combine multiple classification probabilities for a single source ('mean', 'median' or 'max' currently supported)",
+ )
+ parser.add_argument(
+ "--read-consolidation-results",
+ action="store_true",
+ help="if set, search for and read an existing consolidated file having _consol.parquet suffix",
+ )
+ parser.add_argument(
+ "--write-consolidation-results",
+ action="store_true",
+ help="if set, save two files: consolidated inference results [1 row per source] and full results [≥ 1 row per source]",
+ )
+ parser.add_argument(
+ "--consol-filename",
+ type=str,
+ default="inference_results",
+ help="name of file (no extension) to store consolidated and full results",
+ )
+ parser.add_argument(
+ "--doNotSave",
+ action="store_true",
+ help="if set, do not write results",
+ )
+ parser.add_argument(
+ "--doAllSources",
+ action="store_true",
+ help="if set, ignore min_class_examples and run for all sources",
+ )
+
+ args, _ = parser.parse_known_args()
+ self.select_fritz_sample(**vars(args))
+
def select_fritz_sample(
self,
- fields: Union[list, str] = 'all',
- group: str = 'experiment',
+ fields: list = ["all"],
+ group: str = "experiment",
min_class_examples: int = 1000,
select_top_n: bool = False,
include_all_highprob_labels: bool = False,
probability_threshold: float = 0.9,
- al_directory: str = 'AL_datasets',
- al_filename: str = 'active_learning_set',
- algorithm: str = 'dnn',
+ al_directory: str = "AL_datasets",
+ al_filename: str = "active_learning_set",
+ algorithm: str = "dnn",
exclude_training_sources: bool = False,
write_csv: bool = True,
verbose: bool = False,
- consolidation_statistic: str = 'mean',
+ consolidation_statistic: str = "mean",
read_consolidation_results: bool = False,
write_consolidation_results: bool = False,
- consol_filename: str = 'inference_results',
+ consol_filename: str = "inference_results",
doNotSave: bool = False,
doAllSources: bool = False,
):
"""
Select subset of predictions to use for posting to Fritz (active learning, GCN source classifications).
- :param fields: list of field predictions (integers) to include, 'all' to use all available fields, or 'specific_ids' if running on e.g. GCN sources (list or str)
- note: do not use spaces if providing a list of comma-separated integers to this argument.
+ :param fields: list of field predictions (integers) to include, 'all' to use all available fields, or 'specific_ids' if running on e.g. GCN sources (list)
:param group: name of group containing trained models within models directory (str)
:param min_class_examples: minimum number of examples to include for each class. Some classes may contain fewer than this if the sample is limited (int)
- :param select_top_n: if True, select top N probabilities above probability_threshold from each class (bool)
+ :param select_top_n: if set, select top N probabilities above probability_threshold from each class (bool)
:param include_all_highprob_labels: if select_top_n is set, setting this keyword includes any classification above the probability_threshold for all top N sources.
Otherwise, literally only the top N probabilities for each classification will be included, which may artifically exclude relevant labels.
:param probability_threshold: minimum probability to select for Fritz (float)
@@ -1535,11 +2065,11 @@ def select_fritz_sample(
:param al_filename: name of file (no extension) to store Fritz sample (str)
:param algorithm: algorithm [dnn or xgb] (str)
:param exclude_training_sources: if True, exclude sources in current training set from AL sample (bool)
- :param write_csv: if True, write CSV file in addition to parquet (bool)
- :param verbose: if True, print additional information (bool)
+ :param write_csv: if set, write CSV file in addition to parquet (bool)
+ :param verbose: if set, print additional information (bool)
:param consolidation_statistic: method to combine multiple classification probabilities for a single source [mean, median or max currently supported] (str)
- :param read_consolidation_results: if True, search for and read an existing consolidated file having _consol.parquet suffix (bool)
- :param write_consolidation_results: if True, save two files: consolidated inference results [1 row per source] and full results [≥ 1 row per source] (bool)
+ :param read_consolidation_results: if set, search for and read an existing consolidated file having _consol.parquet suffix (bool)
+ :param write_consolidation_results: if set, save two files: consolidated inference results [1 row per source] and full results [≥ 1 row per source] (bool)
:param consol_filename: name of file (no extension) to store consolidated and full results (str)
:param doNotSave: if set, do not write results (bool)
:param doAllSources: if set, ignore min_class_examples and run for all sources (bool)
@@ -1547,62 +2077,62 @@ def select_fritz_sample(
:return:
final_toPost: DataFrame containing sources with high-confidence classifications to post
- :examples: ./scope.py select_fritz_sample --fields=[296,297] --group='experiment' --min_class_examples=1000 --probability_threshold=0.9 --exclude_training_sources --write_consolidation_results
- ./scope.py select_fritz_sample --fields=[296,297] --group='experiment' --min_class_examples=500 --select_top_n --include_all_highprob_labels --probability_threshold=0.7 --exclude_training_sources --read_consolidation_results
- ./scope.py select_fritz_sample --fields='specific_ids' --group='DR16' --algorithm='xgb' --probability_threshold=0.9 --consol_filename='inference_results_specific_ids' --al_directory='GCN' --al_filename='GCN_sources' --write_consolidation_results --select_top_n --doAllSources --write_csv
+ :examples: select-fritz-sample --fields 296 297 --group experiment --min-class-examples 1000 --probability-threshold 0.9 --exclude-training-sources --write-consolidation-results
+ select-fritz-sample --fields 296 297 --group experiment --min-class-examples 500 --select-top-n --include-all-highprob-labels --probability-threshold 0.7 --exclude-training-sources --read-consolidation-results
+ select-fritz-sample --fields specific_ids --group DR16 --algorithm xgb --probability-threshold 0.9 --consol-filename inference_results_specific_ids --al-directory=GCN --al-filename GCN_sources --write-consolidation-results --select-top-n --doAllSources --write-csv
"""
base_path = self.base_path
- if algorithm in ['DNN', 'NN', 'dnn', 'nn']:
- algorithm = 'dnn'
- elif algorithm in ['XGB', 'xgb', 'XGBoost', 'xgboost', 'XGBOOST']:
- algorithm = 'xgb'
+ if algorithm in ["DNN", "NN", "dnn", "nn"]:
+ algorithm = "dnn"
+ elif algorithm in ["XGB", "xgb", "XGBoost", "xgboost", "XGBOOST"]:
+ algorithm = "xgb"
else:
- raise ValueError('Algorithm must be either dnn or xgb.')
+ raise ValueError("Algorithm must be either dnn or xgb.")
- preds_path = base_path / f'preds_{algorithm}'
+ preds_path = base_path / f"preds_{algorithm}"
# Strip extension from filename if provided
- al_filename = al_filename.split('.')[0]
- AL_directory_path = str(base_path / f'{al_directory}_{algorithm}' / al_filename)
+ al_filename = al_filename.split(".")[0]
+ AL_directory_path = str(base_path / f"{al_directory}_{algorithm}" / al_filename)
os.makedirs(AL_directory_path, exist_ok=True)
df_coll = []
df_coll_allRows = []
- if fields in ['all', 'All', 'ALL']:
+ if "all" in fields:
gen_fields = os.walk(preds_path)
fields = [x for x in gen_fields][0][1]
- print(f'Generating Fritz sample from {len(fields)} fields:')
- elif 'specific_ids' in fields:
- fields = [f'field_{fields}']
- print('Generating Fritz sample from specific ids across multiple fields:')
+ print(f"Generating Fritz sample from {len(fields)} fields:")
+ elif "specific_ids" in fields:
+ fields = [f"field_{fields}"]
+ print("Generating Fritz sample from specific ids across multiple fields:")
else:
- fields = [f'field_{f}' for f in fields]
- print(f'Generating Fritz sample from {len(fields)} fields:')
+ fields = [f"field_{f}" for f in fields]
+ print(f"Generating Fritz sample from {len(fields)} fields:")
column_nums = []
AL_directory_PL = pathlib.Path(AL_directory_path)
- gen = AL_directory_PL.glob(f'{consol_filename}_consol.parquet')
+ gen = AL_directory_PL.glob(f"{consol_filename}_consol.parquet")
existing_consol_files = [str(x) for x in gen]
if (read_consolidation_results) & (len(existing_consol_files) > 0):
- print('Loading existing consolidated results...')
+ print("Loading existing consolidated results...")
preds_df = read_parquet(existing_consol_files[0])
else:
- print('Consolidating classification probabilities to one per source...')
+ print("Consolidating classification probabilities to one per source...")
for field in fields:
print(field)
- h = read_parquet(str(preds_path / field / f'{field}.parquet'))
+ h = read_parquet(str(preds_path / field / f"{field}.parquet"))
has_obj_id = False
- if 'obj_id' in h.columns:
+ if "obj_id" in h.columns:
has_obj_id = True
id_mapper = (
- h[['_id', 'obj_id']].set_index('_id').to_dict(orient='index')
+ h[["_id", "obj_id"]].set_index("_id").to_dict(orient="index")
)
- h.drop('obj_id', axis=1, inplace=True)
+ h.drop("obj_id", axis=1, inplace=True)
consolidated_df, all_rows_df = self.consolidate_inference_results(
h, statistic=consolidation_statistic
@@ -1619,20 +2149,20 @@ def select_fritz_sample(
if len(np.unique(column_nums)) > 1:
raise ValueError(
- 'Not all predictions have the same number of columns.'
+ "Not all predictions have the same number of columns."
)
# Create consolidated dataframe (one row per source)
preds_df = pd.concat(df_coll, axis=0)
cols = [x for x in preds_df.columns]
- cols.remove('_id')
- cols.remove('survey_id')
- agg_dct = {c: 'mean' for c in cols}
+ cols.remove("_id")
+ cols.remove("survey_id")
+ agg_dct = {c: "mean" for c in cols}
# One more groupby to combine sources across multiple fields
preds_df = (
- preds_df.groupby(['survey_id', '_id']).agg(agg_dct).reset_index()
+ preds_df.groupby(["survey_id", "_id"]).agg(agg_dct).reset_index()
)
# Create dataframe including all light curves (multiple rows per source)
@@ -1640,28 +2170,28 @@ def select_fritz_sample(
if not has_obj_id:
# Generate position-based obj_ids for Fritz
- raArr = [ra for ra in preds_df['ra']]
- decArr = [dec for dec in preds_df['dec']]
+ raArr = [ra for ra in preds_df["ra"]]
+ decArr = [dec for dec in preds_df["dec"]]
obj_ids = [radec_to_iau_name(x, y) for x, y in zip(raArr, decArr)]
else:
obj_ids = []
- for ID in preds_df['_id']:
- obj_ids += [id_mapper[ID]['obj_id']]
+ for ID in preds_df["_id"]:
+ obj_ids += [id_mapper[ID]["obj_id"]]
- preds_df['obj_id'] = obj_ids
+ preds_df["obj_id"] = obj_ids
# Assign obj_ids to all rows
preds_df_allRows = pd.merge(
- preds_df_allRows, preds_df[['obj_id', 'survey_id']], on='survey_id'
+ preds_df_allRows, preds_df[["obj_id", "survey_id"]], on="survey_id"
)
# Drop sources which are so close that they cannot be resolved by our position-based ID (~0.0004 of sources)
preds_df_allRows = (
- preds_df_allRows.set_index('obj_id')
- .drop(preds_df[preds_df.duplicated('obj_id')]['obj_id'])
+ preds_df_allRows.set_index("obj_id")
+ .drop(preds_df[preds_df.duplicated("obj_id")]["obj_id"])
.reset_index()
)
- preds_df = preds_df.drop_duplicates('obj_id', keep=False).reset_index(
+ preds_df = preds_df.drop_duplicates("obj_id", keep=False).reset_index(
drop=True
)
@@ -1669,40 +2199,40 @@ def select_fritz_sample(
if write_consolidation_results:
write_parquet(
preds_df,
- f'{AL_directory_path}/{consol_filename}_consol.parquet',
+ f"{AL_directory_path}/{consol_filename}_consol.parquet",
)
write_parquet(
preds_df_allRows,
- f'{AL_directory_path}/{consol_filename}_full.parquet',
+ f"{AL_directory_path}/{consol_filename}_full.parquet",
)
if write_csv:
preds_df.to_csv(
- f'{AL_directory_path}/{consol_filename}_consol.csv',
+ f"{AL_directory_path}/{consol_filename}_consol.csv",
index=False,
)
preds_df_allRows.to_csv(
- f'{AL_directory_path}/{consol_filename}_full.csv',
+ f"{AL_directory_path}/{consol_filename}_full.csv",
index=False,
)
# Define non-variable class as 1 - variable
include_nonvar = False
- if f'vnv_{algorithm}' in preds_df.columns:
+ if f"vnv_{algorithm}" in preds_df.columns:
include_nonvar = True
- preds_df[f'nonvar_{algorithm}'] = np.round(
- 1 - preds_df[f'vnv_{algorithm}'], 2
+ preds_df[f"nonvar_{algorithm}"] = np.round(
+ 1 - preds_df[f"vnv_{algorithm}"], 2
)
if exclude_training_sources:
# Get training set from config file
- training_set_config = self.config['training']['dataset']
+ training_set_config = self.config["training"]["dataset"]
training_set_path = str(base_path / training_set_config)
- if training_set_path.endswith('.parquet'):
+ if training_set_path.endswith(".parquet"):
training_set = read_parquet(training_set_path)
- elif training_set_path.endswith('.h5'):
+ elif training_set_path.endswith(".h5"):
training_set = read_hdf(training_set_path)
- elif training_set_path.endswith('.csv'):
+ elif training_set_path.endswith(".csv"):
training_set = pd.read_csv(training_set_path)
else:
raise ValueError(
@@ -1710,25 +2240,25 @@ def select_fritz_sample(
)
intersec = set.intersection(
- set(preds_df['obj_id'].values), set(training_set['obj_id'].values)
+ set(preds_df["obj_id"].values), set(training_set["obj_id"].values)
)
- print(f'Dropping {len(intersec)} sources already in training set...')
- preds_df = preds_df.set_index('obj_id').drop(list(intersec)).reset_index()
+ print(f"Dropping {len(intersec)} sources already in training set...")
+ preds_df = preds_df.set_index("obj_id").drop(list(intersec)).reset_index()
# Use trained model names to establish classes to train
- gen = os.walk(base_path / f'models_{algorithm}' / group)
+ gen = os.walk(base_path / f"models_{algorithm}" / group)
model_tags = [tag[1] for tag in gen]
model_tags = model_tags[0]
model_tags = np.array(model_tags)
if include_nonvar:
- model_tags = np.concatenate([model_tags, ['nonvar']])
+ model_tags = np.concatenate([model_tags, ["nonvar"]])
- print(f'Selecting AL sample for {len(model_tags)} classes...')
+ print(f"Selecting AL sample for {len(model_tags)} classes...")
toPost_df = pd.DataFrame(columns=preds_df.columns)
completed_dict = {}
- preds_df.set_index('obj_id', inplace=True)
- toPost_df.set_index('obj_id', inplace=True)
+ preds_df.set_index("obj_id", inplace=True)
+ toPost_df.set_index("obj_id", inplace=True)
# Fix random state to allow reproducible results
rng = np.random.RandomState(9)
@@ -1736,17 +2266,17 @@ def select_fritz_sample(
# Reset min_class_examples if doAllSources is set
if doAllSources:
min_class_examples = len(preds_df)
- print(f'Selecting sample from all sources ({min_class_examples})')
+ print(f"Selecting sample from all sources ({min_class_examples})")
if not select_top_n:
for tag in model_tags:
# Idenfity all sources above probability threshold
highprob_preds = preds_df[
- preds_df[f'{tag}_{algorithm}'].values >= probability_threshold
+ preds_df[f"{tag}_{algorithm}"].values >= probability_threshold
]
# Find existing sources in AL sample above probability threshold
existing_df = toPost_df[
- toPost_df[f'{tag}_{algorithm}'].values >= probability_threshold
+ toPost_df[f"{tag}_{algorithm}"].values >= probability_threshold
]
existing_count = len(existing_df)
@@ -1767,21 +2297,21 @@ def select_fritz_sample(
concat_toPost_df = highprob_preds
toPost_df = pd.concat([toPost_df, concat_toPost_df], axis=0)
- toPost_df.drop_duplicates(keep='first', inplace=True)
+ toPost_df.drop_duplicates(keep="first", inplace=True)
else:
# Select top N classifications above probability threshold for all classes
print(
- f'Selecting top {min_class_examples} classifications above P = {probability_threshold}...'
+ f"Selecting top {min_class_examples} classifications above P = {probability_threshold}..."
)
preds_df.reset_index(inplace=True)
topN_df = pd.DataFrame()
- class_list = [f'{t}_{algorithm}' for t in model_tags]
+ class_list = [f"{t}_{algorithm}" for t in model_tags]
for tag in model_tags:
goodprob_preds = preds_df[
- preds_df[f'{tag}_{algorithm}'].values >= probability_threshold
+ preds_df[f"{tag}_{algorithm}"].values >= probability_threshold
]
if not include_all_highprob_labels:
@@ -1789,15 +2319,15 @@ def select_fritz_sample(
topN_preds = (
goodprob_preds[
[
- 'obj_id',
- 'survey_id',
- 'ra',
- 'dec',
- 'period',
- f'{tag}_{algorithm}',
+ "obj_id",
+ "survey_id",
+ "ra",
+ "dec",
+ "period",
+ f"{tag}_{algorithm}",
]
]
- .sort_values(by=f'{tag}_{algorithm}', ascending=False)
+ .sort_values(by=f"{tag}_{algorithm}", ascending=False)
.iloc[:min_class_examples]
.reset_index(drop=True)
)
@@ -1806,7 +2336,7 @@ def select_fritz_sample(
# Include not only the top N probabilities for each class but also any other classifications above probability_threshold for these sources
topN_preds = (
goodprob_preds.sort_values(
- by=f'{tag}_{algorithm}', ascending=False
+ by=f"{tag}_{algorithm}", ascending=False
)
.iloc[:min_class_examples]
.reset_index(drop=True)
@@ -1820,26 +2350,26 @@ def select_fritz_sample(
topN_df = pd.concat([topN_df, topN_preds]).reset_index(drop=True)
- toPost_df = topN_df.fillna(0.0).groupby('obj_id').max().reset_index()
+ toPost_df = topN_df.fillna(0.0).groupby("obj_id").max().reset_index()
for tag in model_tags:
# Make metadata dictionary of example count per class
- completed_dict[f'{tag}_{algorithm}'] = int(
- np.sum(toPost_df[f'{tag}_{algorithm}'].values >= probability_threshold)
+ completed_dict[f"{tag}_{algorithm}"] = int(
+ np.sum(toPost_df[f"{tag}_{algorithm}"].values >= probability_threshold)
)
final_toPost = toPost_df.reset_index(drop=True)
if not doNotSave:
# Write parquet and csv files
- write_parquet(final_toPost, f'{AL_directory_path}/{al_filename}.parquet')
+ write_parquet(final_toPost, f"{AL_directory_path}/{al_filename}.parquet")
if write_csv:
final_toPost.to_csv(
- f'{AL_directory_path}/{al_filename}.csv', index=False
+ f"{AL_directory_path}/{al_filename}.csv", index=False
)
# Write metadata
- meta_filepath = f'{AL_directory_path}/meta.json'
+ meta_filepath = f"{AL_directory_path}/meta.json"
with open(meta_filepath, "w") as f:
try:
json.dump(completed_dict, f) # dump dictionary to a json file
@@ -1858,14 +2388,14 @@ def test_limited(self):
# create a mock dataset and check that the training pipeline works
dataset = f"{uuid.uuid4().hex}_orig.csv"
- path_mock = pathlib.Path(__file__).parent.absolute() / "data" / "training"
- group_mock = 'scope_test_limited'
+ path_mock = self.base_path / "data" / "training"
+ group_mock = "scope_test_limited"
try:
- with status('Test training'):
+ with status("Test training"):
print()
- period_suffix_config = self.config['features']['info']['period_suffix']
+ period_suffix_config = self.config["features"]["info"]["period_suffix"]
if not path_mock.exists():
path_mock.mkdir(parents=True, exist_ok=True)
@@ -1874,19 +2404,19 @@ def test_limited(self):
feature_names_orig = [
key
for key in all_feature_names
- if forgiving_true(all_feature_names[key]['include'])
+ if forgiving_true(all_feature_names[key]["include"])
]
feature_names = feature_names_orig.copy()
if not (
- (period_suffix_config is None) | (period_suffix_config == 'None')
+ (period_suffix_config is None) | (period_suffix_config == "None")
):
periodic_bool = [
- all_feature_names[x]['periodic'] for x in feature_names
+ all_feature_names[x]["periodic"] for x in feature_names
]
for j, name in enumerate(feature_names):
if periodic_bool[j]:
- feature_names[j] = f'{name}_{period_suffix_config}'
+ feature_names[j] = f"{name}_{period_suffix_config}"
class_names = [
self.config["training"]["classes"][class_name]["label"]
@@ -1912,16 +2442,16 @@ def test_limited(self):
df_mock_orig = pd.DataFrame.from_records(entries)
df_mock_orig.to_csv(path_mock / dataset, index=False)
- algorithms = ['xgb', 'dnn']
+ algorithms = ["xgb", "dnn"]
model_paths = []
# Train twice: once on Kowalski features, once on generated features with different periodic feature names
for algorithm in algorithms:
tag = "vnv"
- if algorithm == 'xgb':
- extension = 'json'
- elif algorithm == 'dnn':
- extension = 'h5'
+ if algorithm == "xgb":
+ extension = "json"
+ elif algorithm == "dnn":
+ extension = "h5"
time_tag = self.train(
tag=tag,
path_dataset=path_mock / dataset,
@@ -1935,7 +2465,7 @@ def test_limited(self):
group=group_mock,
)
path_model = (
- pathlib.Path(__file__).parent.absolute()
+ self.base_path
/ f"models_{algorithm}"
/ group_mock
/ tag
@@ -1943,7 +2473,7 @@ def test_limited(self):
)
model_paths += [path_model]
- print('model_paths', model_paths)
+ print("model_paths", model_paths)
finally:
# clean up after thyself
@@ -1953,6 +2483,16 @@ def test_limited(self):
for path in model_paths:
shutil.rmtree(path.parent.parent)
+ def parse_run_test(self):
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--doGPU",
+ action="store_true",
+ help="if set, use GPU-accelerated period algorithm",
+ )
+ args, _ = parser.parse_known_args()
+ self.test(**vars(args))
+
def test(self, doGPU=False):
"""
Test different workflows
@@ -1967,14 +2507,14 @@ def test(self, doGPU=False):
inference,
combine_preds,
)
- from scope.fritz import get_lightcurves_via_coords
+ from .fritz import get_lightcurves_via_coords
# Test feature generation
with status("Test generate_features"):
print()
test_field, test_ccd, test_quad = 297, 2, 2
- test_feature_directory = 'generated_features'
- test_feature_filename = 'testFeatures'
+ test_feature_directory = "generated_features"
+ test_feature_filename = "testFeatures"
n_sources = 3
_ = generate_features.generate_features(
@@ -1994,12 +2534,23 @@ def test(self, doGPU=False):
doScaleMinPeriod=True,
)
- path_gen_features = (
- pathlib.Path(__file__).parent.absolute()
- / test_feature_directory
- / f"field_{test_field}"
- / f"{test_feature_filename}_field_{test_field}_ccd_{test_ccd}_quad_{test_quad}.parquet"
+ path_to_features = self.config.get("feature_generation").get(
+ "path_to_features"
)
+ if path_to_features is None:
+ path_gen_features = (
+ self.base_path
+ / test_feature_directory
+ / f"field_{test_field}"
+ / f"{test_feature_filename}_field_{test_field}_ccd_{test_ccd}_quad_{test_quad}.parquet"
+ )
+ else:
+ path_gen_features = (
+ pathlib.Path(path_to_features)
+ / test_feature_directory
+ / f"field_{test_field}"
+ / f"{test_feature_filename}_field_{test_field}_ccd_{test_ccd}_quad_{test_quad}.parquet"
+ )
with status("Test get_lightcurves_via_coords"):
print()
@@ -2010,12 +2561,12 @@ def test(self, doGPU=False):
with status("Test get_cone_ids"):
print()
_ = get_quad_ids.get_cone_ids(
- obj_id_list=['obj1', 'obj2', 'obj3'],
+ obj_id_list=["obj1", "obj2", "obj3"],
ra_list=[40.0, 41.0, 42.0],
dec_list=[50.0, 51.0, 52.0],
)
- src_catalog = self.config['kowalski']['collections']['sources']
+ src_catalog = self.config["kowalski"]["collections"]["sources"]
with status("Test get_ids_loop and get_field_ids"):
print()
_, lst = get_quad_ids.get_ids_loop(
@@ -2034,34 +2585,53 @@ def test(self, doGPU=False):
test_ftrs, outfile = get_features.get_features_loop(
get_features.get_features,
source_ids=lst[0],
- features_catalog=self.config['kowalski']['collections']['features'],
+ features_catalog=self.config["kowalski"]["collections"]["features"],
field=297,
limit_per_query=5,
max_sources=10,
save=False,
)
- testpath = pathlib.Path(outfile)
- testpath = testpath.parent.parent
+ if path_to_features is None:
+ testpath = pathlib.Path(outfile)
+ testpath = testpath.parent.parent
+ else:
+ testpath = pathlib.Path(path_to_features) / "features"
# Use 'field_0' as test directory to avoid removing any existing data locally
- testpath_features = testpath / 'field_0'
+ testpath_features = testpath / "field_0"
if not testpath_features.exists():
testpath_features.mkdir(parents=True, exist_ok=True)
- write_parquet(test_ftrs, str(testpath_features / 'field_0_iter_0.parquet'))
+ write_parquet(test_ftrs, str(testpath_features / "field_0_iter_0.parquet"))
# create a mock dataset and check that the training pipeline works
dataset_orig = f"{uuid.uuid4().hex}_orig.csv"
dataset = f"{uuid.uuid4().hex}.csv"
- path_mock = pathlib.Path(__file__).parent.absolute() / "data" / "training"
- group_mock = 'scope_test'
+ path_mock = self.base_path / "data" / "training"
+ group_mock = "scope_test"
try:
- with status('Test training'):
+ with status("Test training"):
print()
- period_suffix_config = self.config['features']['info']['period_suffix']
- period_suffix_2 = 'LS'
+ period_suffix_config = (
+ self.config.get("features").get("info").get("period_suffix")
+ )
+ if doGPU:
+ if period_suffix_config not in [
+ "ELS",
+ "ECE",
+ "EAOV",
+ "ELS_ECE_EAOV",
+ ]:
+ period_suffix_test = "ELS_ECE_EAOV"
+ else:
+ period_suffix_test = period_suffix_config
+ else:
+ if period_suffix_config not in ["LS", "CE", "AOV", "LS_CE_AOV"]:
+ period_suffix_test = "LS"
+ else:
+ period_suffix_test = period_suffix_config
if not path_mock.exists():
path_mock.mkdir(parents=True, exist_ok=True)
@@ -2070,28 +2640,28 @@ def test(self, doGPU=False):
feature_names_orig = [
key
for key in all_feature_names
- if forgiving_true(all_feature_names[key]['include'])
+ if forgiving_true(all_feature_names[key]["include"])
]
feature_names_new = feature_names_orig.copy()
if not (
- (period_suffix_config is None) | (period_suffix_config == 'None')
+ (period_suffix_config is None) | (period_suffix_config == "None")
):
periodic_bool = [
- all_feature_names[x]['periodic'] for x in feature_names_new
+ all_feature_names[x]["periodic"] for x in feature_names_new
]
for j, name in enumerate(feature_names_new):
if periodic_bool[j]:
- feature_names_new[j] = f'{name}_{period_suffix_config}'
+ feature_names_new[j] = f"{name}_{period_suffix_config}"
feature_names = feature_names_orig.copy()
- if not ((period_suffix_2 is None) | (period_suffix_2 == 'None')):
+ if not ((period_suffix_test is None) | (period_suffix_test == "None")):
periodic_bool = [
- all_feature_names[x]['periodic'] for x in feature_names
+ all_feature_names[x]["periodic"] for x in feature_names
]
for j, name in enumerate(feature_names):
if periodic_bool[j]:
- feature_names[j] = f'{name}_{period_suffix_2}'
+ feature_names[j] = f"{name}_{period_suffix_test}"
class_names = [
self.config["training"]["classes"][class_name]["label"]
@@ -2136,16 +2706,16 @@ def test(self, doGPU=False):
df_mock = pd.DataFrame.from_records(entries)
df_mock.to_csv(path_mock / dataset, index=False)
- algorithms = ['xgb', 'dnn']
+ algorithms = ["xgb", "dnn"]
model_paths_orig = []
# Train twice: once on Kowalski features, once on generated features with different periodic feature names
for algorithm in algorithms:
tag = "vnv"
- if algorithm == 'xgb':
- extension = 'json'
- elif algorithm == 'dnn':
- extension = 'h5'
+ if algorithm == "xgb":
+ extension = "json"
+ elif algorithm == "dnn":
+ extension = "h5"
time_tag = self.train(
tag=tag,
path_dataset=path_mock / dataset_orig,
@@ -2159,7 +2729,7 @@ def test(self, doGPU=False):
group=group_mock,
)
path_model = (
- pathlib.Path(__file__).parent.absolute()
+ self.base_path
/ f"models_{algorithm}"
/ group_mock
/ tag
@@ -2170,10 +2740,10 @@ def test(self, doGPU=False):
model_paths = []
for algorithm in algorithms:
tag = "vnv"
- if algorithm == 'xgb':
- extension = 'json'
- elif algorithm == 'dnn':
- extension = 'h5'
+ if algorithm == "xgb":
+ extension = "json"
+ elif algorithm == "dnn":
+ extension = "h5"
time_tag = self.train(
tag=tag,
path_dataset=path_mock / dataset,
@@ -2184,11 +2754,11 @@ def test(self, doGPU=False):
test=True,
algorithm=algorithm,
skip_cv=True,
- period_suffix=period_suffix_2,
+ period_suffix=period_suffix_test,
group=group_mock,
)
path_model = (
- pathlib.Path(__file__).parent.absolute()
+ self.base_path
/ f"models_{algorithm}"
/ group_mock
/ tag
@@ -2196,8 +2766,8 @@ def test(self, doGPU=False):
)
model_paths += [path_model]
- print('model_paths_orig', model_paths_orig)
- print('model_paths', model_paths)
+ print("model_paths_orig", model_paths_orig)
+ print("model_paths", model_paths)
with status("Test inference (queried features)"):
print()
@@ -2230,7 +2800,7 @@ def test(self, doGPU=False):
trainingSet=df_mock,
feature_directory=test_feature_directory,
feature_file_prefix=test_feature_filename,
- period_suffix=period_suffix_2,
+ period_suffix=period_suffix_test,
no_write_metadata=True,
)
print()
@@ -2244,7 +2814,7 @@ def test(self, doGPU=False):
xgb_model=True,
feature_directory=test_feature_directory,
feature_file_prefix=test_feature_filename,
- period_suffix=period_suffix_2,
+ period_suffix=period_suffix_test,
no_write_metadata=True,
)
@@ -2270,7 +2840,7 @@ def test(self, doGPU=False):
[0],
probability_threshold=0.0,
doNotSave=True,
- algorithm='xgb',
+ algorithm="xgb",
)
_ = self.select_fritz_sample(
[0],
@@ -2279,7 +2849,7 @@ def test(self, doGPU=False):
min_class_examples=3,
probability_threshold=0.0,
doNotSave=True,
- algorithm='xgb',
+ algorithm="xgb",
)
finally:
@@ -2287,21 +2857,17 @@ def test(self, doGPU=False):
(path_mock / dataset_orig).unlink()
(path_mock / dataset).unlink()
os.remove(path_gen_features)
- (testpath_features / 'field_0_iter_0.parquet').unlink()
+ (testpath_features / "field_0_iter_0.parquet").unlink()
os.rmdir(testpath_features)
(preds_filename_dnn_orig).unlink()
(preds_filename_xgb_orig).unlink()
(preds_filename_dnn).unlink()
(preds_filename_xgb).unlink()
- (preds_filename_dnn_orig.parent / 'meta.json').unlink()
- (preds_filename_xgb_orig.parent / 'meta.json').unlink()
+ (preds_filename_dnn_orig.parent / "meta.json").unlink()
+ (preds_filename_xgb_orig.parent / "meta.json").unlink()
os.rmdir(preds_filename_dnn_orig.parent)
os.rmdir(preds_filename_xgb_orig.parent)
# Remove trained model artifacts, but keep models_xgb and models_dnn directories
for path in model_paths:
shutil.rmtree(path.parent.parent)
-
-
-if __name__ == "__main__":
- fire.Fire(Scope)
diff --git a/scope/utils.py b/scope/utils.py
index 4a4bd144..5d480ac5 100644
--- a/scope/utils.py
+++ b/scope/utils.py
@@ -39,11 +39,15 @@
import json as JSON
from sklearn.impute import KNNImputer
import seaborn as sns
+import argparse
+import os
+from deepdiff import DeepDiff
+from pprint import pprint
-BASE_DIR = pathlib.Path(__file__).parent.parent.absolute()
+BASE_DIR = pathlib.Path.cwd()
-def load_config(config_path: Union[str, pathlib.Path]):
+def load_config(config_path: Union[str, pathlib.Path] = "config.yaml"):
"""
Load config and secrets
"""
@@ -53,6 +57,66 @@ def load_config(config_path: Union[str, pathlib.Path]):
return config
+def parse_load_config():
+ """
+ Load config from user-specified --config-path argument
+ """
+ config_parser = argparse.ArgumentParser()
+ config_parser.add_argument(
+ "--config-path",
+ type=str,
+ help="path to config file",
+ )
+ config_parser.add_argument(
+ "--check-configs",
+ action="store_true",
+ help="if set, check config against default file in same directory",
+ )
+ config_parser.add_argument(
+ "--default-config-name",
+ type=str,
+ default="config.defaults.yaml",
+ help="name of default config file",
+ )
+
+ config_args, _ = config_parser.parse_known_args()
+ config_path = config_args.config_path
+
+ if config_path is None:
+ print(f"No --config-path specified. Loading '{BASE_DIR}/config.yaml'.")
+ config_path = str(BASE_DIR / "config.yaml")
+ else:
+ print(f"Loading config file from '{config_path}'.")
+
+ config = load_config(config_path)
+
+ if config_args.check_configs:
+ print("Checking configuration versus defaults...")
+ config_dirname = os.path.dirname(config_path)
+ default_config_path = os.path.join(
+ config_dirname, config_args.default_config_name
+ )
+
+ try:
+ default_config = load_config(default_config_path)
+ except Exception:
+ print(
+ f"Could not load {default_config_path}. To compare configs, place the latest version of config.defaults.yaml in the same directory as your customized config file ({config_path})."
+ )
+
+ deep_diff = DeepDiff(default_config, config, ignore_order=True)
+ difference = {
+ k: v for k, v in deep_diff.items() if k in ("dictionary_item_removed",)
+ }
+ if len(difference) > 0:
+ print("config structure differs from defaults")
+ pprint(difference)
+ raise KeyError("Fix config before proceeding")
+ print("Configuration check finished.")
+
+ return config
+
+
def time_stamp():
"""
@@ -90,7 +154,7 @@ def make_tdtax_taxonomy(taxonomy: Mapping):
def write_hdf(
- dataframe: pd.DataFrame, filepath: str, key: str = 'df', overwrite: bool = True
+ dataframe: pd.DataFrame, filepath: str, key: str = "df", overwrite: bool = True
):
"""
Write HDF5 file and attach metadata
@@ -100,14 +164,14 @@ def write_hdf(
:param key: key associated with DataFrame (str)
:param overwrite: if True, overwrite file, else append. (bool)
"""
- mode = 'w' if overwrite else 'a'
+ mode = "w" if overwrite else "a"
with pd.HDFStore(filepath, mode=mode) as store:
store.put(key, dataframe)
store.get_storer(key).attrs.metadata = dataframe.attrs
-def read_hdf(filepath: str, key: str = 'df'):
+def read_hdf(filepath: str, key: str = "df"):
"""
Read HDF5 file and metadata (if available). Currently supports accessing one key of the file at a time.
@@ -116,17 +180,17 @@ def read_hdf(filepath: str, key: str = 'df'):
:return: pandas.DataFrame
"""
- with pd.HDFStore(filepath, mode='r') as store:
+ with pd.HDFStore(filepath, mode="r") as store:
dataframe = store[key]
try:
dataframe.attrs = store.get_storer(key).attrs.metadata
except AttributeError:
- warnings.warn('Did not read metadata from HDF5 file.')
+ warnings.warn("Did not read metadata from HDF5 file.")
return dataframe
-def write_parquet(dataframe: pd.DataFrame, filepath: str, meta_key: str = 'scope'):
+def write_parquet(dataframe: pd.DataFrame, filepath: str, meta_key: str = "scope"):
"""
Write Apache Parquet file and attach Metadata
@@ -154,7 +218,7 @@ def write_parquet(dataframe: pd.DataFrame, filepath: str, meta_key: str = 'scope
pq.write_table(table, filepath)
-def read_parquet(filepath: str, meta_key: str = 'scope'):
+def read_parquet(filepath: str, meta_key: str = "scope"):
"""
Read Apache Parquet file and metadata (if available)
@@ -173,7 +237,7 @@ def read_parquet(filepath: str, meta_key: str = 'scope'):
restored_meta = JSON.loads(meta_json)
dataframe.attrs = restored_meta
except KeyError:
- warnings.warn('Did not read metadata from parquet file.')
+ warnings.warn("Did not read metadata from parquet file.")
return dataframe
@@ -268,8 +332,8 @@ def plot_periods(
"""Plot a histogram of periods for the sample"""
# plot the H-R diagram for 1 M stars within 200 pc from the Sun
- period_colname = 'period'
- if not ((period_suffix is None) | (period_suffix == 'None')):
+ period_colname = "period"
+ if not ((period_suffix is None) | (period_suffix == "None")):
period_colname = f"{period_colname}_{period_suffix}"
plt.rc("text", usetex=True)
@@ -518,51 +582,51 @@ def impute_features(
features_df: pd.DataFrame,
n_neighbors: int = 5,
self_impute: bool = False,
- **kwargs,
+ period_suffix: str = None,
):
# Load config file
config = load_config(BASE_DIR / "config.yaml")
- period_suffix_config = config['features']['info']['period_suffix']
- period_suffix = kwargs.get('period_suffix', period_suffix_config)
+ if period_suffix is None:
+ period_suffix = config["features"]["info"]["period_suffix"]
if self_impute:
referenceSet = features_df.copy()
else:
# Load training set
- trainingSetPath = str(BASE_DIR / config['training']['dataset'])
- if trainingSetPath.endswith('.parquet'):
+ trainingSetPath = str(BASE_DIR / config["training"]["dataset"])
+ if trainingSetPath.endswith(".parquet"):
trainingSet = read_parquet(trainingSetPath)
- elif trainingSetPath.endswith('.h5'):
+ elif trainingSetPath.endswith(".h5"):
trainingSet = read_hdf(trainingSetPath)
- elif trainingSetPath.endswith('.csv'):
+ elif trainingSetPath.endswith(".csv"):
trainingSet = pd.read_csv(trainingSetPath)
else:
raise ValueError(
- 'Training set must have one of .parquet, .h5 or .csv file formats.'
+ "Training set must have one of .parquet, .h5 or .csv file formats."
)
referenceSet = trainingSet
- all_features = config['features']['ontological']
+ all_features = config["features"]["ontological"]
# Impute zero where specified
feature_list_impute_zero = [
x
for x in all_features
if (
- all_features[x]['include']
- and all_features[x]['impute_strategy'] in ['zero', 'Zero', 'ZERO']
+ all_features[x]["include"]
+ and all_features[x]["impute_strategy"] in ["zero", "Zero", "ZERO"]
)
]
- if not ((period_suffix is None) | (period_suffix == 'None')):
- periodic_bool = [all_features[x]['periodic'] for x in feature_list_impute_zero]
+ if not ((period_suffix is None) | (period_suffix == "None")):
+ periodic_bool = [all_features[x]["periodic"] for x in feature_list_impute_zero]
for j, name in enumerate(feature_list_impute_zero):
if periodic_bool[j]:
- feature_list_impute_zero[j] = f'{name}_{period_suffix}'
+ feature_list_impute_zero[j] = f"{name}_{period_suffix}"
- print('Imputing zero for the following features: ', feature_list_impute_zero)
+ print("Imputing zero for the following features: ", feature_list_impute_zero)
print()
for feat in feature_list_impute_zero:
features_df[feat] = features_df[feat].fillna(0.0)
@@ -572,20 +636,20 @@ def impute_features(
x
for x in all_features
if (
- all_features[x]['include']
- and all_features[x]['impute_strategy'] in ['median', 'Median', 'MEDIAN']
+ all_features[x]["include"]
+ and all_features[x]["impute_strategy"] in ["median", "Median", "MEDIAN"]
)
]
- if not ((period_suffix is None) | (period_suffix == 'None')):
+ if not ((period_suffix is None) | (period_suffix == "None")):
periodic_bool = [
- all_features[x]['periodic'] for x in feature_list_impute_median
+ all_features[x]["periodic"] for x in feature_list_impute_median
]
for j, name in enumerate(feature_list_impute_median):
if periodic_bool[j]:
- feature_list_impute_median[j] = f'{name}_{period_suffix}'
+ feature_list_impute_median[j] = f"{name}_{period_suffix}"
- print('Imputing median for the following features: ', feature_list_impute_median)
+ print("Imputing median for the following features: ", feature_list_impute_median)
print()
for feat in feature_list_impute_median:
features_df[feat] = features_df[feat].fillna(np.nanmedian(referenceSet[feat]))
@@ -595,18 +659,18 @@ def impute_features(
x
for x in all_features
if (
- all_features[x]['include']
- and all_features[x]['impute_strategy'] in ['mean', 'Mean', 'MEAN']
+ all_features[x]["include"]
+ and all_features[x]["impute_strategy"] in ["mean", "Mean", "MEAN"]
)
]
- if not ((period_suffix is None) | (period_suffix == 'None')):
- periodic_bool = [all_features[x]['periodic'] for x in feature_list_impute_mean]
+ if not ((period_suffix is None) | (period_suffix == "None")):
+ periodic_bool = [all_features[x]["periodic"] for x in feature_list_impute_mean]
for j, name in enumerate(feature_list_impute_mean):
if periodic_bool[j]:
- feature_list_impute_mean[j] = f'{name}_{period_suffix}'
+ feature_list_impute_mean[j] = f"{name}_{period_suffix}"
- print('Imputing mean for the following features: ', feature_list_impute_mean)
+ print("Imputing mean for the following features: ", feature_list_impute_mean)
print()
for feat in feature_list_impute_mean:
features_df[feat] = features_df[feat].fillna(np.nanmean(referenceSet[feat]))
@@ -616,23 +680,23 @@ def impute_features(
x
for x in all_features
if (
- all_features[x]['include']
- and all_features[x]['impute_strategy'] in ['regress', 'Regress', 'REGRESS']
+ all_features[x]["include"]
+ and all_features[x]["impute_strategy"] in ["regress", "Regress", "REGRESS"]
)
]
- if not ((period_suffix is None) | (period_suffix == 'None')):
- periodic_bool = [all_features[x]['periodic'] for x in feature_list_regression]
+ if not ((period_suffix is None) | (period_suffix == "None")):
+ periodic_bool = [all_features[x]["periodic"] for x in feature_list_regression]
for j, name in enumerate(feature_list_regression):
if periodic_bool[j]:
- feature_list_regression[j] = f'{name}_{period_suffix}'
+ feature_list_regression[j] = f"{name}_{period_suffix}"
- print('Imputing by regression on the following features: ', feature_list_regression)
+ print("Imputing by regression on the following features: ", feature_list_regression)
print()
# Fit KNNImputer to training set
imp = KNNImputer(n_neighbors=n_neighbors)
- imp.set_output(transform='pandas')
+ imp.set_output(transform="pandas")
fit_feats = imp.fit(referenceSet[feature_list_regression])
imputed_feats = fit_feats.transform(features_df[feature_list_regression])
@@ -646,16 +710,16 @@ def impute_features(
x
for x in all_features
if (
- all_features[x]['include']
- and all_features[x]['impute_strategy'] in ['none', 'None', 'NONE']
+ all_features[x]["include"]
+ and all_features[x]["impute_strategy"] in ["none", "None", "NONE"]
)
]
- if not ((period_suffix is None) | (period_suffix == 'None')):
- periodic_bool = [all_features[x]['periodic'] for x in feature_list_impute_none]
+ if not ((period_suffix is None) | (period_suffix == "None")):
+ periodic_bool = [all_features[x]["periodic"] for x in feature_list_impute_none]
for j, name in enumerate(feature_list_impute_none):
if periodic_bool[j]:
- feature_list_impute_none[j] = f'{name}_{period_suffix}'
+ feature_list_impute_none[j] = f"{name}_{period_suffix}"
orig_len = len(features_df)
features_df = features_df.dropna(subset=feature_list_impute_none).reset_index(
@@ -664,7 +728,7 @@ def impute_features(
new_len = len(features_df)
print()
print(
- f'Dropped {orig_len - new_len} rows containing missing features with no imputation strategy.'
+ f"Dropped {orig_len - new_len} rows containing missing features with no imputation strategy."
)
return features_df
@@ -693,12 +757,12 @@ def overlapping_histogram(a, bins):
sa = np.sort(a[i : i + block])
n += (
np.r_[
- sa.searchsorted(bins[:-1, 1], 'left'),
- sa.searchsorted(bins[-1, 1], 'right'),
+ sa.searchsorted(bins[:-1, 1], "left"),
+ sa.searchsorted(bins[-1, 1], "right"),
]
- np.r_[
- sa.searchsorted(bins[:-1, 0], 'left'),
- sa.searchsorted(bins[-1, 0], 'right'),
+ sa.searchsorted(bins[:-1, 0], "left"),
+ sa.searchsorted(bins[-1, 0], "right"),
]
)
return n, (bins[:, 0] + bins[:, 1]) / 2.0
@@ -784,7 +848,7 @@ def sort_lightcurve(t, m, e):
def make_confusion_matrix(
cf,
group_names=None,
- categories='auto',
+ categories="auto",
count=True,
percent=True,
cbar=True,
@@ -792,11 +856,11 @@ def make_confusion_matrix(
xyplotlabels=True,
sum_stats=True,
figsize=None,
- cmap='Blues',
+ cmap="Blues",
title=None,
annotate_scores=False,
):
- '''
+ """
CONFUSION MATRIX CODE ADAPTED FROM https://github.com/DTrimarchi10/confusion_matrix (Dennis Trimarchi)
This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.
@@ -829,10 +893,10 @@ def make_confusion_matrix(
title: Title for the heatmap. Default is None.
- '''
+ """
# CODE TO GENERATE TEXT INSIDE EACH SQUARE
- blanks = ['' for i in range(cf.size)]
+ blanks = ["" for i in range(cf.size)]
if group_names and len(group_names) == cf.size:
group_labels = ["{}\n".format(value) for value in group_names]
@@ -877,7 +941,7 @@ def make_confusion_matrix(
# SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
if figsize is None:
# Get default figure size if not set
- figsize = plt.rcParams.get('figure.figsize')
+ figsize = plt.rcParams.get("figure.figsize")
if xyticks is False:
# Do not show categories if xyticks is False
@@ -896,8 +960,8 @@ def make_confusion_matrix(
)
if xyplotlabels:
- plt.ylabel('True label')
- plt.xlabel('Predicted label' + stats_text)
+ plt.ylabel("True label")
+ plt.xlabel("Predicted label" + stats_text)
else:
plt.xlabel(stats_text)
@@ -909,22 +973,22 @@ def make_confusion_matrix(
def plot_roc(fpr, tpr, roc_auc):
plt.plot(fpr, tpr)
- plt.plot([0, 1], [0, 1], 'k--')
+ plt.plot([0, 1], [0, 1], "k--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
- plt.xlabel('False Positive Rate')
- plt.ylabel('True Positive Rate')
- plt.title('ROC curve (area = %0.6f)' % roc_auc)
+ plt.xlabel("False Positive Rate")
+ plt.ylabel("True Positive Rate")
+ plt.title("ROC curve (area = %0.6f)" % roc_auc)
def plot_pr(recall, precision):
- plt.step(recall, precision, color='b', alpha=0.2, where='post')
- plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
- plt.xlabel('Recall')
- plt.ylabel('Precision')
+ plt.step(recall, precision, color="b", alpha=0.2, where="post")
+ plt.fill_between(recall, precision, step="post", alpha=0.2, color="b")
+ plt.xlabel("Recall")
+ plt.ylabel("Precision")
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
- plt.title('Precision-Recall')
+ plt.title("Precision-Recall")
""" Datasets """
@@ -937,15 +1001,17 @@ def __init__(
path_dataset: Union[str, pathlib.Path],
features: tuple,
verbose: bool = False,
- algorithm: str = 'dnn',
- **kwargs,
+ algorithm: str = "dnn",
+ period_suffix: str = None,
):
"""Load parquet, hdf5 or csv file with the dataset containing both data and labels
- :param tag:
- :param path_dataset:
- :param features:
- :param verbose:
+ :param tag: classifier designation, refers to "class" in config.taxonomy (str)
+ :param path_dataset: local path to .parquet, .h5 or .csv file with the dataset (str)
+ :param features: list of input features (list)
+ :param verbose: if set, print additional outputs (bool)
+ :param algorithm: name of ML algorithm to use (str)
+ :param period_suffix: suffix of period/Fourier features to use for training (str)
"""
self.tag = tag
self.path_dataset = str(path_dataset)
@@ -955,37 +1021,37 @@ def __init__(
# Load config file
self.config = load_config(BASE_DIR / "config.yaml")
- self.period_suffix_config = self.config['features']['info']['period_suffix']
+ self.period_suffix_config = self.config["features"]["info"]["period_suffix"]
- period_suffix = kwargs.get('period_suffix', self.period_suffix_config)
+ if period_suffix is None:
+ period_suffix = self.period_suffix_config
- if algorithm in ['DNN', 'NN', 'dnn', 'nn']:
- self.algorithm = 'dnn'
- elif algorithm in ['XGB', 'xgb', 'XGBoost', 'xgboost', 'XGBOOST']:
- self.algorithm = 'xgb'
+ if algorithm in ["DNN", "NN", "dnn", "nn"]:
+ self.algorithm = "dnn"
+ elif algorithm in ["XGB", "xgb", "XGBoost", "xgboost", "XGBOOST"]:
+ self.algorithm = "xgb"
else:
- raise ValueError('Current supported algorithms are DNN and XGB.')
+ raise ValueError("Current supported algorithms are DNN and XGB.")
if self.verbose:
log(f"Loading {self.path_dataset}...")
- nrows = kwargs.get("nrows", None)
csv = False
- if self.path_dataset.endswith('.csv'):
+ if self.path_dataset.endswith(".csv"):
csv = True
- self.df_ds = pd.read_csv(self.path_dataset, nrows=nrows)
- elif self.path_dataset.endswith('.h5'):
+ self.df_ds = pd.read_csv(self.path_dataset)
+ elif self.path_dataset.endswith(".h5"):
self.df_ds = read_hdf(self.path_dataset)
- for key in ['coordinates', 'dmdt']:
+ for key in ["coordinates", "dmdt"]:
df_temp = read_hdf(self.path_dataset, key=key)
self.df_ds[key] = df_temp
del df_temp
- self.dmdt = self.df_ds['dmdt']
- elif self.path_dataset.endswith('.parquet'):
+ self.dmdt = self.df_ds["dmdt"]
+ elif self.path_dataset.endswith(".parquet"):
self.df_ds = read_parquet(self.path_dataset)
- self.dmdt = self.df_ds['dmdt']
+ self.dmdt = self.df_ds["dmdt"]
else:
- raise ValueError('Dataset must have .parquet, .h5 or .csv extension.')
+ raise ValueError("Dataset must have .parquet, .h5 or .csv extension.")
if self.verbose:
log(self.df_ds[list(features)].describe())
@@ -997,7 +1063,7 @@ def __init__(
)
dmdt = []
- if (self.verbose) & (self.algorithm == 'dnn'):
+ if (self.verbose) & (self.algorithm == "dnn"):
print("Moving dmdt's to a dedicated numpy array...")
iterator = tqdm(self.df_ds.itertuples(), total=len(self.df_ds))
else:
@@ -1039,28 +1105,28 @@ def make(
batch_size: int = 256,
shuffle_buffer_size: int = 256,
epochs: int = 300,
- **kwargs,
+ float_convert_types: list = [64, 32],
):
"""Make datasets for target_label
- :param target_label: corresponds to training.classes.