diff --git a/cylc-src/bioreactor-workflow/bin/get-instrument b/cylc-src/bioreactor-workflow/bin/get-instrument index f286f32..b6a1a44 100755 --- a/cylc-src/bioreactor-workflow/bin/get-instrument +++ b/cylc-src/bioreactor-workflow/bin/get-instrument @@ -1,6 +1,7 @@ #!/usr/bin/env python -import os, sys +import os +import sys from pathlib import Path from pyopenms import MzMLFile, MSExperiment diff --git a/cylc-src/bioreactor-workflow/flow.cylc b/cylc-src/bioreactor-workflow/flow.cylc index 7c7ebe0..7a48ee3 100644 --- a/cylc-src/bioreactor-workflow/flow.cylc +++ b/cylc-src/bioreactor-workflow/flow.cylc @@ -8,7 +8,7 @@ URL = https://github.com/MetaboHUB-MetaToul-FluxoMet/RTMet # Create task families for conda environments. - %include 'envs/conda.cylc' +%include 'envs/conda.cylc' [scheduling] cycling mode = integer @@ -35,10 +35,6 @@ annotate => upload_features quantify => upload_concentrations """ - # +P3/P1 = """ - # compute_fluxes => upload_fluxes - # compute_fluxes => upload_metadata - # """ {% endif %} [[queues]] [[[default]]] @@ -151,12 +147,13 @@ [[[meta]]] title = Trim Spectra description = """ - Remove the first and last `n_start` and `n_end` spectra from the mzML file. + Remove the first `n_start` and last `n_end` scans from the mzML file. This is useful + if the shape of the flowgram is not stable at the beginning or end of the run. """ categories = bioinformatics [[get_timestamp]] - # Regex should be replaced with XML parsing. + # Regex could be replaced by pyOpenMS. script = """ RUN_TIMESTAMP=$(grep ' 1: + sys.stderr.write(main.__doc__) + elif not MZML: + sys.stderr.write("$mzml environment variable not set.\n") + sys.exit() + elif not Path(MZML).exists(): + sys.stderr.write(f"mzML file not found: {MZML}\n") + sys.exit() + + main() + +Make the script executable: + +.. code-block:: console + + $ chmod +x get-scans-number + +Creating a new task in the [runtime] section +================================================ + +Open :file:`cylc-src/bioreactor-workflow/flow.cylc` and add the following task definition at the end: + +.. code-block:: cylc + :caption: :file:`flow.cylc` + :emphasize-lines: 3- + + [runtime] + # ... + [[get_scans_number]] + # The task will run in the wf-openms conda environment + # Adding None makes the task appear at the root in the TUI/GUI + inherit = None, CONDA_OPENMS + script = """ + echo "The script lauched by this task will extract the number of scans from the mzML file." + + get-scans-number > ${output_file} + + echo "The number of scans has been saved to ${output_file}" + echo "Number of scans: $(cat ${output_file})" + """ + [[[environment]]] + # The python script will use the $mzml environment + # variable to get the path of the file. + mzml = ${MAIN_RESULTS_DIR}/${RAWFILE_STEM}.mzML + output_file = ${MAIN_RESULTS_DIR}/scans_number.txt + +This task will run the :file:`get-scans-number` script and save the output to a file named +:file:`scans_number.txt` in the main results directory. This directory +(:file:`share/cycle/n/dataflow/`) is specific to each cyclepoint ``n``. + +Adding the task to the graph +============================ + +Add a new graph string to the :strong:`+P1/P1` recurrence, inside the :strong:`[graph]` section +of the workflow definition: + +.. code-block:: cylc + :caption: :file:`flow.cylc` + :emphasize-lines: 8 + + [[graph]] + R1/^ = validate_cfg => validate_compounds_db & validate_met_model => is_setup + R1/+P1 = convert_raw => get_instrument => extract_features + +P1/P1 = """ + is_setup[^] => _catch_raw + @catch_raw => _catch_raw => convert_raw => get_timestamp & + trim_spectra => extract_features => annotate => quantify + convert_raw => get_scans_number + """ + +The task will be executed for each cyclepoint (/P1) starting from the second one (+P1). It will run after the +:strong:`convert_raw` task as it depends on the mzML file generated by it. No other task depends on +the one we just added. + +You can check that the task has been added correctly by running: + +.. code-block:: console + + $ cylc graph bioreactor-workflow 0 1 + +.. figure:: /_static/graphs/added-task-graph.png + :alt: Graph with the new task added + :scale: 50% + :align: center + +Testing the new task +==================== + +Install and start a new run of the workflow, and add a mzML file to the :file:`raws/` directory. The task should +start immediately after the :strong:`convert_raw` task and generate a :file:`scans_number.txt` file +in the :file:`cylc-run/your_run_name/share/cycle/1/dataflow/` directory. + +.. code-block:: output + :caption: :file:`job.out` in logs + + Workflow : bioreactor-workflow/task-added + Job : 1/get_scans_number/01 (try 1) + User@Host: elliotfontaine@MBP-Elliot.local + + 2024-07-22T14:18:50+02:00 INFO - started + The script lauched by this task will extract the number of scans from the mzML file. + The number of scans has been saved to /Users/elliotfontaine/cylc-run/bioreactor-workflow/task-added/share/cycle/1/dataflow/scans_number.txt + Number of scans: 35 + 2024-07-22T14:18:52+02:00 INFO - succeeded + + diff --git a/docs/source/development/coding_style.rst b/docs/source/development/coding_style.rst new file mode 100644 index 0000000..324197b --- /dev/null +++ b/docs/source/development/coding_style.rst @@ -0,0 +1,77 @@ +.. _development.coding-style: + +============ +Coding style +============ + +:file:`bin/` scripts: environment variables or command line arguments? +====================================================================== + +When writing scripts (Python, R, Bash) for the workflow, you have the choice between loading +environment variables from inside the script, or parsing command line arguments. + +As a rule of thumb, use environment variables when you don't expect the script to be reused outside +the workflow, and command line arguments with strong input validation when you want to make the script +more portable. + +Cylc +==== + +In general, follow Cylc :doc:`cylc:workflow-design-guide/style-guide`. When creating tasks, +set the :strong:`[meta]` title and description fields to describe what the task does. You can also +add custom field like :strong:`categories` if you want. + +Use uppercase for: + * family tasks (notably the conda ones, e.g. :strong:`CONDA_OPENMS`), + * global environment variables set in :strong:`[runtime][root]` and broadcasted ones (e.g. + :strong:`RAWFILE_STEM`). + +Use lowercase for: + * local environment variables set in :strong:`[environment]` blocks inside tasks. + * task names. + +Add :strong:`None` before the name of inherited family tasks to make the task in question appear at +the root when using the TUI or GUI. Otherwise, the task will be nested under the family task. The +exception are InfluxDB tasks, which are always nested under the :strong:`INFLUXDB` family task. + + +When using global environment variables or Jinja2 template variables to build CLI arguments, +do it in the :strong:`[environment]` block of the task, not in the script itself: + +.. code-block:: cylc + :caption: :file:`flow.cylc` + :emphasize-lines: 4, 7-9 + + [[trim_spectra]] + inherit = None, CONDA_OPENMS + script = """ + trimms ${mzml} ${n_start} ${n_end} + """ + [[[environment]]] + mzml = ${MAIN_RESULTS_DIR}/${RAWFILE_STEM}.mzML + n_start = {{ cfg__trim_values[0] }} + n_end = {{ cfg__trim_values[1] }} + [[[meta]]] + title = Trim Spectra + description = """ + Remove the first `n_start` and last `n_end` scans from the mzML file. This is useful + if the shape of the flowgram is not stable at the beginning or end of the run. + """ + categories = bioinformatics + +Python +====== + +Python code should follow the `PEP 8`_ style guide. The `Black`_ code formatter should be used to +automatically format the code. + +You should also use a linter / static code analyser like `Pylint`_ to catch potential bugs, commented +out code, code smells, etc. + +Bash +==== +[TODO] + +R += +[TODO] diff --git a/docs/source/development/index.rst b/docs/source/development/index.rst new file mode 100644 index 0000000..1fdec00 --- /dev/null +++ b/docs/source/development/index.rst @@ -0,0 +1,28 @@ +.. _development: + +=========== +Development +=========== + +Here are discussed some of the choices made during the development of the project (coding styles for +different languages, pattern used in Cylc, etc). + +You'll also find some guidelines on how to add a new task or configuration option to the workflow. + +.. note:: + It is assumed that you have a basic understanding of: + * Cylc, + * Python, R and Bash. + + For further information on Cylc, please consult their :ref:`cylc:user guide`. + +.. toctree:: + :maxdepth: 2 + + workflow_design + coding_style + add_task + add_config_option + + + \ No newline at end of file diff --git a/docs/source/development/workflow_design.rst b/docs/source/development/workflow_design.rst new file mode 100644 index 0000000..9981ad7 --- /dev/null +++ b/docs/source/development/workflow_design.rst @@ -0,0 +1,240 @@ +.. _development.workflow-design: + +======================= +Workflow design choices +======================= + +The choices described here are the ones currently implemented in the RTMet workflow. They are subject +to change, and could be brought up for discussion. + +Following Cylc's best practices +=============================== + +Our workflow generally follows Cylc's :ref:`cylc:workflow design guide`. + +Some notable exceptions are: + * :ref:`cylc:self-contained workflows`: RTMet relies on a user-wide (or system-wide) conda + installation to handle most of its dependencies. This means they are vulnerable to external + changes. + * :ref:`cylc:workflow housekeeping`: Not implemented yet. + * :doc:`Automating Failure Recovery `: Not + implemented yet. + +Jinja2 templating +================= + +Jinja2 templating is used extensively in the workflow definition file, :file:`flow.cylc`. It allows +text to be generated dynamically, based on the values of variables passed to the template. + +Since the workflow source code contained in the :file:`flow.cylc` is basically text, Jinja2 templating +is a way for Cylc's devs to add logic without having to write a full-fledged programming language. + +User configuration options are passed down from the :rose:file:`rose-suite.conf` file to the workflow as +Jinja2 variables. Some of these variables are used for branching logic: + + +.. code-block:: jinja + :caption: Switching between input strategies + :emphasize-lines: 5-9 + + [scheduling] + cycling mode = integer + initial cycle point = 0 + [[xtriggers]] + {% if cfg__input_strategy == 'internal' %} + catch_raw = catch_raw_internal('%(point)s', '%(workflow_run_dir)s') + {% elif cfg__input_strategy == 'local' %} + catch_raw = catch_raw_local('%(point)s', '%(workflow_run_dir)s', {{ cfg__local_runs_dir }}) + {% endif %} + +Whole parts of the workflow can be enabled or disabled based on the value of a variable: + +.. code-block:: jinja + :caption: Enabling InfluxDB support + :emphasize-lines: 6-12 + + [scheduling] + [[graph]] + ... + R1/+P3 = quantify => compute_fluxes + +P4/P1 = quantify & compute_fluxes[-P1] => compute_fluxes + {% if cfg__toggle_influxdb %} + R1/^ = validate_cfg => create_bucket => is_setup + +P1/P1 = """ + annotate => upload_features + quantify => upload_concentrations + """ + {% endif %} + +Other Jinja2 variables are used to define environment variables for tasks: + +.. code-block:: jinja + :caption: Allowing the user to set the number of scans to trim + :emphasize-lines: 9-10 + + [runtime] + [[trim_spectra]] + inherit = None, CONDA_OPENMS + script = """ + trimms ${mzml} ${n_start} ${n_end} + """ + [[[environment]]] + mzml = ${MAIN_RESULTS_DIR}/${RAWFILE_STEM}.mzML + n_start = {{ cfg__trim_values[0] }} + n_end = {{ cfg__trim_values[1] }} + [[[meta]]] + title = Trim Spectra + description = """ + Remove the first `n_start` and last `n_end` scans from the mzML file. This is useful + if the shape of the flowgram is not stable at the beginning or end of the run. + """ + categories = bioinformatics + +.. seealso:: + :ref:`cylc:user guide jinja2` in Cylc's documentation. + +Rose for configuration management +================================= + +Rose is used for its :ref:`rose:rose suites` capabilities. It interfaces with our workflow using the +:ref:`cylc:cylc rose` plugin. Just think of it as workflow configuration being outsourced to another package, since Cylc doesn't +have it built-in (yet?) + +User configuration options are stored in the :rose:file:`rose-suite.conf` file at the root of the +workflow directory. They are in the :strong:`[template variables]` section, which means they are passed +down to the workflow as Jinja2 variables. + +The chosen naming convention for configuration items is *cfg__*. This is both to avoid +conflicts with other environment variables and to make it clear that these are configuration items. + +.. seealso:: + * :ref:`tutorial.user-config` + * :ref:`reference.user-config` + +Task inheritance to avoid code duplication +========================================== + +Workflow tasks can inherit from other tasks, which mean script blocks (:strong:`[script]`, +:strong:`[pre-script]` and :strong:`[post-script]`) but also :strong:`[environment]` variables are taken +from the parent task. Our workflow uses this feature for: + +* Conda environment activation (see :ref:`below `) +* Sharing InfluxDB configuration (URL, token, organization, etc.) +* Format some of the intermediary tables in a :strong:`[post-script]` block (adding *datetime*, + *cycle* and *instrument_id* columns). + + +.. seealso:: + :ref:`cylc:sharing by inheritance` in Cylc's documentation. + +Run setup is done at the first cyclepoint +========================================= + +This include user configuration validation, input data validation, and other tasks that need to be +done before the main workflow starts: + +* :strong:`[validate_cfg]` +* :strong:`[validate_compounds_db]` +* :strong:`[validate_met_model]` (to be implemented) +* :strong:`[[INFLUXDB][create_bucket]]` + +Cyclepoint 0 is reserved for setup tasks. processing of .raw files starts at cyclepoint 1. + +.. _development.conda-envs: + +Tasks can run in specific conda environments +============================================ + +Conda environments activation is handled by a `pre-script`_. :file:`envs/conda.cylc` defines +family tasks, one for each conda environment: + +.. code-block:: cylc + :lineno-start: 10 + :caption: ``flow.cylc`` + + # Create task families for conda environments. + %include 'envs/conda.cylc' + +.. code-block:: jinja + :caption: ``conda.cylc`` + + {% set conda_envs = { + 'CONDA_TRFP': 'wf-trfp', + 'CONDA_BINNER': 'wf-binner', + 'CONDA_DATAMUNGING': 'wf-datamunging', + 'CONDA_INFLUX': 'wf-influx', + 'CONDA_OPENMS': 'wf-pyopenms', + } %} + + [runtime] + {% for env, conda_env_name in conda_envs.items() %} + [[{{env}}]] + pre-script = """ + set +eu + conda activate {{ conda_env_name }} + set -eu + """ + {% endfor %} + +Individual tasks in the workflow can then inherit from these families to run in the desired conda +environment: + +.. code-block:: cylc + :caption: ``flow.cylc`` + :emphasize-lines: 3 + + [runtime] + [[trim_spectra]] + inherit = None, CONDA_OPENMS + script = """ + trimms ${mzml} ${n_start} ${n_end} + """ + [[[environment]]] + mzml = ${MAIN_RESULTS_DIR}/${RAWFILE_STEM}.mzML + n_start = {{ cfg__trim_values[0] }} + n_end = {{ cfg__trim_values[1] }} + +.. warning:: + If you override the `pre-script`_ in a task while inheriting from a conda family task, you will + lose the conda environment activation. + +.. _pre-script: https://cylc.github.io/cylc-doc/8.3.0/html/reference/config/workflow.html#flow.cylc[runtime][%3Cnamespace%3E]pre-script + +:file:`dataflow/` and :file:`qc/` directories for results +========================================================= + +Our workflow follows the convention described in :ref:`cylc:shared task io paths`. In addition, +the :file:`share/cycle/{{n}}` directories are further divided into :file:`dataflow/` and :file:`qc/`. + +* :file:`dataflow/` contains the results of the main workflow tasks. It is used to pass data between + tasks. +* :file:`qc/` contains quality control results to be analyzed by the user: plots, statistics, etc. + +Data tables are stored in plain text CSV files +======================================================= + +Intermediary results in :file:`dataflow/` are stored in a delimiter-separated format, using semicolons +as separators. It allows for easy inspection and debugging, as well as compatibility with most +spreadsheet softwares. + +Furthermore, they can easily be edited using :command:`awk`/:command:`sed`/:command:`grep` +or :command:`csvkit` without the need to load them as dataframes in Python or R. + +Libraries/packages to be favored +================================ + +* Data wrangling: :bdg-link-success:`csvtk ` (CLI), + :bdg-link-success:`pandas ` (Python) and + :bdg-link-success:`tidyverse ` (R). +* Data validation: :bdg-link-success:`frictionless ` +* Editing/Querying mzML files: :bdg-link-success:`pyopenms ` + +InfluxDB is an optional dependency +================================== + +InfluxDB is used for real-time visualization of the results. It is not a strict requirement for the +workflow to run. It can be enabled by setting +:rose:conf:`rose-suite.conf[template variables]cfg__toggle_influxdb` to :strong:`True`. + +Data is uploaded to InfluxDB using its Python API. :file:`influx_utils.py` contains functions to +convert our CSV files into the correct upload format. \ No newline at end of file diff --git a/docs/source/getting_started/tutorial.rst b/docs/source/getting_started/basic_tutorial.rst similarity index 100% rename from docs/source/getting_started/tutorial.rst rename to docs/source/getting_started/basic_tutorial.rst diff --git a/docs/source/getting_started/index.rst b/docs/source/getting_started/index.rst index 2fe4380..302fb36 100644 --- a/docs/source/getting_started/index.rst +++ b/docs/source/getting_started/index.rst @@ -8,5 +8,5 @@ Getting Started :maxdepth: 2 installation - tutorial + basic_tutorial user_config \ No newline at end of file diff --git a/docs/source/getting_started/user_config.rst b/docs/source/getting_started/user_config.rst index 280ff75..9c5cd15 100644 --- a/docs/source/getting_started/user_config.rst +++ b/docs/source/getting_started/user_config.rst @@ -26,7 +26,7 @@ Inside, you'll find :file:`rose-suite.conf`, which is a global configuration fil # ... Actually, you may find a tolerance of 10 ppm to be a bit too high. Open the file in a text editor, -and reduce the value to 1 ppm: +and reduce the value of :rose:conf:`rose-suite.conf[template variables]cfg__ppm_tol` to 1. .. code-block:: diff diff --git a/docs/source/index.rst b/docs/source/index.rst index 6872053..161cd8a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -57,6 +57,7 @@ If you want to use RTMet for real-time monitoring, there are a few additional re getting_started/index user_guide/index reference/index + development/index glossary contributing license diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index f3691c4..7a4e34c 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -7,6 +7,6 @@ Reference .. toctree:: :maxdepth: 2 - config_options + user_config results data_processing \ No newline at end of file diff --git a/docs/source/reference/config_options.rst b/docs/source/reference/user_config.rst similarity index 97% rename from docs/source/reference/config_options.rst rename to docs/source/reference/user_config.rst index e6753a8..3fdeaff 100644 --- a/docs/source/reference/config_options.rst +++ b/docs/source/reference/user_config.rst @@ -1,3 +1,5 @@ +.. _reference.user-config: + =========================== Workflow User Configuration =========================== diff --git a/docs/source/substitutions.rst.include b/docs/source/substitutions.rst.include index 45a2f39..742a675 100644 --- a/docs/source/substitutions.rst.include +++ b/docs/source/substitutions.rst.include @@ -2,10 +2,11 @@ .. Hyperlinks -.. _Cylc User Guide: https://cylc.github.io/cylc-doc/latest/html/index.html -.. _Cylc Workflow Design Guide: https://cylc.github.io/cylc-doc/latest/html/workflow-design-guide/index.html .. _Cylc: https://cylc.github.io/ .. _fia: https://en.wikipedia.org/wiki/Flow_injection_analysis +.. _PEP 8: https://peps.python.org/pep-0008/ +.. _Black: https://black.readthedocs.io/en/stable/ +.. _Pylint: https://pylint.pycqa.org/en/latest/ .. Substitutions