diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index 7fe70d7..0000000 --- a/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -*.json filter=lfs diff=lfs merge=lfs -text diff --git a/bkbit/cli.py b/bkbit/cli.py index 3c8eff2..26b4020 100644 --- a/bkbit/cli.py +++ b/bkbit/cli.py @@ -5,6 +5,7 @@ from bkbit.data_translators.file_manifest_translator import filemanifest2jsonld from bkbit.data_translators.genome_annotation_translator import gff2jsonld from bkbit.utils.get_ncbi_taxonomy import download_ncbi_taxonomy +from bkbit.model_editors.linkml_trimmer import linkml_trimmer @click.group() def cli(): @@ -18,6 +19,7 @@ def cli(): cli.add_command(filemanifest2jsonld) cli.add_command(gff2jsonld) cli.add_command(download_ncbi_taxonomy) +cli.add_command(linkml_trimmer) if __name__ == '__main__': cli() diff --git a/bkbit/model_editors/linkml_trimmer.py b/bkbit/model_editors/linkml_trimmer.py index 8835326..c2ef7dc 100644 --- a/bkbit/model_editors/linkml_trimmer.py +++ b/bkbit/model_editors/linkml_trimmer.py @@ -1,15 +1,65 @@ +""" +This script provides a utility for trimming a LinkML schema by retaining specified classes, slots, and enums, along with their dependencies. + +It defines a `YamlTrimmer` class for schema manipulation and offers a command-line interface using Click for easy usage from the terminal. + +Usage: + python script.py [OPTIONS] SCHEMA + +Options: + --classes, -c TEXT Comma-separated list of classes to include in the trimmed schema (required). + --slots, -s TEXT Comma-separated list of slots to include in the trimmed schema. + --enums, -e TEXT Comma-separated list of enums to include in the trimmed schema. + +Example: + python script.py schema.yaml -c Person,Organization -s name,age -e StatusEnum + +The script performs the following steps: +1. Loads the specified LinkML schema. +2. Trims the schema by keeping only the specified classes, slots, and enums, along with their dependencies. +3. Serializes and prints the trimmed schema in YAML format. + +Dependencies: + - click + - linkml-runtime + - linkml + +""" + from dataclasses import dataclass from typing import Union from pathlib import Path from linkml_runtime.linkml_model.meta import SchemaDefinition from linkml_runtime.utils.schemaview import SchemaView - from linkml._version import __version__ from linkml.generators.yamlgen import YAMLGenerator - +import click @dataclass class YamlTrimmer: + """ + A utility class for trimming a LinkML schema by retaining specified classes, slots, and enums, along with their dependencies. + + This class helps in generating a simplified version of a LinkML schema by removing all elements that are not reachable from the specified classes, slots, and enums to keep. + + Args: + schema (Union[str, Path, SchemaDefinition]): The LinkML schema to be trimmed. It can be a file path, URL, or a `SchemaDefinition` object. + + Attributes: + schemaview (SchemaView): An object representing the loaded schema, used for manipulation and traversal. + + Methods: + trim_model(keep_classes: list[str], keep_slots: list[str] = [], keep_enums: list[str] = []): + Trims the schema by keeping only the specified classes, slots, and enums, and their dependencies. + + serialize(): + Serializes and prints the trimmed schema in YAML format. + + Example: + >>> yt = YamlTrimmer('path/to/schema.yaml') + >>> yt.trim_model(['Person', 'Organization'], keep_slots=['name'], keep_enums=['StatusEnum']) + >>> yt.serialize() + """ def __init__(self, schema: Union[str, Path, SchemaDefinition]): self.schemaview = SchemaView(schema) @@ -113,5 +163,30 @@ def serialize(self): print(YAMLGenerator(self.schemaview.schema).serialize()) +@click.command() +## ARGUMENTS ## +# Argument #1: Schema file +@click.argument("schema", type=click.Path(exists=True)) + +## OPTIONS ## +# Option #1: Classes +@click.option('--classes', '-c', required=True, help='Comma-separated list of classes to include in trimmed schema') +# Option #2: Slots +@click.option('--slots', '-s', help='Comma-separated list of slots to include in trimmed schema') +# Option #3: Enums +@click.option('--enums', '-e', help='Comma-separated list of enums to include in trimmed schema') + +def linkml_trimmer(schema, classes, slots, enums): + """ + Trim a LinkMl schema based on a list of classes, slots, and enums to keep. + """ + classes = classes.split(',') + slots = slots.split(',') if slots else [] + enums = enums.split(',') if enums else [] + + yt = YamlTrimmer(schema) + yt.trim_model(classes, slots, enums) + yt.serialize() + if __name__ == "__main__": - pass + linkml_trimmer() diff --git a/docs/conf.py b/docs/conf.py index 1826540..3770055 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -37,4 +37,12 @@ # other themes = 'sphinx_rtd_theme', 'classic', 'furo' html_theme = 'sphinx_rtd_theme' html_static_path = ['_static'] -source_suffix = ['.rst', '.md'] +html_show_sourcelink = False +html_context = { + "display_github": True, # Integrate GitHub + "github_user": "brain-bican", # Username + "github_repo": "bkbit", # Repo name + "github_version": "main", # Version + "conf_py_path": "/docs/", # Path in the checkout to the docs root +} +# source_suffix = ['.rst', '.md'] diff --git a/docs/genome_annotation.rst b/docs/genome_annotation.rst index 979c4fc..7f2ab5d 100644 --- a/docs/genome_annotation.rst +++ b/docs/genome_annotation.rst @@ -29,14 +29,16 @@ Command Line Options ,,,,,,,, - ``-a, --assembly_accession`` + ``-a, --assembly_accession `` ID assigned to the genomic assembly used in the GFF3 file. - **Note: Must be provided when using ENSEMBL GFF3 files** - ``-s, --assembly_strain`` + .. note:: + Must be provided when using ENSEMBL GFF3 files + + ``-s, --assembly_strain `` Specific strain of the organism associated with the GFF3 file. - ``-l, --log_level`` + ``-l, --log_level `` Logging level. Default: @@ -48,13 +50,13 @@ Options Log to a file instead of the console. Default: - FALSE + False Arguments ,,,,,,,,,,, ``GFF3_URL`` - URL to the GFF3 file. + Required argument Examples ......... diff --git a/docs/index.rst b/docs/index.rst index 5dfa853..338c482 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -21,6 +21,18 @@ This package contains tools to use the BICAN Knowledgebase Data Models. specimen_metadata genome_annotation +.. toctree:: + :maxdepth: 1 + :caption: MODEL CONVERTERS + + spreadsheet_converter + +.. toctree:: + :maxdepth: 1 + :caption: MODEL EDITORS + + linkml_trimmer + .. toctree:: :maxdepth: 1 :caption: REFERENCE diff --git a/docs/linkml_trimmer.rst b/docs/linkml_trimmer.rst new file mode 100644 index 0000000..48a3f8e --- /dev/null +++ b/docs/linkml_trimmer.rst @@ -0,0 +1,55 @@ +.. _linkml_trimmer: + +LinkML Schema Trimmer +---------------------- + +Overview +......... +Generate a trimmed version of a LinkML schema by only including a specific subset of classes, slots, and enums. + + +Command Line +............. + +``bkbit linkml-trimmer`` +,,,,,,,,,,,,,,,,,,,,,,,, + + .. code-block:: bash + + $ bkbit linkml-trimmer [OPTIONS] SCHEMA + +Options +,,,,,,, + + ``-c, --classes `` + **Required option** + + List of 'classes' to include in the trimmed schema. + + **Note**: Classes must be separated by commas and enclosed in quotes. + ``-s, --slots `` + List of 'slots' to include in the trimmed schema. + + **Note**: Slots must be separated by commas and enclosed in quotes. + + ``-e, --enums `` + List of 'enums' to include in the trimmed schema. + + **Note**: Enums must be separated by commas and enclosed in quotes. + +Arguments +,,,,,,,,, + + ``SCHEMA`` + Required argument + + +Examples +......... + +Example 1: Trim `Biolink Schema `_ +,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, + +.. code-block:: bash + + $ bkbit linkml-trimmer --classes "gene, genome, organism taxon, thing with taxon, material sample, procedure, entity, activity, named thing" biolink.yaml > bican-biolink.yaml \ No newline at end of file diff --git a/docs/specimen_file_manifest.rst b/docs/specimen_file_manifest.rst index 2a6cffa..a4888f2 100644 --- a/docs/specimen_file_manifest.rst +++ b/docs/specimen_file_manifest.rst @@ -8,6 +8,16 @@ Overview Generates a JSON-LD file containing specimen file data using the BICAN Library Generation Schema. +The input file manifest must be in CSV format and contain the following columns: + + - Project ID + - Specimen ID + - File Name + - Checksum + - File Type + - Archive + - Archive URI + Command Line ............. @@ -27,16 +37,7 @@ Command Line **Arguments** ``FILE_MANIFEST_CSV`` - Required argument. - FILE_MANIFEST_CSV can be optained from Brain Knowledge Platform and **must** contains the following columns: - - - Project ID - - Specimen ID - - File Name - - Checksum - - File Type - - Archive - - Archive URI + Required argument Examples ......... diff --git a/docs/specimen_metadata.rst b/docs/specimen_metadata.rst index 1d642d9..210dfb2 100644 --- a/docs/specimen_metadata.rst +++ b/docs/specimen_metadata.rst @@ -36,12 +36,14 @@ Environment Variables jwt_token ,,,,,,,,, - You **must** set the SpecimenPortal Personal API Token as an environment variable before running ``bkbit specimen2jsonld``. Once set, the token will be used to authenticate with the Specimen Portal API and retrieve the specimen metadata. +Token is used to authenticate with the Specimen Portal API and retrieve the specimen metadata. - .. code-block:: bash +.. note:: + You **must** set the Specimen Portal Personal API Token as an environment variable **before** running ``bkbit specimen2jsonld``. - $ export jwt_token=specimen_portal_personal_api_token +.. code-block:: bash + $ export jwt_token=specimen_portal_personal_api_token Examples ......... diff --git a/docs/spreadsheet_converter.rst b/docs/spreadsheet_converter.rst new file mode 100644 index 0000000..581e960 --- /dev/null +++ b/docs/spreadsheet_converter.rst @@ -0,0 +1,89 @@ +.. _spreadsheet_converter: + +Spreadsheet to LinkML Schema +============================= + +Overview +......... +Create a yaml linkml model from set of spreadsheets. It can use either tsv files or Google Sheet as an input. + +The default behavior is to run the converter starting with tsv files, specifying their paths as arguments, for example, model_spreadsheets/*tsv. + +If ``--gsheet`` option is used, the converter starts from downloading spreadsheets from Google Sheets. +The argument must be a YAML file that has ``gsheet_id`` and a list of ``sheets`` with ``gid`` (a unique identifier for each individual sheet) +and ``name`` (optionally) that will be used as a name of the downloaded TSV file (if not available ``gid`` wil be used). + +Command Line +............. + +``bkbit schema2model`` +,,,,,,,,,,,,,,,,,,,,,,, + +.. code-block:: bash + + $ bkbit schema2model [OPTIONS] SPREADSHEETS + +**Options** + + ``-o, --output `` + Path for the yaml output file. + + ``-t, --template