Skip to content

Commit

Permalink
Merge pull request #4 from datatractor/add_entrypoint
Browse files Browse the repository at this point in the history
Add `beam` command line interface
  • Loading branch information
PeterKraus authored Oct 17, 2024
2 parents 1a8cc30 + eb35ce0 commit 7dbbe10
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 6 deletions.
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

Repository containing the reference implementation of the Datatractor API, published at [![Datatractor Yard](https://badgen.net/static/%F0%9F%9A%9Cdatatractor/yard)](https://yard.datatractor.org/).

## `datatractor_beam` package
## `datatractor-beam` package

This repository contains a draft Python 3.10 package, located under the `./beam` directory.
The package can be used to:
Expand Down Expand Up @@ -62,6 +62,12 @@ ret = extract("example.mpr", "biologic-mpr", output_path="output.nc", preferred_

In this case, the `ret` will be empty bytes, and the output of the extractor should appear in the `output.nc` file.

Finally, `beam` can also be executed from the command line, implying `preferred_mode="cli"`. The command line invocation equivalent to the above Python syntax is:

```bash
beam biologic-mpr example.mpr --outfile output.nc
```


### Plans

Expand All @@ -77,7 +83,7 @@ In this case, the `ret` will be empty bytes, and the output of the extractor sho
across subprocesses without any extractor specific classes,
e.g., raw JSON/Python dicts, pandas dataframes or xarray datasets (as
optional requirements, by demand).
- [ ] A command-line for quickly running e.g., `beam <filename>`
- [x] A command-line for quickly running e.g., `beam <filename>`
- [ ] Extractor scaffold/template/plugin
- If it can be kept similarly low-dependency, this package could also
implement an extractor scaffold for those who want to modify existing
Expand Down
54 changes: 50 additions & 4 deletions beam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
"""

import argparse
import importlib.metadata
import json
import multiprocessing.managers
import multiprocessing.shared_memory
Expand All @@ -30,6 +32,7 @@
from typing import Any, Callable, Optional

__all__ = ("extract", "Extractor")
__version__ = importlib.metadata.version("datatractor-beam")

REGISTRY_BASE_URL = "https://yard.datatractor.org/api/v0.1.0"
BIN = "Scripts" if platform.system() == "Windows" else "bin"
Expand All @@ -46,6 +49,47 @@ class SupportedInstallationMethod(Enum):
CONDA = "conda"


def run_beam():
argparser = argparse.ArgumentParser(
prog="beam",
description="""CLI for datatractor extractors that takes a filename and a filetype, then installs and runs an appropriate extractor, if available, from the chosen registry (default: https://registry.datatractor.org/). Filetype IDs can be found in the registry API at e.g., https://registry.datatractor.org/api/filetypes. If a matching extractor is found at https://registry.datatractor.org/api/extractors, it will be installed into a virtual environment local to the beam installation. The results of the extractor will be written out to a file at --outfile, or in the default location for that output file type.""",
)

argparser.add_argument(
"--version",
action="version",
version=f"%(prog)s version {__version__}",
)

argparser.add_argument(
"filetype",
help="FileType.ID of the input file",
default=None,
)

argparser.add_argument(
"infile",
help="Path of the input file",
default=None,
)

argparser.add_argument(
"--outfile",
"-o",
help="Optional path of the output file",
default=None,
)

args = argparser.parse_args()

extract(
input_path=args.infile,
input_type=args.filetype,
output_path=args.outfile,
preferred_mode=SupportedExecutionMethod.CLI,
)


def extract(
input_path: Path | str,
input_type: str,
Expand All @@ -61,15 +105,16 @@ def extract(
Parameters:
input_path: The path or URL of the file to parse.
input_type: The ID of the `FileType` in the registry.
input_type: The ID of the ``FileType`` in the registry.
output_path: The path to write the output to.
If not provided, the output will be requested to be written
to a file with the same name as the input file, but with a .json extension.
to a file with the same name as the input file, but with an extension as
defined using the ``output_type``. Defaults to ``{input_path}.out``.
output_type: A string specifying the desired output type.
preferred_mode: The preferred execution method.
If the extractor supports both Python and CLI, this will be used to determine
which to use. If the extractor only supports one method, this will be ignored.
Accepts the `SupportedExecutionMethod` values of "cli" or "python".
Accepts the ``SupportedExecutionMethod`` values of "cli" or "python".
install: Whether to install the extractor package before running it. Defaults to True.
extractor_definition: A dictionary containing the extractor definition to use instead
of a registry lookup.
Expand Down Expand Up @@ -265,7 +310,8 @@ def execute(
)

if output_path is None:
output_path = input_path.with_suffix(".json")
suffix = ".out" if output_type is None else f".{output_type}"
output_path = input_path.with_suffix(suffix)

command = self.apply_template_args(
command,
Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ dynamic = ["version"]
classifiers = [
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Development Status :: 2 - Pre-Alpha",
"Intended Audience :: Science/Research",
"Intended Audience :: System Administrators",
Expand Down Expand Up @@ -54,6 +56,9 @@ dev = [
[project.urls]
repository = "https://github.com/datatractor/beam"

[project.scripts]
beam = "beam:run_beam"

[tool.ruff]
extend-exclude = [
"providers",
Expand Down
11 changes: 11 additions & 0 deletions tests/test_mpr.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import subprocess
import urllib.request
from pathlib import Path

Expand Down Expand Up @@ -146,3 +147,13 @@ def test_extractorplan_python_method():
function, args, kwargs = ExtractorPlan._prepare_python(
'extract(filename="example.txt", type={"test": "example", "dictionary": "example"})'
)


def test_biologic_beam(tmp_path, test_mprs):
for ind, test_mpr in enumerate(test_mprs):
input_path = tmp_path / test_mpr
output_path = tmp_path / test_mpr.name.replace(".mpr", ".nc")
task = ["beam", "biologic-mpr", str(input_path), "--outfile", str(output_path)]
subprocess.run(task)
assert output_path.exists()
break

0 comments on commit 7dbbe10

Please sign in to comment.