From 083de12bcf9e68e51032500106dc4aff771445a4 Mon Sep 17 00:00:00 2001 From: Josh Bradley Date: Fri, 25 Oct 2024 19:00:24 -0400 Subject: [PATCH] Auto-generate CLI doc pages (#1325) --- .../patch-20241025215416188681.json | 4 ++ dictionary.txt | 1 + docs/cli.md | 9 ++++ docs/get_started.md | 2 + docs/index/cli.md | 23 ---------- docs/query/cli.md | 44 ------------------- mkdocs.yaml | 4 +- poetry.lock | 17 ++++++- pyproject.toml | 1 + tests/unit/config/test_default_config.py | 6 +-- 10 files changed, 36 insertions(+), 75 deletions(-) create mode 100644 .semversioner/next-release/patch-20241025215416188681.json create mode 100644 docs/cli.md delete mode 100644 docs/index/cli.md delete mode 100644 docs/query/cli.md diff --git a/.semversioner/next-release/patch-20241025215416188681.json b/.semversioner/next-release/patch-20241025215416188681.json new file mode 100644 index 0000000000..9b45ed4ee8 --- /dev/null +++ b/.semversioner/next-release/patch-20241025215416188681.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "add-autogenerated-cli-docs" +} diff --git a/dictionary.txt b/dictionary.txt index 4c5d81f762..7ea41bd295 100644 --- a/dictionary.txt +++ b/dictionary.txt @@ -67,6 +67,7 @@ pypi nbformat semversioner mkdocs +typer # Library Methods iterrows diff --git a/docs/cli.md b/docs/cli.md new file mode 100644 index 0000000000..fb8c7e38db --- /dev/null +++ b/docs/cli.md @@ -0,0 +1,9 @@ +# CLI Reference + +This page documents the command-line interface of the graphrag library. + +::: mkdocs-typer + :module: graphrag.cli.main + :prog_name: graphrag + :command: app + :depth: 0 diff --git a/docs/get_started.md b/docs/get_started.md index 3db4a82cc1..46fec3feb4 100644 --- a/docs/get_started.md +++ b/docs/get_started.md @@ -30,6 +30,8 @@ It shows how to use the system to index some text, and then use the indexed data pip install graphrag ``` +The graphrag library includes a CLI for a no-code approach to getting started. Please review the full [CLI documentation](cli.md) for further detail. + # Running the Indexer Now we need to set up a data project and some initial configuration. Let's set that up. We're using the [default configuration mode](config/overview.md), which you can customize as needed using a [config file](config/json_yaml.md), which we recommend, or [environment variables](config/env_vars.md). diff --git a/docs/index/cli.md b/docs/index/cli.md deleted file mode 100644 index 9f479771ca..0000000000 --- a/docs/index/cli.md +++ /dev/null @@ -1,23 +0,0 @@ -# Indexer CLI - -The GraphRAG indexer CLI allows for no-code usage of the GraphRAG Indexer. - -```bash -graphrag index --verbose --root \ ---config --resume \ ---reporter --emit json,csv,parquet \ ---no-cache -``` - -## CLI Arguments - -- `--verbose` - Adds extra logging information during the run. -- `--root ` - the data root directory. This should contain an `input` directory with the input data, and an `.env` file with environment variables. These are described below. -- `--resume ` - if specified, the pipeline will attempt to resume a prior run. The parquet files from the prior run will be loaded into the system as inputs, and the workflows that generated those files will be skipped. The input value should be the timestamped output folder, e.g. "20240105-143721". -- `--config ` - This will opt-out of the Default Configuration mode and execute a custom configuration. If this is used, then none of the environment-variables below will apply. -- `--reporter ` - This will specify the progress reporter to use. The default is `rich`. Valid values are `rich`, `print`, and `none`. -- `--dry-run` - Runs the indexing pipeline without executing any steps in order to inspect and validate the configuration file. -- `--emit ` - This specifies the table output formats the pipeline should emit. The default is `parquet`. Valid values are `parquet`, `csv`, and `json`, comma-separated. -- `--no-cache` - This will disable the caching mechanism. This is useful for debugging and development, but should not be used in production. -- `--output ` - Specify the output directory for pipeline artifacts. -- `--reports ` - Specify the output directory for reporting. diff --git a/docs/query/cli.md b/docs/query/cli.md deleted file mode 100644 index 10d3a92e2d..0000000000 --- a/docs/query/cli.md +++ /dev/null @@ -1,44 +0,0 @@ -# Query CLI - -The GraphRAG query CLI allows for no-code usage of the GraphRAG Query engine. - -```bash -graphrag query --config --data --community-level --response-type --method <"local"|"global"> -``` - -## CLI Arguments - -- `--config ` - The configuration yaml file to use when running the query. If this is used, then none of the environment-variables below will apply. -- `--data ` - Folder containing the `.parquet` output files from running the Indexer. -- `--community-level ` - Community level in the Leiden community hierarchy from which we will load the community reports higher value means we use reports on smaller communities. Default: 2 -- `--response-type ` - Free form text describing the response type and format, can be anything, e.g. `Multiple Paragraphs`, `Single Paragraph`, `Single Sentence`, `List of 3-7 Points`, `Single Page`, `Multi-Page Report`. Default: `Multiple Paragraphs`. -- `--method <"local"|"global">` - Method to use to answer the query, one of local or global. For more information check [Overview](overview.md) -- `--streaming` - Stream back the LLM response - -## Env Variables - -Required environment variables to execute: -- `GRAPHRAG_API_KEY` - API Key for executing the model, will fallback to `OPENAI_API_KEY` if one is not provided. -- `GRAPHRAG_LLM_MODEL` - Model to use for Chat Completions. -- `GRAPHRAG_EMBEDDING_MODEL` - Model to use for Embeddings. - -You can further customize the execution by providing these environment variables: - -- `GRAPHRAG_LLM_API_BASE` - The API Base URL. Default: `None` -- `GRAPHRAG_LLM_TYPE` - The LLM operation type. Either `openai_chat` or `azure_openai_chat`. Default: `openai_chat` -- `GRAPHRAG_LLM_MAX_RETRIES` - The maximum number of retries to attempt when a request fails. Default: `20` -- `GRAPHRAG_EMBEDDING_API_BASE` - The API Base URL. Default: `None` -- `GRAPHRAG_EMBEDDING_TYPE` - The embedding client to use. Either `openai_embedding` or `azure_openai_embedding`. Default: `openai_embedding` -- `GRAPHRAG_EMBEDDING_MAX_RETRIES` - The maximum number of retries to attempt when a request fails. Default: `20` -- `GRAPHRAG_LOCAL_SEARCH_TEXT_UNIT_PROP` - Proportion of context window dedicated to related text units. Default: `0.5` -- `GRAPHRAG_LOCAL_SEARCH_COMMUNITY_PROP` - Proportion of context window dedicated to community reports. Default: `0.1` -- `GRAPHRAG_LOCAL_SEARCH_CONVERSATION_HISTORY_MAX_TURNS` - Maximum number of turns to include in the conversation history. Default: `5` -- `GRAPHRAG_LOCAL_SEARCH_TOP_K_ENTITIES` - Number of related entities to retrieve from the entity description embedding store. Default: `10` -- `GRAPHRAG_LOCAL_SEARCH_TOP_K_RELATIONSHIPS` - Control the number of out-of-network relationships to pull into the context window. Default: `10` -- `GRAPHRAG_LOCAL_SEARCH_MAX_TOKENS` - Change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000). Default: `12000` -- `GRAPHRAG_LOCAL_SEARCH_LLM_MAX_TOKENS` - Change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500). Default: `2000` -- `GRAPHRAG_GLOBAL_SEARCH_MAX_TOKENS` - Change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000). Default: `12000` -- `GRAPHRAG_GLOBAL_SEARCH_DATA_MAX_TOKENS` - Change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000). Default: `12000` -- `GRAPHRAG_GLOBAL_SEARCH_MAP_MAX_TOKENS` - Default: `500` -- `GRAPHRAG_GLOBAL_SEARCH_REDUCE_MAX_TOKENS` - Change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500). Default: `2000` -- `GRAPHRAG_GLOBAL_SEARCH_CONCURRENCY` - Default: `32` diff --git a/mkdocs.yaml b/mkdocs.yaml index 30acf884dd..06cfa5e6e9 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -29,7 +29,6 @@ nav: - Overview: "index/overview.md" - Architecture: "index/architecture.md" - Dataflow: "index/default_dataflow.md" - - CLI: "index/cli.md" - Configuration: - Overview: "config/overview.md" - Init Command: "config/init.md" @@ -46,13 +45,13 @@ nav: - Local Search: "query/local_search.md" - Question Generation: "query/question_generation.md" - Global Search: "query/global_search.md" - - CLI: "query/cli.md" - Notebooks: - Overview: "query/notebooks/overview.md" - Global Search: "examples_notebooks/global_search.ipynb" - Local Search: "examples_notebooks/local_search.ipynb" - Microsoft Research Blog: "blog_posts.md" - Extras: + - CLI: "cli.md" - Operation Dulce: - About: "data/operation_dulce/ABOUT.md" - Document: "data/operation_dulce/Operation Dulce v2 1 1.md" @@ -104,3 +103,4 @@ markdown_extensions: slugify: !!python/object/apply:pymdownx.slugs.slugify kwds: case: lower + - mkdocs-typer diff --git a/poetry.lock b/poetry.lock index 105b511d7e..98c7427e74 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2545,6 +2545,21 @@ files = [ {file = "mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443"}, ] +[[package]] +name = "mkdocs-typer" +version = "0.0.3" +description = "An MkDocs extension to generate documentation for Typer command line applications" +optional = false +python-versions = ">=3.7" +files = [ + {file = "mkdocs_typer-0.0.3-py3-none-any.whl", hash = "sha256:b2a9a44da590a7100114fde4de9123fedfea692d229379984db20ee3b3f12d7c"}, + {file = "mkdocs_typer-0.0.3.tar.gz", hash = "sha256:4dd37f024190a82aaf0f6c984faafb15167d34eab7e29a6a85e61362423a4eb7"}, +] + +[package.dependencies] +markdown = "==3.*" +typer = "==0.*" + [[package]] name = "msal" version = "1.31.0" @@ -5200,4 +5215,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "0bcb3b8ebe38153edddd48f8077ddf58e4628e7b714731a9fa48785288d206b9" +content-hash = "7f78e10fa0099c66763c74fd0846581bfd760fb466bc3479c166a613e4881a3a" diff --git a/pyproject.toml b/pyproject.toml index c290b5ae2f..d454078b6a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,6 +92,7 @@ json-repair = "^0.30.0" future = "^1.0.0" # Needed until graspologic fixes their dependency typer = "^0.12.5" +mkdocs-typer = "^0.0.3" [tool.poetry.group.dev.dependencies] coverage = "^7.6.0" ipykernel = "^6.29.4" diff --git a/tests/unit/config/test_default_config.py b/tests/unit/config/test_default_config.py index fc348c1dbd..ccdf7908c9 100644 --- a/tests/unit/config/test_default_config.py +++ b/tests/unit/config/test_default_config.py @@ -482,10 +482,8 @@ def test_can_set_no_chunk_by_columns(self): def test_all_env_vars_is_accurate(self): env_var_docs_path = Path("docs/config/env_vars.md") - query_docs_path = Path("docs/query/cli.md") env_var_docs = env_var_docs_path.read_text(encoding="utf-8") - query_docs = query_docs_path.read_text(encoding="utf-8") def find_envvar_names(text) -> set[str]: pattern = r"`(GRAPHRAG_[^`]+)`" @@ -493,9 +491,7 @@ def find_envvar_names(text) -> set[str]: found = {f for f in found if not f.endswith("_")} return {*found} - graphrag_strings = find_envvar_names(env_var_docs) | find_envvar_names( - query_docs - ) + graphrag_strings = find_envvar_names(env_var_docs) missing = {s for s in graphrag_strings if s not in ALL_ENV_VARS} - { # Remove configs covered by the base LLM connection configs