Skip to content

Commit

Permalink
Index API (#953)
Browse files Browse the repository at this point in the history
* Initial Index API

- Implement main API entry point: build_index
- Rely on GraphRagConfig instead of PipelineConfig
    - This unifies the API signature with the
    promt_tune and query API entry points
- Derive cache settings, config, and resuming from
    the config and other arguments to
    simplify/reduce arguments to build_index
- Add preflight config file validations
- Add semver change

* fix smoke tests

* fix smoke tests

* Use asyncio

* Add e2e artifacts in GH actions

* Remove unnecessary E2E test, and add skip_validations flag to cli

* Nicer imports

* Reorganize API functions.

* Add license headers and module docstrings

* Fix ignored ruff rule

---------

Co-authored-by: Alonso Guevara <[email protected]>
  • Loading branch information
dworthen and AlonsoGuevara authored Aug 20, 2024
1 parent 5a781dd commit 6b4de3d
Show file tree
Hide file tree
Showing 10 changed files with 593 additions and 202 deletions.
5 changes: 0 additions & 5 deletions .github/workflows/python-smoke-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,3 @@ jobs:
with:
name: smoke-test-artifacts-${{ matrix.python-version }}-${{ matrix.poetry-version }}-${{ runner.os }}
path: tests/fixtures/*/output

- name: E2E Test
if: steps.changes.outputs.python == 'true'
run: |
./scripts/e2e-test.sh
4 changes: 4 additions & 0 deletions .semversioner/next-release/minor-20240819154736579383.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "minor",
"description": "Implement Index API"
}
184 changes: 184 additions & 0 deletions graphrag/config/config_file_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Load a GraphRagConfiguration from a file."""

import json
from abc import ABC, abstractmethod
from pathlib import Path

import yaml

from . import create_graphrag_config
from .models.graph_rag_config import GraphRagConfig

_default_config_files = ["settings.yaml", "settings.yml", "settings.json"]


def resolve_config_path_with_root(root: str | Path) -> Path:
"""Resolve the config path from the given root directory.
Parameters
----------
root : str | Path
The path to the root directory containing the config file.
Searches for a default config file (settings.{yaml,yml,json}).
Returns
-------
Path
The resolved config file path.
Raises
------
FileNotFoundError
If the config file is not found or cannot be resolved for the directory.
"""
root = Path(root)

if not root.is_dir():
msg = f"Invalid config path: {root} is not a directory"
raise FileNotFoundError(msg)

for file in _default_config_files:
if (root / file).is_file():
return root / file

msg = f"Unable to resolve config file for parent directory: {root}"
raise FileNotFoundError(msg)


class ConfigFileLoader(ABC):
"""Base class for loading a configuration from a file."""

@abstractmethod
def load_config(self, config_path: str | Path) -> GraphRagConfig:
"""Load configuration from a file."""
raise NotImplementedError


class ConfigYamlLoader(ConfigFileLoader):
"""Load a configuration from a yaml file."""

def load_config(self, config_path: str | Path) -> GraphRagConfig:
"""Load a configuration from a yaml file.
Parameters
----------
config_path : str | Path
The path to the yaml file to load.
Returns
-------
GraphRagConfig
The loaded configuration.
Raises
------
ValueError
If the file extension is not .yaml or .yml.
FileNotFoundError
If the config file is not found.
"""
config_path = Path(config_path)
if config_path.suffix not in [".yaml", ".yml"]:
msg = f"Invalid file extension for loading yaml config from: {config_path!s}. Expected .yaml or .yml"
raise ValueError(msg)
root_dir = str(config_path.parent)
if not config_path.is_file():
msg = f"Config file not found: {config_path}"
raise FileNotFoundError(msg)
with config_path.open("rb") as file:
data = yaml.safe_load(file.read().decode(encoding="utf-8", errors="strict"))
return create_graphrag_config(data, root_dir)


class ConfigJsonLoader(ConfigFileLoader):
"""Load a configuration from a json file."""

def load_config(self, config_path: str | Path) -> GraphRagConfig:
"""Load a configuration from a json file.
Parameters
----------
config_path : str | Path
The path to the json file to load.
Returns
-------
GraphRagConfig
The loaded configuration.
Raises
------
ValueError
If the file extension is not .json.
FileNotFoundError
If the config file is not found.
"""
config_path = Path(config_path)
root_dir = str(config_path.parent)
if config_path.suffix != ".json":
msg = f"Invalid file extension for loading json config from: {config_path!s}. Expected .json"
raise ValueError(msg)
if not config_path.is_file():
msg = f"Config file not found: {config_path}"
raise FileNotFoundError(msg)
with config_path.open("rb") as file:
data = json.loads(file.read().decode(encoding="utf-8", errors="strict"))
return create_graphrag_config(data, root_dir)


def get_config_file_loader(config_path: str | Path) -> ConfigFileLoader:
"""Config File Loader Factory.
Parameters
----------
config_path : str | Path
The path to the config file.
Returns
-------
ConfigFileLoader
The config file loader for the provided config file.
Raises
------
ValueError
If the config file extension is not supported.
"""
config_path = Path(config_path)
ext = config_path.suffix
match ext:
case ".yaml" | ".yml":
return ConfigYamlLoader()
case ".json":
return ConfigJsonLoader()
case _:
msg = f"Unsupported config file extension: {ext}"
raise ValueError(msg)


def load_config_from_file(config_path: str | Path) -> GraphRagConfig:
"""Load a configuration from a file.
Parameters
----------
config_path : str | Path
The path to the configuration file.
Supports .yaml, .yml, and .json config files.
Returns
-------
GraphRagConfig
The loaded configuration.
Raises
------
ValueError
If the file extension is not supported.
FileNotFoundError
If the config file is not found.
"""
loader = get_config_file_loader(config_path)
return loader.load_config(config_path)
65 changes: 65 additions & 0 deletions graphrag/config/logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Logging utilities. A unified way for enabling logging."""

import logging
from pathlib import Path

from .enums import ReportingType
from .models.graph_rag_config import GraphRagConfig
from .resolve_timestamp_path import resolve_timestamp_path


def enable_logging(log_filepath: str | Path, verbose: bool = False) -> None:
"""Enable logging to a file.
Parameters
----------
log_filepath : str | Path
The path to the log file.
verbose : bool, default=False
Whether to log debug messages.
"""
log_filepath = Path(log_filepath)
log_filepath.parent.mkdir(parents=True, exist_ok=True)
log_filepath.touch(exist_ok=True)

logging.basicConfig(
filename=log_filepath,
filemode="a",
format="%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
level=logging.DEBUG if verbose else logging.INFO,
)


def enable_logging_with_config(
config: GraphRagConfig, timestamp_value: str, verbose: bool = False
) -> tuple[bool, str]:
"""Enable logging to a file based on the config.
Parameters
----------
config : GraphRagConfig
The configuration.
timestamp_value : str
The timestamp value representing the directory to place the log files.
verbose : bool, default=False
Whether to log debug messages.
Returns
-------
tuple[bool, str]
A tuple of a boolean indicating if logging was enabled and the path to the log file.
(False, "") if logging was not enabled.
(True, str) if logging was enabled.
"""
if config.reporting.type == ReportingType.file:
log_path = resolve_timestamp_path(
Path(config.root_dir) / config.reporting.base_dir / "indexing-engine.log",
timestamp_value,
)
enable_logging(log_path, verbose)
return (True, str(log_path))
return (False, "")
115 changes: 115 additions & 0 deletions graphrag/config/resolve_timestamp_path.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Resolve timestamp variables in a path."""

import re
from pathlib import Path
from string import Template


def _resolve_timestamp_path_with_value(path: str | Path, timestamp_value: str) -> Path:
"""Resolve the timestamp in the path with the given timestamp value.
Parameters
----------
path : str | Path
The path containing ${timestamp} variables to resolve.
timestamp_value : str
The timestamp value used to resolve the path.
Returns
-------
Path
The path with ${timestamp} variables resolved to the provided timestamp value.
"""
template = Template(str(path))
resolved_path = template.substitute(timestamp=timestamp_value)
return Path(resolved_path)


def _resolve_timestamp_path_with_dir(
path: str | Path, pattern: re.Pattern[str]
) -> Path:
"""Resolve the timestamp in the path with the latest available timestamp directory value.
Parameters
----------
path : str | Path
The path containing ${timestamp} variables to resolve.
pattern : re.Pattern[str]
The pattern to use to match the timestamp directories.
Returns
-------
Path
The path with ${timestamp} variables resolved to the latest available timestamp directory value.
Raises
------
ValueError
If the parent directory expecting to contain timestamp directories does not exist or is not a directory.
Or if no timestamp directories are found in the parent directory that match the pattern.
"""
path = Path(path)
path_parts = path.parts
parent_dir = Path(path_parts[0])
found_timestamp_pattern = False
for _, part in enumerate(path_parts[1:]):
if part.lower() == "${timestamp}":
found_timestamp_pattern = True
break
parent_dir = parent_dir / part

# Path not using timestamp layout.
if not found_timestamp_pattern:
return path

if not parent_dir.exists() or not parent_dir.is_dir():
msg = f"Parent directory {parent_dir} does not exist or is not a directory."
raise ValueError(msg)

timestamp_dirs = [
d for d in parent_dir.iterdir() if d.is_dir() and pattern.match(d.name)
]
timestamp_dirs.sort(key=lambda d: d.name, reverse=True)
if len(timestamp_dirs) == 0:
msg = f"No timestamp directories found in {parent_dir} that match {pattern.pattern}."
raise ValueError(msg)
return _resolve_timestamp_path_with_value(path, timestamp_dirs[0].name)


def resolve_timestamp_path(
path: str | Path,
pattern_or_timestamp_value: re.Pattern[str] | str = re.compile(r"^\d{8}-\d{6}$"),
) -> Path:
r"""Timestamp path resolver.
Resolve the timestamp in the path with the given timestamp value or
with the latest available timestamp directory matching the given pattern.
Parameters
----------
path : str | Path
The path containing ${timestamp} variables to resolve.
pattern_or_timestamp_value : re.Pattern[str] | str, default=re.compile(r"^\d{8}-\d{6}$")
The pattern to use to match the timestamp directories or the timestamp value to use.
If a string is provided, the path will be resolved with the given string value.
Otherwise, the path will be resolved with the latest available timestamp directory
that matches the given pattern.
Returns
-------
Path
The path with ${timestamp} variables resolved to the provided timestamp value or
the latest available timestamp directory.
Raises
------
ValueError
If the parent directory expecting to contain timestamp directories does not exist or is not a directory.
Or if no timestamp directories are found in the parent directory that match the pattern.
"""
if isinstance(pattern_or_timestamp_value, str):
return _resolve_timestamp_path_with_value(path, pattern_or_timestamp_value)
return _resolve_timestamp_path_with_dir(path, pattern_or_timestamp_value)
7 changes: 6 additions & 1 deletion graphrag/index/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@
help="Overlay default configuration values on a provided configuration file (--config).",
action="store_true",
)
parser.add_argument(
"--skip-validations",
help="Skip any preflight validation. Useful when running no LLM steps.",
action="store_true",
)
args = parser.parse_args()

if args.overlay_defaults and not args.config:
Expand All @@ -85,5 +90,5 @@
dryrun=args.dryrun or False,
init=args.init or False,
overlay_defaults=args.overlay_defaults or False,
cli=True,
skip_validations=args.skip_validations or False,
)
Loading

0 comments on commit 6b4de3d

Please sign in to comment.