-
Notifications
You must be signed in to change notification settings - Fork 2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Initial Index API - Implement main API entry point: build_index - Rely on GraphRagConfig instead of PipelineConfig - This unifies the API signature with the promt_tune and query API entry points - Derive cache settings, config, and resuming from the config and other arguments to simplify/reduce arguments to build_index - Add preflight config file validations - Add semver change * fix smoke tests * fix smoke tests * Use asyncio * Add e2e artifacts in GH actions * Remove unnecessary E2E test, and add skip_validations flag to cli * Nicer imports * Reorganize API functions. * Add license headers and module docstrings * Fix ignored ruff rule --------- Co-authored-by: Alonso Guevara <[email protected]>
- Loading branch information
1 parent
5a781dd
commit 6b4de3d
Showing
10 changed files
with
593 additions
and
202 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
{ | ||
"type": "minor", | ||
"description": "Implement Index API" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
# Copyright (c) 2024 Microsoft Corporation. | ||
# Licensed under the MIT License | ||
|
||
"""Load a GraphRagConfiguration from a file.""" | ||
|
||
import json | ||
from abc import ABC, abstractmethod | ||
from pathlib import Path | ||
|
||
import yaml | ||
|
||
from . import create_graphrag_config | ||
from .models.graph_rag_config import GraphRagConfig | ||
|
||
_default_config_files = ["settings.yaml", "settings.yml", "settings.json"] | ||
|
||
|
||
def resolve_config_path_with_root(root: str | Path) -> Path: | ||
"""Resolve the config path from the given root directory. | ||
Parameters | ||
---------- | ||
root : str | Path | ||
The path to the root directory containing the config file. | ||
Searches for a default config file (settings.{yaml,yml,json}). | ||
Returns | ||
------- | ||
Path | ||
The resolved config file path. | ||
Raises | ||
------ | ||
FileNotFoundError | ||
If the config file is not found or cannot be resolved for the directory. | ||
""" | ||
root = Path(root) | ||
|
||
if not root.is_dir(): | ||
msg = f"Invalid config path: {root} is not a directory" | ||
raise FileNotFoundError(msg) | ||
|
||
for file in _default_config_files: | ||
if (root / file).is_file(): | ||
return root / file | ||
|
||
msg = f"Unable to resolve config file for parent directory: {root}" | ||
raise FileNotFoundError(msg) | ||
|
||
|
||
class ConfigFileLoader(ABC): | ||
"""Base class for loading a configuration from a file.""" | ||
|
||
@abstractmethod | ||
def load_config(self, config_path: str | Path) -> GraphRagConfig: | ||
"""Load configuration from a file.""" | ||
raise NotImplementedError | ||
|
||
|
||
class ConfigYamlLoader(ConfigFileLoader): | ||
"""Load a configuration from a yaml file.""" | ||
|
||
def load_config(self, config_path: str | Path) -> GraphRagConfig: | ||
"""Load a configuration from a yaml file. | ||
Parameters | ||
---------- | ||
config_path : str | Path | ||
The path to the yaml file to load. | ||
Returns | ||
------- | ||
GraphRagConfig | ||
The loaded configuration. | ||
Raises | ||
------ | ||
ValueError | ||
If the file extension is not .yaml or .yml. | ||
FileNotFoundError | ||
If the config file is not found. | ||
""" | ||
config_path = Path(config_path) | ||
if config_path.suffix not in [".yaml", ".yml"]: | ||
msg = f"Invalid file extension for loading yaml config from: {config_path!s}. Expected .yaml or .yml" | ||
raise ValueError(msg) | ||
root_dir = str(config_path.parent) | ||
if not config_path.is_file(): | ||
msg = f"Config file not found: {config_path}" | ||
raise FileNotFoundError(msg) | ||
with config_path.open("rb") as file: | ||
data = yaml.safe_load(file.read().decode(encoding="utf-8", errors="strict")) | ||
return create_graphrag_config(data, root_dir) | ||
|
||
|
||
class ConfigJsonLoader(ConfigFileLoader): | ||
"""Load a configuration from a json file.""" | ||
|
||
def load_config(self, config_path: str | Path) -> GraphRagConfig: | ||
"""Load a configuration from a json file. | ||
Parameters | ||
---------- | ||
config_path : str | Path | ||
The path to the json file to load. | ||
Returns | ||
------- | ||
GraphRagConfig | ||
The loaded configuration. | ||
Raises | ||
------ | ||
ValueError | ||
If the file extension is not .json. | ||
FileNotFoundError | ||
If the config file is not found. | ||
""" | ||
config_path = Path(config_path) | ||
root_dir = str(config_path.parent) | ||
if config_path.suffix != ".json": | ||
msg = f"Invalid file extension for loading json config from: {config_path!s}. Expected .json" | ||
raise ValueError(msg) | ||
if not config_path.is_file(): | ||
msg = f"Config file not found: {config_path}" | ||
raise FileNotFoundError(msg) | ||
with config_path.open("rb") as file: | ||
data = json.loads(file.read().decode(encoding="utf-8", errors="strict")) | ||
return create_graphrag_config(data, root_dir) | ||
|
||
|
||
def get_config_file_loader(config_path: str | Path) -> ConfigFileLoader: | ||
"""Config File Loader Factory. | ||
Parameters | ||
---------- | ||
config_path : str | Path | ||
The path to the config file. | ||
Returns | ||
------- | ||
ConfigFileLoader | ||
The config file loader for the provided config file. | ||
Raises | ||
------ | ||
ValueError | ||
If the config file extension is not supported. | ||
""" | ||
config_path = Path(config_path) | ||
ext = config_path.suffix | ||
match ext: | ||
case ".yaml" | ".yml": | ||
return ConfigYamlLoader() | ||
case ".json": | ||
return ConfigJsonLoader() | ||
case _: | ||
msg = f"Unsupported config file extension: {ext}" | ||
raise ValueError(msg) | ||
|
||
|
||
def load_config_from_file(config_path: str | Path) -> GraphRagConfig: | ||
"""Load a configuration from a file. | ||
Parameters | ||
---------- | ||
config_path : str | Path | ||
The path to the configuration file. | ||
Supports .yaml, .yml, and .json config files. | ||
Returns | ||
------- | ||
GraphRagConfig | ||
The loaded configuration. | ||
Raises | ||
------ | ||
ValueError | ||
If the file extension is not supported. | ||
FileNotFoundError | ||
If the config file is not found. | ||
""" | ||
loader = get_config_file_loader(config_path) | ||
return loader.load_config(config_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# Copyright (c) 2024 Microsoft Corporation. | ||
# Licensed under the MIT License | ||
|
||
"""Logging utilities. A unified way for enabling logging.""" | ||
|
||
import logging | ||
from pathlib import Path | ||
|
||
from .enums import ReportingType | ||
from .models.graph_rag_config import GraphRagConfig | ||
from .resolve_timestamp_path import resolve_timestamp_path | ||
|
||
|
||
def enable_logging(log_filepath: str | Path, verbose: bool = False) -> None: | ||
"""Enable logging to a file. | ||
Parameters | ||
---------- | ||
log_filepath : str | Path | ||
The path to the log file. | ||
verbose : bool, default=False | ||
Whether to log debug messages. | ||
""" | ||
log_filepath = Path(log_filepath) | ||
log_filepath.parent.mkdir(parents=True, exist_ok=True) | ||
log_filepath.touch(exist_ok=True) | ||
|
||
logging.basicConfig( | ||
filename=log_filepath, | ||
filemode="a", | ||
format="%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s", | ||
datefmt="%H:%M:%S", | ||
level=logging.DEBUG if verbose else logging.INFO, | ||
) | ||
|
||
|
||
def enable_logging_with_config( | ||
config: GraphRagConfig, timestamp_value: str, verbose: bool = False | ||
) -> tuple[bool, str]: | ||
"""Enable logging to a file based on the config. | ||
Parameters | ||
---------- | ||
config : GraphRagConfig | ||
The configuration. | ||
timestamp_value : str | ||
The timestamp value representing the directory to place the log files. | ||
verbose : bool, default=False | ||
Whether to log debug messages. | ||
Returns | ||
------- | ||
tuple[bool, str] | ||
A tuple of a boolean indicating if logging was enabled and the path to the log file. | ||
(False, "") if logging was not enabled. | ||
(True, str) if logging was enabled. | ||
""" | ||
if config.reporting.type == ReportingType.file: | ||
log_path = resolve_timestamp_path( | ||
Path(config.root_dir) / config.reporting.base_dir / "indexing-engine.log", | ||
timestamp_value, | ||
) | ||
enable_logging(log_path, verbose) | ||
return (True, str(log_path)) | ||
return (False, "") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
# Copyright (c) 2024 Microsoft Corporation. | ||
# Licensed under the MIT License | ||
|
||
"""Resolve timestamp variables in a path.""" | ||
|
||
import re | ||
from pathlib import Path | ||
from string import Template | ||
|
||
|
||
def _resolve_timestamp_path_with_value(path: str | Path, timestamp_value: str) -> Path: | ||
"""Resolve the timestamp in the path with the given timestamp value. | ||
Parameters | ||
---------- | ||
path : str | Path | ||
The path containing ${timestamp} variables to resolve. | ||
timestamp_value : str | ||
The timestamp value used to resolve the path. | ||
Returns | ||
------- | ||
Path | ||
The path with ${timestamp} variables resolved to the provided timestamp value. | ||
""" | ||
template = Template(str(path)) | ||
resolved_path = template.substitute(timestamp=timestamp_value) | ||
return Path(resolved_path) | ||
|
||
|
||
def _resolve_timestamp_path_with_dir( | ||
path: str | Path, pattern: re.Pattern[str] | ||
) -> Path: | ||
"""Resolve the timestamp in the path with the latest available timestamp directory value. | ||
Parameters | ||
---------- | ||
path : str | Path | ||
The path containing ${timestamp} variables to resolve. | ||
pattern : re.Pattern[str] | ||
The pattern to use to match the timestamp directories. | ||
Returns | ||
------- | ||
Path | ||
The path with ${timestamp} variables resolved to the latest available timestamp directory value. | ||
Raises | ||
------ | ||
ValueError | ||
If the parent directory expecting to contain timestamp directories does not exist or is not a directory. | ||
Or if no timestamp directories are found in the parent directory that match the pattern. | ||
""" | ||
path = Path(path) | ||
path_parts = path.parts | ||
parent_dir = Path(path_parts[0]) | ||
found_timestamp_pattern = False | ||
for _, part in enumerate(path_parts[1:]): | ||
if part.lower() == "${timestamp}": | ||
found_timestamp_pattern = True | ||
break | ||
parent_dir = parent_dir / part | ||
|
||
# Path not using timestamp layout. | ||
if not found_timestamp_pattern: | ||
return path | ||
|
||
if not parent_dir.exists() or not parent_dir.is_dir(): | ||
msg = f"Parent directory {parent_dir} does not exist or is not a directory." | ||
raise ValueError(msg) | ||
|
||
timestamp_dirs = [ | ||
d for d in parent_dir.iterdir() if d.is_dir() and pattern.match(d.name) | ||
] | ||
timestamp_dirs.sort(key=lambda d: d.name, reverse=True) | ||
if len(timestamp_dirs) == 0: | ||
msg = f"No timestamp directories found in {parent_dir} that match {pattern.pattern}." | ||
raise ValueError(msg) | ||
return _resolve_timestamp_path_with_value(path, timestamp_dirs[0].name) | ||
|
||
|
||
def resolve_timestamp_path( | ||
path: str | Path, | ||
pattern_or_timestamp_value: re.Pattern[str] | str = re.compile(r"^\d{8}-\d{6}$"), | ||
) -> Path: | ||
r"""Timestamp path resolver. | ||
Resolve the timestamp in the path with the given timestamp value or | ||
with the latest available timestamp directory matching the given pattern. | ||
Parameters | ||
---------- | ||
path : str | Path | ||
The path containing ${timestamp} variables to resolve. | ||
pattern_or_timestamp_value : re.Pattern[str] | str, default=re.compile(r"^\d{8}-\d{6}$") | ||
The pattern to use to match the timestamp directories or the timestamp value to use. | ||
If a string is provided, the path will be resolved with the given string value. | ||
Otherwise, the path will be resolved with the latest available timestamp directory | ||
that matches the given pattern. | ||
Returns | ||
------- | ||
Path | ||
The path with ${timestamp} variables resolved to the provided timestamp value or | ||
the latest available timestamp directory. | ||
Raises | ||
------ | ||
ValueError | ||
If the parent directory expecting to contain timestamp directories does not exist or is not a directory. | ||
Or if no timestamp directories are found in the parent directory that match the pattern. | ||
""" | ||
if isinstance(pattern_or_timestamp_value, str): | ||
return _resolve_timestamp_path_with_value(path, pattern_or_timestamp_value) | ||
return _resolve_timestamp_path_with_dir(path, pattern_or_timestamp_value) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.