Skip to content

Commit

Permalink
Add CSV fixtures based on the test novel (#7)
Browse files Browse the repository at this point in the history
* update test novel

* test data update

* update test path name

* add more test fixtures

* Unit test updates

* only emit full configs in debug mode

* update fixture paths

* update argument order

* update fixture paths

* remove settings.yaml from text example

* update skip print

* only inspect the latest output

* add json_parsing_llm function for handling cached outputs that need json parsing

* streamline top-level run usage

* json-parsing llm build updates

* streamline run_pipeline_with_config call pattern in examples

* skip data assertions on loaded_csv until the null-filtering for vectors lands

* fixture update
  • Loading branch information
darthtrevino authored Apr 2, 2024
1 parent b08f3d7 commit 2b162b0
Show file tree
Hide file tree
Showing 15 changed files with 2,271 additions and 126 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,5 +89,5 @@ jobs:
name: Smoke Tests
env:
GRAPHRAG_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GRAPHRAG_LLM_MODEL: ${{ secrets.OPENAI_LLM_MODEL }}
GRAPHRAG_EMBEDDING_MODEL: ${{ secrets.OPENAI_EMBEDDING_MODEL }}
GRAPHRAG_LLM_MODEL: gpt-3.5-turbo
GRAPHRAG_EMBEDDING_MODEL: text-embedding-3-small
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ coverage/
licenses.txt
python/graphrag/examples_notebooks/*/lancedb
python/graphrag/examples_notebooks/*/data
python/graphrag/tests/fixtures/cache
python/graphrag/tests/fixtures/*/cache
python/graphrag/tests/fixtures/*/output

Expand Down

Large diffs are not rendered by default.

27 changes: 21 additions & 6 deletions python/graphrag/graphrag/index/storage/blob_pipeline_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,31 @@ def __init__(
self._container_name,
self._path_prefix,
)
self.create_container()

def create_container(self) -> None:
"""Create the container if it does not exist."""
if not self.container_exists():
container_name = self._container_name
container_names = [
container.name
for container in self._blob_service_client.list_containers()
]
if container_name not in container_names:
self._blob_service_client.create_container(container_name)

def delete_container(self) -> None:
"""Delete the container."""
if self.container_exists():
self._blob_service_client.delete_container(self._container_name)

def container_exists(self) -> bool:
"""Check if the container exists."""
container_name = self._container_name
container_names = [
container.name for container in self._blob_service_client.list_containers()
]
if container_name not in container_names:
self._blob_service_client.create_container(container_name)

def delete_container(self) -> None:
"""Delete the container."""
self._blob_service_client.delete_container(self._container_name)
return container_name in container_names

def find(
self,
Expand Down
72 changes: 36 additions & 36 deletions python/graphrag/tests/fixtures/azure/input/dulce.txt

Large diffs are not rendered by default.

183 changes: 183 additions & 0 deletions python/graphrag/tests/fixtures/min-csv/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
{
"input_path": "./tests/fixtures/min-csv",
"input_type": "text",
"workflow_config": {
"create_base_text_units": {
"row_range": [
1,
2000
],
"subworkflows": 11,
"max_runtime": 10
},
"create_base_extracted_entities": {
"row_range": [
1,
2000
],
"subworkflows": 2,
"max_runtime": 300
},
"create_final_covariates": {
"row_range": [
1,
2000
],
"nan_allowed_columns": [
"type",
"description",
"subject_type",
"object_id",
"object_type",
"status",
"start_date",
"end_date",
"source_text"
],
"subworkflows": 5,
"max_runtime": 300
},
"create_summarized_entities": {
"row_range": [
1,
2000
],
"subworkflows": 1,
"max_runtime": 300
},
"join_text_units_to_covariate_ids": {
"row_range": [
1,
2000
],
"subworkflows": 2,
"max_runtime": 10
},
"create_base_entity_graph": {
"row_range": [
1,
2000
],
"subworkflows": 2,
"max_runtime": 10
},
"create_final_entities": {
"row_range": [
1,
2000
],
"nan_allowed_columns": [
"type",
"description",
"graph_embedding"
],
"subworkflows": 10,
"max_runtime": 300
},
"create_final_relationships": {
"row_range": [
1,
2000
],
"subworkflows": 4,
"max_runtime": 100
},
"create_final_nodes": {
"row_range": [
1,
2000
],
"nan_allowed_columns": [
"entity_type",
"description",
"graph_embedding",
"community",
"_raw_level_"
],
"subworkflows": 11,
"max_runtime": 10
},
"create_final_communities": {
"row_range": [
1,
2000
],
"subworkflows": 14,
"max_runtime": 10
},
"create_final_community_reports": {
"row_range": [
1,
2000
],
"nan_allowed_columns": [
"community_id",
"title",
"summary",
"full_content",
"full_content_json",
"rank",
"rank_explanation",
"findings"
],
"subworkflows": 6,
"max_runtime": 300
},
"join_text_units_to_entity_ids": {
"row_range": [
1,
2000
],
"subworkflows": 3,
"max_runtime": 10
},
"join_text_units_to_relationship_ids": {
"row_range": [
1,
2000
],
"subworkflows": 4,
"max_runtime": 10
},
"create_final_text_units": {
"row_range": [
1,
2000
],
"nan_allowed_columns": [
"relationship_ids",
"entity_ids"
],
"subworkflows": 8,
"max_runtime": 100
},
"create_base_documents": {
"row_range": [
1,
2000
],
"subworkflows": 8,
"max_runtime": 10
},
"create_final_documents": {
"row_range": [
1,
2000
],
"subworkflows": 1,
"max_runtime": 100
}
},
"query_config": [
{
"query": "Who is Agent Alex Mercer and what are his goals?",
"method": "local"
},
{
"query": "What is the major conflict in this story and who are the protagonist and antagonist?",
"method": "global"
}

],
"slow": false
}
Loading

0 comments on commit 2b162b0

Please sign in to comment.