Add CSV fixtures based on the test novel (#7)

* update test novel * test data update * update test path name * add more test fixtures * Unit test updates * only emit full configs in debug mode * update fixture paths * update argument order * update fixture paths * remove settings.yaml from text example * update skip print * only inspect the latest output * add json_parsing_llm function for handling cached outputs that need json parsing * streamline top-level run usage * json-parsing llm build updates * streamline run_pipeline_with_config call pattern in examples * skip data assertions on loaded_csv until the null-filtering for vectors lands * fixture update
microsoft · Apr 2, 2024 · 2b162b0 · 2b162b0
1 parent b08f3d7
commit 2b162b0
Show file tree

Hide file tree

Showing 15 changed files with 2,271 additions and 126 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -89,5 +89,5 @@ jobs:
         name: Smoke Tests
         env:
           GRAPHRAG_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-          GRAPHRAG_LLM_MODEL: ${{ secrets.OPENAI_LLM_MODEL }}
-          GRAPHRAG_EMBEDDING_MODEL: ${{ secrets.OPENAI_EMBEDDING_MODEL }}
+          GRAPHRAG_LLM_MODEL: gpt-3.5-turbo
+          GRAPHRAG_EMBEDDING_MODEL: text-embedding-3-small
diff --git a/.gitignore b/.gitignore
@@ -15,6 +15,7 @@ coverage/
 licenses.txt
 python/graphrag/examples_notebooks/*/lancedb
 python/graphrag/examples_notebooks/*/data
+python/graphrag/tests/fixtures/cache
 python/graphrag/tests/fixtures/*/cache
 python/graphrag/tests/fixtures/*/output
 

diff --git a/...ocsite/_posts/_query/notebooks/inputs/operation dulce/Operation Dulce v2 1 1.md b/...ocsite/_posts/_query/notebooks/inputs/operation dulce/Operation Dulce v2 1 1.md
diff --git a/python/graphrag/graphrag/index/storage/blob_pipeline_storage.py b/python/graphrag/graphrag/index/storage/blob_pipeline_storage.py
@@ -47,16 +47,31 @@ def __init__(
             self._container_name,
             self._path_prefix,
         )
+        self.create_container()
+
+    def create_container(self) -> None:
+        """Create the container if it does not exist."""
+        if not self.container_exists():
+            container_name = self._container_name
+            container_names = [
+                container.name
+                for container in self._blob_service_client.list_containers()
+            ]
+            if container_name not in container_names:
+                self._blob_service_client.create_container(container_name)
 
+    def delete_container(self) -> None:
+        """Delete the container."""
+        if self.container_exists():
+            self._blob_service_client.delete_container(self._container_name)
+
+    def container_exists(self) -> bool:
+        """Check if the container exists."""
+        container_name = self._container_name
         container_names = [
             container.name for container in self._blob_service_client.list_containers()
         ]
-        if container_name not in container_names:
-            self._blob_service_client.create_container(container_name)
-
-    def delete_container(self) -> None:
-        """Delete the container."""
-        self._blob_service_client.delete_container(self._container_name)
+        return container_name in container_names
 
     def find(
         self,

diff --git a/python/graphrag/tests/fixtures/azure/input/dulce.txt b/python/graphrag/tests/fixtures/azure/input/dulce.txt
diff --git a/python/graphrag/tests/fixtures/min-csv/config.json b/python/graphrag/tests/fixtures/min-csv/config.json
@@ -0,0 +1,183 @@
+{
+    "input_path": "./tests/fixtures/min-csv",
+    "input_type": "text",
+    "workflow_config": {
+        "create_base_text_units": {
+            "row_range": [
+                1,
+                2000
+            ],
+            "subworkflows": 11,
+            "max_runtime": 10
+        },
+        "create_base_extracted_entities": {
+            "row_range": [
+                1,
+                2000
+            ],
+            "subworkflows": 2,
+            "max_runtime": 300
+        },
+        "create_final_covariates": {
+            "row_range": [
+                1,
+                2000
+            ],
+            "nan_allowed_columns": [
+                "type",
+                "description",
+                "subject_type",
+                "object_id",
+                "object_type",
+                "status",
+                "start_date",
+                "end_date",
+                "source_text"
+            ],
+            "subworkflows": 5,
+            "max_runtime": 300
+        },
+        "create_summarized_entities": {
+            "row_range": [
+                1,
+                2000
+            ],
+            "subworkflows": 1,
+            "max_runtime": 300
+        },
+        "join_text_units_to_covariate_ids": {
+            "row_range": [
+                1,
+                2000
+            ],
+            "subworkflows": 2,
+            "max_runtime": 10
+        },
+        "create_base_entity_graph": {
+            "row_range": [
+                1,
+                2000
+            ],
+            "subworkflows": 2,
+            "max_runtime": 10
+        },
+        "create_final_entities": {
+            "row_range": [
+                1,
+                2000
+            ],
+            "nan_allowed_columns": [
+                "type",
+                "description",
+                "graph_embedding"
+            ],
+            "subworkflows": 10,
+            "max_runtime": 300
+        },
+        "create_final_relationships": {
+            "row_range": [
+                1,
+                2000
+            ],
+            "subworkflows": 4,
+            "max_runtime": 100
+        },
+        "create_final_nodes": {
+            "row_range": [
+                1,
+                2000
+            ],
+            "nan_allowed_columns": [
+                "entity_type",
+                "description",
+                "graph_embedding",
+                "community",
+                "_raw_level_"
+            ],
+            "subworkflows": 11,
+            "max_runtime": 10
+        },
+        "create_final_communities": {
+            "row_range": [
+                1,
+                2000
+            ],
+            "subworkflows": 14,
+            "max_runtime": 10
+        },
+        "create_final_community_reports": {
+            "row_range": [
+                1,
+                2000
+            ],
+            "nan_allowed_columns": [
+                "community_id",
+                "title",
+                "summary",
+                "full_content",
+                "full_content_json",
+                "rank",
+                "rank_explanation",
+                "findings"
+            ],
+            "subworkflows": 6,
+            "max_runtime": 300
+        },
+        "join_text_units_to_entity_ids": {
+            "row_range": [
+                1,
+                2000
+            ],
+            "subworkflows": 3,
+            "max_runtime": 10
+        },
+        "join_text_units_to_relationship_ids": {
+            "row_range": [
+                1,
+                2000
+            ],
+            "subworkflows": 4,
+            "max_runtime": 10
+        },
+        "create_final_text_units": {
+            "row_range": [
+                1,
+                2000
+            ],
+            "nan_allowed_columns": [
+                "relationship_ids",
+                "entity_ids"
+            ],
+            "subworkflows": 8,
+            "max_runtime": 100
+        },
+        "create_base_documents": {
+            "row_range": [
+                1,
+                2000
+            ],
+            "subworkflows": 8,
+            "max_runtime": 10
+        },
+        "create_final_documents": {
+            "row_range": [
+                1,
+                2000
+            ],
+            "subworkflows": 1,
+            "max_runtime": 100
+        }
+    },
+    "query_config": [
+        {
+            "query": "Who is Agent Alex Mercer and what are his goals?",
+            "method": "local"
+        },
+        {
+            "query": "What is the major conflict in this story and who are the protagonist and antagonist?",
+            "method": "global"
+        }
+
+    ],
+    "slow": false
+}
diff --git a/...phrag/tests/fixtures/dulce/input/ABOUT.md → ...rag/tests/fixtures/min-csv/input/ABOUT.md b/...phrag/tests/fixtures/dulce/input/ABOUT.md → ...rag/tests/fixtures/min-csv/input/ABOUT.md