diff --git a/.semversioner/next-release/patch-20240806230321384817.json b/.semversioner/next-release/patch-20240806230321384817.json
new file mode 100644
index 0000000000..0e78ac354f
--- /dev/null
+++ b/.semversioner/next-release/patch-20240806230321384817.json
@@ -0,0 +1,4 @@
+{
+ "type": "patch",
+ "description": "Docs updates"
+}
diff --git a/README.md b/README.md
index e60cb56fa4..0b936058ce 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@
The GraphRAG project is a data pipeline and transformation suite that is designed to extract meaningful, structured data from unstructured text using the power of LLMs.
-To learn more about GraphRAG and how it can be used to enhance your LLMs ability to reason about your private data, please visit the Microsoft Research Blog Post.
+To learn more about GraphRAG and how it can be used to enhance your LLM's ability to reason about your private data, please visit the Microsoft Research Blog Post.
## Quickstart
diff --git a/docsite/posts/config/env_vars.md b/docsite/posts/config/env_vars.md
index 7a8d8f65ed..3c579d9411 100644
--- a/docsite/posts/config/env_vars.md
+++ b/docsite/posts/config/env_vars.md
@@ -93,7 +93,6 @@ These settings control the text embedding model used by the pipeline. Any settin
| `GRAPHRAG_EMBEDDING_REQUESTS_PER_MINUTE` | | The number of requests per minute to allow for the embedding client. 0 = Bypass | `int` | 0 |
| `GRAPHRAG_EMBEDDING_MAX_RETRIES` | | The maximum number of retries to attempt when a request fails. | `int` | 10 |
| `GRAPHRAG_EMBEDDING_MAX_RETRY_WAIT` | | The maximum number of seconds to wait between retries. | `int` | 10 |
-| `GRAPHRAG_EMBEDDING_TARGET` | | The target fields to embed. Either `required` or `all`. | `str` | `required` |
| `GRAPHRAG_EMBEDDING_SLEEP_ON_RATE_LIMIT_RECOMMENDATION` | | Whether to sleep on rate limit recommendation. (Azure Only) | `bool` | `True` |
## Input Settings
diff --git a/docsite/posts/config/template.md b/docsite/posts/config/template.md
index d3ff14d52f..a9a5ebdb38 100644
--- a/docsite/posts/config/template.md
+++ b/docsite/posts/config/template.md
@@ -168,7 +168,6 @@ GRAPHRAG_EMBEDDING_API_VERSION="api_version" # For Azure OpenAI Users and if GRA
# GRAPHRAG_ASYNC_MODE=asyncio
# GRAPHRAG_ENCODING_MODEL=cl100k_base
# GRAPHRAG_MAX_CLUSTER_SIZE=10
-# GRAPHRAG_ENTITY_RESOLUTION_ENABLED=False
# GRAPHRAG_SKIP_WORKFLOWS=None
# GRAPHRAG_UMAP_ENABLED=False
```
diff --git a/docsite/posts/get_started.md b/docsite/posts/get_started.md
index f2c0e7fbf0..b0ea2664c6 100644
--- a/docsite/posts/get_started.md
+++ b/docsite/posts/get_started.md
@@ -100,7 +100,7 @@ python -m graphrag.index --root ./ragtest
![pipeline executing from the CLI](/img/pipeline-running.png)
-This process will take some time to run. This depends on the size of your input data, what model you're using, and the text chunk size being used (these can be configured in your `.env` file).
+This process will take some time to run. This depends on the size of your input data, what model you're using, and the text chunk size being used (these can be configured in your `settings.yml` file).
Once the pipeline is complete, you should see a new folder called `./ragtest/output//artifacts` with a series of parquet files.
# Using the Query Engine
diff --git a/docsite/posts/index/0-architecture.md b/docsite/posts/index/0-architecture.md
index 4826bdabb2..c00e6e8802 100644
--- a/docsite/posts/index/0-architecture.md
+++ b/docsite/posts/index/0-architecture.md
@@ -52,10 +52,8 @@ stateDiagram-v2
Chunk --> ExtractGraph
Chunk --> EmbedDocuments
ExtractGraph --> GenerateReports
+ ExtractGraph --> EmbedEntities
ExtractGraph --> EmbedGraph
- EntityResolution --> EmbedGraph
- EntityResolution --> GenerateReports
- ExtractGraph --> EntityResolution
```
### Dataframe Message Format
diff --git a/docsite/posts/index/1-default_dataflow.md b/docsite/posts/index/1-default_dataflow.md
index 1b8d135dab..f2f9e22a0d 100644
--- a/docsite/posts/index/1-default_dataflow.md
+++ b/docsite/posts/index/1-default_dataflow.md
@@ -34,8 +34,7 @@ flowchart TB
subgraph phase2[Phase 2: Graph Extraction]
textUnits --> graph_extract[Entity & Relationship Extraction]
graph_extract --> graph_summarize[Entity & Relationship Summarization]
- graph_summarize --> entity_resolve[Entity Resolution]
- entity_resolve --> claim_extraction[Claim Extraction]
+ graph_summarize --> claim_extraction[Claim Extraction]
claim_extraction --> graph_outputs[Graph Tables]
end
subgraph phase3[Phase 3: Graph Augmentation]
@@ -95,7 +94,7 @@ Entities and Relationships are extracted at once in our _entity_extract_ verb, a
title: Graph Extraction
---
flowchart LR
- tu[TextUnit] --> ge[Graph Extraction] --> gs[Graph Summarization] --> er[Entity Resolution]
+ tu[TextUnit] --> ge[Graph Extraction] --> gs[Graph Summarization]
tu --> ce[Claim Extraction]
```
@@ -109,18 +108,12 @@ These subgraphs are merged together - any entities with the same _name_ and _typ
Now that we have a graph of entities and relationships, each with a list of descriptions, we can summarize these lists into a single description per entity and relationship. This is done by asking the LLM for a short summary that captures all of the distinct information from each description. This allows all of our entities and relationships to have a single concise description.
-### Entity Resolution (Not Enabled by Default)
-
-The final step of graph extraction is to resolve any entities that represent the same real-world entity but but have different names. Since this is done via LLM, and we don't want to lose information, we want to take a conservative, non-destructive approach to this.
-
-Our current implementation of Entity Resolution, however, is destructive. It will provide the LLM with a series of entities and ask it to determine which ones should be merged. Those entities are then merged together into a single entity and their relationships are updated.
-
-We are currently exploring other entity resolution techniques. In the near future, entity resolution will be executed by creating an edge between entity variants indicating that the entities have been resolved by the indexing engine. This will allow for end-users to undo indexing-side resolutions, and add their own non-destructive resolutions using a similar process.
-
### Claim Extraction & Emission
Finally, as an independent workflow, we extract claims from the source TextUnits. These claims represent positive factual statements with an evaluated status and time-bounds. These are emitted as a primary artifact called **Covariates**.
+Note: claim extraction is _optional_ and turned off by default. This is because claim extraction generally needs prompt tuning to be useful.
+
## Phase 3: Graph Augmentation
Now that we have a usable graph of entities and relationships, we want to understand their community structure and augment the graph with additional information. This is done in two steps: _Community Detection_ and _Graph Embedding_. These give us explicit (communities) and implicit (embeddings) ways of understanding the topological structure of our graph.
diff --git a/examples_notebooks/local_search.ipynb b/examples_notebooks/local_search.ipynb
index 3f1633373c..0c692f02a9 100644
--- a/examples_notebooks/local_search.ipynb
+++ b/examples_notebooks/local_search.ipynb
@@ -145,6 +145,8 @@
"metadata": {},
"outputs": [],
"source": [
+ "# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable\n",
+ "# Please see the GRAPHRAG_CLAIM_* settings\n",
"covariate_df = pd.read_parquet(f\"{INPUT_DIR}/{COVARIATE_TABLE}.parquet\")\n",
"\n",
"claims = read_indexer_covariates(covariate_df)\n",
@@ -240,6 +242,7 @@
" text_units=text_units,\n",
" entities=entities,\n",
" relationships=relationships,\n",
+ " # if you did not run covariates during indexing, set this to None\n",
" covariates=covariates,\n",
" entity_text_embeddings=description_embedding_store,\n",
" embedding_vectorstore_key=EntityVectorStoreKey.ID, # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE\n",
diff --git a/graphrag/index/verbs/entities/summarize/__init__.py b/graphrag/index/verbs/entities/summarize/__init__.py
index 9ba401295e..d7e9a5d93a 100644
--- a/graphrag/index/verbs/entities/summarize/__init__.py
+++ b/graphrag/index/verbs/entities/summarize/__init__.py
@@ -1,7 +1,7 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
-"""Root package for resolution entities."""
+"""Root package for entity summarization."""
from .description_summarize import SummarizeStrategyType, summarize_descriptions
diff --git a/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/__init__.py b/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/__init__.py
index 665600d0fb..a98d9406cb 100644
--- a/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/__init__.py
+++ b/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/__init__.py
@@ -1,7 +1,7 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
-"""The Entity Resolution graph intelligence package root."""
+"""The Entity summarization graph intelligence package root."""
from .run_graph_intelligence import run
diff --git a/graphrag/index/verbs/entities/summarize/strategies/typing.py b/graphrag/index/verbs/entities/summarize/strategies/typing.py
index e950cedc10..398295031b 100644
--- a/graphrag/index/verbs/entities/summarize/strategies/typing.py
+++ b/graphrag/index/verbs/entities/summarize/strategies/typing.py
@@ -1,7 +1,7 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
-"""A module containing 'ResolvedEntity' and 'EntityResolutionResult' models."""
+"""A module containing 'SummarizedDescriptionResult' model."""
from collections.abc import Awaitable, Callable
from dataclasses import dataclass