From 0b2120ca453715cf71d3576f01c646142bb94cf4 Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Wed, 27 Nov 2024 09:56:48 -0800 Subject: [PATCH 1/2] Docs and notebooks update (#1451) * Fix local question gen and example notebook * Update global search notebook * Add lazy blog post * Update breaking changes doc for migration notes * Simplify Getting Started page * Semver * Spellcheck * Fix types * Add comments on cache-free migration * Update wording * Spelling --------- Co-authored-by: Alonso Guevara --- .../patch-20241126215650769602.json | 4 + dictionary.txt | 2 + docs/blog_posts.md | 6 + docs/examples_notebooks/global_search.ipynb | 30 +- docs/examples_notebooks/local_search.ipynb | 962 +++++++++++++++++- docs/get_started.md | 15 +- graphrag/query/question_gen/local_gen.py | 41 +- v1-breaking-changes.md | 48 +- 8 files changed, 1037 insertions(+), 71 deletions(-) create mode 100644 .semversioner/next-release/patch-20241126215650769602.json diff --git a/.semversioner/next-release/patch-20241126215650769602.json b/.semversioner/next-release/patch-20241126215650769602.json new file mode 100644 index 000000000..bd384d0f4 --- /dev/null +++ b/.semversioner/next-release/patch-20241126215650769602.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Fix question gen." +} diff --git a/dictionary.txt b/dictionary.txt index 0288faac9..e2ea99f02 100644 --- a/dictionary.txt +++ b/dictionary.txt @@ -24,6 +24,7 @@ getcwd fillna noqa dtypes +ints # Azure abfs @@ -167,6 +168,7 @@ FIRUZABAD Krohaara KROHAARA POKRALLY +René Tazbah TIRUZIA Tiruzia diff --git a/docs/blog_posts.md b/docs/blog_posts.md index b2467a137..2ff64abce 100644 --- a/docs/blog_posts.md +++ b/docs/blog_posts.md @@ -38,4 +38,10 @@ By Bryan Li, Research Intern; [Ha Trinh](https://www.microsoft.com/en-us/research/people/trinhha/), Senior Data Scientist; [Darren Edge](https://www.microsoft.com/en-us/research/people/daedge/), Senior Director; [Jonathan Larson](https://www.microsoft.com/en-us/research/people/jolarso/), Senior Principal Data Architect +- [:octicons-arrow-right-24: __LazyGraphRAG: Setting a new standard for quality and cost__](https://www.microsoft.com/en-us/research/blog/lazygraphrag-setting-a-new-standard-for-quality-and-cost/) + + --- +
Published November 25, 2024 + + By [Darren Edge](https://www.microsoft.com/en-us/research/people/daedge/), Senior Director; [Ha Trinh](https://www.microsoft.com/en-us/research/people/trinhha/), Senior Data Scientist; [Jonathan Larson](https://www.microsoft.com/en-us/research/people/jolarso/), Senior Principal Data Architect
\ No newline at end of file diff --git a/docs/examples_notebooks/global_search.ipynb b/docs/examples_notebooks/global_search.ipynb index 02f8ba63d..14a188e7c 100644 --- a/docs/examples_notebooks/global_search.ipynb +++ b/docs/examples_notebooks/global_search.ipynb @@ -75,9 +75,9 @@ "source": [ "### Load community reports as context for global search\n", "\n", - "- Load all community reports in the `create_final_community_reports` table from the ire-indexing engine, to be used as context data for global search.\n", - "- Load entities from the `create_final_nodes` and `create_final_entities` tables from the ire-indexing engine, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the rank attribute in the community reports table for context ranking)\n", - "- Load all communities in the `create_final_communites` table from the ire-indexing engine, to be used to reconstruct the community graph hierarchy for dynamic community selection." + "- Load all community reports in the `create_final_community_reports` table from the GraphRAG, to be used as context data for global search.\n", + "- Load entities from the `create_final_nodes` and `create_final_entities` tables from the GraphRAG, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the rank attribute in the community reports table for context ranking)\n", + "- Load all communities in the `create_final_communites` table from the GraphRAG, to be used to reconstruct the community graph hierarchy for dynamic community selection." ] }, { @@ -379,21 +379,23 @@ "text": [ "### Overview of Cosmic Vocalization\n", "\n", - "Cosmic Vocalization is a phenomenon that has garnered significant attention from various individuals and groups. It is perceived as a cosmic event with potential implications for security and interstellar communication. The Paranormal Military Squad is actively engaged with Cosmic Vocalization, indicating its strategic importance in security measures [Data: Reports (6)].\n", + "Cosmic Vocalization is a phenomenon that has garnered significant attention within the community, involving various individuals and groups. It is perceived as an interstellar event with potential implications for both communication and security.\n", "\n", - "### Key Perspectives and Concerns\n", + "### Key Perspectives\n", "\n", - "1. **Strategic Engagement**: The Paranormal Military Squad's involvement suggests that Cosmic Vocalization is not only a subject of interest but also a matter of strategic importance. This engagement highlights the potential security implications of these cosmic phenomena [Data: Reports (6)].\n", + "**Alex Mercer's Viewpoint** \n", + "Alex Mercer perceives Cosmic Vocalization as part of an interstellar duet, suggesting that it may be a responsive or communicative event. This perspective highlights the potential for Cosmic Vocalization to be part of a larger cosmic interaction or dialogue [Data: Reports (6)].\n", "\n", - "2. **Community Interest**: Within the community, Cosmic Vocalization is a focal point of interest. Alex Mercer, for instance, perceives it as part of an interstellar duet, which suggests a responsive and perhaps communicative approach to these cosmic events [Data: Reports (6)].\n", + "**Taylor Cruz's Concerns** \n", + "Taylor Cruz raises concerns about the nature of Cosmic Vocalization, fearing it might be a homing tune. This adds a layer of urgency and potential threat, as it suggests that the vocalization could be attracting attention from unknown entities or forces [Data: Reports (6)].\n", "\n", - "3. **Potential Threats**: Concerns have been raised by individuals like Taylor Cruz, who fears that Cosmic Vocalization might be a homing tune. This perspective adds a layer of urgency and suggests that there may be potential threats associated with these cosmic sounds [Data: Reports (6)].\n", + "### Involvement of the Paranormal Military Squad\n", "\n", - "### Metaphorical Interpretation\n", + "The Paranormal Military Squad is actively engaged with Cosmic Vocalization, indicating its significance in security measures. Their involvement suggests that the phenomenon is not only of scientific interest but also of strategic importance, potentially impacting national or global security [Data: Reports (6)].\n", "\n", - "The Universe is metaphorically treated as a concert hall by the Paranormal Military Squad, which suggests a broader perspective on how cosmic events are interpreted and responded to by human entities. This metaphorical view may influence how strategies and responses are formulated in relation to Cosmic Vocalization [Data: Reports (6)].\n", + "### Conclusion\n", "\n", - "In summary, Cosmic Vocalization is a complex phenomenon involving strategic, communicative, and potentially threatening elements. The involvement of the Paranormal Military Squad and the concerns raised by community members underscore its significance and the need for careful consideration of its implications.\n" + "Cosmic Vocalization is a complex and multifaceted phenomenon that involves various stakeholders, each with their own perspectives and concerns. The involvement of both individuals like Alex Mercer and Taylor Cruz, as well as organized groups like the Paranormal Military Squad, underscores its importance and the need for further investigation and understanding.\n" ] } ], @@ -638,7 +640,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "LLM calls: 2. Prompt tokens: 11292. Output tokens: 606.\n" + "LLM calls: 2. Prompt tokens: 11237. Output tokens: 483.\n" ] } ], @@ -652,7 +654,7 @@ ], "metadata": { "kernelspec": { - "display_name": "graphrag", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -666,7 +668,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.5" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/docs/examples_notebooks/local_search.ipynb b/docs/examples_notebooks/local_search.ipynb index 8f9afb350..8d0e85277 100644 --- a/docs/examples_notebooks/local_search.ipynb +++ b/docs/examples_notebooks/local_search.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -70,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -95,9 +95,185 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Entity count: 178\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
leveltitletypedescriptionsource_idcommunitydegreehuman_readable_ididsizegraph_embeddingtop_level_node_idxy
00ALEX MERCERPERSONAlex Mercer is a commanding and strategic figu...06bdff339c02ab35c80fa49320d5da66,1f2c6c263f168...3570b45241d70f0e43fca764df95b2b81f7757Noneb45241d70f0e43fca764df95b2b81f7700
10TAYLOR CRUZPERSONTaylor Cruz is a central and commanding figure...06bdff339c02ab35c80fa49320d5da66,1f2c6c263f168...35214119fd06010c494caa07f439b333f4c552None4119fd06010c494caa07f439b333f4c500
20JORDAN HAYESPERSONDr. Jordan Hayes is a central figure at Dulce ...06bdff339c02ab35c80fa49320d5da66,1f2c6c263f168...3482d3835bf3dda84ead99deadbeac5d0d7d48Noned3835bf3dda84ead99deadbeac5d0d7d00
30SAM RIVERAPERSONSam Rivera is a key operator and technologist ...06bdff339c02ab35c80fa49320d5da66,1b603cdb97651...3483077d2820ae1845bcbb1803379a3d1eae48None077d2820ae1845bcbb1803379a3d1eae00
40PARANORMAL MILITARY SQUADORGANIZATIONThe Paranormal Military Squad is a specialized...06bdff339c02ab35c80fa49320d5da66,2db9206de77cf...34843671ea0dd4e84c1a9b02c5ab2c8f4bac48None3671ea0dd4e84c1a9b02c5ab2c8f4bac00
\n", + "
" + ], + "text/plain": [ + " level title type \\\n", + "0 0 ALEX MERCER PERSON \n", + "1 0 TAYLOR CRUZ PERSON \n", + "2 0 JORDAN HAYES PERSON \n", + "3 0 SAM RIVERA PERSON \n", + "4 0 PARANORMAL MILITARY SQUAD ORGANIZATION \n", + "\n", + " description \\\n", + "0 Alex Mercer is a commanding and strategic figu... \n", + "1 Taylor Cruz is a central and commanding figure... \n", + "2 Dr. Jordan Hayes is a central figure at Dulce ... \n", + "3 Sam Rivera is a key operator and technologist ... \n", + "4 The Paranormal Military Squad is a specialized... \n", + "\n", + " source_id community degree \\\n", + "0 06bdff339c02ab35c80fa49320d5da66,1f2c6c263f168... 3 57 \n", + "1 06bdff339c02ab35c80fa49320d5da66,1f2c6c263f168... 3 52 \n", + "2 06bdff339c02ab35c80fa49320d5da66,1f2c6c263f168... 3 48 \n", + "3 06bdff339c02ab35c80fa49320d5da66,1b603cdb97651... 3 48 \n", + "4 06bdff339c02ab35c80fa49320d5da66,2db9206de77cf... 3 48 \n", + "\n", + " human_readable_id id size graph_embedding \\\n", + "0 0 b45241d70f0e43fca764df95b2b81f77 57 None \n", + "1 1 4119fd06010c494caa07f439b333f4c5 52 None \n", + "2 2 d3835bf3dda84ead99deadbeac5d0d7d 48 None \n", + "3 3 077d2820ae1845bcbb1803379a3d1eae 48 None \n", + "4 4 3671ea0dd4e84c1a9b02c5ab2c8f4bac 48 None \n", + "\n", + " top_level_node_id x y \n", + "0 b45241d70f0e43fca764df95b2b81f77 0 0 \n", + "1 4119fd06010c494caa07f439b333f4c5 0 0 \n", + "2 d3835bf3dda84ead99deadbeac5d0d7d 0 0 \n", + "3 077d2820ae1845bcbb1803379a3d1eae 0 0 \n", + "4 3671ea0dd4e84c1a9b02c5ab2c8f4bac 0 0 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# read nodes table to get community and degree data\n", "entity_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_TABLE}.parquet\")\n", @@ -128,9 +304,161 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Relationship count: 373\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sourcetargetweightdescriptiontext_unit_idsidhuman_readable_idsource_degreetarget_degreerank
0ALEX MERCERTAYLOR CRUZ21.0Alex Mercer and Taylor Cruz are integral membe...[06bdff339c02ab35c80fa49320d5da66, 1f2c6c263f1...148fffeb994541b2b4b6dcefda7001a805752109
1ALEX MERCERJORDAN HAYES25.0Alex Mercer and Jordan Hayes are integral team...[06bdff339c02ab35c80fa49320d5da66, 1f2c6c263f1...89c08e793298442686292454a1abff3115748105
2ALEX MERCERSAM RIVERA20.0Alex Mercer and Sam Rivera are integral member...[06bdff339c02ab35c80fa49320d5da66, 1f2c6c263f1...0467928aa65e4a4fba62bdb1467e3a5425748105
3ALEX MERCEROPERATION: DULCE BRIEFING ROOM1.0Alex Mercer was present in the Operation: Dulc...[06bdff339c02ab35c80fa49320d5da66]43c3390303c6476cb65f584e37c3e81c357461
4ALEX MERCERPARANORMAL MILITARY SQUAD20.0Alex Mercer is a prominent and influential lea...[06bdff339c02ab35c80fa49320d5da66, 2db9206de77...fa14b16c17e3417dba5a4b473ea5b18d45748105
\n", + "
" + ], + "text/plain": [ + " source target weight \\\n", + "0 ALEX MERCER TAYLOR CRUZ 21.0 \n", + "1 ALEX MERCER JORDAN HAYES 25.0 \n", + "2 ALEX MERCER SAM RIVERA 20.0 \n", + "3 ALEX MERCER OPERATION: DULCE BRIEFING ROOM 1.0 \n", + "4 ALEX MERCER PARANORMAL MILITARY SQUAD 20.0 \n", + "\n", + " description \\\n", + "0 Alex Mercer and Taylor Cruz are integral membe... \n", + "1 Alex Mercer and Jordan Hayes are integral team... \n", + "2 Alex Mercer and Sam Rivera are integral member... \n", + "3 Alex Mercer was present in the Operation: Dulc... \n", + "4 Alex Mercer is a prominent and influential lea... \n", + "\n", + " text_unit_ids \\\n", + "0 [06bdff339c02ab35c80fa49320d5da66, 1f2c6c263f1... \n", + "1 [06bdff339c02ab35c80fa49320d5da66, 1f2c6c263f1... \n", + "2 [06bdff339c02ab35c80fa49320d5da66, 1f2c6c263f1... \n", + "3 [06bdff339c02ab35c80fa49320d5da66] \n", + "4 [06bdff339c02ab35c80fa49320d5da66, 2db9206de77... \n", + "\n", + " id human_readable_id source_degree \\\n", + "0 148fffeb994541b2b4b6dcefda7001a8 0 57 \n", + "1 89c08e793298442686292454a1abff31 1 57 \n", + "2 0467928aa65e4a4fba62bdb1467e3a54 2 57 \n", + "3 43c3390303c6476cb65f584e37c3e81c 3 57 \n", + "4 fa14b16c17e3417dba5a4b473ea5b18d 4 57 \n", + "\n", + " target_degree rank \n", + "0 52 109 \n", + "1 48 105 \n", + "2 48 105 \n", + "3 4 61 \n", + "4 48 105 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "relationship_df = pd.read_parquet(f\"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet\")\n", "relationships = read_indexer_relationships(relationship_df)\n", @@ -141,9 +469,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Claim records: 156\n" + ] + } + ], "source": [ "# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable\n", "# Please see the GRAPHRAG_CLAIM_* settings\n", @@ -164,9 +500,175 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Report records: 20\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
communityfull_contentlevelranktitlerank_explanationsummaryfindingsfull_content_jsonid
010# Paranormal Military Squad at Dulce Base: Dec...18.5Paranormal Military Squad at Dulce Base: Decod...The impact severity rating is high due to the ...The Paranormal Military Squad, stationed at Du...[{'explanation': 'Jordan is a central figure i...{\\n \"title\": \"Paranormal Military Squad at ...1ba2d200-dd26-4693-affe-a5539d0a0e0d
111# Dulce and Paranormal Military Squad Operatio...18.5Dulce and Paranormal Military Squad OperationsThe impact severity rating is high due to the ...The community centers around Dulce, a secretiv...[{'explanation': 'Dulce is described as a top-...{\\n \"title\": \"Dulce and Paranormal Military...a8a530b0-ae6b-44ea-b11c-9f70d138298d
212# Paranormal Military Squad and Dulce Base Ope...17.5Paranormal Military Squad and Dulce Base Opera...The impact severity rating is relatively high ...The community centers around the Paranormal Mi...[{'explanation': 'Taylor is a central figure w...{\\n \"title\": \"Paranormal Military Squad and...0478975b-c805-4cc1-b746-82f3e689e2f3
313# Mission Dynamics and Leadership: Cruz and Wa...17.5Mission Dynamics and Leadership: Cruz and Wash...The impact severity rating is relatively high ...This report explores the intricate dynamics of...[{'explanation': 'Cruz is a central figure in ...{\\n \"title\": \"Mission Dynamics and Leadersh...b56f6e68-3951-4f07-8760-63700944a375
414# Dulce Base and Paranormal Military Squad: Br...18.5Dulce Base and Paranormal Military Squad: Brid...The impact severity rating is high due to the ...The community centers around the Dulce Base, a...[{'explanation': 'Sam Rivera, a member of the ...{\\n \"title\": \"Dulce Base and Paranormal Mil...736e7006-d050-4abb-a122-00febf3f540f
\n", + "
" + ], + "text/plain": [ + " community full_content level rank \\\n", + "0 10 # Paranormal Military Squad at Dulce Base: Dec... 1 8.5 \n", + "1 11 # Dulce and Paranormal Military Squad Operatio... 1 8.5 \n", + "2 12 # Paranormal Military Squad and Dulce Base Ope... 1 7.5 \n", + "3 13 # Mission Dynamics and Leadership: Cruz and Wa... 1 7.5 \n", + "4 14 # Dulce Base and Paranormal Military Squad: Br... 1 8.5 \n", + "\n", + " title \\\n", + "0 Paranormal Military Squad at Dulce Base: Decod... \n", + "1 Dulce and Paranormal Military Squad Operations \n", + "2 Paranormal Military Squad and Dulce Base Opera... \n", + "3 Mission Dynamics and Leadership: Cruz and Wash... \n", + "4 Dulce Base and Paranormal Military Squad: Brid... \n", + "\n", + " rank_explanation \\\n", + "0 The impact severity rating is high due to the ... \n", + "1 The impact severity rating is high due to the ... \n", + "2 The impact severity rating is relatively high ... \n", + "3 The impact severity rating is relatively high ... \n", + "4 The impact severity rating is high due to the ... \n", + "\n", + " summary \\\n", + "0 The Paranormal Military Squad, stationed at Du... \n", + "1 The community centers around Dulce, a secretiv... \n", + "2 The community centers around the Paranormal Mi... \n", + "3 This report explores the intricate dynamics of... \n", + "4 The community centers around the Dulce Base, a... \n", + "\n", + " findings \\\n", + "0 [{'explanation': 'Jordan is a central figure i... \n", + "1 [{'explanation': 'Dulce is described as a top-... \n", + "2 [{'explanation': 'Taylor is a central figure w... \n", + "3 [{'explanation': 'Cruz is a central figure in ... \n", + "4 [{'explanation': 'Sam Rivera, a member of the ... \n", + "\n", + " full_content_json \\\n", + "0 {\\n \"title\": \"Paranormal Military Squad at ... \n", + "1 {\\n \"title\": \"Dulce and Paranormal Military... \n", + "2 {\\n \"title\": \"Paranormal Military Squad and... \n", + "3 {\\n \"title\": \"Mission Dynamics and Leadersh... \n", + "4 {\\n \"title\": \"Dulce Base and Paranormal Mil... \n", + "\n", + " id \n", + "0 1ba2d200-dd26-4693-affe-a5539d0a0e0d \n", + "1 a8a530b0-ae6b-44ea-b11c-9f70d138298d \n", + "2 0478975b-c805-4cc1-b746-82f3e689e2f3 \n", + "3 b56f6e68-3951-4f07-8760-63700944a375 \n", + "4 736e7006-d050-4abb-a122-00febf3f540f " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "report_df = pd.read_parquet(f\"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet\")\n", "reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)\n", @@ -184,9 +686,150 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Text unit records: 50\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtextn_tokensdocument_idsentity_idsrelationship_idscovariate_ids
006bdff339c02ab35c80fa49320d5da66# Operation: Dulce\\n\\n## Chapter 1\\n\\nThe thru...600[958fdd043f17ade63cb13570b59df295][b45241d70f0e43fca764df95b2b81f77, 4119fd06010...[148fffeb994541b2b4b6dcefda7001a8, 89c08e79329...[439081a3-bfeb-4693-968a-0f6189d8fa50, 3399e3d...
128dc4aa41a3e99deb7f354682f3e9904's authoritarian performance. _Protocols_, Jor...600[958fdd043f17ade63cb13570b59df295][4119fd06010c494caa07f439b333f4c5, 077d2820ae1...[9a6f414210e14841a5b0e661aedc898d, db541b72609...[b09594cb-d4b7-4de4-a1af-97778300eb1b, f7c5ea4...
2813db3138ef511c34be86f841f68aa8f, rehearsing the speech for their subordinates...600[958fdd043f17ade63cb13570b59df295][b45241d70f0e43fca764df95b2b81f77, 077d2820ae1...[0467928aa65e4a4fba62bdb1467e3a54, 7cc3356d38d...[7642fe21-7cb9-4428-848b-d2e3f5ab10ca, 6fe3d6c...
3d0afd106abf3a2966ff88a99eef710db, weighing his words carefully. \"Our tech is t...600[958fdd043f17ade63cb13570b59df295][b45241d70f0e43fca764df95b2b81f77, 4119fd06010...[148fffeb994541b2b4b6dcefda7001a8, 89c08e79329...[956e4c22-e343-4b5d-ad3c-d44ca3ce5fb5]
4f4c7c95d7bf621c6eb73d331cbf0b608stepping into their exchange. The authority i...600[958fdd043f17ade63cb13570b59df295][077d2820ae1845bcbb1803379a3d1eae, 1fd3fa8bb5a...[478e4c72d8fb46dd8cc9f0691c9878fd, 82b0446e7c9...[a54eda4d-b5bf-471d-989f-370eb9aff961, 9f85274...
\n", + "
" + ], + "text/plain": [ + " id \\\n", + "0 06bdff339c02ab35c80fa49320d5da66 \n", + "1 28dc4aa41a3e99deb7f354682f3e9904 \n", + "2 813db3138ef511c34be86f841f68aa8f \n", + "3 d0afd106abf3a2966ff88a99eef710db \n", + "4 f4c7c95d7bf621c6eb73d331cbf0b608 \n", + "\n", + " text n_tokens \\\n", + "0 # Operation: Dulce\\n\\n## Chapter 1\\n\\nThe thru... 600 \n", + "1 's authoritarian performance. _Protocols_, Jor... 600 \n", + "2 , rehearsing the speech for their subordinates... 600 \n", + "3 , weighing his words carefully. \"Our tech is t... 600 \n", + "4 stepping into their exchange. The authority i... 600 \n", + "\n", + " document_ids \\\n", + "0 [958fdd043f17ade63cb13570b59df295] \n", + "1 [958fdd043f17ade63cb13570b59df295] \n", + "2 [958fdd043f17ade63cb13570b59df295] \n", + "3 [958fdd043f17ade63cb13570b59df295] \n", + "4 [958fdd043f17ade63cb13570b59df295] \n", + "\n", + " entity_ids \\\n", + "0 [b45241d70f0e43fca764df95b2b81f77, 4119fd06010... \n", + "1 [4119fd06010c494caa07f439b333f4c5, 077d2820ae1... \n", + "2 [b45241d70f0e43fca764df95b2b81f77, 077d2820ae1... \n", + "3 [b45241d70f0e43fca764df95b2b81f77, 4119fd06010... \n", + "4 [077d2820ae1845bcbb1803379a3d1eae, 1fd3fa8bb5a... \n", + "\n", + " relationship_ids \\\n", + "0 [148fffeb994541b2b4b6dcefda7001a8, 89c08e79329... \n", + "1 [9a6f414210e14841a5b0e661aedc898d, db541b72609... \n", + "2 [0467928aa65e4a4fba62bdb1467e3a54, 7cc3356d38d... \n", + "3 [148fffeb994541b2b4b6dcefda7001a8, 89c08e79329... \n", + "4 [478e4c72d8fb46dd8cc9f0691c9878fd, 82b0446e7c9... \n", + "\n", + " covariate_ids \n", + "0 [439081a3-bfeb-4693-968a-0f6189d8fa50, 3399e3d... \n", + "1 [b09594cb-d4b7-4de4-a1af-97778300eb1b, f7c5ea4... \n", + "2 [7642fe21-7cb9-4428-848b-d2e3f5ab10ca, 6fe3d6c... \n", + "3 [956e4c22-e343-4b5d-ad3c-d44ca3ce5fb5] \n", + "4 [a54eda4d-b5bf-471d-989f-370eb9aff961, 9f85274... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "text_unit_df = pd.read_parquet(f\"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet\")\n", "text_units = read_indexer_text_units(text_unit_df)\n", @@ -197,7 +840,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -233,7 +876,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -260,7 +903,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -303,7 +946,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -326,9 +969,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Overview of Agent Mercer\n", + "\n", + "Agent Alex Mercer is a prominent figure within the Paranormal Military Squad, playing a crucial role in Operation: Dulce at the Dulce Base. He is recognized for his commanding and strategic presence, particularly in overseeing operations related to communication with extraterrestrial intelligence. His responsibilities include initiating broadcasts to communicate with extraterrestrial beings, decoding alien messages, and leading the team in understanding and responding to these communications [Data: Entities (0)].\n", + "\n", + "### Role and Responsibilities\n", + "\n", + "Mercer is deeply involved in the philosophical and strategic aspects of interstellar communication, viewing these interactions as a form of cosmic dialogue. His leadership style is characterized by a blend of determination, compliance with mission protocols, and a protective approach towards his team. He collaborates with team members like Jordan Hayes, exploring secured areas and engaging in high-stakes, secretive operations. Mercer emphasizes the importance of intuition and trust beyond protocol, particularly in his mentorship of Sam Rivera [Data: Entities (0)].\n", + "\n", + "### Relationships and Influence\n", + "\n", + "Agent Mercer is known for his intellectual curiosity and deep involvement in the mission's strategic aspects. He is depicted as a thoughtful mentor, particularly to Sam Rivera, and his experiences during encounters with alien signals have led to profound changes, reinforcing his role as a key decision-maker and guardian in missions that transcend traditional boundaries [Data: Entities (0); Relationships (167)].\n", + "\n", + "### Involvement in Operation: Dulce\n", + "\n", + "Operation: Dulce was a significant mission undertaken by the Paranormal Military Squad, with Mercer being a key participant. The operation involved exploring paranormal anomalies and assessing potential threats, marking the beginning of an interstellar odyssey and leading to the first contact with an alien race. Mercer's involvement in this operation highlights his critical role in expanding the boundaries of human understanding of the universe [Data: Entities (5); Relationships (245)].\n", + "\n", + "In summary, Agent Alex Mercer is a pivotal figure in the Paranormal Military Squad, known for his strategic leadership and deep involvement in extraterrestrial communication efforts. His role in Operation: Dulce underscores his importance in the mission to uncover the secrets of the Dulce Base and engage with alien intelligence.\n" + ] + } + ], "source": [ "result = await search_engine.asearch(\"Tell me about Agent Mercer\")\n", "print(result.response)" @@ -336,9 +1003,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "## Overview of Dr. Jordan Hayes\n", + "\n", + "Dr. Jordan Hayes is a prominent scientist at Dulce Base, playing a crucial role in the Paranormal Military Squad's efforts to understand and communicate with extraterrestrial entities. Dr. Hayes's work is primarily focused on decoding and analyzing alien signals and codes, which are essential for interstellar communication and potentially interspecies interaction. This involves decrypting algorithms, interpreting cosmic signals, and analyzing the implications of alien society [Data: Entities (2, 17, 47); Relationships (201, 322, 283)].\n", + "\n", + "## Key Contributions and Expertise\n", + "\n", + "Dr. Hayes is noted for their adaptability and skepticism, qualities that are essential given the uncertainties and unknown challenges of their mission. They are deeply involved in the scientific exploration aspects of Operation: Dulce, where their efforts are on the verge of a significant scientific breakthrough. Dr. Hayes leads efforts in isolating and understanding complex alien signals that resemble human cognition, suggesting that these signals are artificial and patterned, indicating a tandem evolution with humanity [Data: Entities (2, 17); Claims (60, 83, 91, 134)].\n", + "\n", + "## Collaborative Efforts and Leadership\n", + "\n", + "Dr. Hayes works closely with colleagues like Alex Mercer and Taylor Cruz, engaging in thoughtful dialogue and showing analytical thinking about the mission's uncertainties. Despite some tension with Taylor Cruz due to differing leadership styles, Dr. Hayes continues to collaborate effectively with the team, contributing to the strategic discussions and decisions made in the briefing room. Their role in the command center at Dulce Base involves setting up lab stations, operating the mainframe, and playing a crucial role in the command center [Data: Entities (2, 17); Relationships (26, 82, 175, 322); Claims (2, 13)].\n", + "\n", + "## Scientific Breakthroughs and Challenges\n", + "\n", + "Dr. Hayes's work suggests that the alien signals they are studying are not just random but are structured and intentional, potentially indicating a form of extraterrestrial communication. This has led to the identification of new inviting signal patterns suggesting an intelligent confluence. Dr. Hayes's efforts are crucial in crafting humanity's responses to cosmic alignments with stars and responsive galactic signals, navigating the dark corridors of Dulce with a focus on the unknown variables and challenges beyond established protocols [Data: Entities (2, 17); Claims (60, 83, 91, 134, 153)].\n", + "\n", + "In summary, Dr. Jordan Hayes is a central figure in the efforts to understand and communicate with extraterrestrial entities at Dulce Base. Their work is characterized by a blend of scientific rigor, adaptability, and collaboration, making them a vital asset to the Paranormal Military Squad's mission.\n" + ] + } + ], "source": [ "question = \"Tell me about Dr. Jordan Hayes\"\n", "result = await search_engine.asearch(question)\n", @@ -354,18 +1045,218 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
identitydescriptionnumber of relationshipsin_context
017DR. JORDAN HAYESDr. Jordan Hayes is a key scientist at Dulce B...18True
17JORDANJordan Hayes is a key member of the Paranormal...22True
22JORDAN HAYESDr. Jordan Hayes is a central figure at Dulce ...48True
347DR. HAYESDr. Hayes is a scientist working on understand...1True
410AGENT HAYESAgent Hayes is known for emphasizing empowerme...4True
\n", + "
" + ], + "text/plain": [ + " id entity description \\\n", + "0 17 DR. JORDAN HAYES Dr. Jordan Hayes is a key scientist at Dulce B... \n", + "1 7 JORDAN Jordan Hayes is a key member of the Paranormal... \n", + "2 2 JORDAN HAYES Dr. Jordan Hayes is a central figure at Dulce ... \n", + "3 47 DR. HAYES Dr. Hayes is a scientist working on understand... \n", + "4 10 AGENT HAYES Agent Hayes is known for emphasizing empowerme... \n", + "\n", + " number of relationships in_context \n", + "0 18 True \n", + "1 22 True \n", + "2 48 True \n", + "3 1 True \n", + "4 4 True " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "result.context_data[\"entities\"].head()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsourcetargetdescriptionweightlinksin_context
0260JORDANDR. JORDAN HAYESDr. Jordan Hayes and Jordan refer to the same ...1.01True
150ALEX MERCERJORDANAlex Mercer values Jordan's analytical approac...1.02True
226ALEX MERCERDR. JORDAN HAYESAlex Mercer and Dr. Jordan Hayes are collabora...5.02True
377TAYLOR CRUZJORDANJordan and Taylor Cruz are team members workin...4.02True
482TAYLOR CRUZDR. JORDAN HAYESDr. Jordan Hayes and Taylor Cruz are colleague...4.02True
\n", + "
" + ], + "text/plain": [ + " id source target \\\n", + "0 260 JORDAN DR. JORDAN HAYES \n", + "1 50 ALEX MERCER JORDAN \n", + "2 26 ALEX MERCER DR. JORDAN HAYES \n", + "3 77 TAYLOR CRUZ JORDAN \n", + "4 82 TAYLOR CRUZ DR. JORDAN HAYES \n", + "\n", + " description weight links in_context \n", + "0 Dr. Jordan Hayes and Jordan refer to the same ... 1.0 1 True \n", + "1 Alex Mercer values Jordan's analytical approac... 1.0 2 True \n", + "2 Alex Mercer and Dr. Jordan Hayes are collabora... 5.0 2 True \n", + "3 Jordan and Taylor Cruz are team members workin... 4.0 2 True \n", + "4 Dr. Jordan Hayes and Taylor Cruz are colleague... 4.0 2 True " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "result.context_data[\"relationships\"].head()" ] @@ -374,9 +1265,22 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "KeyError", + "evalue": "'reports'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[17], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mresult\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcontext_data\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mreports\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mhead()\n", + "\u001b[0;31mKeyError\u001b[0m: 'reports'" + ] + } + ], "source": [ - "result.context_data[\"reports\"].head()" + "if \"reports\" in result.context_data:\n", + " result.context_data[\"reports\"].head()" ] }, { @@ -446,7 +1350,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -460,7 +1364,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/docs/get_started.md b/docs/get_started.md index 4eb2f2887..ef6d14407 100644 --- a/docs/get_started.md +++ b/docs/get_started.md @@ -14,11 +14,6 @@ To get started with the GraphRAG system, you have a few options: To get started with the GraphRAG system we recommend trying the [Solution Accelerator](https://github.com/Azure-Samples/graphrag-accelerator) package. This provides a user-friendly end-to-end experience with Azure resources. -# Top-Level Modules - -* [Indexing Pipeline Overview](index/overview.md) -* [Query Engine Overview](query/overview.md) - # Overview The following is a simple end-to-end example for using the GraphRAG system. @@ -34,26 +29,20 @@ The graphrag library includes a CLI for a no-code approach to getting started. P # Running the Indexer -Now we need to set up a data project and some initial configuration. Let's set that up. We're using the [default configuration mode](config/overview.md), which you can customize as needed using a [config file](config/yaml.md), which we recommend, or [environment variables](config/env_vars.md). - -First let's get a sample dataset ready: +We need to set up a data project and some initial configuration. First let's get a sample dataset ready: ```sh mkdir -p ./ragtest/input ``` -Now let's get a copy of A Christmas Carol by Charles Dickens from a trusted source +Get a copy of A Christmas Carol by Charles Dickens from a trusted source: ```sh curl https://www.gutenberg.org/cache/epub/24022/pg24022.txt -o ./ragtest/input/book.txt ``` -Next we'll inject some required config variables: - ## Set Up Your Workspace Variables -First let's make sure to setup the required environment variables. For details on these environment variables, and what environment variables are available, see the [variables documentation](config/overview.md). - To initialize your workspace, first run the `graphrag init` command. Since we have already configured a directory named `./ragtest` in the previous step, run the following command: diff --git a/graphrag/query/question_gen/local_gen.py b/graphrag/query/question_gen/local_gen.py index 5372d4cf4..7ba7bb1f0 100644 --- a/graphrag/query/question_gen/local_gen.py +++ b/graphrag/query/question_gen/local_gen.py @@ -5,12 +5,15 @@ import logging import time -from typing import Any +from typing import Any, cast import tiktoken from graphrag.prompts.query.question_gen_system_prompt import QUESTION_SYSTEM_PROMPT -from graphrag.query.context_builder.builders import LocalContextBuilder +from graphrag.query.context_builder.builders import ( + ContextBuilderResult, + LocalContextBuilder, +) from graphrag.query.context_builder.conversation_history import ( ConversationHistory, ) @@ -71,12 +74,17 @@ async def agenerate( if context_data is None: # generate context data based on the question history - context_data, context_records = self.context_builder.build_context( - query=question_text, - conversation_history=conversation_history, - **kwargs, - **self.context_builder_params, - ) # type: ignore + result = cast( + ContextBuilderResult, + self.context_builder.build_context( + query=question_text, + conversation_history=conversation_history, + **kwargs, + **self.context_builder_params, + ), + ) + context_data = cast(str, result.context_chunks) + context_records = result.context_records else: context_records = {"context_data": context_data} log.info("GENERATE QUESTION: %s. LAST QUESTION: %s", start_time, question_text) @@ -144,12 +152,17 @@ def generate( if context_data is None: # generate context data based on the question history - context_data, context_records = self.context_builder.build_context( - query=question_text, - conversation_history=conversation_history, - **kwargs, - **self.context_builder_params, - ) # type: ignore + result = cast( + ContextBuilderResult, + self.context_builder.build_context( + query=question_text, + conversation_history=conversation_history, + **kwargs, + **self.context_builder_params, + ), + ) + context_data = cast(str, result.context_chunks) + context_records = result.context_records else: context_records = {"context_data": context_data} log.info( diff --git a/v1-breaking-changes.md b/v1-breaking-changes.md index a2ec67a5c..1905980cc 100644 --- a/v1-breaking-changes.md +++ b/v1-breaking-changes.md @@ -1,4 +1,50 @@ -# Config Breaking Changes +# GraphRAG Data Model and Config Breaking Changes + +As we worked toward a cleaner codebase, data model, and configuration for the v1 release, we made a few changes that can break older indexes. During the development process we left shims in place to account for these changes, so that all old indexes will work up until v1.0. However, with the release of 1.0 we are removing these shims to allow the codebase to move forward without the legacy code elements. This should be a fairly painless process for most users: because we aggressively use a cache for LLM calls, re-running an index over the top of a previous one should be very low (or no) cost. Therefore, our standard migration recommendation is as follows: + +1. Rename or move your settings.yml file to back it up. +2. Re-run `graphrag init` to generate a new default settings.yml. +3. Open your old settings.yml and copy any critical settings that you changed. For most people this is likely only the LLM and embedding config. +4. Re-run `graphrag index`. This will re-execute the standard pipeline, using the cache for any LLM calls that it can. The output parquet tables will be in the latest format. + +Note that one of the new requirements is that we write embeddings to a vector store during indexing. By default, this uses a local lancedb instance. When you re-generate the default config, a block will be added to reflect this. If you need to write to Azure AI Search instead, we recommend updating these settings before you index, so you don't need to do a separate vector ingest. + +All of the breaking changes listed below are accounted for in the four steps above. + +## What if I don't have a cache available? + +If you no longer have your original GraphRAG cache, you can manually update your index. The most important aspect is ensuring that you have the required embeddings stored. If you already have your embeddings in a vector store, much of this can be avoided. + +Parquet changes: +- The `create_final_entities.name` field has been renamed to `create_final_entities.title` for consistency with the other tables. Use your parquet editor of choice to fix this. +- The `create_final_communities.id` field has been renamed to `create_final_communities.community` so that `id` can be repurposed for a UUID like the other tables. Use your parquet editor of choice to copy and rename this. You can copy it to leave the `id` field in place, or use a tool such as pandas to give each community a new UUID in the `id` field. (We join on the `community` field internally, so `id` can be effectively ignored). + +Embeddings changes: +- For Local Search, you need to have the entity.description embeddings in a vector store +- For DRIFT Search, you need the community.full_content embeddings in a vector store +- If you are only using Global search, you do not need any embeddings + +The easiest way to get both of those is to run the pipeline with all workflows skipped except for `generate_embeddings`, which will embed those fields and write them to a vector store directly. Using a newer config file that has the embeddings.vector_store block: + +- Set the `skip_workflows` value to [create_base_entity_graph, create_base_text_units, create_final_text_units, create_final_community_reports, create_final_nodes, create_final_relationships, create_final_documents, create_final_covariates, create_final_entities, create_final_communities] +- Re-run `graphrag index` + +What this does is run the pipeline, but skip over all of the usual artifact generation - the only workflow that is not skipped is the one that generates all default (or otherwise configured) embeddings. + +## Updated data model + +- We have streamlined the data model of the index in a few small ways to align tables more consistently and remove redundant content. Notably: + - Consistent use of `id` and `human_readable_id` across all tables; this also insures all int IDs are actually saved as ints and never strings + - Alignment of fields from `create_final_entities` (such as name -> title) with `create_final_nodes`, and removal of redundant content across these tables + - Rename of `document.raw_content` to `document.text` + - Rename of `entity.name` to `entity.title` + - Rename `rank` to `combined_degree` in `create_final_relationships` and removal of `source_degree` and `target_degree`fields + - Fixed community tables to use a proper UUID for the `id` field, and retain `community` and `human_readable_id` for the short IDs + - Removal of all embeddings columns from parquet files in favor of direct vector store writes + +### Migration + +- Run a new index, leveraging existing cache. ## New required Embeddings From dad2176b3ca63847fefc6442ce11ea06e5982e60 Mon Sep 17 00:00:00 2001 From: Josh Bradley Date: Wed, 27 Nov 2024 13:27:43 -0500 Subject: [PATCH 2/2] Miscellaneous code cleanup procedures (#1452) --- .../patch-20241127084633163555.json | 4 ++ docs/config/env_vars.md | 4 +- docs/config/yaml.md | 14 ++-- docs/index/default_dataflow.md | 12 ++-- examples/use_built_in_workflows/run.py | 4 +- graphrag/api/index.py | 12 +--- graphrag/api/query.py | 2 +- graphrag/{index/storage => cache}/__init__.py | 2 +- .../cache/load_cache.py => cache/factory.py} | 21 +++--- .../{index => }/cache/json_pipeline_cache.py | 4 +- .../cache/memory_pipeline_cache.py | 7 +- .../{index => }/cache/noop_pipeline_cache.py | 2 +- graphrag/{index => }/cache/pipeline_cache.py | 0 graphrag/callbacks/__init__.py | 2 +- .../callbacks/{factories.py => factory.py} | 0 graphrag/cli/index.py | 9 +-- graphrag/cli/initialize.py | 2 +- graphrag/cli/main.py | 9 --- graphrag/cli/query.py | 9 +-- graphrag/config/__init__.py | 2 +- graphrag/index/context.py | 4 +- graphrag/index/emit/__init__.py | 4 -- graphrag/index/emit/csv_table_emitter.py | 32 --------- graphrag/index/emit/factories.py | 40 ----------- graphrag/index/emit/json_table_emitter.py | 33 --------- graphrag/index/emit/table_emitter.py | 15 ----- graphrag/index/emit/types.py | 18 ----- .../parquet_table_emitter.py => exporter.py} | 24 ++++--- .../index/flows/create_base_entity_graph.py | 4 +- .../index/flows/create_base_text_units.py | 2 +- .../flows/create_final_community_reports.py | 2 +- .../index/flows/create_final_covariates.py | 2 +- graphrag/index/flows/create_final_nodes.py | 2 +- .../index/flows/generate_text_embeddings.py | 4 +- graphrag/index/input/csv.py | 2 +- .../index/input/{load_input.py => factory.py} | 10 +-- graphrag/index/input/text.py | 2 +- graphrag/index/llm/load_llm.py | 2 +- graphrag/index/operations/cluster_graph.py | 4 +- .../index/operations/embed_text/embed_text.py | 2 +- .../operations/embed_text/strategies/mock.py | 2 +- .../embed_text/strategies/openai.py | 2 +- .../embed_text/strategies/typing.py | 2 +- .../extract_covariates/extract_covariates.py | 2 +- .../extract_covariates/strategies.py | 2 +- .../operations/extract_covariates/typing.py | 2 +- .../extract_entities/extract_entities.py | 2 +- .../strategies/graph_intelligence.py | 2 +- .../extract_entities/strategies/nltk.py | 2 +- .../extract_entities/strategies/typing.py | 2 +- graphrag/index/operations/snapshot.py | 2 +- graphrag/index/operations/snapshot_graphml.py | 2 +- graphrag/index/operations/snapshot_rows.py | 2 +- .../summarize_communities/strategies.py | 2 +- .../summarize_communities.py | 2 +- .../summarize_communities/typing.py | 2 +- .../summarize_descriptions/strategies.py | 2 +- .../summarize_descriptions.py | 2 +- .../summarize_descriptions/typing.py | 2 +- graphrag/index/run/cache.py | 17 ----- graphrag/index/run/profiling.py | 2 +- graphrag/index/run/run.py | 67 +++++++++---------- graphrag/index/run/utils.py | 35 ++-------- graphrag/index/run/workflow.py | 30 ++++----- graphrag/index/update/entities.py | 2 +- graphrag/index/update/incremental_index.py | 4 +- .../v1/subflows/create_base_entity_graph.py | 4 +- .../v1/subflows/create_base_text_units.py | 2 +- .../v1/subflows/create_final_communities.py | 2 +- .../create_final_community_reports.py | 2 +- .../v1/subflows/create_final_covariates.py | 4 +- .../v1/subflows/create_final_documents.py | 2 +- .../v1/subflows/create_final_entities.py | 2 +- .../v1/subflows/create_final_nodes.py | 2 +- .../v1/subflows/create_final_relationships.py | 2 +- .../v1/subflows/create_final_text_units.py | 2 +- .../v1/subflows/generate_text_embeddings.py | 4 +- graphrag/logging/{factories.py => factory.py} | 0 graphrag/prompt_tune/loader/input.py | 4 +- graphrag/prompts/__init__.py | 2 +- graphrag/query/{factories.py => factory.py} | 0 graphrag/query/indexer_adapters.py | 2 +- graphrag/{index/cache => storage}/__init__.py | 2 +- .../storage/blob_pipeline_storage.py | 6 +- .../load_storage.py => storage/factory.py} | 14 ++-- .../storage/file_pipeline_storage.py | 2 +- .../storage/memory_pipeline_storage.py | 13 ++-- .../{index => }/storage/pipeline_storage.py | 0 graphrag/utils/__init__.py | 2 +- graphrag/utils/storage.py | 30 +-------- graphrag/vector_stores/__init__.py | 2 +- .../storage/test_blob_pipeline_storage.py | 2 +- .../storage/test_file_pipeline_storage.py | 2 +- tests/smoke/test_fixtures.py | 2 +- .../cache/test_file_pipeline_cache.py | 4 +- .../{test_emit.py => test_export.py} | 12 ++-- 96 files changed, 201 insertions(+), 452 deletions(-) create mode 100644 .semversioner/next-release/patch-20241127084633163555.json rename graphrag/{index/storage => cache}/__init__.py (60%) rename graphrag/{index/cache/load_cache.py => cache/factory.py} (66%) rename graphrag/{index => }/cache/json_pipeline_cache.py (93%) rename graphrag/{index => }/cache/memory_pipeline_cache.py (92%) rename graphrag/{index => }/cache/noop_pipeline_cache.py (96%) rename graphrag/{index => }/cache/pipeline_cache.py (100%) rename graphrag/callbacks/{factories.py => factory.py} (100%) delete mode 100644 graphrag/index/emit/__init__.py delete mode 100644 graphrag/index/emit/csv_table_emitter.py delete mode 100644 graphrag/index/emit/factories.py delete mode 100644 graphrag/index/emit/json_table_emitter.py delete mode 100644 graphrag/index/emit/table_emitter.py delete mode 100644 graphrag/index/emit/types.py rename graphrag/index/{emit/parquet_table_emitter.py => exporter.py} (61%) rename graphrag/index/input/{load_input.py => factory.py} (91%) delete mode 100644 graphrag/index/run/cache.py rename graphrag/logging/{factories.py => factory.py} (100%) rename graphrag/query/{factories.py => factory.py} (100%) rename graphrag/{index/cache => storage}/__init__.py (62%) rename graphrag/{index => }/storage/blob_pipeline_storage.py (98%) rename graphrag/{index/storage/load_storage.py => storage/factory.py} (68%) rename graphrag/{index => }/storage/file_pipeline_storage.py (98%) rename graphrag/{index => }/storage/memory_pipeline_storage.py (87%) rename graphrag/{index => }/storage/pipeline_storage.py (100%) rename tests/unit/indexing/workflows/{test_emit.py => test_export.py} (90%) diff --git a/.semversioner/next-release/patch-20241127084633163555.json b/.semversioner/next-release/patch-20241127084633163555.json new file mode 100644 index 000000000..01699f7e0 --- /dev/null +++ b/.semversioner/next-release/patch-20241127084633163555.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "miscellaneous code cleanup and minor changes for better alignment of style across the codebase." +} diff --git a/docs/config/env_vars.md b/docs/config/env_vars.md index 6632ace9e..8474d6f46 100644 --- a/docs/config/env_vars.md +++ b/docs/config/env_vars.md @@ -2,7 +2,7 @@ ## Text-Embeddings Customization -By default, the GraphRAG indexer will only emit embeddings required for our query methods. However, the model has embeddings defined for all plaintext fields, and these can be generated by setting the `GRAPHRAG_EMBEDDING_TARGET` environment variable to `all`. +By default, the GraphRAG indexer will only export embeddings required for our query methods. However, the model has embeddings defined for all plaintext fields, and these can be generated by setting the `GRAPHRAG_EMBEDDING_TARGET` environment variable to `all`. If the embedding target is `all`, and you want to only embed a subset of these fields, you may specify which embeddings to skip using the `GRAPHRAG_EMBEDDING_SKIP` argument described below. @@ -152,7 +152,7 @@ These settings control the data input used by the pipeline. Any settings with a ## Storage -This section controls the storage mechanism used by the pipeline used for emitting output tables. +This section controls the storage mechanism used by the pipeline used for exporting output tables. | Parameter | Description | Type | Required or Optional | Default | | ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- | -------------------- | ------- | diff --git a/docs/config/yaml.md b/docs/config/yaml.md index 5b4012570..11b803ece 100644 --- a/docs/config/yaml.md +++ b/docs/config/yaml.md @@ -67,7 +67,7 @@ This is the base LLM configuration section. Other steps may override this config - `async_mode` (see Async Mode top-level config) - `batch_size` **int** - The maximum batch size to use. - `batch_max_tokens` **int** - The maximum batch # of tokens. -- `target` **required|all|none** - Determines which set of embeddings to emit. +- `target` **required|all|none** - Determines which set of embeddings to export. - `skip` **list[str]** - Which embeddings to skip. Only useful if target=all to customize the list. - `vector_store` **dict** - The vector store to use. Configured for lancedb by default. - `type` **str** - `lancedb` or `azure_ai_search`. Default=`lancedb` @@ -203,7 +203,7 @@ This is the base LLM configuration section. Other steps may override this config #### Fields -- `max_cluster_size` **int** - The maximum cluster size to emit. +- `max_cluster_size` **int** - The maximum cluster size to export. - `strategy` **dict** - Fully override the cluster_graph strategy. ### embed_graph @@ -228,11 +228,11 @@ This is the base LLM configuration section. Other steps may override this config #### Fields -- `embeddings` **bool** - Emit embeddings snapshots to parquet. -- `graphml` **bool** - Emit graph snapshots to GraphML. -- `raw_entities` **bool** - Emit raw entity snapshots to JSON. -- `top_level_nodes` **bool** - Emit top-level-node snapshots to JSON. -- `transient` **bool** - Emit transient workflow tables snapshots to parquet. +- `embeddings` **bool** - Export embeddings snapshots to parquet. +- `graphml` **bool** - Export graph snapshots to GraphML. +- `raw_entities` **bool** - Export raw entity snapshots to JSON. +- `top_level_nodes` **bool** - Export top-level-node snapshots to JSON. +- `transient` **bool** - Export transient workflow tables snapshots to parquet. ### encoding_model diff --git a/docs/index/default_dataflow.md b/docs/index/default_dataflow.md index 1397d8d85..6f5595654 100644 --- a/docs/index/default_dataflow.md +++ b/docs/index/default_dataflow.md @@ -105,9 +105,9 @@ Now that we have a graph of entities and relationships, each with a list of desc ### Claim Extraction & Emission -Finally, as an independent workflow, we extract claims from the source TextUnits. These claims represent positive factual statements with an evaluated status and time-bounds. These are emitted as a primary artifact called **Covariates**. +Finally, as an independent workflow, we extract claims from the source TextUnits. These claims represent positive factual statements with an evaluated status and time-bounds. These get exported as a primary artifact called **Covariates**. -Note: claim extraction is _optional_ and turned off by default. This is because claim extraction generally needs prompt tuning to be useful. +Note: claim extraction is _optional_ and turned off by default. This is because claim extraction generally requires prompt tuning to be useful. ## Phase 3: Graph Augmentation @@ -131,7 +131,7 @@ In this step, we generate a vector representation of our graph using the Node2Ve ### Graph Tables Emission -Once our graph augmentation steps are complete, the final **Entities** and **Relationships** tables are emitted after their text fields are text-embedded. +Once our graph augmentation steps are complete, the final **Entities** and **Relationships** tables are exported after their text fields are text-embedded. ## Phase 4: Community Summarization @@ -161,7 +161,7 @@ In this step, we generate a vector representation of our communities by generati ### Community Tables Emission -At this point, some bookkeeping work is performed and we emit the **Communities** and **CommunityReports** tables. +At this point, some bookkeeping work is performed and we export the **Communities** and **CommunityReports** tables. ## Phase 5: Document Processing @@ -189,7 +189,7 @@ In this step, we generate a vector representation of our documents using an aver ### Documents Table Emission -At this point, we can emit the **Documents** table into the knowledge Model. +At this point, we can export the **Documents** table into the knowledge Model. ## Phase 6: Network Visualization @@ -203,4 +203,4 @@ flowchart LR nv[Umap Documents] --> ne[Umap Entities] --> ng[Nodes Table Emission] ``` -For each of the logical graphs, we perform a UMAP dimensionality reduction to generate a 2D representation of the graph. This will allow us to visualize the graph in a 2D space and understand the relationships between the nodes in the graph. The UMAP embeddings are then emitted as a table of _Nodes_. The rows of this table include a discriminator indicating whether the node is a document or an entity, and the UMAP coordinates. +For each of the logical graphs, we perform a UMAP dimensionality reduction to generate a 2D representation of the graph. This will allow us to visualize the graph in a 2D space and understand the relationships between the nodes in the graph. The UMAP embeddings are then exported as a table of _Nodes_. The rows of this table include a discriminator indicating whether the node is a document or an entity, and the UMAP coordinates. diff --git a/examples/use_built_in_workflows/run.py b/examples/use_built_in_workflows/run.py index 7212126d0..adda7f6b4 100644 --- a/examples/use_built_in_workflows/run.py +++ b/examples/use_built_in_workflows/run.py @@ -5,7 +5,7 @@ from graphrag.index.config.input import PipelineCSVInputConfig from graphrag.index.config.workflow import PipelineWorkflowReference -from graphrag.index.input.load_input import load_input +from graphrag.index.input.factory import create_input from graphrag.index.run import run_pipeline, run_pipeline_with_config sample_data_dir = os.path.join( @@ -14,7 +14,7 @@ # Load our dataset once shared_dataset = asyncio.run( - load_input( + create_input( PipelineCSVInputConfig( file_pattern=".*\\.csv$", base_dir=sample_data_dir, diff --git a/graphrag/api/index.py b/graphrag/api/index.py index 90fb30694..4b2355fdf 100644 --- a/graphrag/api/index.py +++ b/graphrag/api/index.py @@ -10,11 +10,10 @@ from pathlib import Path +from graphrag.cache.noop_pipeline_cache import NoopPipelineCache from graphrag.config.enums import CacheType from graphrag.config.models.graph_rag_config import GraphRagConfig -from graphrag.index.cache.noop_pipeline_cache import NoopPipelineCache from graphrag.index.create_pipeline_config import create_pipeline_config -from graphrag.index.emit.types import TableEmitterType from graphrag.index.run import run_pipeline_with_config from graphrag.index.typing import PipelineRunResult from graphrag.logging.base import ProgressReporter @@ -27,7 +26,6 @@ async def build_index( is_resume_run: bool = False, memory_profile: bool = False, progress_reporter: ProgressReporter | None = None, - emit: list[TableEmitterType] = [TableEmitterType.Parquet], # noqa: B006 ) -> list[PipelineRunResult]: """Run the pipeline with the given configuration. @@ -45,9 +43,6 @@ async def build_index( Whether to enable memory profiling. progress_reporter : ProgressReporter | None default=None The progress reporter. - emit : list[str] - The list of emitter types to emit. - Accepted values {"parquet", "csv"}. Returns ------- @@ -60,10 +55,6 @@ async def build_index( msg = "Cannot resume and update a run at the same time." raise ValueError(msg) - # Ensure Parquet is part of the emitters - if TableEmitterType.Parquet not in emit: - emit.append(TableEmitterType.Parquet) - config = _patch_vector_config(config) pipeline_config = create_pipeline_config(config) @@ -77,7 +68,6 @@ async def build_index( memory_profile=memory_profile, cache=pipeline_cache, progress_reporter=progress_reporter, - emit=emit, is_resume_run=is_resume_run, is_update_run=is_update_run, ): diff --git a/graphrag/api/query.py b/graphrag/api/query.py index 7149211c8..2eab003cb 100644 --- a/graphrag/api/query.py +++ b/graphrag/api/query.py @@ -30,7 +30,7 @@ entity_description_embedding, ) from graphrag.logging.print_progress import PrintProgressReporter -from graphrag.query.factories import ( +from graphrag.query.factory import ( get_drift_search_engine, get_global_search_engine, get_local_search_engine, diff --git a/graphrag/index/storage/__init__.py b/graphrag/cache/__init__.py similarity index 60% rename from graphrag/index/storage/__init__.py rename to graphrag/cache/__init__.py index 51f34cbd3..9c4e8be3f 100644 --- a/graphrag/index/storage/__init__.py +++ b/graphrag/cache/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""The Indexing Engine storage package root.""" +"""A package containing cache implementations.""" diff --git a/graphrag/index/cache/load_cache.py b/graphrag/cache/factory.py similarity index 66% rename from graphrag/index/cache/load_cache.py rename to graphrag/cache/factory.py index 91f633367..7253fc9d6 100644 --- a/graphrag/index/cache/load_cache.py +++ b/graphrag/cache/factory.py @@ -1,7 +1,7 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""A module containing load_cache method definition.""" +"""A module containing create_cache method definition.""" from __future__ import annotations @@ -12,21 +12,24 @@ PipelineBlobCacheConfig, PipelineFileCacheConfig, ) -from graphrag.index.storage.blob_pipeline_storage import BlobPipelineStorage -from graphrag.index.storage.file_pipeline_storage import FilePipelineStorage +from graphrag.storage.blob_pipeline_storage import BlobPipelineStorage +from graphrag.storage.file_pipeline_storage import FilePipelineStorage if TYPE_CHECKING: + from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.config.cache import ( PipelineCacheConfig, ) -from graphrag.index.cache.json_pipeline_cache import JsonPipelineCache -from graphrag.index.cache.memory_pipeline_cache import create_memory_cache -from graphrag.index.cache.noop_pipeline_cache import NoopPipelineCache +from graphrag.cache.json_pipeline_cache import JsonPipelineCache +from graphrag.cache.memory_pipeline_cache import InMemoryCache +from graphrag.cache.noop_pipeline_cache import NoopPipelineCache -def load_cache(config: PipelineCacheConfig | None, root_dir: str | None): - """Load the cache from the given config.""" +def create_cache( + config: PipelineCacheConfig | None, root_dir: str | None +) -> PipelineCache: + """Create a cache from the given config.""" if config is None: return NoopPipelineCache() @@ -34,7 +37,7 @@ def load_cache(config: PipelineCacheConfig | None, root_dir: str | None): case CacheType.none: return NoopPipelineCache() case CacheType.memory: - return create_memory_cache() + return InMemoryCache() case CacheType.file: config = cast(PipelineFileCacheConfig, config) storage = FilePipelineStorage(root_dir).child(config.base_dir) diff --git a/graphrag/index/cache/json_pipeline_cache.py b/graphrag/cache/json_pipeline_cache.py similarity index 93% rename from graphrag/index/cache/json_pipeline_cache.py rename to graphrag/cache/json_pipeline_cache.py index 13d521239..8993e04f3 100644 --- a/graphrag/index/cache/json_pipeline_cache.py +++ b/graphrag/cache/json_pipeline_cache.py @@ -6,8 +6,8 @@ import json from typing import Any -from graphrag.index.cache.pipeline_cache import PipelineCache -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.storage.pipeline_storage import PipelineStorage class JsonPipelineCache(PipelineCache): diff --git a/graphrag/index/cache/memory_pipeline_cache.py b/graphrag/cache/memory_pipeline_cache.py similarity index 92% rename from graphrag/index/cache/memory_pipeline_cache.py rename to graphrag/cache/memory_pipeline_cache.py index 2a9e19c9c..62de552f9 100644 --- a/graphrag/index/cache/memory_pipeline_cache.py +++ b/graphrag/cache/memory_pipeline_cache.py @@ -5,7 +5,7 @@ from typing import Any -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache class InMemoryCache(PipelineCache): @@ -76,8 +76,3 @@ def child(self, name: str) -> PipelineCache: def _create_cache_key(self, key: str) -> str: """Create a cache key for the given key.""" return f"{self._name}{key}" - - -def create_memory_cache() -> PipelineCache: - """Create a memory cache.""" - return InMemoryCache() diff --git a/graphrag/index/cache/noop_pipeline_cache.py b/graphrag/cache/noop_pipeline_cache.py similarity index 96% rename from graphrag/index/cache/noop_pipeline_cache.py rename to graphrag/cache/noop_pipeline_cache.py index 738787ad3..227ef687b 100644 --- a/graphrag/index/cache/noop_pipeline_cache.py +++ b/graphrag/cache/noop_pipeline_cache.py @@ -5,7 +5,7 @@ from typing import Any -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache class NoopPipelineCache(PipelineCache): diff --git a/graphrag/index/cache/pipeline_cache.py b/graphrag/cache/pipeline_cache.py similarity index 100% rename from graphrag/index/cache/pipeline_cache.py rename to graphrag/cache/pipeline_cache.py diff --git a/graphrag/callbacks/__init__.py b/graphrag/callbacks/__init__.py index c6b2def2f..d62357135 100644 --- a/graphrag/callbacks/__init__.py +++ b/graphrag/callbacks/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""A module containing callback implementations.""" +"""A package containing callback implementations.""" diff --git a/graphrag/callbacks/factories.py b/graphrag/callbacks/factory.py similarity index 100% rename from graphrag/callbacks/factories.py rename to graphrag/callbacks/factory.py diff --git a/graphrag/cli/index.py b/graphrag/cli/index.py index 90a720fab..3e03ab5af 100644 --- a/graphrag/cli/index.py +++ b/graphrag/cli/index.py @@ -15,10 +15,9 @@ from graphrag.config.load_config import load_config from graphrag.config.logging import enable_logging_with_config from graphrag.config.resolve_path import resolve_paths -from graphrag.index.emit.types import TableEmitterType from graphrag.index.validate_config import validate_config_names from graphrag.logging.base import ProgressReporter -from graphrag.logging.factories import create_progress_reporter +from graphrag.logging.factory import create_progress_reporter from graphrag.logging.types import ReporterType from graphrag.utils.cli import redact @@ -73,7 +72,6 @@ def index_cli( cache: bool, reporter: ReporterType, config_filepath: Path | None, - emit: list[TableEmitterType], dry_run: bool, skip_validation: bool, output_dir: Path | None, @@ -88,7 +86,6 @@ def index_cli( memprofile=memprofile, cache=cache, reporter=reporter, - emit=emit, dry_run=dry_run, skip_validation=skip_validation, output_dir=output_dir, @@ -102,7 +99,6 @@ def update_cli( cache: bool, reporter: ReporterType, config_filepath: Path | None, - emit: list[TableEmitterType], skip_validation: bool, output_dir: Path | None, ): @@ -126,7 +122,6 @@ def update_cli( memprofile=memprofile, cache=cache, reporter=reporter, - emit=emit, dry_run=False, skip_validation=skip_validation, output_dir=output_dir, @@ -140,7 +135,6 @@ def _run_index( memprofile, cache, reporter, - emit, dry_run, skip_validation, output_dir, @@ -189,7 +183,6 @@ def _run_index( is_resume_run=bool(resume), memory_profile=memprofile, progress_reporter=progress_reporter, - emit=emit, ) ) encountered_errors = any( diff --git a/graphrag/cli/initialize.py b/graphrag/cli/initialize.py index 992d38f71..7d2bba7ef 100644 --- a/graphrag/cli/initialize.py +++ b/graphrag/cli/initialize.py @@ -6,7 +6,7 @@ from pathlib import Path from graphrag.config.init_content import INIT_DOTENV, INIT_YAML -from graphrag.logging.factories import create_progress_reporter +from graphrag.logging.factory import create_progress_reporter from graphrag.logging.types import ReporterType from graphrag.prompts.index.claim_extraction import CLAIM_EXTRACTION_PROMPT from graphrag.prompts.index.community_report import ( diff --git a/graphrag/cli/main.py b/graphrag/cli/main.py index 919015ae3..c084d0625 100644 --- a/graphrag/cli/main.py +++ b/graphrag/cli/main.py @@ -12,7 +12,6 @@ import typer -from graphrag.index.emit.types import TableEmitterType from graphrag.logging.types import ReporterType from graphrag.prompt_tune.defaults import ( MAX_TOKEN_COUNT, @@ -138,9 +137,6 @@ def _index_cli( reporter: Annotated[ ReporterType, typer.Option(help="The progress reporter to use.") ] = ReporterType.RICH, - emit: Annotated[ - str, typer.Option(help="The data formats to emit, comma-separated.") - ] = TableEmitterType.Parquet.value, dry_run: Annotated[ bool, typer.Option( @@ -175,7 +171,6 @@ def _index_cli( cache=cache, reporter=ReporterType(reporter), config_filepath=config, - emit=[TableEmitterType(value.strip()) for value in emit.split(",")], dry_run=dry_run, skip_validation=skip_validation, output_dir=output, @@ -209,9 +204,6 @@ def _update_cli( reporter: Annotated[ ReporterType, typer.Option(help="The progress reporter to use.") ] = ReporterType.RICH, - emit: Annotated[ - str, typer.Option(help="The data formats to emit, comma-separated.") - ] = TableEmitterType.Parquet.value, cache: Annotated[bool, typer.Option(help="Use LLM cache.")] = True, skip_validation: Annotated[ bool, @@ -243,7 +235,6 @@ def _update_cli( cache=cache, reporter=ReporterType(reporter), config_filepath=config, - emit=[TableEmitterType(value.strip()) for value in emit.split(",")], skip_validation=skip_validation, output_dir=output, ) diff --git a/graphrag/cli/query.py b/graphrag/cli/query.py index ea9116c69..f11dfd60c 100644 --- a/graphrag/cli/query.py +++ b/graphrag/cli/query.py @@ -15,7 +15,8 @@ from graphrag.config.resolve_path import resolve_paths from graphrag.index.create_pipeline_config import create_pipeline_config from graphrag.logging.print_progress import PrintProgressReporter -from graphrag.utils.storage import _create_storage, _load_table_from_storage +from graphrag.storage.factory import create_storage +from graphrag.utils.storage import _load_table_from_storage reporter = PrintProgressReporter("") @@ -40,7 +41,6 @@ def run_global_search( resolve_paths(config) dataframe_dict = _resolve_parquet_files( - root_dir=root_dir, config=config, parquet_list=[ "create_final_nodes.parquet", @@ -126,7 +126,6 @@ def run_local_search( # TODO remove optional create_final_entities_description_embeddings.parquet to delete backwards compatibility dataframe_dict = _resolve_parquet_files( - root_dir=root_dir, config=config, parquet_list=[ "create_final_nodes.parquet", @@ -217,7 +216,6 @@ def run_drift_search( resolve_paths(config) dataframe_dict = _resolve_parquet_files( - root_dir=root_dir, config=config, parquet_list=[ "create_final_nodes.parquet", @@ -261,7 +259,6 @@ def run_drift_search( def _resolve_parquet_files( - root_dir: Path, config: GraphRagConfig, parquet_list: list[str], optional_list: list[str] | None = None, @@ -269,7 +266,7 @@ def _resolve_parquet_files( """Read parquet files to a dataframe dict.""" dataframe_dict = {} pipeline_config = create_pipeline_config(config) - storage_obj = _create_storage(root_dir=root_dir, config=pipeline_config.storage) + storage_obj = create_storage(pipeline_config.storage) # type: ignore for parquet_file in parquet_list: df_key = parquet_file.split(".")[0] df_value = asyncio.run( diff --git a/graphrag/config/__init__.py b/graphrag/config/__init__.py index 90e6e010a..926d8d97f 100644 --- a/graphrag/config/__init__.py +++ b/graphrag/config/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""The Indexing Engine default config package root.""" +"""The config package root.""" diff --git a/graphrag/index/context.py b/graphrag/index/context.py index fa4c9b272..c45decd17 100644 --- a/graphrag/index/context.py +++ b/graphrag/index/context.py @@ -7,8 +7,8 @@ from dataclasses import dataclass as dc_dataclass from dataclasses import field -from graphrag.index.cache.pipeline_cache import PipelineCache -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.storage.pipeline_storage import PipelineStorage @dc_dataclass diff --git a/graphrag/index/emit/__init__.py b/graphrag/index/emit/__init__.py deleted file mode 100644 index 7ae6eea9f..000000000 --- a/graphrag/index/emit/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Definitions for emitting pipeline artifacts to storage.""" diff --git a/graphrag/index/emit/csv_table_emitter.py b/graphrag/index/emit/csv_table_emitter.py deleted file mode 100644 index 3ba976b8d..000000000 --- a/graphrag/index/emit/csv_table_emitter.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""CSVTableEmitter module.""" - -import logging - -import pandas as pd - -from graphrag.index.emit.table_emitter import TableEmitter -from graphrag.index.storage.pipeline_storage import PipelineStorage - -log = logging.getLogger(__name__) - - -class CSVTableEmitter(TableEmitter): - """CSVTableEmitter class.""" - - _storage: PipelineStorage - - def __init__(self, storage: PipelineStorage): - """Create a new CSV Table Emitter.""" - self._storage = storage - - async def emit(self, name: str, data: pd.DataFrame) -> None: - """Emit a dataframe to storage.""" - filename = f"{name}.csv" - log.info("emitting CSV table %s", filename) - await self._storage.set( - filename, - data.to_csv(), - ) diff --git a/graphrag/index/emit/factories.py b/graphrag/index/emit/factories.py deleted file mode 100644 index 9a83e7185..000000000 --- a/graphrag/index/emit/factories.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Table Emitter Factories.""" - -from graphrag.index.emit.csv_table_emitter import CSVTableEmitter -from graphrag.index.emit.json_table_emitter import JsonTableEmitter -from graphrag.index.emit.parquet_table_emitter import ParquetTableEmitter -from graphrag.index.emit.table_emitter import TableEmitter -from graphrag.index.emit.types import TableEmitterType -from graphrag.index.storage.pipeline_storage import PipelineStorage -from graphrag.index.typing import ErrorHandlerFn - - -def create_table_emitter( - emitter_type: TableEmitterType, storage: PipelineStorage, on_error: ErrorHandlerFn -) -> TableEmitter: - """Create a table emitter based on the specified type.""" - match emitter_type: - case TableEmitterType.Json: - return JsonTableEmitter(storage) - case TableEmitterType.Parquet: - return ParquetTableEmitter(storage, on_error) - case TableEmitterType.CSV: - return CSVTableEmitter(storage) - case _: - msg = f"Unsupported table emitter type: {emitter_type}" - raise ValueError(msg) - - -def create_table_emitters( - emitter_types: list[TableEmitterType], - storage: PipelineStorage, - on_error: ErrorHandlerFn, -) -> list[TableEmitter]: - """Create a list of table emitters based on the specified types.""" - return [ - create_table_emitter(emitter_type, storage, on_error) - for emitter_type in emitter_types - ] diff --git a/graphrag/index/emit/json_table_emitter.py b/graphrag/index/emit/json_table_emitter.py deleted file mode 100644 index ceadc414d..000000000 --- a/graphrag/index/emit/json_table_emitter.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""JsonTableEmitter module.""" - -import logging - -import pandas as pd - -from graphrag.index.emit.table_emitter import TableEmitter -from graphrag.index.storage.pipeline_storage import PipelineStorage - -log = logging.getLogger(__name__) - - -class JsonTableEmitter(TableEmitter): - """JsonTableEmitter class.""" - - _storage: PipelineStorage - - def __init__(self, storage: PipelineStorage): - """Create a new Json Table Emitter.""" - self._storage = storage - - async def emit(self, name: str, data: pd.DataFrame) -> None: - """Emit a dataframe to storage.""" - filename = f"{name}.json" - - log.info("emitting JSON table %s", filename) - await self._storage.set( - filename, - data.to_json(orient="records", lines=True, force_ascii=False), - ) diff --git a/graphrag/index/emit/table_emitter.py b/graphrag/index/emit/table_emitter.py deleted file mode 100644 index 2161eeb52..000000000 --- a/graphrag/index/emit/table_emitter.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""TableEmitter protocol for emitting tables to a destination.""" - -from typing import Protocol - -import pandas as pd - - -class TableEmitter(Protocol): - """TableEmitter protocol for emitting tables to a destination.""" - - async def emit(self, name: str, data: pd.DataFrame) -> None: - """Emit a dataframe to storage.""" diff --git a/graphrag/index/emit/types.py b/graphrag/index/emit/types.py deleted file mode 100644 index 4c8e8f891..000000000 --- a/graphrag/index/emit/types.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Table Emitter Types.""" - -from enum import Enum - - -class TableEmitterType(str, Enum): - """Table Emitter Types.""" - - Json = "json" - Parquet = "parquet" - CSV = "csv" - - def __str__(self): - """Return the string representation of the enum value.""" - return self.value diff --git a/graphrag/index/emit/parquet_table_emitter.py b/graphrag/index/exporter.py similarity index 61% rename from graphrag/index/emit/parquet_table_emitter.py rename to graphrag/index/exporter.py index e649f283c..4910e8746 100644 --- a/graphrag/index/emit/parquet_table_emitter.py +++ b/graphrag/index/exporter.py @@ -1,7 +1,7 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""ParquetTableEmitter module.""" +"""ParquetExporter module.""" import logging import traceback @@ -9,15 +9,17 @@ import pandas as pd from pyarrow.lib import ArrowInvalid, ArrowTypeError -from graphrag.index.emit.table_emitter import TableEmitter -from graphrag.index.storage.pipeline_storage import PipelineStorage from graphrag.index.typing import ErrorHandlerFn +from graphrag.storage.pipeline_storage import PipelineStorage log = logging.getLogger(__name__) -class ParquetTableEmitter(TableEmitter): - """ParquetTableEmitter class.""" +class ParquetExporter: + """ParquetExporter class. + + A class that exports dataframe's to a storage destination in .parquet file format. + """ _storage: PipelineStorage _on_error: ErrorHandlerFn @@ -27,25 +29,25 @@ def __init__( storage: PipelineStorage, on_error: ErrorHandlerFn, ): - """Create a new Parquet Table Emitter.""" + """Create a new Parquet Table TableExporter.""" self._storage = storage self._on_error = on_error - async def emit(self, name: str, data: pd.DataFrame) -> None: - """Emit a dataframe to storage.""" + async def export(self, name: str, data: pd.DataFrame) -> None: + """Export dataframe to storage.""" filename = f"{name}.parquet" - log.info("emitting parquet table %s", filename) + log.info("exporting parquet table %s", filename) try: await self._storage.set(filename, data.to_parquet()) except ArrowTypeError as e: - log.exception("Error while emitting parquet table") + log.exception("Error while exporting parquet table") self._on_error( e, traceback.format_exc(), None, ) except ArrowInvalid as e: - log.exception("Error while emitting parquet table") + log.exception("Error while exporting parquet table") self._on_error( e, traceback.format_exc(), diff --git a/graphrag/index/flows/create_base_entity_graph.py b/graphrag/index/flows/create_base_entity_graph.py index fe429d336..cc9bb3302 100644 --- a/graphrag/index/flows/create_base_entity_graph.py +++ b/graphrag/index/flows/create_base_entity_graph.py @@ -11,7 +11,7 @@ VerbCallbacks, ) -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.operations.cluster_graph import cluster_graph from graphrag.index.operations.embed_graph import embed_graph from graphrag.index.operations.extract_entities import extract_entities @@ -22,7 +22,7 @@ from graphrag.index.operations.summarize_descriptions import ( summarize_descriptions, ) -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.storage.pipeline_storage import PipelineStorage async def create_base_entity_graph( diff --git a/graphrag/index/flows/create_base_text_units.py b/graphrag/index/flows/create_base_text_units.py index cca55d3a1..3603fad20 100644 --- a/graphrag/index/flows/create_base_text_units.py +++ b/graphrag/index/flows/create_base_text_units.py @@ -16,8 +16,8 @@ from graphrag.index.operations.chunk_text import chunk_text from graphrag.index.operations.snapshot import snapshot -from graphrag.index.storage.pipeline_storage import PipelineStorage from graphrag.index.utils.hashing import gen_md5_hash +from graphrag.storage.pipeline_storage import PipelineStorage async def create_base_text_units( diff --git a/graphrag/index/flows/create_final_community_reports.py b/graphrag/index/flows/create_final_community_reports.py index 754d66d6e..681cab7c9 100644 --- a/graphrag/index/flows/create_final_community_reports.py +++ b/graphrag/index/flows/create_final_community_reports.py @@ -11,7 +11,7 @@ VerbCallbacks, ) -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.graph.extractors.community_reports.schemas import ( CLAIM_DESCRIPTION, CLAIM_DETAILS, diff --git a/graphrag/index/flows/create_final_covariates.py b/graphrag/index/flows/create_final_covariates.py index ad25445bf..ba400be56 100644 --- a/graphrag/index/flows/create_final_covariates.py +++ b/graphrag/index/flows/create_final_covariates.py @@ -12,7 +12,7 @@ VerbCallbacks, ) -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.operations.extract_covariates import ( extract_covariates, ) diff --git a/graphrag/index/flows/create_final_nodes.py b/graphrag/index/flows/create_final_nodes.py index c966e673e..fd8a58316 100644 --- a/graphrag/index/flows/create_final_nodes.py +++ b/graphrag/index/flows/create_final_nodes.py @@ -13,7 +13,7 @@ from graphrag.index.operations.layout_graph import layout_graph from graphrag.index.operations.snapshot import snapshot from graphrag.index.operations.unpack_graph import unpack_graph -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.storage.pipeline_storage import PipelineStorage async def create_final_nodes( diff --git a/graphrag/index/flows/generate_text_embeddings.py b/graphrag/index/flows/generate_text_embeddings.py index 23fee842d..f701c464f 100644 --- a/graphrag/index/flows/generate_text_embeddings.py +++ b/graphrag/index/flows/generate_text_embeddings.py @@ -10,7 +10,7 @@ VerbCallbacks, ) -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.config.embeddings import ( community_full_content_embedding, community_summary_embedding, @@ -23,7 +23,7 @@ ) from graphrag.index.operations.embed_text import embed_text from graphrag.index.operations.snapshot import snapshot -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.storage.pipeline_storage import PipelineStorage log = logging.getLogger(__name__) diff --git a/graphrag/index/input/csv.py b/graphrag/index/input/csv.py index ceec80b50..f5bb022b1 100644 --- a/graphrag/index/input/csv.py +++ b/graphrag/index/input/csv.py @@ -11,9 +11,9 @@ import pandas as pd from graphrag.index.config.input import PipelineCSVInputConfig, PipelineInputConfig -from graphrag.index.storage.pipeline_storage import PipelineStorage from graphrag.index.utils.hashing import gen_md5_hash from graphrag.logging.base import ProgressReporter +from graphrag.storage.pipeline_storage import PipelineStorage log = logging.getLogger(__name__) diff --git a/graphrag/index/input/load_input.py b/graphrag/index/input/factory.py similarity index 91% rename from graphrag/index/input/load_input.py rename to graphrag/index/input/factory.py index 4bfc82cbb..703bde358 100644 --- a/graphrag/index/input/load_input.py +++ b/graphrag/index/input/factory.py @@ -1,7 +1,7 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""A module containing load_input method definition.""" +"""A module containing create_input method definition.""" import logging from collections.abc import Awaitable, Callable @@ -17,10 +17,10 @@ from graphrag.index.input.csv import load as load_csv from graphrag.index.input.text import input_type as text from graphrag.index.input.text import load as load_text -from graphrag.index.storage.blob_pipeline_storage import BlobPipelineStorage -from graphrag.index.storage.file_pipeline_storage import FilePipelineStorage from graphrag.logging.base import ProgressReporter from graphrag.logging.null_progress import NullProgressReporter +from graphrag.storage.blob_pipeline_storage import BlobPipelineStorage +from graphrag.storage.file_pipeline_storage import FilePipelineStorage log = logging.getLogger(__name__) loaders: dict[str, Callable[..., Awaitable[pd.DataFrame]]] = { @@ -29,12 +29,12 @@ } -async def load_input( +async def create_input( config: PipelineInputConfig | InputConfig, progress_reporter: ProgressReporter | None = None, root_dir: str | None = None, ) -> pd.DataFrame: - """Load the input data for a pipeline.""" + """Instantiate input data for a pipeline.""" root_dir = root_dir or "" log.info("loading input from root_dir=%s", config.base_dir) progress_reporter = progress_reporter or NullProgressReporter() diff --git a/graphrag/index/input/text.py b/graphrag/index/input/text.py index 45814ee3f..c4854ab65 100644 --- a/graphrag/index/input/text.py +++ b/graphrag/index/input/text.py @@ -11,9 +11,9 @@ import pandas as pd from graphrag.index.config.input import PipelineInputConfig -from graphrag.index.storage.pipeline_storage import PipelineStorage from graphrag.index.utils.hashing import gen_md5_hash from graphrag.logging.base import ProgressReporter +from graphrag.storage.pipeline_storage import PipelineStorage DEFAULT_FILE_PATTERN = re.compile( r".*[\\/](?P[^\\/]+)[\\/](?P\d{4})-(?P\d{2})-(?P\d{2})_(?P[^_]+)_\d+\.txt" diff --git a/graphrag/index/llm/load_llm.py b/graphrag/index/llm/load_llm.py index f06c89ad9..4354bd3f3 100644 --- a/graphrag/index/llm/load_llm.py +++ b/graphrag/index/llm/load_llm.py @@ -27,7 +27,7 @@ if TYPE_CHECKING: from datashaper import VerbCallbacks - from graphrag.index.cache.pipeline_cache import PipelineCache + from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.typing import ErrorHandlerFn log = logging.getLogger(__name__) diff --git a/graphrag/index/operations/cluster_graph.py b/graphrag/index/operations/cluster_graph.py index 295c78e6f..54598e48e 100644 --- a/graphrag/index/operations/cluster_graph.py +++ b/graphrag/index/operations/cluster_graph.py @@ -105,8 +105,8 @@ def apply_clustering( graph.nodes[node]["level"] = level # add node degree - for node_degree in graph.degree: - graph.nodes[str(node_degree[0])]["degree"] = int(node_degree[1]) + for node, degree in graph.degree: # type: ignore + graph.nodes[node]["degree"] = int(degree) # add node uuid and incremental record id (a human readable id used as reference in the final report) for index, node in enumerate(graph.nodes()): diff --git a/graphrag/index/operations/embed_text/embed_text.py b/graphrag/index/operations/embed_text/embed_text.py index 627d1728b..06964ebd3 100644 --- a/graphrag/index/operations/embed_text/embed_text.py +++ b/graphrag/index/operations/embed_text/embed_text.py @@ -11,7 +11,7 @@ import pandas as pd from datashaper import VerbCallbacks -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.operations.embed_text.strategies.typing import TextEmbeddingStrategy from graphrag.utils.embeddings import create_collection_name from graphrag.vector_stores.base import BaseVectorStore, VectorStoreDocument diff --git a/graphrag/index/operations/embed_text/strategies/mock.py b/graphrag/index/operations/embed_text/strategies/mock.py index a32eceb38..3ebb1de8a 100644 --- a/graphrag/index/operations/embed_text/strategies/mock.py +++ b/graphrag/index/operations/embed_text/strategies/mock.py @@ -9,7 +9,7 @@ from datashaper import ProgressTicker, VerbCallbacks, progress_ticker -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.operations.embed_text.strategies.typing import TextEmbeddingResult diff --git a/graphrag/index/operations/embed_text/strategies/openai.py b/graphrag/index/operations/embed_text/strategies/openai.py index f0445d848..15ce0a1ff 100644 --- a/graphrag/index/operations/embed_text/strategies/openai.py +++ b/graphrag/index/operations/embed_text/strategies/openai.py @@ -11,7 +11,7 @@ from datashaper import ProgressTicker, VerbCallbacks, progress_ticker import graphrag.config.defaults as defs -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.llm.load_llm import load_llm_embeddings from graphrag.index.operations.embed_text.strategies.typing import TextEmbeddingResult from graphrag.index.text_splitting.text_splitting import TokenTextSplitter diff --git a/graphrag/index/operations/embed_text/strategies/typing.py b/graphrag/index/operations/embed_text/strategies/typing.py index 79998f72e..b53d710c0 100644 --- a/graphrag/index/operations/embed_text/strategies/typing.py +++ b/graphrag/index/operations/embed_text/strategies/typing.py @@ -8,7 +8,7 @@ from datashaper import VerbCallbacks -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache @dataclass diff --git a/graphrag/index/operations/extract_covariates/extract_covariates.py b/graphrag/index/operations/extract_covariates/extract_covariates.py index 48b3b04c3..ff78f949f 100644 --- a/graphrag/index/operations/extract_covariates/extract_covariates.py +++ b/graphrag/index/operations/extract_covariates/extract_covariates.py @@ -14,7 +14,7 @@ derive_from_rows, ) -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.operations.extract_covariates.typing import ( Covariate, CovariateExtractStrategy, diff --git a/graphrag/index/operations/extract_covariates/strategies.py b/graphrag/index/operations/extract_covariates/strategies.py index 4d7729961..46d0ca0c2 100644 --- a/graphrag/index/operations/extract_covariates/strategies.py +++ b/graphrag/index/operations/extract_covariates/strategies.py @@ -9,7 +9,7 @@ from datashaper import VerbCallbacks import graphrag.config.defaults as defs -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.graph.extractors.claims import ClaimExtractor from graphrag.index.llm.load_llm import load_llm from graphrag.index.operations.extract_covariates.typing import ( diff --git a/graphrag/index/operations/extract_covariates/typing.py b/graphrag/index/operations/extract_covariates/typing.py index a208ea3e9..1a0a857a6 100644 --- a/graphrag/index/operations/extract_covariates/typing.py +++ b/graphrag/index/operations/extract_covariates/typing.py @@ -10,7 +10,7 @@ from datashaper import VerbCallbacks -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache @dataclass diff --git a/graphrag/index/operations/extract_entities/extract_entities.py b/graphrag/index/operations/extract_entities/extract_entities.py index 522d4d98e..1e48237e2 100644 --- a/graphrag/index/operations/extract_entities/extract_entities.py +++ b/graphrag/index/operations/extract_entities/extract_entities.py @@ -15,8 +15,8 @@ derive_from_rows, ) +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.bootstrap import bootstrap -from graphrag.index.cache.pipeline_cache import PipelineCache from graphrag.index.operations.extract_entities.strategies.typing import ( Document, EntityExtractStrategy, diff --git a/graphrag/index/operations/extract_entities/strategies/graph_intelligence.py b/graphrag/index/operations/extract_entities/strategies/graph_intelligence.py index 18fcc9744..adab8e37e 100644 --- a/graphrag/index/operations/extract_entities/strategies/graph_intelligence.py +++ b/graphrag/index/operations/extract_entities/strategies/graph_intelligence.py @@ -6,7 +6,7 @@ from datashaper import VerbCallbacks import graphrag.config.defaults as defs -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.graph.extractors import GraphExtractor from graphrag.index.llm.load_llm import load_llm from graphrag.index.operations.extract_entities.strategies.typing import ( diff --git a/graphrag/index/operations/extract_entities/strategies/nltk.py b/graphrag/index/operations/extract_entities/strategies/nltk.py index 08f447004..e2a9feb3a 100644 --- a/graphrag/index/operations/extract_entities/strategies/nltk.py +++ b/graphrag/index/operations/extract_entities/strategies/nltk.py @@ -8,7 +8,7 @@ from datashaper import VerbCallbacks from nltk.corpus import words -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.operations.extract_entities.strategies.typing import ( Document, EntityExtractionResult, diff --git a/graphrag/index/operations/extract_entities/strategies/typing.py b/graphrag/index/operations/extract_entities/strategies/typing.py index 57df220d9..a3b473faa 100644 --- a/graphrag/index/operations/extract_entities/strategies/typing.py +++ b/graphrag/index/operations/extract_entities/strategies/typing.py @@ -10,7 +10,7 @@ import networkx as nx from datashaper import VerbCallbacks -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache ExtractedEntity = dict[str, Any] StrategyConfig = dict[str, Any] diff --git a/graphrag/index/operations/snapshot.py b/graphrag/index/operations/snapshot.py index 7ae7f0ca0..1a61fce1c 100644 --- a/graphrag/index/operations/snapshot.py +++ b/graphrag/index/operations/snapshot.py @@ -5,7 +5,7 @@ import pandas as pd -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.storage.pipeline_storage import PipelineStorage async def snapshot( diff --git a/graphrag/index/operations/snapshot_graphml.py b/graphrag/index/operations/snapshot_graphml.py index feda376f9..6d1d48849 100644 --- a/graphrag/index/operations/snapshot_graphml.py +++ b/graphrag/index/operations/snapshot_graphml.py @@ -5,7 +5,7 @@ import networkx as nx -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.storage.pipeline_storage import PipelineStorage async def snapshot_graphml( diff --git a/graphrag/index/operations/snapshot_rows.py b/graphrag/index/operations/snapshot_rows.py index 0050f9806..1140ee555 100644 --- a/graphrag/index/operations/snapshot_rows.py +++ b/graphrag/index/operations/snapshot_rows.py @@ -9,7 +9,7 @@ import pandas as pd -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.storage.pipeline_storage import PipelineStorage @dataclass diff --git a/graphrag/index/operations/summarize_communities/strategies.py b/graphrag/index/operations/summarize_communities/strategies.py index 33900a985..4c3d7a3fe 100644 --- a/graphrag/index/operations/summarize_communities/strategies.py +++ b/graphrag/index/operations/summarize_communities/strategies.py @@ -9,7 +9,7 @@ from datashaper import VerbCallbacks -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.graph.extractors.community_reports import ( CommunityReportsExtractor, ) diff --git a/graphrag/index/operations/summarize_communities/summarize_communities.py b/graphrag/index/operations/summarize_communities/summarize_communities.py index de35981d0..bab02ef5e 100644 --- a/graphrag/index/operations/summarize_communities/summarize_communities.py +++ b/graphrag/index/operations/summarize_communities/summarize_communities.py @@ -16,7 +16,7 @@ import graphrag.config.defaults as defaults import graphrag.index.graph.extractors.community_reports.schemas as schemas -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.graph.extractors.community_reports import ( get_levels, prep_community_report_context, diff --git a/graphrag/index/operations/summarize_communities/typing.py b/graphrag/index/operations/summarize_communities/typing.py index 46a1a0957..6c6b7e677 100644 --- a/graphrag/index/operations/summarize_communities/typing.py +++ b/graphrag/index/operations/summarize_communities/typing.py @@ -10,7 +10,7 @@ from datashaper import VerbCallbacks from typing_extensions import TypedDict -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache ExtractedEntity = dict[str, Any] StrategyConfig = dict[str, Any] diff --git a/graphrag/index/operations/summarize_descriptions/strategies.py b/graphrag/index/operations/summarize_descriptions/strategies.py index fd6ea5a84..7a574e081 100644 --- a/graphrag/index/operations/summarize_descriptions/strategies.py +++ b/graphrag/index/operations/summarize_descriptions/strategies.py @@ -5,7 +5,7 @@ from datashaper import VerbCallbacks -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.graph.extractors.summarize import SummarizeExtractor from graphrag.index.llm.load_llm import load_llm from graphrag.index.operations.summarize_descriptions.typing import ( diff --git a/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py b/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py index 612f2c679..471ab9103 100644 --- a/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py +++ b/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py @@ -14,7 +14,7 @@ progress_ticker, ) -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.operations.summarize_descriptions.typing import ( SummarizationStrategy, SummarizeStrategyType, diff --git a/graphrag/index/operations/summarize_descriptions/typing.py b/graphrag/index/operations/summarize_descriptions/typing.py index c7ba9ceb7..dc10b3ebe 100644 --- a/graphrag/index/operations/summarize_descriptions/typing.py +++ b/graphrag/index/operations/summarize_descriptions/typing.py @@ -10,7 +10,7 @@ from datashaper import VerbCallbacks -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache StrategyConfig = dict[str, Any] diff --git a/graphrag/index/run/cache.py b/graphrag/index/run/cache.py deleted file mode 100644 index d1c57e4f8..000000000 --- a/graphrag/index/run/cache.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Cache functions for the GraphRAG update module.""" - -from graphrag.index.cache.load_cache import load_cache -from graphrag.index.cache.pipeline_cache import PipelineCache -from graphrag.index.config.cache import ( - PipelineCacheConfigTypes, - PipelineMemoryCacheConfig, -) - - -def _create_cache( - config: PipelineCacheConfigTypes | None, root_dir: str -) -> PipelineCache: - return load_cache(config or PipelineMemoryCacheConfig(), root_dir=root_dir) diff --git a/graphrag/index/run/profiling.py b/graphrag/index/run/profiling.py index 6121b9ea2..36efcde01 100644 --- a/graphrag/index/run/profiling.py +++ b/graphrag/index/run/profiling.py @@ -11,7 +11,7 @@ from datashaper import MemoryProfile, Workflow, WorkflowRunResult from graphrag.index.context import PipelineRunStats -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.storage.pipeline_storage import PipelineStorage log = logging.getLogger(__name__) diff --git a/graphrag/index/run/run.py b/graphrag/index/run/run.py index d2dfa19df..8af345f9c 100644 --- a/graphrag/index/run/run.py +++ b/graphrag/index/run/run.py @@ -14,17 +14,20 @@ import pandas as pd from datashaper import NoopVerbCallbacks, WorkflowCallbacks +from graphrag.cache.factory import create_cache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.callbacks.console_workflow_callbacks import ConsoleWorkflowCallbacks -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.factory import create_pipeline_reporter +from graphrag.index.config.cache import PipelineMemoryCacheConfig from graphrag.index.config.pipeline import ( PipelineConfig, PipelineWorkflowReference, ) +from graphrag.index.config.storage import PipelineFileStorageConfig from graphrag.index.config.workflow import PipelineWorkflowStep -from graphrag.index.emit.factories import create_table_emitters -from graphrag.index.emit.types import TableEmitterType +from graphrag.index.exporter import ParquetExporter +from graphrag.index.input.factory import create_input from graphrag.index.load_pipeline_config import load_pipeline_config -from graphrag.index.run.cache import _create_cache from graphrag.index.run.postprocess import ( _create_postprocess_steps, _run_post_process_steps, @@ -32,8 +35,6 @@ from graphrag.index.run.profiling import _dump_stats from graphrag.index.run.utils import ( _apply_substitutions, - _create_input, - _create_reporter, _validate_dataset, create_run_context, ) @@ -41,7 +42,6 @@ _create_callback_chain, _process_workflow, ) -from graphrag.index.storage.pipeline_storage import PipelineStorage from graphrag.index.typing import PipelineRunResult from graphrag.index.update.incremental_index import ( get_delta_docs, @@ -54,7 +54,8 @@ ) from graphrag.logging.base import ProgressReporter from graphrag.logging.null_progress import NullProgressReporter -from graphrag.utils.storage import _create_storage +from graphrag.storage.factory import create_storage +from graphrag.storage.pipeline_storage import PipelineStorage log = logging.getLogger(__name__) @@ -71,7 +72,6 @@ async def run_pipeline_with_config( input_post_process_steps: list[PipelineWorkflowStep] | None = None, additional_verbs: VerbDefinitions | None = None, additional_workflows: WorkflowDefinitions | None = None, - emit: list[TableEmitterType] | None = None, memory_profile: bool = False, run_id: str | None = None, is_resume_run: bool = False, @@ -90,7 +90,6 @@ async def run_pipeline_with_config( - input_post_process_steps - The post process steps to run on the input data (this overrides the config) - additional_verbs - The custom verbs to use for the pipeline. - additional_workflows - The custom workflows to use for the pipeline. - - emit - The table emitters to use for the pipeline. - memory_profile - Whether or not to profile the memory. - run_id - The run id to start or resume from. """ @@ -105,19 +104,26 @@ async def run_pipeline_with_config( root_dir = config.root_dir or "" progress_reporter = progress_reporter or NullProgressReporter() - storage = storage or _create_storage(config.storage, root_dir=Path(root_dir)) + storage = storage = create_storage(config.storage) # type: ignore if is_update_run: - update_index_storage = update_index_storage or _create_storage( - config.update_index_storage, root_dir=Path(root_dir) + update_index_storage = update_index_storage or create_storage( + config.update_index_storage + or PipelineFileStorageConfig(base_dir=str(Path(root_dir) / "output")) ) - cache = cache or _create_cache(config.cache, root_dir) - callbacks = callbacks or _create_reporter(config.reporting, root_dir) + # TODO: remove the default choice (PipelineMemoryCacheConfig) when the new config system guarantees the existence of a cache config + cache = cache or create_cache(config.cache or PipelineMemoryCacheConfig(), root_dir) + callbacks = ( + create_pipeline_reporter(config.reporting, root_dir) + if config.reporting + else None + ) + # TODO: remove the type ignore when the new config system guarantees the existence of an input config dataset = ( dataset if dataset is not None - else await _create_input(config.input, progress_reporter, root_dir) + else await create_input(config.input, progress_reporter, root_dir) # type: ignore ) post_process_steps = input_post_process_steps or _create_postprocess_steps( @@ -152,7 +158,6 @@ async def run_pipeline_with_config( additional_verbs=additional_verbs, additional_workflows=additional_workflows, progress_reporter=progress_reporter, - emit=emit, is_resume_run=False, ): tables_dict[table.workflow] = table.result @@ -180,7 +185,6 @@ async def run_pipeline_with_config( additional_verbs=additional_verbs, additional_workflows=additional_workflows, progress_reporter=progress_reporter, - emit=emit, is_resume_run=is_resume_run, ): yield table @@ -196,7 +200,6 @@ async def run_pipeline( input_post_process_steps: list[PipelineWorkflowStep] | None = None, additional_verbs: VerbDefinitions | None = None, additional_workflows: WorkflowDefinitions | None = None, - emit: list[TableEmitterType] | None = None, memory_profile: bool = False, is_resume_run: bool = False, **_kwargs: dict, @@ -222,21 +225,18 @@ async def run_pipeline( """ start_time = time.time() - context = create_run_context(storage=storage, cache=cache, stats=None) - progress_reporter = progress_reporter or NullProgressReporter() callbacks = callbacks or ConsoleWorkflowCallbacks() callbacks = _create_callback_chain(callbacks, progress_reporter) - # TODO: This default behavior is already defined at the API level. Update tests - # of this function to pass in an emit type before removing this default setting. - emit = emit or [TableEmitterType.Parquet] - emitters = create_table_emitters( - emit, + + context = create_run_context(storage=storage, cache=cache, stats=None) + exporter = ParquetExporter( context.storage, lambda e, s, d: cast(WorkflowCallbacks, callbacks).on_error( - "Error emitting table", e, s, d + "Error exporting table", e, s, d ), ) + loaded_workflows = load_workflows( workflows, additional_verbs=additional_verbs, @@ -245,17 +245,11 @@ async def run_pipeline( ) workflows_to_run = loaded_workflows.workflows workflow_dependencies = loaded_workflows.dependencies - - if len(emitters) == 0: - log.info( - "No emitters provided. No table outputs will be generated. This is probably not correct." - ) - dataset = await _run_post_process_steps( input_post_process_steps, dataset, context, callbacks ) - # Make sure the incoming data is valid + # ensure the incoming data is valid _validate_dataset(dataset) log.info("Final # of rows loaded: %s", len(dataset)) @@ -266,15 +260,14 @@ async def run_pipeline( await _dump_stats(context.stats, context.storage) for workflow_to_run in workflows_to_run: - # Try to flush out any intermediate dataframes + # flush out any intermediate dataframes gc.collect() - last_workflow = workflow_to_run.workflow.name result = await _process_workflow( workflow_to_run.workflow, context, callbacks, - emitters, + exporter, workflow_dependencies, dataset, start_time, diff --git a/graphrag/index/run/utils.py b/graphrag/index/run/utils.py index ab5b4989f..e78ee1117 100644 --- a/graphrag/index/run/utils.py +++ b/graphrag/index/run/utils.py @@ -8,56 +8,29 @@ from typing import Any import pandas as pd -from datashaper import ( - WorkflowCallbacks, -) -from graphrag.callbacks.factories import create_pipeline_reporter -from graphrag.index.cache.memory_pipeline_cache import InMemoryCache -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.memory_pipeline_cache import InMemoryCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.config.cache import ( PipelineBlobCacheConfig, PipelineFileCacheConfig, ) -from graphrag.index.config.input import PipelineInputConfigTypes from graphrag.index.config.pipeline import PipelineConfig from graphrag.index.config.reporting import ( PipelineBlobReportingConfig, PipelineFileReportingConfig, - PipelineReportingConfigTypes, ) from graphrag.index.config.storage import ( PipelineBlobStorageConfig, PipelineFileStorageConfig, ) from graphrag.index.context import PipelineRunContext, PipelineRunStats -from graphrag.index.input.load_input import load_input -from graphrag.index.storage.memory_pipeline_storage import MemoryPipelineStorage -from graphrag.index.storage.pipeline_storage import PipelineStorage -from graphrag.logging.base import ProgressReporter +from graphrag.storage.memory_pipeline_storage import MemoryPipelineStorage +from graphrag.storage.pipeline_storage import PipelineStorage log = logging.getLogger(__name__) -def _create_reporter( - config: PipelineReportingConfigTypes | None, root_dir: str -) -> WorkflowCallbacks | None: - """Create the reporter for the pipeline.""" - return create_pipeline_reporter(config, root_dir) if config else None - - -async def _create_input( - config: PipelineInputConfigTypes | None, - progress_reporter: ProgressReporter | None, - root_dir: str, -) -> pd.DataFrame | None: - """Load the input for the pipeline.""" - if config is None: - return None - - return await load_input(config, progress_reporter, root_dir) - - def _validate_dataset(dataset: Any): """Validate the dataset for the pipeline. diff --git a/graphrag/index/run/workflow.py b/graphrag/index/run/workflow.py index 449791026..f6d813a4e 100644 --- a/graphrag/index/run/workflow.py +++ b/graphrag/index/run/workflow.py @@ -18,11 +18,11 @@ from graphrag.callbacks.progress_workflow_callbacks import ProgressWorkflowCallbacks from graphrag.index.config.pipeline import PipelineConfig from graphrag.index.context import PipelineRunContext -from graphrag.index.emit.table_emitter import TableEmitter +from graphrag.index.exporter import ParquetExporter from graphrag.index.run.profiling import _write_workflow_stats -from graphrag.index.storage.pipeline_storage import PipelineStorage from graphrag.index.typing import PipelineRunResult from graphrag.logging.base import ProgressReporter +from graphrag.storage.pipeline_storage import PipelineStorage from graphrag.utils.storage import _load_table_from_storage log = logging.getLogger(__name__) @@ -43,10 +43,10 @@ async def _inject_workflow_data_dependencies( try: table = await _load_table_from_storage(f"{id}.parquet", storage) except ValueError: - # our workflows now allow transient tables, and we avoid putting those in primary storage - # however, we need to keep the table in the dependency list for proper execution order - # this allows us to catch missing table errors but emit a warning for pipeline users who may genuinely have an error (which we expect to be very rare) - # todo: this issue will resolve itself if we remove DataShaper completely + # our workflows allow for transient tables, and we avoid putting those in storage + # however, we need to keep the table in the dependency list for proper execution order. + # this allows us to catch missing table errors and issue a warning for pipeline users who may genuinely have an error (which we expect to be very rare) + # todo: this issue will resolve itself once we remove DataShaper completely log.warning( "Dependency table %s not found in storage: it may be a runtime-only in-memory table. If you see further errors, this may be an actual problem.", id, @@ -55,17 +55,15 @@ async def _inject_workflow_data_dependencies( workflow.add_table(workflow_id, table) -async def _emit_workflow_output( - workflow: Workflow, emitters: list[TableEmitter] +async def _export_workflow_output( + workflow: Workflow, exporter: ParquetExporter ) -> pd.DataFrame: - """Emit the workflow output.""" + """Export the output from each step of the workflow.""" output = cast(pd.DataFrame, workflow.output()) - # only write the final output if it has content - # this is expressly designed to allow us to create - # workflows with side effects that don't produce a formal output to save + # only write final output that is not empty (i.e. has content) + # NOTE: this design is intentional - it accounts for workflow steps with "side effects" that don't produce a formal output to save if not output.empty: - for emitter in emitters: - await emitter.emit(workflow.name, output) + await exporter.export(workflow.name, output) return output @@ -85,7 +83,7 @@ async def _process_workflow( workflow: Workflow, context: PipelineRunContext, callbacks: WorkflowCallbacks, - emitters: list[TableEmitter], + exporter: ParquetExporter, workflow_dependencies: dict[str, list[str]], dataset: pd.DataFrame, start_time: float, @@ -113,7 +111,7 @@ async def _process_workflow( ) # Save the output from the workflow - output = await _emit_workflow_output(workflow, emitters) + output = await _export_workflow_output(workflow, exporter) workflow.dispose() return PipelineRunResult(workflow_name, output, None) diff --git a/graphrag/index/update/entities.py b/graphrag/index/update/entities.py index 6f59bb81e..538e39e9c 100644 --- a/graphrag/index/update/entities.py +++ b/graphrag/index/update/entities.py @@ -10,7 +10,7 @@ import pandas as pd from datashaper import VerbCallbacks -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.config.pipeline import PipelineConfig from graphrag.index.operations.summarize_descriptions.strategies import ( run_graph_intelligence as run_entity_summarization, diff --git a/graphrag/index/update/incremental_index.py b/graphrag/index/update/incremental_index.py index abebfd7e0..582807201 100644 --- a/graphrag/index/update/incremental_index.py +++ b/graphrag/index/update/incremental_index.py @@ -9,11 +9,10 @@ import pandas as pd from datashaper import VerbCallbacks -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.config.pipeline import PipelineConfig from graphrag.index.flows.generate_text_embeddings import generate_text_embeddings from graphrag.index.run.workflow import _find_workflow_config -from graphrag.index.storage.pipeline_storage import PipelineStorage from graphrag.index.update.communities import ( _merge_and_resolve_nodes, _update_and_merge_communities, @@ -25,6 +24,7 @@ ) from graphrag.index.update.relationships import _update_and_merge_relationships from graphrag.logging.print_progress import ProgressReporter +from graphrag.storage.pipeline_storage import PipelineStorage from graphrag.utils.storage import _load_table_from_storage diff --git a/graphrag/index/workflows/v1/subflows/create_base_entity_graph.py b/graphrag/index/workflows/v1/subflows/create_base_entity_graph.py index 3f1199e21..a1c25d5d7 100644 --- a/graphrag/index/workflows/v1/subflows/create_base_entity_graph.py +++ b/graphrag/index/workflows/v1/subflows/create_base_entity_graph.py @@ -14,11 +14,11 @@ ) from datashaper.table_store.types import VerbResult, create_verb_result -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.flows.create_base_entity_graph import ( create_base_entity_graph as create_base_entity_graph_flow, ) -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.storage.pipeline_storage import PipelineStorage @verb( diff --git a/graphrag/index/workflows/v1/subflows/create_base_text_units.py b/graphrag/index/workflows/v1/subflows/create_base_text_units.py index e9b3f4393..507a26585 100644 --- a/graphrag/index/workflows/v1/subflows/create_base_text_units.py +++ b/graphrag/index/workflows/v1/subflows/create_base_text_units.py @@ -17,7 +17,7 @@ from graphrag.index.flows.create_base_text_units import ( create_base_text_units as create_base_text_units_flow, ) -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.storage.pipeline_storage import PipelineStorage @verb(name="create_base_text_units", treats_input_tables_as_immutable=True) diff --git a/graphrag/index/workflows/v1/subflows/create_final_communities.py b/graphrag/index/workflows/v1/subflows/create_final_communities.py index d66a32759..546db5d45 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_communities.py +++ b/graphrag/index/workflows/v1/subflows/create_final_communities.py @@ -15,7 +15,7 @@ from graphrag.index.flows.create_final_communities import ( create_final_communities as create_final_communities_flow, ) -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.storage.pipeline_storage import PipelineStorage @verb(name="create_final_communities", treats_input_tables_as_immutable=True) diff --git a/graphrag/index/workflows/v1/subflows/create_final_community_reports.py b/graphrag/index/workflows/v1/subflows/create_final_community_reports.py index ff6d9ef8a..1b58a4f74 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_community_reports.py +++ b/graphrag/index/workflows/v1/subflows/create_final_community_reports.py @@ -15,7 +15,7 @@ ) from datashaper.table_store.types import VerbResult, create_verb_result -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.flows.create_final_community_reports import ( create_final_community_reports as create_final_community_reports_flow, ) diff --git a/graphrag/index/workflows/v1/subflows/create_final_covariates.py b/graphrag/index/workflows/v1/subflows/create_final_covariates.py index 0ab54b1d8..c4f1795dd 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_covariates.py +++ b/graphrag/index/workflows/v1/subflows/create_final_covariates.py @@ -13,11 +13,11 @@ ) from datashaper.table_store.types import VerbResult, create_verb_result -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.flows.create_final_covariates import ( create_final_covariates as create_final_covariates_flow, ) -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.storage.pipeline_storage import PipelineStorage @verb(name="create_final_covariates", treats_input_tables_as_immutable=True) diff --git a/graphrag/index/workflows/v1/subflows/create_final_documents.py b/graphrag/index/workflows/v1/subflows/create_final_documents.py index 4ac4e24dc..8ec3541cf 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_documents.py +++ b/graphrag/index/workflows/v1/subflows/create_final_documents.py @@ -16,7 +16,7 @@ from graphrag.index.flows.create_final_documents import ( create_final_documents as create_final_documents_flow, ) -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.storage.pipeline_storage import PipelineStorage @verb( diff --git a/graphrag/index/workflows/v1/subflows/create_final_entities.py b/graphrag/index/workflows/v1/subflows/create_final_entities.py index 968fa0d24..d3795b085 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_entities.py +++ b/graphrag/index/workflows/v1/subflows/create_final_entities.py @@ -15,7 +15,7 @@ from graphrag.index.flows.create_final_entities import ( create_final_entities as create_final_entities_flow, ) -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.storage.pipeline_storage import PipelineStorage @verb( diff --git a/graphrag/index/workflows/v1/subflows/create_final_nodes.py b/graphrag/index/workflows/v1/subflows/create_final_nodes.py index e8266b76b..24eb98b9b 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_nodes.py +++ b/graphrag/index/workflows/v1/subflows/create_final_nodes.py @@ -15,7 +15,7 @@ from graphrag.index.flows.create_final_nodes import ( create_final_nodes as create_final_nodes_flow, ) -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.storage.pipeline_storage import PipelineStorage @verb(name="create_final_nodes", treats_input_tables_as_immutable=True) diff --git a/graphrag/index/workflows/v1/subflows/create_final_relationships.py b/graphrag/index/workflows/v1/subflows/create_final_relationships.py index 995eeec83..f8553e998 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_relationships.py +++ b/graphrag/index/workflows/v1/subflows/create_final_relationships.py @@ -17,8 +17,8 @@ from graphrag.index.flows.create_final_relationships import ( create_final_relationships as create_final_relationships_flow, ) -from graphrag.index.storage.pipeline_storage import PipelineStorage from graphrag.index.utils.ds_util import get_required_input_table +from graphrag.storage.pipeline_storage import PipelineStorage @verb( diff --git a/graphrag/index/workflows/v1/subflows/create_final_text_units.py b/graphrag/index/workflows/v1/subflows/create_final_text_units.py index 8cc69839a..2810d060a 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_text_units.py +++ b/graphrag/index/workflows/v1/subflows/create_final_text_units.py @@ -17,8 +17,8 @@ from graphrag.index.flows.create_final_text_units import ( create_final_text_units as create_final_text_units_flow, ) -from graphrag.index.storage.pipeline_storage import PipelineStorage from graphrag.index.utils.ds_util import get_named_input_table, get_required_input_table +from graphrag.storage.pipeline_storage import PipelineStorage @verb(name="create_final_text_units", treats_input_tables_as_immutable=True) diff --git a/graphrag/index/workflows/v1/subflows/generate_text_embeddings.py b/graphrag/index/workflows/v1/subflows/generate_text_embeddings.py index 1ac256a90..b44513a67 100644 --- a/graphrag/index/workflows/v1/subflows/generate_text_embeddings.py +++ b/graphrag/index/workflows/v1/subflows/generate_text_embeddings.py @@ -16,12 +16,12 @@ verb, ) -from graphrag.index.cache.pipeline_cache import PipelineCache +from graphrag.cache.pipeline_cache import PipelineCache from graphrag.index.flows.generate_text_embeddings import ( generate_text_embeddings as generate_text_embeddings_flow, ) -from graphrag.index.storage.pipeline_storage import PipelineStorage from graphrag.index.utils.ds_util import get_required_input_table +from graphrag.storage.pipeline_storage import PipelineStorage log = logging.getLogger(__name__) diff --git a/graphrag/logging/factories.py b/graphrag/logging/factory.py similarity index 100% rename from graphrag/logging/factories.py rename to graphrag/logging/factory.py diff --git a/graphrag/prompt_tune/loader/input.py b/graphrag/prompt_tune/loader/input.py index eb8ad8f8f..f7e29ea42 100644 --- a/graphrag/prompt_tune/loader/input.py +++ b/graphrag/prompt_tune/loader/input.py @@ -9,7 +9,7 @@ import graphrag.config.defaults as defs from graphrag.config.models.graph_rag_config import GraphRagConfig -from graphrag.index.input.load_input import load_input +from graphrag.index.input.factory import create_input from graphrag.index.llm.load_llm import load_llm_embeddings from graphrag.index.operations.chunk_text import chunk_text from graphrag.llm.types.llm_types import EmbeddingLLM @@ -58,7 +58,7 @@ async def load_docs_in_chunks( k: int = K, ) -> list[str]: """Load docs into chunks for generating prompts.""" - dataset = await load_input(config.input, reporter, root) + dataset = await create_input(config.input, reporter, root) # covert to text units chunk_strategy = config.chunks.resolved_strategy(defs.ENCODING_MODEL) diff --git a/graphrag/prompts/__init__.py b/graphrag/prompts/__init__.py index 6a6123b80..3bb0594c3 100644 --- a/graphrag/prompts/__init__.py +++ b/graphrag/prompts/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""All prompts for the system.""" +"""All prompts for the GraphRAG system.""" diff --git a/graphrag/query/factories.py b/graphrag/query/factory.py similarity index 100% rename from graphrag/query/factories.py rename to graphrag/query/factory.py diff --git a/graphrag/query/indexer_adapters.py b/graphrag/query/indexer_adapters.py index 478fe385d..3bad1c7af 100644 --- a/graphrag/query/indexer_adapters.py +++ b/graphrag/query/indexer_adapters.py @@ -19,7 +19,7 @@ from graphrag.model.entity import Entity from graphrag.model.relationship import Relationship from graphrag.model.text_unit import TextUnit -from graphrag.query.factories import get_text_embedder +from graphrag.query.factory import get_text_embedder from graphrag.query.input.loaders.dfs import ( read_communities, read_community_reports, diff --git a/graphrag/index/cache/__init__.py b/graphrag/storage/__init__.py similarity index 62% rename from graphrag/index/cache/__init__.py rename to graphrag/storage/__init__.py index ece87659f..b21f077cb 100644 --- a/graphrag/index/cache/__init__.py +++ b/graphrag/storage/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""The Indexing Engine cache package root.""" +"""The storage package root.""" diff --git a/graphrag/index/storage/blob_pipeline_storage.py b/graphrag/storage/blob_pipeline_storage.py similarity index 98% rename from graphrag/index/storage/blob_pipeline_storage.py rename to graphrag/storage/blob_pipeline_storage.py index da75a1008..9bb39e309 100644 --- a/graphrag/index/storage/blob_pipeline_storage.py +++ b/graphrag/storage/blob_pipeline_storage.py @@ -13,8 +13,8 @@ from azure.storage.blob import BlobServiceClient from datashaper import Progress -from graphrag.index.storage.pipeline_storage import PipelineStorage from graphrag.logging.base import ProgressReporter +from graphrag.storage.pipeline_storage import PipelineStorage log = logging.getLogger(__name__) @@ -305,8 +305,8 @@ def create_blob_storage( msg = "No storage account blob url provided for blob storage." raise ValueError(msg) return BlobPipelineStorage( - connection_string, - container_name, + connection_string=connection_string, + container_name=container_name, path_prefix=base_dir, storage_account_blob_url=storage_account_blob_url, ) diff --git a/graphrag/index/storage/load_storage.py b/graphrag/storage/factory.py similarity index 68% rename from graphrag/index/storage/load_storage.py rename to graphrag/storage/factory.py index fc847e06f..b7b677808 100644 --- a/graphrag/index/storage/load_storage.py +++ b/graphrag/storage/factory.py @@ -1,7 +1,7 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""A module containing load_storage method definition.""" +"""Factory functions for creating storage.""" from __future__ import annotations @@ -13,16 +13,16 @@ PipelineFileStorageConfig, PipelineStorageConfig, ) -from graphrag.index.storage.blob_pipeline_storage import create_blob_storage -from graphrag.index.storage.file_pipeline_storage import create_file_storage -from graphrag.index.storage.memory_pipeline_storage import create_memory_storage +from graphrag.storage.blob_pipeline_storage import create_blob_storage +from graphrag.storage.file_pipeline_storage import create_file_storage +from graphrag.storage.memory_pipeline_storage import MemoryPipelineStorage -def load_storage(config: PipelineStorageConfig): - """Load the storage for a pipeline.""" +def create_storage(config: PipelineStorageConfig): + """Create a storage object based on the config.""" match config.type: case StorageType.memory: - return create_memory_storage() + return MemoryPipelineStorage() case StorageType.blob: config = cast(PipelineBlobStorageConfig, config) return create_blob_storage( diff --git a/graphrag/index/storage/file_pipeline_storage.py b/graphrag/storage/file_pipeline_storage.py similarity index 98% rename from graphrag/index/storage/file_pipeline_storage.py rename to graphrag/storage/file_pipeline_storage.py index dbbca4e18..082569d08 100644 --- a/graphrag/index/storage/file_pipeline_storage.py +++ b/graphrag/storage/file_pipeline_storage.py @@ -16,8 +16,8 @@ from aiofiles.ospath import exists from datashaper import Progress -from graphrag.index.storage.pipeline_storage import PipelineStorage from graphrag.logging.base import ProgressReporter +from graphrag.storage.pipeline_storage import PipelineStorage log = logging.getLogger(__name__) diff --git a/graphrag/index/storage/memory_pipeline_storage.py b/graphrag/storage/memory_pipeline_storage.py similarity index 87% rename from graphrag/index/storage/memory_pipeline_storage.py rename to graphrag/storage/memory_pipeline_storage.py index 80245c387..4f925781c 100644 --- a/graphrag/index/storage/memory_pipeline_storage.py +++ b/graphrag/storage/memory_pipeline_storage.py @@ -3,10 +3,12 @@ """A module containing 'InMemoryStorage' model.""" -from typing import Any +from typing import TYPE_CHECKING, Any -from graphrag.index.storage.file_pipeline_storage import FilePipelineStorage -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.storage.file_pipeline_storage import FilePipelineStorage + +if TYPE_CHECKING: + from graphrag.storage.pipeline_storage import PipelineStorage class MemoryPipelineStorage(FilePipelineStorage): @@ -74,8 +76,3 @@ def child(self, name: str | None) -> "PipelineStorage": def keys(self) -> list[str]: """Return the keys in the storage.""" return list(self._storage.keys()) - - -def create_memory_storage() -> PipelineStorage: - """Create memory storage.""" - return MemoryPipelineStorage() diff --git a/graphrag/index/storage/pipeline_storage.py b/graphrag/storage/pipeline_storage.py similarity index 100% rename from graphrag/index/storage/pipeline_storage.py rename to graphrag/storage/pipeline_storage.py diff --git a/graphrag/utils/__init__.py b/graphrag/utils/__init__.py index 4d2f3e2a2..42966138a 100644 --- a/graphrag/utils/__init__.py +++ b/graphrag/utils/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""Util functions for the GraphRAG run module.""" +"""Util functions for the GraphRAG package.""" diff --git a/graphrag/utils/storage.py b/graphrag/utils/storage.py index 8b2e0ce2e..ed2d5b1cb 100644 --- a/graphrag/utils/storage.py +++ b/graphrag/utils/storage.py @@ -5,42 +5,14 @@ import logging from io import BytesIO -from pathlib import Path import pandas as pd -from graphrag.index.config.storage import ( - PipelineFileStorageConfig, - PipelineStorageConfigTypes, -) -from graphrag.index.storage.load_storage import load_storage -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.storage.pipeline_storage import PipelineStorage log = logging.getLogger(__name__) -def _create_storage( - config: PipelineStorageConfigTypes | None, root_dir: Path -) -> PipelineStorage: - """Create the storage for the pipeline. - - Parameters - ---------- - config : PipelineStorageConfigTypes - The storage configuration. - root_dir : str - The root directory. - - Returns - ------- - PipelineStorage - The pipeline storage. - """ - return load_storage( - config or PipelineFileStorageConfig(base_dir=str(root_dir / "output")) - ) - - async def _load_table_from_storage(name: str, storage: PipelineStorage) -> pd.DataFrame: if not await storage.has(name): msg = f"Could not find {name} in storage!" diff --git a/graphrag/vector_stores/__init__.py b/graphrag/vector_stores/__init__.py index c1d54b741..4f137d07b 100644 --- a/graphrag/vector_stores/__init__.py +++ b/graphrag/vector_stores/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""A module containing vector storage implementations.""" +"""A package containing vector store implementations.""" diff --git a/tests/integration/storage/test_blob_pipeline_storage.py b/tests/integration/storage/test_blob_pipeline_storage.py index d2ea86834..af5b26e07 100644 --- a/tests/integration/storage/test_blob_pipeline_storage.py +++ b/tests/integration/storage/test_blob_pipeline_storage.py @@ -4,7 +4,7 @@ import re -from graphrag.index.storage.blob_pipeline_storage import BlobPipelineStorage +from graphrag.storage.blob_pipeline_storage import BlobPipelineStorage # cspell:disable-next-line well-known-key WELL_KNOWN_BLOB_STORAGE_KEY = "DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;" diff --git a/tests/integration/storage/test_file_pipeline_storage.py b/tests/integration/storage/test_file_pipeline_storage.py index 3a8b3eaed..03f2c824e 100644 --- a/tests/integration/storage/test_file_pipeline_storage.py +++ b/tests/integration/storage/test_file_pipeline_storage.py @@ -6,7 +6,7 @@ import re from pathlib import Path -from graphrag.index.storage.file_pipeline_storage import FilePipelineStorage +from graphrag.storage.file_pipeline_storage import FilePipelineStorage __dirname__ = os.path.dirname(__file__) diff --git a/tests/smoke/test_fixtures.py b/tests/smoke/test_fixtures.py index 59ca28760..6e8471c03 100644 --- a/tests/smoke/test_fixtures.py +++ b/tests/smoke/test_fixtures.py @@ -15,10 +15,10 @@ import pandas as pd import pytest -from graphrag.index.storage.blob_pipeline_storage import BlobPipelineStorage from graphrag.query.context_builder.community_context import ( NO_COMMUNITY_RECORDS_WARNING, ) +from graphrag.storage.blob_pipeline_storage import BlobPipelineStorage log = logging.getLogger(__name__) diff --git a/tests/unit/indexing/cache/test_file_pipeline_cache.py b/tests/unit/indexing/cache/test_file_pipeline_cache.py index ff63056ed..045581ad4 100644 --- a/tests/unit/indexing/cache/test_file_pipeline_cache.py +++ b/tests/unit/indexing/cache/test_file_pipeline_cache.py @@ -4,8 +4,8 @@ import os import unittest -from graphrag.index.cache.json_pipeline_cache import JsonPipelineCache -from graphrag.index.storage.file_pipeline_storage import ( +from graphrag.cache.json_pipeline_cache import JsonPipelineCache +from graphrag.storage.file_pipeline_storage import ( FilePipelineStorage, ) diff --git a/tests/unit/indexing/workflows/test_emit.py b/tests/unit/indexing/workflows/test_export.py similarity index 90% rename from tests/unit/indexing/workflows/test_emit.py rename to tests/unit/indexing/workflows/test_export.py index 2d17bb199..59a62e9ec 100644 --- a/tests/unit/indexing/workflows/test_emit.py +++ b/tests/unit/indexing/workflows/test_export.py @@ -13,8 +13,8 @@ from graphrag.index.config.pipeline import PipelineWorkflowReference from graphrag.index.run import run_pipeline -from graphrag.index.storage.memory_pipeline_storage import MemoryPipelineStorage -from graphrag.index.storage.pipeline_storage import PipelineStorage +from graphrag.storage.memory_pipeline_storage import MemoryPipelineStorage +from graphrag.storage.pipeline_storage import PipelineStorage async def mock_verb( @@ -50,7 +50,7 @@ async def mock_no_return_verb( ) -async def test_normal_result_emits_parquet(): +async def test_normal_result_exports_parquet(): mock_verbs: Any = {"mock_verb": mock_verb} mock_workflows: Any = { "mock_workflow": lambda _x: [ @@ -84,10 +84,10 @@ async def test_normal_result_emits_parquet(): assert len(pipeline_result) == 1 assert ( storage.keys() == ["stats.json", "mock_write", "mock_workflow.parquet"] - ), "Mock workflow output should be written to storage by the emitter when there is a non-empty data frame" + ), "Mock workflow output should be written to storage by the exporter when there is a non-empty data frame" -async def test_empty_result_does_not_emit_parquet(): +async def test_empty_result_does_not_export_parquet(): mock_verbs: Any = {"mock_no_return_verb": mock_no_return_verb} mock_workflows: Any = { "mock_workflow": lambda _x: [ @@ -122,4 +122,4 @@ async def test_empty_result_does_not_emit_parquet(): assert storage.keys() == [ "stats.json", "empty_write", - ], "Mock workflow output should not be written to storage by the emitter" + ], "Mock workflow output should not be written to storage by the exporter"