From e1eefbcfa7afd3289b7e4ff683d701b81ebe5add Mon Sep 17 00:00:00 2001 From: Chris Trevino Date: Mon, 1 Apr 2024 17:47:39 -0700 Subject: [PATCH] Enable GitHub Actions CI (#3) * disable codeql for now * update gh-action names * reformat * update secrets in ci * mark secrets as required * split up check, build, test * update turbo * spit up testing strata * resolve pydantic deprecations * wire up openai env vars for test * use section.value in env-var reading * llvm install * start azurite before unit tests * skip azure smoke test for now to get CI in place * formatting * smoke test logging * print out len(key) in the fixture * use fragment type in factories * formatting * secret use update * remove is_clean check --- .github/workflows/ci.yml | 39 ++-- .github/workflows/codeql.yml | 48 ---- .github/workflows/python-publish.yml | 2 +- package.json | 2 +- .../graphrag/index/default_config/load.py | 4 +- .../default_config/parameters/factories.py | 23 +- .../graphrag/graphrag/vector_stores/qdrant.py | 216 +++++++++--------- python/graphrag/package.json | 8 +- python/graphrag/pyproject.toml | 1 + python/graphrag/tests/smoke/test_fixtures.py | 11 + .../tests/unit/indexing/config/helpers.py | 6 +- .../tests/unit/indexing/config/test_load.py | 8 +- .../default_config/test_default_config.py | 2 +- turbo.json | 2 +- yarn.lock | 60 ++--- 15 files changed, 209 insertions(+), 223 deletions(-) delete mode 100644 .github/workflows/codeql.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4a4012b389..f004a243b5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,4 +1,4 @@ -name: JavaScript CI +name: CI on: push: branches: [main] @@ -33,6 +33,11 @@ jobs: with: poetry-version: '1.6.1' + + - run: | + sudo apt-get update + sudo apt-get install -y llvm-11 python3-dev + name: "LLVM install" - uses: actions/checkout@v3 @@ -61,20 +66,28 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} server-token: ${{ secrets.GITHUB_TOKEN }} + - run: yarn check + name: Static Checks + + - run: yarn build + name: Build + - run: yarn start:azurite& name: Start Azurite - - - run: yarn ci - name: Verify + + - run: yarn test:unit + name: Unit Tests env: - GRAPHRAG_API_KEY: $(openaiApiKey) - GRAPHRAG_LLM_MODEL: $(completionModel) - GRAPHRAG_EMBEDDING_MODEL: $(embeddingModel) + GRAPHRAG_API_KEY: ${{ secrets.OPENAI_API_KEY }} - - run: | - git add -A - git status - name: Git status + - run: yarn test:integration + name: Integration Tests + env: + GRAPHRAG_API_KEY: ${{ secrets.OPENAI_API_KEY }} - - run: yarn is_clean - name: Check if repo is clean + - run: yarn test:smoke + name: Smoke Tests + env: + GRAPHRAG_API_KEY: ${{ secrets.OPENAI_API_KEY }} + GRAPHRAG_LLM_MODEL: ${{ secrets.OPENAI_LLM_MODEL }} + GRAPHRAG_EMBEDDING_MODEL: ${{ secrets.OPENAI_EMBEDDING_MODEL }} diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml deleted file mode 100644 index a5ac3f1801..0000000000 --- a/.github/workflows/codeql.yml +++ /dev/null @@ -1,48 +0,0 @@ -name: 'Code scanning - action' - -on: - push: - pull_request: - schedule: - - cron: '0 19 * * 0' - -jobs: - CodeQL-Build: - # CodeQL runs on ubuntu-latest and windows-latest - runs-on: ubuntu-latest - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - # We must fetch at least the immediate parents so that if this is - # a pull request then we can checkout the head. - fetch-depth: 2 - - # Initializes the CodeQL tools for scanning. - - name: Initialize CodeQL - uses: github/codeql-action/init@v3 - with: - config-file: ./.github/codeql/codeql-config.yml - # Override language selection by uncommenting this and choosing your languages - # with: - # languages: go, javascript, csharp, python, cpp, java - - # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). - # If this step fails, then you should remove it and run the build manually (see below) - - name: Autobuild - uses: github/codeql-action/autobuild@v3 - - # ℹī¸ Command-line programs to run using the OS shell. - # 📚 https://git.io/JvXDl - - # ✏ī¸ If the Autobuild fails above, remove it and uncomment the following three lines - # and modify them (or add more) to build your code if your project - # uses a compiled language - - #- run: | - # make bootstrap - # make release - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v3 diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 6650a65d85..85565392a1 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -1,4 +1,4 @@ -name: Python-Publish-CI +name: Python Publish on: push: branches: [main] diff --git a/package.json b/package.json index 66e9ee7ade..d83381f617 100644 --- a/package.json +++ b/package.json @@ -32,6 +32,6 @@ "devDependencies": { "cspell": "^8.3.2", "npm-run-all": "^4.1.5", - "turbo": "^1.12.4" + "turbo": "^1.13.0" } } diff --git a/python/graphrag/graphrag/index/default_config/load.py b/python/graphrag/graphrag/index/default_config/load.py index bf49403214..204284b084 100644 --- a/python/graphrag/graphrag/index/default_config/load.py +++ b/python/graphrag/graphrag/index/default_config/load.py @@ -45,8 +45,8 @@ def load_pipeline_config(config_or_path: str | PipelineConfig) -> PipelineConfig for extended_config in config.extends: extended_config = load_pipeline_config(extended_config) merged_config = { - **json.loads(extended_config.json()), - **json.loads(config.json(exclude_unset=True)), + **json.loads(extended_config.model_dump_json()), + **json.loads(config.model_dump_json(exclude_unset=True)), } config = PipelineConfig.model_validate(merged_config) diff --git a/python/graphrag/graphrag/index/default_config/parameters/factories.py b/python/graphrag/graphrag/index/default_config/parameters/factories.py index 02c735ae33..e796183180 100644 --- a/python/graphrag/graphrag/index/default_config/parameters/factories.py +++ b/python/graphrag/graphrag/index/default_config/parameters/factories.py @@ -115,20 +115,23 @@ def default_config_parameters_from_env_vars( root_dir = root_dir or str(Path.cwd()) env = _make_env(root_dir) - def _str(key: str, default_value: str | None = None) -> str | None: - return env(key, default_value) + def _key(key: str | Fragment) -> str | None: + return key.value if isinstance(key, Fragment) else key - def _int(key: str, default_value: int | None = None) -> int | None: - return env.int(key, default_value) + def _str(key: str | Fragment, default_value: str | None = None) -> str | None: + return env(_key(key), default_value) - def _bool(key: str, default_value: bool | None = None) -> bool | None: - return env.bool(key, default_value) + def _int(key: str | Fragment, default_value: int | None = None) -> int | None: + return env.int(_key(key), default_value) - def _float(key: str, default_value: float | None = None) -> float | None: - return env.float(key, default_value) + def _bool(key: str | Fragment, default_value: bool | None = None) -> bool | None: + return env.bool(_key(key), default_value) - def section(key: str): - return env.prefixed(f"{key}_") + def _float(key: str | Fragment, default_value: float | None = None) -> float | None: + return env.float(_key(key), default_value) + + def section(key: Section): + return env.prefixed(f"{key.value}_") fallback_oai_key = _str("OPENAI_API_KEY", _str("AZURE_OPENAI_API_KEY")) fallback_oai_org = _str("OPENAI_ORG_ID") diff --git a/python/graphrag/graphrag/vector_stores/qdrant.py b/python/graphrag/graphrag/vector_stores/qdrant.py index 17f2c006c4..10d2f2e352 100644 --- a/python/graphrag/graphrag/vector_stores/qdrant.py +++ b/python/graphrag/graphrag/vector_stores/qdrant.py @@ -3,111 +3,111 @@ # Licensed under the MIT license. See LICENSE file in the project. # -"""A package containing the Qdrant vector store implementation.""" - -from typing import Any - -from qdrant_client import QdrantClient # type: ignore -from qdrant_client.http import models # type: ignore -from qdrant_client.models import Distance, VectorParams # type: ignore - -from graphrag.model.types import TextEmbedder - -from .base import BaseVectorStore, VectorStoreDocument, VectorStoreSearchResult - - -class Qdrant(BaseVectorStore): - """The Qdrant vector storage implementation.""" - - def connect(self, **kwargs: Any) -> Any: - """Connect to the Qdrant vector store.""" - url = kwargs.get("url", None) - port = kwargs.get("port", 6333) - - api_key = kwargs.get("api_key", None) - timeout = kwargs.get("timeout", 1000) - self.vector_size = kwargs.get("vector_size", 1536) - - if url: - https = kwargs.get("https", "https://" in url) - self.db_connection = QdrantClient( - url=url, port=port, api_key=api_key, https=https, timeout=timeout - ) - else: - # create in-memory db - self.db_connection = QdrantClient(":memory:") - - def load_documents( - self, documents: list[VectorStoreDocument], overwrite: bool = True - ) -> None: - """Load documents into the vector store.""" - if overwrite: - self.db_connection.recreate_collection( - collection_name=self.collection_name, - vectors_config=VectorParams( - size=( - len(documents[0].vector) - if len(documents) > 0 and documents[0].vector - else self.vector_size - ), - distance=Distance.COSINE, - ), - ) - - self.db_connection.upsert( - collection_name=self.collection_name, - points=models.Batch( - ids=[doc.id for doc in documents], - vectors=[doc.vector if doc.vector else [] for doc in documents], - payloads=[{"text": doc.text, **doc.attributes} for doc in documents], - ), - ) - - def filter_by_id(self, include_ids: list[str] | list[int]) -> Any: - """Build a query filter to filter documents by id.""" - self.query_filter = models.Filter( - must=[ - models.HasIdCondition(has_id=include_ids), # type: ignore - ], - ) - return self.query_filter - - def similarity_search_by_vector( - self, query_embedding: list[float], k: int = 10, **kwargs: Any - ) -> list[VectorStoreSearchResult]: - """Perform a vector-based similarity search.""" - docs = self.db_connection.search( - collection_name=self.collection_name, - query_filter=self.query_filter, - query_vector=query_embedding, - limit=k, - with_vectors=True, - ) - - return [ - VectorStoreSearchResult( - document=VectorStoreDocument( - id=doc.id, - text=doc.payload["text"] if doc.payload else "", - vector=doc.vector if doc.vector else [], # type: ignore - attributes=( - {k: v for k, v in doc.payload.items() if k != "text"} - if doc.payload - else {} - ), - ), - score=1 - abs(doc.score), - ) - for doc in docs - ] - - def similarity_search_by_text( - self, text: str, text_embedder: TextEmbedder, k: int = 10, **kwargs: Any - ) -> list[VectorStoreSearchResult]: - """Perform a text-based similarity search.""" - query_embedding = text_embedder(text) - if query_embedding: - return self.similarity_search_by_vector( - query_embedding=query_embedding, k=k - ) - return [] +"""A package containing the Qdrant vector store implementation.""" + +from typing import Any + +from qdrant_client import QdrantClient # type: ignore +from qdrant_client.http import models # type: ignore +from qdrant_client.models import Distance, VectorParams # type: ignore + +from graphrag.model.types import TextEmbedder + +from .base import BaseVectorStore, VectorStoreDocument, VectorStoreSearchResult + + +class Qdrant(BaseVectorStore): + """The Qdrant vector storage implementation.""" + + def connect(self, **kwargs: Any) -> Any: + """Connect to the Qdrant vector store.""" + url = kwargs.get("url", None) + port = kwargs.get("port", 6333) + + api_key = kwargs.get("api_key", None) + timeout = kwargs.get("timeout", 1000) + self.vector_size = kwargs.get("vector_size", 1536) + + if url: + https = kwargs.get("https", "https://" in url) + self.db_connection = QdrantClient( + url=url, port=port, api_key=api_key, https=https, timeout=timeout + ) + else: + # create in-memory db + self.db_connection = QdrantClient(":memory:") + + def load_documents( + self, documents: list[VectorStoreDocument], overwrite: bool = True + ) -> None: + """Load documents into the vector store.""" + if overwrite: + self.db_connection.recreate_collection( + collection_name=self.collection_name, + vectors_config=VectorParams( + size=( + len(documents[0].vector) + if len(documents) > 0 and documents[0].vector + else self.vector_size + ), + distance=Distance.COSINE, + ), + ) + + self.db_connection.upsert( + collection_name=self.collection_name, + points=models.Batch( + ids=[doc.id for doc in documents], + vectors=[doc.vector if doc.vector else [] for doc in documents], + payloads=[{"text": doc.text, **doc.attributes} for doc in documents], + ), + ) + + def filter_by_id(self, include_ids: list[str] | list[int]) -> Any: + """Build a query filter to filter documents by id.""" + self.query_filter = models.Filter( + must=[ + models.HasIdCondition(has_id=include_ids), # type: ignore + ], + ) + return self.query_filter + + def similarity_search_by_vector( + self, query_embedding: list[float], k: int = 10, **kwargs: Any + ) -> list[VectorStoreSearchResult]: + """Perform a vector-based similarity search.""" + docs = self.db_connection.search( + collection_name=self.collection_name, + query_filter=self.query_filter, + query_vector=query_embedding, + limit=k, + with_vectors=True, + ) + + return [ + VectorStoreSearchResult( + document=VectorStoreDocument( + id=doc.id, + text=doc.payload["text"] if doc.payload else "", + vector=doc.vector if doc.vector else [], # type: ignore + attributes=( + {k: v for k, v in doc.payload.items() if k != "text"} + if doc.payload + else {} + ), + ), + score=1 - abs(doc.score), + ) + for doc in docs + ] + + def similarity_search_by_text( + self, text: str, text_embedder: TextEmbedder, k: int = 10, **kwargs: Any + ) -> list[VectorStoreSearchResult]: + """Perform a text-based similarity search.""" + query_embedding = text_embedder(text) + if query_embedding: + return self.similarity_search_by_vector( + query_embedding=query_embedding, k=k + ) + return [] diff --git a/python/graphrag/package.json b/python/graphrag/package.json index 59ae9f9e3e..d1a6a12be0 100644 --- a/python/graphrag/package.json +++ b/python/graphrag/package.json @@ -4,7 +4,6 @@ "private": true, "scripts": { "start:azurite": "azurite -L -l ./temp_azurite -d ./temp_azurite/debug.log", - "_test_e2e": "./e2e-test.sh", "_poe_test": "poetry run poe test", "python_authenticate": "poetry config http-basic.resilience resilience $RESILIENCE_ARTIFACTS_PASSWORD", "python_install": "poetry install", @@ -13,7 +12,12 @@ "format": "poetry run poe format", "build": "poetry build && poetry run poe build_docs", "freeze": "poetry freeze-wheel -vvv", - "test": "run-s _poe_test _test_e2e", + "test:unit": "poetry run poe test_unit", + "test:integration": "poetry run poe test_integration", + "test:smoke": "poetry run poe test_smoke", + "test:e2e": "./e2e-test.sh", + "generate:coverage_report": "poetry run poe coverage_report", + "test": "run-s 'test:*' coverage_report", "run:index": "poetry run poe index", "run:query": "poetry run poe query", "upload": "poetry run poe upload", diff --git a/python/graphrag/pyproject.toml b/python/graphrag/pyproject.toml index 6aa4b9122d..fcbf7823de 100644 --- a/python/graphrag/pyproject.toml +++ b/python/graphrag/pyproject.toml @@ -101,6 +101,7 @@ fix_unsafe = "ruff check --fix --unsafe-fixes ." _test_all = "coverage run -m pytest ./tests" test_unit = "pytest ./tests/unit" test_integration = "pytest ./tests/integration" +test_smoke = "pytest ./tests/smoke" build_docs = "python -m graphrag.build_docs" index = "python -m graphrag.index" query = "python -m graphrag.query" diff --git a/python/graphrag/tests/smoke/test_fixtures.py b/python/graphrag/tests/smoke/test_fixtures.py index 88fd50dbd6..a5039a6605 100644 --- a/python/graphrag/tests/smoke/test_fixtures.py +++ b/python/graphrag/tests/smoke/test_fixtures.py @@ -239,20 +239,31 @@ def test_fixture( workflow_config: dict[str, dict[str, Any]], query_config: list[dict[str, str]], ): + print( + "Running smoke test, len(key)=", len(os.environ.get("GRAPHRAG_API_KEY", "")) + ) + if workflow_config.get("skip", False): + print("skipping smoke test :-()") + return + azure = workflow_config.get("azure") root = Path(input_path) dispose = None if azure is not None: dispose = asyncio.run(prepare_azurite_data(input_path, azure)) + print("running indexer") self.__run_indexer(root, input_type) + print("indexer complete") if dispose is not None: dispose() if not workflow_config.get("skip_assert", False): + print("performing dataset assertions") self.__assert_indexer_outputs(root, workflow_config) + print("running queries") for query in query_config: result = self.__run_query(root, query) diff --git a/python/graphrag/tests/unit/indexing/config/helpers.py b/python/graphrag/tests/unit/indexing/config/helpers.py index d785bc647b..939b16ffd1 100644 --- a/python/graphrag/tests/unit/indexing/config/helpers.py +++ b/python/graphrag/tests/unit/indexing/config/helpers.py @@ -22,10 +22,12 @@ def assert_contains_default_config( assert config is not None assert isinstance(config, PipelineConfig) - checked_config = json.loads(config.json(exclude_defaults=True, exclude_unset=True)) + checked_config = json.loads( + config.model_dump_json(exclude_defaults=True, exclude_unset=True) + ) actual_default_config = json.loads( - default_config(default_config_parameters_from_env_vars(".")).json( + default_config(default_config_parameters_from_env_vars(".")).model_dump_json( exclude_defaults=True, exclude_unset=True ) ) diff --git a/python/graphrag/tests/unit/indexing/config/test_load.py b/python/graphrag/tests/unit/indexing/config/test_load.py index 73ba0844b0..aaab622ce2 100644 --- a/python/graphrag/tests/unit/indexing/config/test_load.py +++ b/python/graphrag/tests/unit/indexing/config/test_load.py @@ -79,13 +79,13 @@ def assert_is_default_config( assert isinstance(config, PipelineConfig) checked_config = json.loads( - config.json(exclude_defaults=True, exclude_unset=True) + config.model_dump_json(exclude_defaults=True, exclude_unset=True) ) actual_default_config = json.loads( - default_config(default_config_parameters_from_env_vars(".")).json( - exclude_defaults=True, exclude_unset=True - ) + default_config( + default_config_parameters_from_env_vars(".") + ).model_dump_json(exclude_defaults=True, exclude_unset=True) ) props_to_ignore = ["root_dir", "extends"] diff --git a/python/graphrag/tests/unit/indexing/default_config/test_default_config.py b/python/graphrag/tests/unit/indexing/default_config/test_default_config.py index f8b2818236..1341fe574b 100644 --- a/python/graphrag/tests/unit/indexing/default_config/test_default_config.py +++ b/python/graphrag/tests/unit/indexing/default_config/test_default_config.py @@ -265,7 +265,7 @@ def test_yaml_load_e2e(): # generate the pipeline from the default parameters pipeline_config = default_config(parameters, True) - config_str = pipeline_config.json() + config_str = pipeline_config.model_dump_json() assert "${PIPELINE_LLM_API_KEY}" not in config_str assert "${PIPELINE_LLM_API_BASE}" not in config_str assert "${PIPELINE_LLM_API_VERSION}" not in config_str diff --git a/turbo.json b/turbo.json index 8c918eeec5..68b615537f 100644 --- a/turbo.json +++ b/turbo.json @@ -6,7 +6,7 @@ "outputs": ["dist/**", "docs/**"] }, "test": { - "dependsOn": ["build"], + "dependsOn": [], "outputs": [] }, "check": { diff --git a/yarn.lock b/yarn.lock index d1685cd32e..9bf3df2556 100644 --- a/yarn.lock +++ b/yarn.lock @@ -835,7 +835,7 @@ __metadata: dependencies: cspell: "npm:^8.3.2" npm-run-all: "npm:^4.1.5" - turbo: "npm:^1.12.4" + turbo: "npm:^1.13.0" languageName: unknown linkType: soft @@ -6174,58 +6174,58 @@ __metadata: languageName: node linkType: hard -"turbo-darwin-64@npm:1.12.4": - version: 1.12.4 - resolution: "turbo-darwin-64@npm:1.12.4" +"turbo-darwin-64@npm:1.13.1": + version: 1.13.1 + resolution: "turbo-darwin-64@npm:1.13.1" conditions: os=darwin & cpu=x64 languageName: node linkType: hard -"turbo-darwin-arm64@npm:1.12.4": - version: 1.12.4 - resolution: "turbo-darwin-arm64@npm:1.12.4" +"turbo-darwin-arm64@npm:1.13.1": + version: 1.13.1 + resolution: "turbo-darwin-arm64@npm:1.13.1" conditions: os=darwin & cpu=arm64 languageName: node linkType: hard -"turbo-linux-64@npm:1.12.4": - version: 1.12.4 - resolution: "turbo-linux-64@npm:1.12.4" +"turbo-linux-64@npm:1.13.1": + version: 1.13.1 + resolution: "turbo-linux-64@npm:1.13.1" conditions: os=linux & cpu=x64 languageName: node linkType: hard -"turbo-linux-arm64@npm:1.12.4": - version: 1.12.4 - resolution: "turbo-linux-arm64@npm:1.12.4" +"turbo-linux-arm64@npm:1.13.1": + version: 1.13.1 + resolution: "turbo-linux-arm64@npm:1.13.1" conditions: os=linux & cpu=arm64 languageName: node linkType: hard -"turbo-windows-64@npm:1.12.4": - version: 1.12.4 - resolution: "turbo-windows-64@npm:1.12.4" +"turbo-windows-64@npm:1.13.1": + version: 1.13.1 + resolution: "turbo-windows-64@npm:1.13.1" conditions: os=win32 & cpu=x64 languageName: node linkType: hard -"turbo-windows-arm64@npm:1.12.4": - version: 1.12.4 - resolution: "turbo-windows-arm64@npm:1.12.4" +"turbo-windows-arm64@npm:1.13.1": + version: 1.13.1 + resolution: "turbo-windows-arm64@npm:1.13.1" conditions: os=win32 & cpu=arm64 languageName: node linkType: hard -"turbo@npm:^1.12.4": - version: 1.12.4 - resolution: "turbo@npm:1.12.4" - dependencies: - turbo-darwin-64: "npm:1.12.4" - turbo-darwin-arm64: "npm:1.12.4" - turbo-linux-64: "npm:1.12.4" - turbo-linux-arm64: "npm:1.12.4" - turbo-windows-64: "npm:1.12.4" - turbo-windows-arm64: "npm:1.12.4" +"turbo@npm:^1.13.0": + version: 1.13.1 + resolution: "turbo@npm:1.13.1" + dependencies: + turbo-darwin-64: "npm:1.13.1" + turbo-darwin-arm64: "npm:1.13.1" + turbo-linux-64: "npm:1.13.1" + turbo-linux-arm64: "npm:1.13.1" + turbo-windows-64: "npm:1.13.1" + turbo-windows-arm64: "npm:1.13.1" dependenciesMeta: turbo-darwin-64: optional: true @@ -6241,7 +6241,7 @@ __metadata: optional: true bin: turbo: bin/turbo - checksum: c58920f24aed084c59813543bcbd7617977798611a59e791595e965097f86bfad1014a2442ae04de59e363e45e48b2e0cc881f90dab5f97c40cc2f7b7db8bdee + checksum: 2fe682b70b4e0e8d64dfef80ec3aae52acfae2ac99dd3e633d5bbf5958ac8c322ce563081f56852e2e6a1766b673141d65ec296661a6795c8e99a5f7c291d80f languageName: node linkType: hard