diff --git a/.github/workflows/bootstrap.yaml b/.github/workflows/bootstrap.yaml index 3e1df48..4886b3b 100644 --- a/.github/workflows/bootstrap.yaml +++ b/.github/workflows/bootstrap.yaml @@ -212,8 +212,8 @@ jobs: git branch -D "$AUTOMATION_BRANCH" || : git checkout -b "$AUTOMATION_BRANCH" else - # The -B flag swaps branch and creates it if NOT present - git checkout -B "$AUTOMATION_BRANCH" + git fetch origin "$AUTOMATION_BRANCH" + git switch -c "$AUTOMATION_BRANCH" "origin/$AUTOMATION_BRANCH" fi # Only if NOT running in GitHub diff --git a/.github/workflows/builds.yaml b/.github/workflows/builds.yaml index 1c7f171..a152862 100644 --- a/.github/workflows/builds.yaml +++ b/.github/workflows/builds.yaml @@ -10,22 +10,34 @@ on: - "*" - "!update-devops-tooling" +env: + package-path: "dist/" + jobs: - parse-project-metadata: - name: "Determine Python versions" - # yamllint disable-line rule:line-length - uses: os-climate/devops-reusable-workflows/.github/workflows/pyproject-toml-fetch-matrix.yaml@main + get-python-versions: + name: "Validate Python project" + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.parse-project-metadata.outputs.python-matrix-versions }} + + steps: + - uses: actions/checkout@v4 - test-builds: - name: "Build: Python" - needs: [parse-project-metadata] + - name: "Parse: pyproject.toml" + id: parse-project-metadata + # yamllint disable-line rule:line-length + uses: os-climate/devops-reusable-workflows/.github/actions/python-versions-matrix@main + + builds: + name: "Python builds" + needs: [get-python-versions] runs-on: "ubuntu-latest" continue-on-error: true # Don't run when pull request is merged if: github.event.pull_request.merged == false strategy: fail-fast: false - matrix: ${{ fromJson(needs.parse-project-metadata.outputs.matrix) }} + matrix: ${{ fromJson(needs.get-python-versions.outputs.matrix) }} steps: - name: "Populate environment variables" @@ -81,8 +93,8 @@ jobs: python -m build fi - - name: "Validating Artefacts with Twine" - run: | - echo "Validating artefacts with: twine check dist/*" - pip install --upgrade twine - twine check dist/* + - name: "Validate Artefacts with Twine" + id: twine-check-artefacts + env: + package-path: ${{ env.package-path }} + uses: os-climate/devops-reusable-workflows/.github/actions/twine-check-artefacts@main diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..876fb54 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,83 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "🔐 CodeQL" + +on: + push: + branches: [ "main", "gh-pages", "master" ] + pull_request: + branches: [ "main", "gh-pages", "master" ] + schedule: + - cron: '39 20 * * 6' + +jobs: + analyze: + name: Analyze (${{ matrix.language }}) + # Runner size impacts CodeQL analysis time. To learn more, please see: + # - https://gh.io/recommended-hardware-resources-for-running-codeql + # - https://gh.io/supported-runners-and-hardware-resources + # - https://gh.io/using-larger-runners (GitHub.com only) + # Consider using larger runners or machines with greater resources for possible analysis time improvements. + runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} + timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }} + permissions: + # required for all workflows + security-events: write + + # required to fetch internal or private CodeQL packs + packages: read + + # only required for workflows in private repositories + actions: read + contents: read + + strategy: + fail-fast: false + matrix: + include: + - language: python + build-mode: none + # CodeQL supports the following values keywords for 'language': + # 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + build-mode: ${{ matrix.build-mode }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # If the analyze step fails for one of the languages you are analyzing with + # "We were unable to automatically build your code", modify the matrix above + # to set the build mode to "manual" for that language. Then modify this step + # to build your code. + - if: matrix.build-mode == 'manual' + shell: bash + run: | + echo 'If you are using a "manual" build mode for one or more of the' \ + 'languages you are analyzing, replace this with the commands to build' \ + 'your code, for example:' + echo ' make bootstrap' + echo ' make release' + exit 1 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{matrix.language}}" diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index ac254c4..6fe13e0 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -1,17 +1,13 @@ --- -name: "🐍📦 Production build and release" +name: "🐍📦 Old Production build and release" # GitHub/PyPI trusted publisher documentation: # https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ # yamllint disable-line rule:truthy on: - # workflow_dispatch: push: # Only invoked on release tag pushes - branches: - - 'main' - - 'master' tags: - 'v*.*.*' @@ -28,8 +24,9 @@ jobs: if: startsWith(github.ref, 'refs/tags/') runs-on: ubuntu-latest permissions: - # IMPORTANT: mandatory for Sigstore + contents: write id-token: write + steps: ### BUILDING ### @@ -44,10 +41,20 @@ jobs: - name: "Setup PDM for build commands" uses: pdm-project/setup-pdm@v4 + - name: "Fetch current semantic tag" + id: fetch-tags + # yamllint disable-line rule:line-length + uses: os-climate/devops-reusable-workflows/.github/actions/latest-semantic-tag@main + - name: "Update version from tags for production release" run: | - echo "Github versioning: ${{ github.ref_name }}" - scripts/release-versioning.sh + echo "Github tag/versioning: ${{ github.ref_name }}" + if (grep 'dynamic = \[\"version\"\]' pyproject.toml > /dev/null); then + echo "Proceeding build with dynamic versioning" + else + echo "Using legacy script to bump release version" + scripts/release-versioning.sh + fi - name: "Build with PDM backend" run: | @@ -56,7 +63,8 @@ jobs: ### SIGNING ### - name: "Sign packages with Sigstore" - uses: sigstore/gh-action-sigstore-python@v2 + # Use new action + uses: sigstore/gh-action-sigstore-python@v3.0.0 with: inputs: >- ./dist/*.tar.gz @@ -72,8 +80,6 @@ jobs: github: name: "📦 Publish to GitHub" - # Only publish on tag pushes - if: startsWith(github.ref, 'refs/tags/') needs: - build runs-on: ubuntu-latest @@ -94,20 +100,17 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} prerelease: false tag_name: ${{ github.ref_name }} - name: "Test/Development Build \ - ${{ github.ref_name }}" + name: ${{ github.ref_name }}" # body_path: ${{ github.workspace }}/CHANGELOG.rst files: | dist/*.tar.gz dist/*.whl - dist/*.sigstore + dist/*.sigstore* ### PUBLISH PYPI TEST ### testpypi: - name: "📦 Publish to PyPi Test" - # Only publish on tag pushes - if: startsWith(github.ref, 'refs/tags/') + name: "📦 Test publishing to PyPI" needs: - build runs-on: ubuntu-latest @@ -128,9 +131,9 @@ jobs: if [ -f dist/buildvars.txt ]; then rm dist/buildvars.txt fi - rm dist/*.sigstore + rm dist/*.sigstore* - - name: Publish distribution to Test PyPI + - name: "Test publishing to PyPI" uses: pypa/gh-action-pypi-publish@release/v1 with: repository-url: https://test.pypi.org/legacy/ @@ -140,8 +143,6 @@ jobs: pypi: name: "📦 Publish to PyPi" - # Only publish on tag pushes - if: startsWith(github.ref, 'refs/tags/') needs: - testpypi runs-on: ubuntu-latest @@ -162,7 +163,7 @@ jobs: if [ -f dist/buildvars.txt ]; then rm dist/buildvars.txt fi - rm dist/*.sigstore + rm dist/*.sigstore* - name: "Setup PDM for build commands" uses: pdm-project/setup-pdm@v4 diff --git a/.github/workflows/security.yaml b/.github/workflows/security.yaml index 1561763..89c57f1 100644 --- a/.github/workflows/security.yaml +++ b/.github/workflows/security.yaml @@ -16,21 +16,29 @@ on: - "!update-devops-tooling" jobs: + get-python-versions: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.parse-project-metadata.outputs.python-matrix-versions }} + + steps: + - uses: actions/checkout@v4 - parse-project-metadata: - name: "Determine Python versions" - # yamllint disable-line rule:line-length - uses: os-climate/devops-reusable-workflows/.github/workflows/pyproject-toml-fetch-matrix.yaml@main + - name: "Populate environment variables" + id: parse-project-metadata + # yamllint disable-line rule:line-length + uses: os-climate/devops-reusable-workflows/.github/actions/python-versions-matrix@main - build: - name: "Audit Python dependencies" - needs: [parse-project-metadata] - runs-on: ubuntu-latest + builds: + name: "Python builds" + needs: [get-python-versions] + runs-on: "ubuntu-latest" + continue-on-error: true # Don't run when pull request is merged if: github.event.pull_request.merged == false strategy: fail-fast: false - matrix: ${{ fromJson(needs.parse-project-metadata.outputs.matrix) }} + matrix: ${{ fromJson(needs.get-python-versions.outputs.matrix) }} steps: - name: "Checkout repository" @@ -57,4 +65,4 @@ jobs: pdm list --graph - name: "Run: pip-audit" - uses: pypa/gh-action-pip-audit@v1.1.0 + uses: pypa/gh-action-pip-audit@v1.0.8 diff --git a/.github/workflows/testing.yaml b/.github/workflows/testing.yaml index aebd8a9..125e06e 100644 --- a/.github/workflows/testing.yaml +++ b/.github/workflows/testing.yaml @@ -11,21 +11,28 @@ on: - "!update-devops-tooling" jobs: + get-python-versions: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.parse-project-metadata.outputs.python-matrix-versions }} + + steps: + - uses: actions/checkout@v4 - parse-project-metadata: - name: "Determine Python versions" - # yamllint disable-line rule:line-length - uses: os-climate/devops-reusable-workflows/.github/workflows/pyproject-toml-fetch-matrix.yaml@main + - name: "Populate environment variables" + id: parse-project-metadata + # yamllint disable-line rule:line-length + uses: os-climate/devops-reusable-workflows/.github/actions/python-versions-matrix@main testing: name: "Run unit tests" - needs: [parse-project-metadata] + needs: [get-python-versions] runs-on: ubuntu-latest # Don't run when pull request is merged if: github.event.pull_request.merged == false strategy: fail-fast: false - matrix: ${{ fromJson(needs.parse-project-metadata.outputs.matrix) }} + matrix: ${{ fromJson(needs.get-python-versions.outputs.matrix) }} steps: - name: "Checkout repository" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d232f45..1e7725f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,9 +1,6 @@ --- ci: autofix_commit_msg: "Chore: pre-commit autoupdate" - skip: - # pre-commit.ci cannot install WGET, so tomlint must be disabled - - tomllint exclude: | (?x)^( @@ -13,21 +10,10 @@ exclude: | repos: - - repo: local - hooks: - - id: tomllint - name: "Script: scripts/tomllint.sh" - language: script - # pass_filenames: false - files: \^*.toml - types: [file] - entry: scripts/tomllint.sh . - - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.6.0 hooks: - id: check-added-large-files - - id: check-ast - id: check-case-conflict - id: check-executables-have-shebangs - id: check-json @@ -38,17 +24,15 @@ repos: # - id: detect-aws-credentials - id: check-xml - id: check-yaml - - id: debug-statements - id: detect-private-key - id: end-of-file-fixer - - id: mixed-line-ending - args: ["--fix=lf"] + # - id: mixed-line-ending + # args: ["--fix=lf"] - id: name-tests-test args: ["--pytest-test-first"] - id: no-commit-to-branch # - id: pretty-format-json - id: requirements-txt-fixer - - id: trailing-whitespace - repo: https://github.com/pre-commit/mirrors-prettier rev: v4.0.0-alpha.8 @@ -78,39 +62,27 @@ repos: rev: v0.10.0.1 hooks: - id: shellcheck - - - repo: https://github.com/pycqa/pydocstyle.git - rev: 6.3.0 - hooks: - - id: pydocstyle - additional_dependencies: ["tomli"] + args: ["-x"] # Check external files - repo: https://github.com/Mateusz-Grzelinski/actionlint-py rev: v1.7.1.15 hooks: - id: actionlint - - repo: https://github.com/pycqa/flake8 - rev: "7.1.1" - hooks: - - id: flake8 - additional_dependencies: - - pep8-naming - - repo: https://github.com/adrienverge/yamllint.git rev: v1.35.1 hooks: - id: yamllint - args: [ "-d", "{rules: {line-length: {max: 120}}, ignore-from-file: [.gitignore],}", ] + args: + ["-d", "{rules: {line-length: {max: 120}}, + ignore-from-file: [.gitignore],}"] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.6 + rev: v0.5.5 hooks: - id: ruff - files: ^(scripts|tests|custom_components)/.+\.py$ - args: [--fix, --exit-non-zero-on-fix] + args: [--fix, --exit-non-zero-on-fix, --config=pyproject.toml] - id: ruff-format - files: ^(scripts|tests|custom_components)/.+\.py$ - repo: local hooks: @@ -118,28 +90,23 @@ repos: name: "create mypy cache" language: system pass_filenames: false - entry: bash -c 'if [ ! -d .mypy_cache ]; then /bin/mkdir .mypy_cache; fi; exit 0' + entry: bash -c 'if [ ! -d .mypy_cache ]; + then /bin/mkdir .mypy_cache; fi; exit 0' - repo: https://github.com/pre-commit/mirrors-mypy - rev: "v1.11.1" + rev: "v1.11.0" hooks: - id: mypy verbose: true args: ["--show-error-codes", "--install-types", "--non-interactive"] additional_dependencies: ["pytest", "types-requests"] +# yamllint disable rule:comments-indentation # Check for misspellings in documentation files # - repo: https://github.com/codespell-project/codespell # rev: v2.2.2 # hooks: - # - id: codespell - - # To embrace black styles, even in docs - # - repo: https://github.com/asottile/blacken-docs - # rev: v1.13.0 - # hooks: - # - id: blacken-docs - # additional_dependencies: [black] + # - id: codespell # Automatically upgrade Python syntax for newer versions # - repo: https://github.com/asottile/pyupgrade @@ -147,3 +114,4 @@ repos: # hooks: # - id: pyupgrade # args: ['--py37-plus'] +# yamllint enable rule:comments-indentation diff --git a/osc_extraction_utils/conftest.py b/osc_extraction_utils/conftest.py index 96ce88d..1c7d681 100644 --- a/osc_extraction_utils/conftest.py +++ b/osc_extraction_utils/conftest.py @@ -74,7 +74,9 @@ def create_multiple_xlsx_files(path_folder: Path) -> None: create_single_xlsx_file(path_folder, file_name=f"xlsx_file_{i}.xlsx") -def modify_project_settings(project_settings: typing.Dict, *args: typing.Tuple[str, str, bool]) -> typing.Dict: +def modify_project_settings( + project_settings: typing.Dict, *args: typing.Tuple[str, str, bool] +) -> typing.Dict: """Returns are modified project settings dict based on the input args :param project_settings: Project settings @@ -128,7 +130,9 @@ def s3_settings() -> S3Settings: # TODO add test mode paths? @pytest.fixture(scope="session") def project_paths(main_settings: MainSettings) -> ProjectPaths: - return ProjectPaths("test_project", main_settings, Path(__file__).parents[1].resolve()) + return ProjectPaths( + "test_project", main_settings, Path(__file__).parents[1].resolve() + ) @pytest.fixture(scope="session") @@ -155,8 +159,12 @@ def prerequisites_generate_text( write_to_file(path_current_file, f"That is a test {i}", "HEADER") with ( - patch.object(project_paths, "path_folder_relevance", Path(path_folder_relevance)), - patch.object(project_paths, "path_folder_text_3434", Path(path_folder_text_3434)), + patch.object( + project_paths, "path_folder_relevance", Path(path_folder_relevance) + ), + patch.object( + project_paths, "path_folder_text_3434", Path(path_folder_text_3434) + ), patch("osc_extraction_utils.merger.os.getenv", lambda *args: args[0]), ): yield diff --git a/osc_extraction_utils/converter.py b/osc_extraction_utils/converter.py index 0aea73d..fd5d3e1 100644 --- a/osc_extraction_utils/converter.py +++ b/osc_extraction_utils/converter.py @@ -11,7 +11,9 @@ def convert(self) -> None: class XlsToCsvConverter(Converter): - def __init__(self, path_folder_source: Path = Path(), path_folder_destination: Path = Path()): + def __init__( + self, path_folder_source: Path = Path(), path_folder_destination: Path = Path() + ): self.path_folder_source: Path = path_folder_source self.path_folder_destination: Path = path_folder_destination @@ -44,7 +46,9 @@ def convert(self) -> None: self._convert_single_file_to_csv(list_paths_xlsx_files[0]) def _find_xlsx_files_in_source_folder(self) -> list[Path]: - list_paths_xlsx_files: list[Path] = list(self._path_folder_source.glob("*.xlsx")) + list_paths_xlsx_files: list[Path] = list( + self._path_folder_source.glob("*.xlsx") + ) return list_paths_xlsx_files def _check_for_valid_paths(self) -> None: @@ -62,5 +66,7 @@ def _check_xlsx_files(self, list_paths_xlsx_files: list[Path]) -> None: def _convert_single_file_to_csv(self, path_file: Path) -> None: print(f"Converting {path_file} to csv-format") df_read_excel: pd.DataFrame = pd.read_excel(path_file, engine="openpyxl") - path_csv_file: Path = self._path_folder_destination / "aggregated_annotation.csv" + path_csv_file: Path = ( + self._path_folder_destination / "aggregated_annotation.csv" + ) df_read_excel.to_csv(path_csv_file, index=False, header=True) diff --git a/osc_extraction_utils/core_utils.py b/osc_extraction_utils/core_utils.py index 1eb7c5a..7df5d23 100644 --- a/osc_extraction_utils/core_utils.py +++ b/osc_extraction_utils/core_utils.py @@ -26,12 +26,16 @@ def _delete_file(path_file: Path) -> None: print("Failed to delete %s. Reason: %s" % (str(path_file), exception)) -def copy_file_without_overwrite(path_folder_source_as_str: str, path_folder_destination_as_str: str) -> bool: +def copy_file_without_overwrite( + path_folder_source_as_str: str, path_folder_destination_as_str: str +) -> bool: path_folder_source = Path(path_folder_source_as_str) path_folder_destination = Path(path_folder_destination_as_str) for path_file_current_source in path_folder_source.iterdir(): - path_file_current_destination = path_folder_destination / path_file_current_source.name + path_file_current_destination = ( + path_folder_destination / path_file_current_source.name + ) if not path_file_current_destination.exists(): shutil.copyfile(path_file_current_source, path_file_current_destination) return True @@ -62,14 +66,24 @@ def copy_file_without_overwrite(path_folder_source_as_str: str, path_folder_dest def download_data_from_s3_main_bucket_to_local_folder_if_required( - s3_bucket: S3Communication, path_s3_with_prefix_folder: Path, path_local_folder: Path, main_settings: MainSettings + s3_bucket: S3Communication, + path_s3_with_prefix_folder: Path, + path_local_folder: Path, + main_settings: MainSettings, ): if main_settings.general.s3_usage: - s3_bucket.download_files_in_prefix_to_dir(path_s3_with_prefix_folder, path_local_folder) + s3_bucket.download_files_in_prefix_to_dir( + path_s3_with_prefix_folder, path_local_folder + ) def upload_data_from_local_folder_to_s3_interim_bucket_if_required( - s3_bucket: S3Communication, path_local_folder: Path, path_s3_with_prefix_folder: Path, main_settings: MainSettings + s3_bucket: S3Communication, + path_local_folder: Path, + path_s3_with_prefix_folder: Path, + main_settings: MainSettings, ): if main_settings.general.s3_usage: - s3_bucket.upload_files_in_dir_to_prefix(path_local_folder, path_s3_with_prefix_folder) + s3_bucket.upload_files_in_dir_to_prefix( + path_local_folder, path_s3_with_prefix_folder + ) diff --git a/osc_extraction_utils/merger.py b/osc_extraction_utils/merger.py index b0a8899..2174d24 100644 --- a/osc_extraction_utils/merger.py +++ b/osc_extraction_utils/merger.py @@ -9,7 +9,12 @@ class Merger: # TODO finish Merger class - def __init__(self, main_settings: MainSettings, s3_settings: S3Settings, project_paths: ProjectPaths) -> None: + def __init__( + self, + main_settings: MainSettings, + s3_settings: S3Settings, + project_paths: ProjectPaths, + ) -> None: self.main_settings: MainSettings = main_settings self.s3_settings: S3Settings = s3_settings self.project_paths: ProjectPaths = project_paths @@ -33,7 +38,9 @@ def _return_s3_communication_interim(self) -> S3Communication: return S3Communication( s3_endpoint_url=os.getenv(self.s3_settings.interim_bucket.s3_endpoint), aws_access_key_id=os.getenv(self.s3_settings.interim_bucket.s3_access_key), - aws_secret_access_key=os.getenv(self.s3_settings.interim_bucket.s3_secret_key), + aws_secret_access_key=os.getenv( + self.s3_settings.interim_bucket.s3_secret_key + ), s3_bucket=os.getenv(self.s3_settings.interim_bucket.s3_bucket_name), ) @@ -47,7 +54,9 @@ def _download_inference_related_files_from_s3(self) -> None: / "Text" ) # TODO wrong type - self.s3_communication_main.download_files_in_prefix_to_dir(str(path_file_related_s3), str(self.project_paths.path_folder_relevance)) # type: ignore + self.s3_communication_main.download_files_in_prefix_to_dir( + str(path_file_related_s3), str(self.project_paths.path_folder_relevance) + ) # type: ignore def _upload_inference_related_files_to_s3(self) -> None: path_file_upload_to_s3: Path = ( @@ -59,12 +68,20 @@ def _upload_inference_related_files_to_s3(self) -> None: / "text_3434.csv" ) # TODO wrong type - self.s3_communication_interim.upload_file_to_s3(filepath=str(path_file_upload_to_s3), s3_prefix=str(path_file_upload_to_s3.parent), s3_key=str(path_file_upload_to_s3.name)) # type: ignore + self.s3_communication_interim.upload_file_to_s3( + filepath=str(path_file_upload_to_s3), + s3_prefix=str(path_file_upload_to_s3.parent), + s3_key=str(path_file_upload_to_s3.name), + ) # type: ignore def _weird_writing_stuff(self) -> bool: - with open(str(self.project_paths.path_folder_text_3434) + r"/text_3434.csv", "w") as file_out: + with open( + str(self.project_paths.path_folder_text_3434) + r"/text_3434.csv", "w" + ) as file_out: very_first = True - rel_inf_list = list(glob.iglob(str(self.project_paths.path_folder_relevance) + r"/*.csv")) + rel_inf_list = list( + glob.iglob(str(self.project_paths.path_folder_relevance) + r"/*.csv") + ) if len(rel_inf_list) == 0: print("No relevance inference results found.") return False @@ -84,7 +101,12 @@ def _weird_writing_stuff(self) -> bool: return False -def generate_text_3434(project_name: str, s3_usage: bool, s3_settings: S3Settings, project_paths: ProjectPaths): +def generate_text_3434( + project_name: str, + s3_usage: bool, + s3_settings: S3Settings, + project_paths: ProjectPaths, +): """ This function merges all infer relevance outputs into one large file, which is then used to train the kpi extraction model. @@ -102,12 +124,25 @@ def generate_text_3434(project_name: str, s3_usage: bool, s3_settings: S3Setting s3_bucket=os.getenv(s3_settings.main_bucket.s3_bucket_name), ) # Download infer relevance files - prefix_rel_infer = str(Path(s3_settings.prefix) / project_name / "data" / "output" / "RELEVANCE" / "Text") - s3c_main.download_files_in_prefix_to_dir(prefix_rel_infer, str(project_paths.path_folder_relevance)) + prefix_rel_infer = str( + Path(s3_settings.prefix) + / project_name + / "data" + / "output" + / "RELEVANCE" + / "Text" + ) + s3c_main.download_files_in_prefix_to_dir( + prefix_rel_infer, str(project_paths.path_folder_relevance) + ) - with open(str(project_paths.path_folder_text_3434) + r"/text_3434.csv", "w") as file_out: + with open( + str(project_paths.path_folder_text_3434) + r"/text_3434.csv", "w" + ) as file_out: very_first = True - rel_inf_list = list(glob.iglob(str(project_paths.path_folder_relevance) + r"/*.csv")) + rel_inf_list = list( + glob.iglob(str(project_paths.path_folder_relevance) + r"/*.csv") + ) if len(rel_inf_list) == 0: print("No relevance inference results found.") return False @@ -132,7 +167,9 @@ def generate_text_3434(project_name: str, s3_usage: bool, s3_settings: S3Setting aws_secret_access_key=os.getenv(s3_settings.interim_bucket.s3_secret_key), s3_bucket=os.getenv(s3_settings.interim_bucket.s3_bucket_name), ) - project_prefix_text3434 = str(Path(s3_settings.prefix) / project_name / "data" / "interim" / "ml") + project_prefix_text3434 = str( + Path(s3_settings.prefix) / project_name / "data" / "interim" / "ml" + ) s3c_interim.upload_file_to_s3( filepath=str(project_paths.path_folder_text_3434) + r"/text_3434.csv", s3_prefix=project_prefix_text3434, diff --git a/osc_extraction_utils/paths.py b/osc_extraction_utils/paths.py index bcbadb4..fcb785d 100644 --- a/osc_extraction_utils/paths.py +++ b/osc_extraction_utils/paths.py @@ -32,26 +32,44 @@ class ProjectPaths(BaseSettings): path_folder_source_annotation: Path = Field(default=Path("input/annotations")) path_folder_source_mapping: Path = Field(default=Path("input/kpi_mapping")) path_folder_destination_pdf: Path = Field(default=Path("interim/pdfs")) - path_folder_destination_annotation: Path = Field(default=Path("interim/ml/annotations")) + path_folder_destination_annotation: Path = Field( + default=Path("interim/ml/annotations") + ) path_folder_destination_mapping: Path = Field(default=Path("interim/kpi_mapping")) - path_folder_destination_extraction: Path = Field(default=Path("interim/ml/extraction")) + path_folder_destination_extraction: Path = Field( + default=Path("interim/ml/extraction") + ) path_folder_destination_curation: Path = Field(default=Path("interim/ml/curation")) path_folder_destination_training: Path = Field(default=Path("interim/ml/training")) - path_folder_destination_saved_models_relevance: Path = Field(default=Path("RELEVANCE/Text")) - path_folder_destination_saved_models_inference: Path = Field(default=Path("KPI_EXTRACTION/Text")) + path_folder_destination_saved_models_relevance: Path = Field( + default=Path("RELEVANCE/Text") + ) + path_folder_destination_saved_models_inference: Path = Field( + default=Path("KPI_EXTRACTION/Text") + ) path_folder_text_3434: Path = Field(default=Path("interim/ml")) path_folder_relevance: Path = Field(default=Path("output/RELEVANCE/Text")) - def __init__(self, string_project_name: str, main_settings: MainSettings, path_folder_root: Path, **kwargs): + def __init__( + self, + string_project_name: str, + main_settings: MainSettings, + path_folder_root: Path, + **kwargs, + ): super().__init__(**kwargs) if not isinstance(string_project_name, str): raise TypeError self._string_project_name: str = string_project_name self._PATH_FOLDER_ROOT = path_folder_root.resolve() - self._path_project_data_folder: Path = self._PATH_FOLDER_DATA / Path(string_project_name) - self._path_project_model_folder: Path = self._PATH_FOLDER_MODEL / Path(string_project_name) + self._path_project_data_folder: Path = self._PATH_FOLDER_DATA / Path( + string_project_name + ) + self._path_project_model_folder: Path = self._PATH_FOLDER_MODEL / Path( + string_project_name + ) self._main_settings: MainSettings = main_settings self._update_all_paths_depending_on_path_project_data_folder() self._update_all_paths_depending_on_path_project_model_folder() @@ -118,13 +136,19 @@ def PYTHON_EXECUTABLE(self) -> str: def _update_all_paths_depending_on_path_project_data_folder(self) -> None: list_paths_model_fields_filtered: list[str] = [ - path_model_field for path_model_field in self.model_fields.keys() if "saved_models" not in path_model_field + path_model_field + for path_model_field in self.model_fields.keys() + if "saved_models" not in path_model_field ] for path_field in list_paths_model_fields_filtered: path_field_default: Path = self.model_fields[path_field].default setattr( - self, f"{path_field}", self._PATH_FOLDER_DATA / Path(self._string_project_name) / path_field_default + self, + f"{path_field}", + self._PATH_FOLDER_DATA + / Path(self._string_project_name) + / path_field_default, ) def _update_all_root_related_paths(self) -> None: @@ -133,7 +157,9 @@ def _update_all_root_related_paths(self) -> None: self._PATH_FOLDER_DATA: Path = self.PATH_FOLDER_ROOT / "data" def _create_all_root_related_folders(self) -> None: - self._PATH_FOLDER_MODEL.mkdir(parents=True, exist_ok=True) # includes root folder + self._PATH_FOLDER_MODEL.mkdir( + parents=True, exist_ok=True + ) # includes root folder self._PATH_FOLDER_DATA.mkdir(exist_ok=True) def _update_all_paths_depending_on_path_project_model_folder(self) -> None: @@ -147,7 +173,8 @@ def _update_all_paths_depending_on_path_project_model_folder(self) -> None: ] for string_model_field, path_main_settings in zip( - list_string_paths_depending_on_path_project_model_folder, list_paths_main_settings + list_string_paths_depending_on_path_project_model_folder, + list_paths_main_settings, ): setattr( self, diff --git a/osc_extraction_utils/router.py b/osc_extraction_utils/router.py index 4bd4e2e..42642d8 100644 --- a/osc_extraction_utils/router.py +++ b/osc_extraction_utils/router.py @@ -9,7 +9,12 @@ class Router: - def __init__(self, main_settings: MainSettings, s3_settings: S3Settings, project_paths: ProjectPaths) -> None: + def __init__( + self, + main_settings: MainSettings, + s3_settings: S3Settings, + project_paths: ProjectPaths, + ) -> None: self._main_settings: MainSettings = main_settings self._s3_settings: S3Settings = s3_settings self._project_paths: ProjectPaths = project_paths @@ -29,8 +34,12 @@ def run_router(self): self._check_extraction_server_is_live() self._define_payload() - self._send_payload_to_server_address_with_node(self._extraction_server_address, "extract") - self._send_payload_to_server_address_with_node(self._extraction_server_address, "curate") + self._send_payload_to_server_address_with_node( + self._extraction_server_address, "extract" + ) + self._send_payload_to_server_address_with_node( + self._extraction_server_address, "curate" + ) self._check_inference_server_is_live() @@ -38,23 +47,25 @@ def run_router(self): self._check_for_kpi_training_and_send_request() def _set_extraction_server_string(self) -> None: - self._extraction_server_address = ( - f"http://{self._main_settings.general.ext_ip}:{self._main_settings.general.ext_port}" - ) + self._extraction_server_address = f"http://{self._main_settings.general.ext_ip}:{self._main_settings.general.ext_port}" def _set_inference_server_string(self) -> None: - self._inference_server_address = ( - f"http://{self._main_settings.general.infer_ip}:{self._main_settings.general.infer_port}" - ) + self._inference_server_address = f"http://{self._main_settings.general.infer_ip}:{self._main_settings.general.infer_port}" - def _send_payload_to_server_address_with_node(self, server_address: str, node: str) -> None: - response: requests.Response = requests.get(f"{server_address}/{node}", params=self._payload) + def _send_payload_to_server_address_with_node( + self, server_address: str, node: str + ) -> None: + response: requests.Response = requests.get( + f"{server_address}/{node}", params=self._payload + ) print(response.text) if response.status_code != 200: self._return_value = False def _check_extraction_server_is_live(self) -> None: - response: requests.Response = requests.get(f"{self._extraction_server_address}/liveness") + response: requests.Response = requests.get( + f"{self._extraction_server_address}/liveness" + ) if response.status_code == 200: print("Extraction server is up. Proceeding to extraction.") else: @@ -62,12 +73,17 @@ def _check_extraction_server_is_live(self) -> None: self._return_value = False def _define_payload(self) -> None: - self._payload = {"project_name": self._main_settings.general.project_name, "mode": "train"} + self._payload = { + "project_name": self._main_settings.general.project_name, + "mode": "train", + } self._payload.update(self._main_settings.model_dump()) self._payload = {"payload": json.dumps(self._payload)} def _check_inference_server_is_live(self) -> None: - response: requests.Response = requests.get(f"{self._inference_server_address}/liveness") + response: requests.Response = requests.get( + f"{self._inference_server_address}/liveness" + ) if response.status_code == 200: print("Inference server is up. Proceeding to Inference.") else: @@ -77,7 +93,9 @@ def _check_inference_server_is_live(self) -> None: def _check_for_train_relevance_training_and_send_request(self) -> None: print("Relevance training will be started.") if self._main_settings.train_relevance.train: - self._send_payload_to_server_address_with_node(self._inference_server_address, "train_relevance") + self._send_payload_to_server_address_with_node( + self._inference_server_address, "train_relevance" + ) else: print( "No relevance training done. If you want to have a relevance training please set variable " @@ -86,10 +104,16 @@ def _check_for_train_relevance_training_and_send_request(self) -> None: def _check_for_kpi_training_and_send_request(self) -> None: if self._main_settings.train_kpi.train: - self._send_payload_to_server_address_with_node(self._inference_server_address, "infer_relevance") + self._send_payload_to_server_address_with_node( + self._inference_server_address, "infer_relevance" + ) self._check_for_generate_text_3434() - print("Next we start the training of the inference model. This may take some time.") - self._send_payload_to_server_address_with_node(self._inference_server_address, "train_kpi") + print( + "Next we start the training of the inference model. This may take some time." + ) + self._send_payload_to_server_address_with_node( + self._inference_server_address, "train_kpi" + ) else: print( "No kpi training done. If you want to have a kpi training please set variable" diff --git a/osc_extraction_utils/s3_communication.py b/osc_extraction_utils/s3_communication.py index b6c4028..7904fda 100644 --- a/osc_extraction_utils/s3_communication.py +++ b/osc_extraction_utils/s3_communication.py @@ -1,4 +1,5 @@ """S3 communication tools.""" + import os import os.path as osp import pathlib @@ -69,7 +70,9 @@ def download_file_from_s3(self, filepath: Path, s3_prefix: str, s3_key: str): with open(filepath, "wb") as f: f.write(buffer_bytes) - def upload_df_to_s3(self, df, s3_prefix, s3_key, filetype=S3FileType.PARQUET, **pd_to_ftype_args): + def upload_df_to_s3( + self, df, s3_prefix, s3_key, filetype=S3FileType.PARQUET, **pd_to_ftype_args + ): """ Take as input the data frame to be uploaded, and the output s3_key. @@ -83,12 +86,16 @@ def upload_df_to_s3(self, df, s3_prefix, s3_key, filetype=S3FileType.PARQUET, ** elif filetype == S3FileType.PARQUET: df.to_parquet(buffer, **pd_to_ftype_args) else: - raise ValueError(f"Received unexpected file type arg {filetype}. Can only be one of: {list(S3FileType)})") + raise ValueError( + f"Received unexpected file type arg {filetype}. Can only be one of: {list(S3FileType)})" + ) status = self._upload_bytes(buffer.getvalue(), s3_prefix, s3_key) return status - def download_df_from_s3(self, s3_prefix, s3_key, filetype=S3FileType.PARQUET, **pd_read_ftype_args): + def download_df_from_s3( + self, s3_prefix, s3_key, filetype=S3FileType.PARQUET, **pd_read_ftype_args + ): """Read from s3 and see if the saved data is correct.""" buffer_bytes = self._download_bytes(s3_prefix, s3_key) buffer = BytesIO(buffer_bytes) @@ -100,7 +107,9 @@ def download_df_from_s3(self, s3_prefix, s3_key, filetype=S3FileType.PARQUET, ** elif filetype == S3FileType.PARQUET: df = pd.read_parquet(buffer, **pd_read_ftype_args) else: - raise ValueError(f"Received unexpected file type arg {filetype}. Can only be one of: {list(S3FileType)})") + raise ValueError( + f"Received unexpected file type arg {filetype}. Can only be one of: {list(S3FileType)})" + ) return df def upload_files_in_dir_to_prefix(self, source_dir, s3_prefix): @@ -124,7 +133,9 @@ def download_files_in_prefix_to_dir(self, s3_prefix, destination_dir) -> None: Modified from original code here: https://stackoverflow.com/a/33350380 """ paginator = self.s3_resource.meta.client.get_paginator("list_objects") - for result in paginator.paginate(Bucket=self.bucket, Delimiter="/", Prefix=s3_prefix): + for result in paginator.paginate( + Bucket=self.bucket, Delimiter="/", Prefix=s3_prefix + ): # download all files in the sub "directory", if any if result.get("CommonPrefixes") is not None: for subdir in result.get("CommonPrefixes"): diff --git a/osc_extraction_utils/settings.py b/osc_extraction_utils/settings.py index 7f491e7..d14c54c 100644 --- a/osc_extraction_utils/settings.py +++ b/osc_extraction_utils/settings.py @@ -153,11 +153,15 @@ class TrainKpi(BaseSettings): class InferKpi(BaseSettings): - skip_processed_files: bool = False # If set to True, will skip inferring on already processed files + skip_processed_files: bool = ( + False # If set to True, will skip inferring on already processed files + ) top_k: int = 4 batch_size: int = 16 gpu: bool = True - num_processes: int | None = None # Set to value 1 (or 0) to disable multiprocessing. Set to None to let Inferencer use all CPU cores minus one. + num_processes: int | None = ( + None # Set to value 1 (or 0) to disable multiprocessing. Set to None to let Inferencer use all CPU cores minus one. + ) no_ans_boost: int = -15 # If incr diff --git a/osc_extraction_utils/settings_handler.py b/osc_extraction_utils/settings_handler.py index 43cc5b6..eae1e40 100644 --- a/osc_extraction_utils/settings_handler.py +++ b/osc_extraction_utils/settings_handler.py @@ -11,17 +11,28 @@ class SettingsHandler: Class for reading and writing setting files """ - def __init__(self, main_settings: MainSettings = MainSettings(), s3_settings: S3Settings = S3Settings()): + def __init__( + self, + main_settings: MainSettings = MainSettings(), + s3_settings: S3Settings = S3Settings(), + ): self.main_settings: MainSettings = main_settings self.s3_settings: S3Settings = s3_settings def read_settings( self, - path_main_settings=Path(__file__).parents[1].resolve() / "data" / "TEST" / "settings.yaml", - path_s3_settings=Path(__file__).parents[1].resolve() / "data" / "s3_settings.yaml", + path_main_settings=Path(__file__).parents[1].resolve() + / "data" + / "TEST" + / "settings.yaml", + path_s3_settings=Path(__file__).parents[1].resolve() + / "data" + / "s3_settings.yaml", ): try: - with open(str(path_main_settings)) as file_main_settings, open(str(path_s3_settings)) as file_s3_settings: + with open(str(path_main_settings)) as file_main_settings, open( + str(path_s3_settings) + ) as file_s3_settings: self.main_settings = yaml.safe_load(file_main_settings) self.s3_settings = yaml.safe_load(file_s3_settings) except Exception as e: @@ -32,7 +43,9 @@ def read_settings( def _read_setting_file(path: Path) -> S3Settings | MainSettings: with open(path, mode="r") as file_settings: loaded_settings: dict = yaml.safe_load(file_settings) - settings: Type[S3Settings] | Type[MainSettings] = SettingsHandler._settings_factory(loaded_settings) + settings: Type[S3Settings] | Type[MainSettings] = ( + SettingsHandler._settings_factory(loaded_settings) + ) return settings(**loaded_settings) @staticmethod diff --git a/osc_extraction_utils/tests/test_converter.py b/osc_extraction_utils/tests/test_converter.py index efe3faf..10709f9 100644 --- a/osc_extraction_utils/tests/test_converter.py +++ b/osc_extraction_utils/tests/test_converter.py @@ -42,7 +42,9 @@ def test_convert_single_file_to_csv(converter) -> None: mocked_read_excel.assert_called_once_with(Path("file.xlsx"), engine="openpyxl") path_destination_file: Path = path_destination_folder / "aggregated_annotation.csv" - mocked_read_excel.return_value.to_csv.assert_called_once_with(path_destination_file, index=False, header=True) + mocked_read_excel.return_value.to_csv.assert_called_once_with( + path_destination_file, index=False, header=True + ) def test_find_xlsx_files_in_source_folder(converter) -> None: @@ -50,7 +52,9 @@ def test_find_xlsx_files_in_source_folder(converter) -> None: mocked_path_glob.return_value = [Path("file1.xlsx"), Path("file2.xlsx")] with patch("osc_extraction_utils.core_utils.Path.glob", mocked_path_glob): - list_paths_xlsx_files: list[Path] = converter._find_xlsx_files_in_source_folder() + list_paths_xlsx_files: list[Path] = ( + converter._find_xlsx_files_in_source_folder() + ) mocked_path_glob.assert_called_once_with("*.xlsx") assert list_paths_xlsx_files == [Path("file1.xlsx"), Path("file2.xlsx")] diff --git a/osc_extraction_utils/tests/test_core_utils.py b/osc_extraction_utils/tests/test_core_utils.py index 5f8f427..b16992f 100644 --- a/osc_extraction_utils/tests/test_core_utils.py +++ b/osc_extraction_utils/tests/test_core_utils.py @@ -14,7 +14,9 @@ @pytest.fixture() -def prerequisites_copy_file_without_overwrite(path_folder_temporary: Path) -> Generator[None, None, None]: +def prerequisites_copy_file_without_overwrite( + path_folder_temporary: Path, +) -> Generator[None, None, None]: """Defines a fixture for creating the source and destination folder :param path_folder_temporary: Requesting the path_folder_temporary fixture @@ -61,7 +63,9 @@ def test_create_folder_cleanup(path_folder_temporary: Path): def test_create_folder_already_exists(): with ( patch.object(Path, "mkdir") as mocked_path, - patch("osc_extraction_utils.core_utils._delete_files_in_folder") as mocked_mkdir, + patch( + "osc_extraction_utils.core_utils._delete_files_in_folder" + ) as mocked_mkdir, ): mocked_path.side_effect = OSError @@ -72,7 +76,9 @@ def test_create_folder_already_exists(): def test_create_folder_path_not_exists(): with ( patch.object(Path, "mkdir") as mocked_path, - patch("osc_extraction_utils.core_utils._delete_files_in_folder") as mocked_mkdir, + patch( + "osc_extraction_utils.core_utils._delete_files_in_folder" + ) as mocked_mkdir, ): mocked_path.side_effect = FileNotFoundError @@ -80,7 +86,9 @@ def test_create_folder_path_not_exists(): mocked_mkdir.assert_called_once() -def test_copy_file_without_overwrite_result(prerequisites_copy_file_without_overwrite, path_folder_temporary: Path): +def test_copy_file_without_overwrite_result( + prerequisites_copy_file_without_overwrite, path_folder_temporary: Path +): """Tests if copy_file_without_overwrite returns True if executed Requesting prerequisites_copy_file_without_overwrite automatically (autouse) @@ -92,7 +100,9 @@ def test_copy_file_without_overwrite_result(prerequisites_copy_file_without_over path_folder_source_file = path_folder_source / "test.txt" path_folder_source_file.touch() - result = copy_file_without_overwrite(str(path_folder_source), str(path_folder_destination)) + result = copy_file_without_overwrite( + str(path_folder_source), str(path_folder_destination) + ) assert result is True diff --git a/osc_extraction_utils/tests/test_generate_text.py b/osc_extraction_utils/tests/test_generate_text.py index 42821e9..c296970 100644 --- a/osc_extraction_utils/tests/test_generate_text.py +++ b/osc_extraction_utils/tests/test_generate_text.py @@ -9,7 +9,11 @@ from osc_extraction_utils.settings import S3Settings -def test_generate_text_with_s3(prerequisites_generate_text, path_folder_temporary: Path, project_paths: ProjectPaths): +def test_generate_text_with_s3( + prerequisites_generate_text, + path_folder_temporary: Path, + project_paths: ProjectPaths, +): """Tests if the s3 connection objects are created and their methods are called Requesting prerequisites_generate_text automatically (autouse) @@ -34,8 +38,15 @@ def test_generate_text_with_s3(prerequisites_generate_text, path_folder_temporar }, } - with patch("osc_extraction_utils.merger.S3Communication", Mock(spec=S3Communication)) as mocked_s3: - generate_text_3434(project_name, True, S3Settings(**mocked_s3_settings), project_paths=project_paths) + with patch( + "osc_extraction_utils.merger.S3Communication", Mock(spec=S3Communication) + ) as mocked_s3: + generate_text_3434( + project_name, + True, + S3Settings(**mocked_s3_settings), + project_paths=project_paths, + ) # check for calls mocked_s3.assert_any_call( @@ -52,12 +63,17 @@ def test_generate_text_with_s3(prerequisites_generate_text, path_folder_temporar ) call_list = [call[0] for call in mocked_s3.mock_calls] - assert any([call for call in call_list if "download_files_in_prefix_to_dir" in call]) + assert any( + [call for call in call_list if "download_files_in_prefix_to_dir" in call] + ) assert any([call for call in call_list if "upload_file_to_s3" in call]) def test_generate_text_no_s3( - prerequisites_generate_text, path_folder_temporary: Path, project_paths: ProjectPaths, s3_settings: S3Settings + prerequisites_generate_text, + path_folder_temporary: Path, + project_paths: ProjectPaths, + s3_settings: S3Settings, ): """Tests if files are taken from the folder relevance, then read in and putting the content into the file text_3434.csv. Note that @@ -94,7 +110,10 @@ def test_generate_text_no_s3( def test_generate_text_successful( - prerequisites_generate_text, path_folder_temporary: Path, project_paths: ProjectPaths, s3_settings: S3Settings + prerequisites_generate_text, + path_folder_temporary: Path, + project_paths: ProjectPaths, + s3_settings: S3Settings, ): """Tests if the function returns true Requesting prerequisites_generate_text automatically (autouse) @@ -105,7 +124,9 @@ def test_generate_text_successful( project_name = "test" s3_usage = False - return_value = generate_text_3434(project_name, s3_usage, s3_settings, project_paths=project_paths) + return_value = generate_text_3434( + project_name, s3_usage, s3_settings, project_paths=project_paths + ) assert return_value is True @@ -134,7 +155,9 @@ def test_generate_text_not_successful_empty_folder( file.unlink() # call the function - return_value = generate_text_3434(project_name, s3_usage, s3_settings, project_paths=project_paths) + return_value = generate_text_3434( + project_name, s3_usage, s3_settings, project_paths=project_paths + ) output_cmd, _ = capsys.readouterr() assert "No relevance inference results found." in output_cmd @@ -163,7 +186,9 @@ def test_generate_text_not_successful_exception( file.unlink() # patch glob.iglob to force an exception... - with patch("osc_extraction_utils.merger.glob.iglob", side_effect=lambda *args: [None]): + with patch( + "osc_extraction_utils.merger.glob.iglob", side_effect=lambda *args: [None] + ): return_value = generate_text_3434( project_name=project_name, s3_usage=s3_usage, diff --git a/osc_extraction_utils/tests/test_link_files.py b/osc_extraction_utils/tests/test_link_files.py index 9c9d043..e68d52b 100644 --- a/osc_extraction_utils/tests/test_link_files.py +++ b/osc_extraction_utils/tests/test_link_files.py @@ -8,7 +8,9 @@ @pytest.fixture(autouse=True) -def path_folders_required_linking(path_folder_temporary: Path) -> Generator[None, None, None]: +def path_folders_required_linking( + path_folder_temporary: Path, +) -> Generator[None, None, None]: """Defines a fixture for creating the source, source_pdf and destination folder :param path_folder_temporary: Requesting the path_folder_temporary fixture @@ -60,7 +62,11 @@ def test_link_extracted_files_result(path_folder_temporary: Path): path_folder_source_pdf = path_folder_temporary / "source_pdf" path_folder_destination = path_folder_temporary / "destination" - result = link_extracted_files(str(path_folder_source), str(path_folder_source_pdf), str(path_folder_destination)) + result = link_extracted_files( + str(path_folder_source), + str(path_folder_source_pdf), + str(path_folder_destination), + ) assert result is True @@ -86,9 +92,13 @@ def test_link_extracted_files_copy(path_folder_temporary: Path): for i in range(10): path_current_file = path_folder_destination / f"test_{i}.json" - assert not path_current_file.exists() is True + assert path_current_file.exists() is not True - link_extracted_files(str(path_folder_source), str(path_folder_source_pdf), str(path_folder_destination)) + link_extracted_files( + str(path_folder_source), + str(path_folder_source_pdf), + str(path_folder_destination), + ) for i in range(10): path_current_file = path_folder_destination / f"test_{i}.json" diff --git a/osc_extraction_utils/tests/test_merger.py b/osc_extraction_utils/tests/test_merger.py index 346607d..5fd81db 100644 --- a/osc_extraction_utils/tests/test_merger.py +++ b/osc_extraction_utils/tests/test_merger.py @@ -19,7 +19,9 @@ @pytest.fixture -def merger(main_settings: MainSettings, s3_settings: S3Settings, project_paths: ProjectPaths): +def merger( + main_settings: MainSettings, s3_settings: S3Settings, project_paths: ProjectPaths +): return Merger(main_settings, s3_settings, project_paths) @@ -28,7 +30,9 @@ def test_setup_s3_usage(merger: Merger, s3_usage: bool): with ( patch.object(merger.main_settings.general, "s3_usage", s3_usage), patch.object(merger.s3_settings.main_bucket, "s3_endpoint", "https://0.0.0.0"), - patch.object(merger.s3_settings.interim_bucket, "s3_endpoint", "https://0.0.0.0"), + patch.object( + merger.s3_settings.interim_bucket, "s3_endpoint", "https://0.0.0.0" + ), ): merger._setup_s3_usage() @@ -40,7 +44,9 @@ def test_setup_s3_usage(merger: Merger, s3_usage: bool): assert merger.s3_communication_interim is None -@pytest.mark.parametrize("bucket_settings_object", [MainBucketSettings, InterimBucketSettings]) +@pytest.mark.parametrize( + "bucket_settings_object", [MainBucketSettings, InterimBucketSettings] +) def test_return_s3_communication_main( merger: Merger, bucket_settings_object: MainBucketSettings | InterimBucketSettings ): @@ -54,7 +60,9 @@ def test_return_s3_communication_main( with ( patch("osc_extraction_utils.merger.S3Communication") as mocked_s3_communication, patch("osc_extraction_utils.merger.os.getenv", side_effect=lambda args: args), - patch.object(merger.s3_settings, "main_bucket", bucket_settings_object(**settings)), + patch.object( + merger.s3_settings, "main_bucket", bucket_settings_object(**settings) + ), ): merger._return_s3_communication_main() @@ -99,7 +107,9 @@ def test_upload_to_s3(merger: Merger): merger._upload_inference_related_files_to_s3() mocked_s3_communication.upload_file_to_s3.assert_called_with( - filepath=str(path_local_file), s3_prefix=str(path_local_file.parent), s3_key=str(path_local_file.name) + filepath=str(path_local_file), + s3_prefix=str(path_local_file.parent), + s3_key=str(path_local_file.name), ) @@ -112,7 +122,9 @@ def test_write_output(merger: Merger, path_folder_temporary: Path): path_folder_relevance: Path = path_folder_temporary / "folder_relevance" create_multiple_xlsx_files(path_folder_relevance) - with patch.object(merger.project_paths, "path_folder_text_3434", path_file_text_3434), patch.object( + with patch.object( + merger.project_paths, "path_folder_text_3434", path_file_text_3434 + ), patch.object( merger.project_paths, "path_folder_relevance", path_folder_relevance ): merger._weird_writing_stuff() diff --git a/osc_extraction_utils/tests/test_paths.py b/osc_extraction_utils/tests/test_paths.py index 8c82c65..ca327fb 100644 --- a/osc_extraction_utils/tests/test_paths.py +++ b/osc_extraction_utils/tests/test_paths.py @@ -43,7 +43,9 @@ def test_python_executable_set(paths_project: ProjectPaths): assert paths_project.PYTHON_EXECUTABLE == "python" -def test_check_that_all_required_paths_exist_in_project_path_object(main_settings: MainSettings): +def test_check_that_all_required_paths_exist_in_project_path_object( + main_settings: MainSettings, +): list_paths_expected = [ "input/pdfs/training", "input/annotations", @@ -60,7 +62,9 @@ def test_check_that_all_required_paths_exist_in_project_path_object(main_setting "output/RELEVANCE/Text", ] - with patch.object(ProjectPaths, "_update_all_paths_depending_on_path_project_data_folder"), patch.object( + with patch.object( + ProjectPaths, "_update_all_paths_depending_on_path_project_data_folder" + ), patch.object( ProjectPaths, "_update_all_paths_depending_on_path_project_model_folder" ): paths_project: ProjectPaths = ProjectPaths( @@ -74,10 +78,16 @@ def test_check_that_all_required_paths_exist_in_project_path_object(main_setting def test_project_paths_update_methods_are_called(main_settings: MainSettings): with ( - patch.object(ProjectPaths, "_update_all_paths_depending_on_path_project_data_folder") as mocked_update_data, - patch.object(ProjectPaths, "_update_all_paths_depending_on_path_project_model_folder") as mocked_update_model, + patch.object( + ProjectPaths, "_update_all_paths_depending_on_path_project_data_folder" + ) as mocked_update_data, + patch.object( + ProjectPaths, "_update_all_paths_depending_on_path_project_model_folder" + ) as mocked_update_model, ): - ProjectPaths("new_test_project", main_settings, Path(__file__).parents[1].resolve()) + ProjectPaths( + "new_test_project", main_settings, Path(__file__).parents[1].resolve() + ) mocked_update_data.assert_called_once() mocked_update_model.assert_called_once() @@ -95,12 +105,18 @@ def test_set_path_project_model_folder(paths_project: ProjectPaths): assert paths_project.path_project_model_folder.parts[-1] == string_test_project -def test_setting_new_project_name_results_in_call_of_update_methods(paths_project: ProjectPaths): +def test_setting_new_project_name_results_in_call_of_update_methods( + paths_project: ProjectPaths, +): string_test_project: str = "test_project" with ( - patch.object(paths_project, "_update_all_paths_depending_on_path_project_data_folder") as mocked_update_data, - patch.object(paths_project, "_update_all_paths_depending_on_path_project_model_folder") as mocked_update_model, + patch.object( + paths_project, "_update_all_paths_depending_on_path_project_data_folder" + ) as mocked_update_data, + patch.object( + paths_project, "_update_all_paths_depending_on_path_project_model_folder" + ) as mocked_update_model, ): paths_project.string_project_name = string_test_project @@ -116,11 +132,15 @@ def test_set_main_settings(paths_project: ProjectPaths, main_settings: Settings) assert paths_project.main_settings != main_settings -def test_update_all_root_related_paths(main_settings: MainSettings, path_folder_temporary: Path): +def test_update_all_root_related_paths( + main_settings: MainSettings, path_folder_temporary: Path +): string_test_project: str = "test_project" path_folder_project_root = path_folder_temporary / "test_project" - project_paths = ProjectPaths(string_test_project, main_settings, path_folder_project_root) + project_paths = ProjectPaths( + string_test_project, main_settings, path_folder_project_root + ) assert project_paths._PATH_FOLDER_ROOT == path_folder_project_root assert project_paths._PATH_FOLDER_NLP == path_folder_project_root @@ -128,7 +148,9 @@ def test_update_all_root_related_paths(main_settings: MainSettings, path_folder_ assert project_paths._PATH_FOLDER_DATA == path_folder_project_root / "data" -def test_create_all_root_related_paths(main_settings: MainSettings, path_folder_temporary: Path): +def test_create_all_root_related_paths( + main_settings: MainSettings, path_folder_temporary: Path +): string_test_project: str = "test_project" path_folder_project_root = path_folder_temporary / "test_project" list_paths_expected = [ @@ -143,7 +165,9 @@ def test_create_all_root_related_paths(main_settings: MainSettings, path_folder_ assert path.exists() -def test_update_all_paths_depending_on_path_project_data_folder(paths_project: ProjectPaths): +def test_update_all_paths_depending_on_path_project_data_folder( + paths_project: ProjectPaths, +): string_test_project: str = "test_project" paths_project.string_project_name = string_test_project @@ -160,7 +184,9 @@ def test_update_all_paths_depending_on_path_project_data_folder(paths_project: P assert string_test_project in path_current_field.parts -def test_update_all_paths_depending_on_path_project_model_folder(paths_project: ProjectPaths): +def test_update_all_paths_depending_on_path_project_model_folder( + paths_project: ProjectPaths, +): main_settings_changed: MainSettings = MainSettings() main_settings_changed.general.project_name = "TEST_NEW" paths_project._main_settings = main_settings_changed @@ -170,13 +196,17 @@ def test_update_all_paths_depending_on_path_project_model_folder(paths_project: paths_project._update_all_paths_depending_on_path_project_model_folder() list_paths_model_fields_filtered: list[str] = [ - path_model_field for path_model_field in paths_project.model_fields.keys() if "saved_models" in path_model_field + path_model_field + for path_model_field in paths_project.model_fields.keys() + if "saved_models" in path_model_field ] list_paths_main_settings: list[Path] = [ Path(paths_project.main_settings.train_relevance.output_model_name), Path(paths_project.main_settings.train_kpi.output_model_name), ] - for path_model_field, path_main_settings in zip(list_paths_model_fields_filtered, list_paths_main_settings): + for path_model_field, path_main_settings in zip( + list_paths_model_fields_filtered, list_paths_main_settings + ): path_model_current_field: Path = getattr(paths_project, f"{path_model_field}") assert string_test_project in path_model_current_field.parts assert path_model_current_field.parts[-1] == path_main_settings.name diff --git a/osc_extraction_utils/tests/test_run_router.py b/osc_extraction_utils/tests/test_run_router.py index 31056f5..28ca35c 100644 --- a/osc_extraction_utils/tests/test_run_router.py +++ b/osc_extraction_utils/tests/test_run_router.py @@ -12,7 +12,9 @@ @pytest.fixture -def router(main_settings: MainSettings, s3_settings: S3Settings, project_paths: ProjectPaths): +def router( + main_settings: MainSettings, s3_settings: S3Settings, project_paths: ProjectPaths +): dict_general_settings = { "project_name": "TEST", "ext_ip": "0.0.0.0", @@ -22,7 +24,11 @@ def router(main_settings: MainSettings, s3_settings: S3Settings, project_paths: } with patch.object(main_settings, "general", Mock(**dict_general_settings)): - router = Router(main_settings=main_settings, s3_settings=s3_settings, project_paths=project_paths) + router = Router( + main_settings=main_settings, + s3_settings=s3_settings, + project_paths=project_paths, + ) router._set_extraction_server_string() router._set_inference_server_string() yield router @@ -43,13 +49,19 @@ def server(prerequisites_generate_text) -> requests_mock.mocker.Mocker: server_address_extraction = f"http://{extraction_ip}:{extraction_port}" server_address_inference = f"http://{inference_ip}:{inference_port}" - with requests_mock.Mocker() as mocked_server, patch("osc_extraction_utils.router.json"): + with requests_mock.Mocker() as mocked_server, patch( + "osc_extraction_utils.router.json" + ): mocked_server.get(f"{server_address_extraction}/liveness", status_code=200) mocked_server.get(f"{server_address_extraction}/extract", status_code=200) mocked_server.get(f"{server_address_extraction}/curate", status_code=200) mocked_server.get(f"{server_address_inference}/liveness", status_code=200) - mocked_server.get(f"{server_address_inference}/train_relevance", status_code=200) - mocked_server.get(f"{server_address_inference}/infer_relevance", status_code=200) + mocked_server.get( + f"{server_address_inference}/train_relevance", status_code=200 + ) + mocked_server.get( + f"{server_address_inference}/infer_relevance", status_code=200 + ) mocked_server.get(f"{server_address_inference}/train_kpi", status_code=200) yield mocked_server @@ -82,7 +94,9 @@ def test_run_router_extraction_liveness_up( def test_run_router_extraction_server_down( - router: Router, server: requests_mock.mocker.Mocker, capsys: typing.Generator[CaptureFixture[str], None, None] + router: Router, + server: requests_mock.mocker.Mocker, + capsys: typing.Generator[CaptureFixture[str], None, None], ): """Tests the return value if the extraction server is down @@ -94,12 +108,16 @@ def test_run_router_extraction_server_down( server_address_node = f"http://{extraction_ip}:{extraction_port}/extract" server.get(server_address_node, status_code=-1) - router._send_payload_to_server_address_with_node(f"http://{extraction_ip}:{extraction_port}", "extract") + router._send_payload_to_server_address_with_node( + f"http://{extraction_ip}:{extraction_port}", "extract" + ) assert router.return_value is False -def test_run_router_extraction_curation_server_down(router: Router, server: requests_mock.mocker.Mocker): +def test_run_router_extraction_curation_server_down( + router: Router, server: requests_mock.mocker.Mocker +): """Tests the return value of the curation of the extraction server :param server: Requesting the server fixture @@ -216,7 +234,9 @@ def test_run_router_kpi_training( ): inference_ip = "0.0.0.1" inference_port = 8000 - server_address_node_infer_relevance = f"http://{inference_ip}:{inference_port}/infer_relevance" + server_address_node_infer_relevance = ( + f"http://{inference_ip}:{inference_port}/infer_relevance" + ) server_address_node_train_kpi = f"http://{inference_ip}:{inference_port}/train_kpi" # force an exception of generate_text_3434 by removing the folder_text_3434 @@ -232,10 +252,12 @@ def test_run_router_kpi_training( else: mocked_generate_text.side_effect = Exception() - with patch("osc_extraction_utils.router.generate_text_3434", mocked_generate_text), patch.object( - main_settings, "train_kpi", Mock(train=train_kpi) - ): - server.get(server_address_node_infer_relevance, status_code=status_code_infer_relevance) + with patch( + "osc_extraction_utils.router.generate_text_3434", mocked_generate_text + ), patch.object(main_settings, "train_kpi", Mock(train=train_kpi)): + server.get( + server_address_node_infer_relevance, status_code=status_code_infer_relevance + ) server.get(server_address_node_train_kpi, status_code=status_code_train_kpi) router.run_router() @@ -254,7 +276,10 @@ def test_run_router_kpi_training( ], ) def test_run_router_successful_run( - router: Router, server: requests_mock.mocker.Mocker, infer_relevance: bool, train_kpi: bool + router: Router, + server: requests_mock.mocker.Mocker, + infer_relevance: bool, + train_kpi: bool, ): with patch("osc_extraction_utils.merger.generate_text_3434", Mock()): router.run_router() diff --git a/osc_extraction_utils/tests/test_running.py b/osc_extraction_utils/tests/test_running.py index 11c1a8f..c963d5a 100644 --- a/osc_extraction_utils/tests/test_running.py +++ b/osc_extraction_utils/tests/test_running.py @@ -53,7 +53,9 @@ def test_set_running(path_folder_root_testing: Path, training_monitor: TrainingM assert path_file_running.exists() -def test_checking_onging_run(path_folder_root_testing: Path, training_monitor: TrainingMonitor): +def test_checking_onging_run( + path_folder_root_testing: Path, training_monitor: TrainingMonitor +): """Tests the return value of check_running for ongoing runs :param prerequisite_running: Fixture for prerequisite of running funcions @@ -66,7 +68,9 @@ def test_checking_onging_run(path_folder_root_testing: Path, training_monitor: T assert training_monitor.check_running() is True -def test_checking_finished_run(path_folder_root_testing: Path, training_monitor: TrainingMonitor): +def test_checking_finished_run( + path_folder_root_testing: Path, training_monitor: TrainingMonitor +): """Tests the return value of check_running for finished runs :param prerequisite_running: Fixture for prerequisite of running funcions @@ -79,7 +83,9 @@ def test_checking_finished_run(path_folder_root_testing: Path, training_monitor: assert training_monitor.check_running() is False -def test_clear_running(path_folder_root_testing: Path, training_monitor: TrainingMonitor): +def test_clear_running( + path_folder_root_testing: Path, training_monitor: TrainingMonitor +): """Tests for clearing running file :param prerequisite_running: Fixture for prerequisite of running funcions diff --git a/osc_extraction_utils/tests/test_s3_connection.py b/osc_extraction_utils/tests/test_s3_connection.py index d4a0413..7d349eb 100644 --- a/osc_extraction_utils/tests/test_s3_connection.py +++ b/osc_extraction_utils/tests/test_s3_connection.py @@ -14,7 +14,9 @@ @pytest.mark.parametrize("s3_usage", [True, False]) -def test_download_data_from_s3_main_bucket_to_local_folder_if_required(s3_usage: bool, main_settings: MainSettings): +def test_download_data_from_s3_main_bucket_to_local_folder_if_required( + s3_usage: bool, main_settings: MainSettings +): mocked_s3_bucket = Mock(spec=S3Communication) mocked_path_local = Mock(spec=Path("path_local")) mocked_path_s3 = Mock(spec=Path("path_s3")) @@ -25,13 +27,17 @@ def test_download_data_from_s3_main_bucket_to_local_folder_if_required(s3_usage: ) if s3_usage: - mocked_s3_bucket.download_files_in_prefix_to_dir.assert_called_with(mocked_path_s3, mocked_path_local) + mocked_s3_bucket.download_files_in_prefix_to_dir.assert_called_with( + mocked_path_s3, mocked_path_local + ) else: mocked_s3_bucket.assert_not_called() @pytest.mark.parametrize("s3_usage", [True, False]) -def test_upload_data_from_local_folder_to_s3_interim_bucket_if_required(s3_usage: bool, main_settings: MainSettings): +def test_upload_data_from_local_folder_to_s3_interim_bucket_if_required( + s3_usage: bool, main_settings: MainSettings +): mocked_s3_bucket = Mock(spec=S3Communication) mocked_path_local = Mock(spec=Path("path_local")) mocked_path_s3 = Mock(spec=Path("path_s3")) @@ -42,6 +48,8 @@ def test_upload_data_from_local_folder_to_s3_interim_bucket_if_required(s3_usage ) if s3_usage: - mocked_s3_bucket.upload_files_in_dir_to_prefix.assert_called_with(mocked_path_local, mocked_path_s3) + mocked_s3_bucket.upload_files_in_dir_to_prefix.assert_called_with( + mocked_path_local, mocked_path_s3 + ) else: mocked_s3_bucket.assert_not_called() diff --git a/osc_extraction_utils/tests/test_settings_handler.py b/osc_extraction_utils/tests/test_settings_handler.py index 0c85f51..88cabb5 100644 --- a/osc_extraction_utils/tests/test_settings_handler.py +++ b/osc_extraction_utils/tests/test_settings_handler.py @@ -18,7 +18,9 @@ def test_read_settings_files(settings_handler: SettingsHandler): path_settings_main = path_root / "data" / "TEST" / "settings.yaml" path_settings_s3 = path_root / "data" / "s3_settings.yaml" - with patch("osc_extraction_utils.settings_handler.yaml"), patch("builtins.open") as mocked_open: + with patch("osc_extraction_utils.settings_handler.yaml"), patch( + "builtins.open" + ) as mocked_open: settings_handler.read_settings() mocked_open.assert_any_call(str(path_settings_main)) diff --git a/osc_extraction_utils/training_monitor.py b/osc_extraction_utils/training_monitor.py index 3d311ed..19bb712 100644 --- a/osc_extraction_utils/training_monitor.py +++ b/osc_extraction_utils/training_monitor.py @@ -5,7 +5,11 @@ class TrainingMonitor: """Class for a simple monitoring of a training process""" def __init__(self, path_file_running=None) -> None: - self.path_file_running: Path = path_file_running if path_file_running is not None else self._set_default_path() + self.path_file_running: Path = ( + path_file_running + if path_file_running is not None + else self._set_default_path() + ) @staticmethod def _set_default_path() -> Path: diff --git a/osc_extraction_utils/utils.py b/osc_extraction_utils/utils.py index 21ee13f..532b386 100644 --- a/osc_extraction_utils/utils.py +++ b/osc_extraction_utils/utils.py @@ -35,13 +35,16 @@ def save_train_info( # s3_settings = project_settings["s3_settings"] project_prefix = s3_settings.prefix + "/" + project_name + "/data" s3c_main.download_files_in_prefix_to_dir( - project_prefix + "/input/kpi_mapping", str(project_paths.path_folder_source_mapping) + project_prefix + "/input/kpi_mapping", + str(project_paths.path_folder_source_mapping), ) s3c_main.download_files_in_prefix_to_dir( - project_prefix + "/input/annotations", str(project_paths.path_folder_source_annotation) + project_prefix + "/input/annotations", + str(project_paths.path_folder_source_annotation), ) s3c_main.download_files_in_prefix_to_dir( - project_prefix + "/input/pdfs/training", str(project_paths.path_folder_source_pdf) + project_prefix + "/input/pdfs/training", + str(project_paths.path_folder_source_pdf), ) dir_train: dict[str, Any] = {} @@ -57,12 +60,21 @@ def save_train_info( dir_train.update( { "annotations": pd.read_excel( - str(project_paths.path_folder_source_annotation) + r"/" + filename, engine="openpyxl" + str(project_paths.path_folder_source_annotation) + + r"/" + + filename, + engine="openpyxl", ) } ) first = False - dir_train.update({"kpis": pd.read_csv(str(project_paths.path_folder_source_mapping) + "/kpi_mapping.csv")}) + dir_train.update( + { + "kpis": pd.read_csv( + str(project_paths.path_folder_source_mapping) + "/kpi_mapping.csv" + ) + } + ) # relevance_model = project_settings['train_relevance']['output_model_name'] relevance_model = main_settings.train_relevance.output_model_name @@ -70,7 +82,9 @@ def save_train_info( kpi_model = main_settings.train_kpi.output_model_name name_out = str(project_paths.path_project_model_folder) - name_out = name_out + "/SUMMARY_REL_" + relevance_model + "_KPI_" + kpi_model + ".pickle" + name_out = ( + name_out + "/SUMMARY_REL_" + relevance_model + "_KPI_" + kpi_model + ".pickle" + ) with open(name_out, "wb") as handle: pickle.dump(dir_train, handle, protocol=pickle.HIGHEST_PROTOCOL)