diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 3d28e118b3..ce1315aca5 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -258,11 +258,6 @@ jobs: python-version: ['3.8'] # can't use 3.7 due to requiring anon mode for adlfs daft-runner: [py, ray] micropartitions: [1, 0] - # These permissions are needed to interact with GitHub's OIDC Token endpoint. - # This is used in the step "Assume GitHub Actions AWS Credentials" - permissions: - id-token: write - contents: read steps: - uses: actions/checkout@v4 with: @@ -295,20 +290,6 @@ jobs: run: | mkdir -p /tmp/daft-integration-testing/nginx chmod +rw /tmp/daft-integration-testing/nginx - - name: Assume GitHub Actions AWS Credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: us-west-2 - role-to-assume: ${{ secrets.ACTIONS_AWS_ROLE_ARN }} - role-session-name: DaftPythonPackageGitHubWorkflow - - name: Assume GitHub Actions GCloud Credentials - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.ACTIONS_GCP_SERVICE_ACCOUNT_JSON }} - # NOTE: Workload Identity seems to be having problems with our Rust crate, so we use JSON instead - # See issue: https://github.com/yoshidan/google-cloud-rust/issues/171#issuecomment-1730511655 - # workload_identity_provider: ${{ secrets.ACTIONS_GCP_WORKLOAD_IDENTITY_PROVIDER }} - # service_account: ${{ secrets.ACTIONS_GCP_SERVICE_ACCOUNT }} - name: Spin up IO services uses: isbang/compose-action@v1.5.1 with: diff --git a/tests/integration/io/conftest.py b/tests/integration/io/conftest.py index e950cafda1..733ceea93b 100644 --- a/tests/integration/io/conftest.py +++ b/tests/integration/io/conftest.py @@ -41,6 +41,8 @@ def aws_public_s3_config() -> daft.io.IOConfig: s3=daft.io.S3Config( # NOTE: no keys or endpoints specified for an AWS public s3 bucket region_name="us-west-2", + # Use anonymous mode to avoid having to search for credentials in the Github Runner + anonymous=True, ) ) diff --git a/tests/integration/io/test_list_files_gcs.py b/tests/integration/io/test_list_files_gcs.py index 7549697833..6fcce3cfb8 100644 --- a/tests/integration/io/test_list_files_gcs.py +++ b/tests/integration/io/test_list_files_gcs.py @@ -6,7 +6,6 @@ from daft.daft import GCSConfig, IOConfig, io_glob BUCKET = "daft-public-data-gs" -DEFAULT_GCS_CONFIG = GCSConfig(project_id=None, anonymous=None) ANON_GCS_CONFIG = GCSConfig(project_id=None, anonymous=True) @@ -52,29 +51,26 @@ def compare_gcs_result(daft_ls_result: list, fsspec_result: list): ) @pytest.mark.parametrize("recursive", [False, True]) @pytest.mark.parametrize("fanout_limit", [None, 1]) -@pytest.mark.parametrize("gcs_config", [DEFAULT_GCS_CONFIG, ANON_GCS_CONFIG]) -def test_gs_flat_directory_listing(path, recursive, gcs_config, fanout_limit): +def test_gs_flat_directory_listing(path, recursive, fanout_limit): fs = gcsfs.GCSFileSystem() glob_path = path.rstrip("/") + "/**" if recursive else path - daft_ls_result = io_glob(glob_path, io_config=IOConfig(gcs=gcs_config), fanout_limit=fanout_limit) + daft_ls_result = io_glob(glob_path, io_config=IOConfig(gcs=ANON_GCS_CONFIG), fanout_limit=fanout_limit) fsspec_result = gcsfs_recursive_list(fs, path) if recursive else fs.ls(path, detail=True) compare_gcs_result(daft_ls_result, fsspec_result) @pytest.mark.integration() @pytest.mark.parametrize("recursive", [False, True]) -@pytest.mark.parametrize("gcs_config", [DEFAULT_GCS_CONFIG, ANON_GCS_CONFIG]) -def test_gs_single_file_listing(recursive, gcs_config): +def test_gs_single_file_listing(recursive): path = f"gs://{BUCKET}/test_ls/file.txt" fs = gcsfs.GCSFileSystem() - daft_ls_result = io_glob(path, io_config=IOConfig(gcs=gcs_config)) + daft_ls_result = io_glob(path, io_config=IOConfig(gcs=ANON_GCS_CONFIG)) fsspec_result = gcsfs_recursive_list(fs, path) if recursive else fs.ls(path, detail=True) compare_gcs_result(daft_ls_result, fsspec_result) @pytest.mark.integration() -@pytest.mark.parametrize("gcs_config", [DEFAULT_GCS_CONFIG, ANON_GCS_CONFIG]) -def test_gs_notfound(gcs_config): +def test_gs_notfound(): path = f"gs://{BUCKET}/test_" with pytest.raises(FileNotFoundError, match=path): - io_glob(path, io_config=IOConfig(gcs=gcs_config)) + io_glob(path, io_config=IOConfig(gcs=ANON_GCS_CONFIG)) diff --git a/tests/integration/io/test_url_download_private_aws_s3.py b/tests/integration/io/test_url_download_private_aws_s3.py deleted file mode 100644 index 1635ddfb16..0000000000 --- a/tests/integration/io/test_url_download_private_aws_s3.py +++ /dev/null @@ -1,39 +0,0 @@ -from __future__ import annotations - -import pytest -from botocore import session - -import daft -from daft.io import IOConfig, S3Config - - -@pytest.fixture(scope="session") -def io_config() -> IOConfig: - """Create IOConfig with botocore's current session""" - sess = session.Session() - creds = sess.get_credentials() - - return IOConfig( - s3=S3Config( - key_id=creds.access_key, access_key=creds.secret_key, session_token=creds.token, region_name="us-west-2" - ) - ) - - -@pytest.mark.integration() -def test_url_download_aws_s3_public_bucket_with_creds(small_images_s3_paths, io_config): - data = {"urls": small_images_s3_paths} - df = daft.from_pydict(data) - df = df.with_column("data", df["urls"].url.download(use_native_downloader=True, io_config=io_config)) - - data = df.to_pydict() - assert len(data["data"]) == 6 - for img_bytes in data["data"]: - assert img_bytes is not None - - -@pytest.mark.integration() -def test_read_parquet_aws_s3_public_bucket_with_creds(io_config): - filename = "s3://daft-public-data/test_fixtures/parquet-dev/mvp.parquet" - df = daft.read_parquet(filename, io_config=io_config, use_native_downloader=True).collect() - assert len(df) == 100