coiled · douglasdavis · Mar 21, 2023 · Mar 21, 2023 · Mar 21, 2023 · Mar 21, 2023
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -90,6 +90,17 @@ jobs:
           python ci/create_runtime_meta.py
           source ci/scripts/install_coiled_runtime.sh coiled_software_environment.yaml
 
+      - name: Determine if workflows should be run
+        # Run workflows on PRs with `workflows` label and nightly cron job
+        if: |
+          github.event_name == 'schedule'
+          || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'workflows'))
+        run: |
+          # Put EXTRA_OPTIONS into $GITHUB_ENV so it can be used in subsequent workflow steps
+          export EXTRA_OPTIONS="--run-workflows"
+          echo $EXTRA_OPTIONS
+          echo EXTRA_OPTIONS=$EXTRA_OPTIONS >> $GITHUB_ENV
+
       - name: Run Coiled Runtime Tests
         id: test
         env:
@@ -100,7 +111,7 @@ jobs:
           DB_NAME: ${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
           BENCHMARK: true
           CLUSTER_DUMP: always
-        run: bash ci/scripts/run_tests.sh -n 4 --dist loadscope ${{ matrix.pytest_args }}
+        run: bash ci/scripts/run_tests.sh -n 4 --dist loadscope ${{ env.EXTRA_OPTIONS }} ${{ matrix.pytest_args }}
 
       - name: Dump coiled.Cluster kwargs
         run: cat cluster_kwargs.merged.yaml

diff --git a/cluster_kwargs.yaml b/cluster_kwargs.yaml
@@ -31,6 +31,15 @@ parquet_cluster:
   n_workers: 15
   worker_vm_types: [m5.xlarge]  # 4 CPU, 16 GiB
 
+# For tests/workflows/test_embarrassingly_parallel.py
+embarrassingly_parallel_cluster:
+  n_workers: 100
+  # TODO: Remove the `m6i.xlarge` worker specification below
+  # once it's the default worker instance type
+  worker_vm_types: [m6i.xlarge]  # 4 CPU, 16 GiB
+  backend_options:
+    region: "us-east-1"  # Same region as dataset
+
 # For test_spill.py
 spill_cluster:
   n_workers: 5
@@ -49,3 +58,12 @@ test_work_stealing_on_straggling_worker:
 test_repeated_merge_spill:
   n_workers: 20
   worker_vm_types: [m6i.large]
+
+# For tests/workflows/test_from_csv_to_parquet.py
+from_csv_to_parquet_cluster:
+  n_workers: 5
+  # TODO: Remove the `m6i.xlarge` worker specification below
+  # once it's the default worker instance type
+  worker_vm_types: [m6i.xlarge]  # 4 CPU, 16 GiB
+  backend_options:
+    region: "us-east-1"  # Same region as dataset
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -58,16 +58,19 @@ def pytest_addoption(parser):
     parser.addoption(
         "--benchmark", action="store_true", help="Collect benchmarking data for tests"
     )
+    parser.addoption("--run-workflows", action="store_true", help="Run workflow tests")
 
 
 def pytest_collection_modifyitems(config, items):
-    if config.getoption("--run-latest"):
-        # --run-latest given in cli: do not skip latest coiled-runtime tests
-        return
     skip_latest = pytest.mark.skip(reason="need --run-latest option to run")
+    skip_workflows = pytest.mark.skip(reason="need --run-workflows option to run")
     for item in items:
-        if "latest_runtime" in item.keywords:
+        if not config.getoption("--run-latest") and "latest_runtime" in item.keywords:
             item.add_marker(skip_latest)
+        if not config.getoption("--run-workflows") and (
+            (TEST_DIR / "workflows") in item.path.parents
+        ):
+            item.add_marker(skip_workflows)
 
 
 def get_coiled_runtime_version():
@@ -527,11 +530,17 @@ def s3_storage_options():
 
 
 @pytest.fixture(scope="session")
-def s3():
-    return s3fs.S3FileSystem(
-        key=os.environ.get("AWS_ACCESS_KEY_ID"),
-        secret=os.environ.get("AWS_SECRET_ACCESS_KEY"),
-    )
+def s3(s3_storage_options):
+    return s3fs.S3FileSystem(**s3_storage_options)
+
+
+@pytest.fixture
+def s3_factory(s3_storage_options):
+    def _(**exta_options):
+        kwargs = {**s3_storage_options, **exta_options}
+        return s3fs.S3FileSystem(**kwargs)
+
+    return _
 
 
 @pytest.fixture(scope="session")

diff --git a/tests/workflows/test_embarrassingly_parallel.py b/tests/workflows/test_embarrassingly_parallel.py
@@ -0,0 +1,103 @@
+import io
+import tarfile
+import uuid
+
+import coiled
+import pandas as pd
+import pytest
+from dask.distributed import Client, wait
+
+
+@pytest.fixture(scope="module")
+def embarrassingly_parallel_cluster(
+    dask_env_variables,
+    cluster_kwargs,
+    github_cluster_tags,
+):
+    with coiled.Cluster(
+        f"embarrassingly-parallel-{uuid.uuid4().hex[:8]}",
+        environ=dask_env_variables,
+        tags=github_cluster_tags,
+        **cluster_kwargs["embarrassingly_parallel_cluster"],
+    ) as cluster:
+        yield cluster
+
+
+@pytest.fixture
+def embarrassingly_parallel_client(
+    embarrassingly_parallel_cluster,
+    cluster_kwargs,
+    upload_cluster_dump,
+    benchmark_all,
+):
+    n_workers = cluster_kwargs["embarrassingly_parallel_cluster"]["n_workers"]
+    with Client(embarrassingly_parallel_cluster) as client:
+        embarrassingly_parallel_cluster.scale(n_workers)
+        client.wait_for_workers(n_workers)
+        client.restart()
+        with upload_cluster_dump(client), benchmark_all(client):
+            yield client
+
+
+def test_embarassingly_parallel(embarrassingly_parallel_client, s3_factory):
+    # How popular is matplotlib?
+    s3 = s3_factory(requester_pays=True)
+    directories = s3.ls("s3://arxiv/pdf")
+
+    # We only analyze files from 1991-2022 here in order to have a consistent data volume.
+    # This is benchmarking purposes only, as this dataset is updated monthly.
+    years = list(range(91, 100)) + list(range(23))
+    directories = [
+        d
+        for d in directories
+        if d.endswith(".tar") and int(d.split("_")[2][:2]) in years
+    ]
+
+    def extract(filename: str, fs):
+        """Extract and process one directory of arXiv data
+
+        Returns
+        -------
+        filename: str
+        contains_matplotlib: boolean
+        """
+        out = []
+        with fs.open(filename) as f:
+            bytes_ = f.read()
+            with io.BytesIO() as bio:
+                bio.write(bytes_)
+                bio.seek(0)
+                with tarfile.TarFile(fileobj=bio) as tf:
+                    for member in tf.getmembers():
+                        if member.isfile() and member.name.endswith(".pdf"):
+                            data = tf.extractfile(member).read()
+                            out.append((member.name, b"matplotlib" in data.lower()))
+                return out
+
+    futures = embarrassingly_parallel_client.map(extract, directories, fs=s3)
+    wait(futures)
+    # We had one error in one file.  Let's just ignore and move on.
+    good = [future for future in futures if future.status == "finished"]
+    data = embarrassingly_parallel_client.gather(good)
+
+    # Convert to Pandas
+    dfs = [pd.DataFrame(d, columns=["filename", "has_matplotlib"]) for d in data]
+    df = pd.concat(dfs)
+
+    def filename_to_date(filename):
+        year = int(filename.split("/")[0][:2])
+        month = int(filename.split("/")[0][2:4])
+        if year > 80:
+            year = 1900 + year
+        else:
+            year = 2000 + year
+
+        return pd.Timestamp(year=year, month=month, day=1)
+
+    df["date"] = df.filename.map(filename_to_date)
+    result = df.groupby("date").has_matplotlib.mean()
+    # Some light validation to ensure results are consistent.
+    # This is only for benchmarking.
+    assert result.idxmin() == pd.Timestamp("1991-07-01")  # Earliest timestamp
+    assert result.idxmax() == pd.Timestamp("2022-10-01")  # Row with maximum value
+    assert result.ne(0).idxmax() == pd.Timestamp("2005-06-01")  # First non-zero row
diff --git a/tests/workflows/test_from_csv_to_parquet.py b/tests/workflows/test_from_csv_to_parquet.py
@@ -0,0 +1,127 @@
+import os
+import uuid
+
+import coiled
+import dask.dataframe as dd
+import pytest
+from distributed import Client, LocalCluster, wait  # noqa
+
+LOCAL_RUN = os.environ.get("LOCAL_WORKFLOW_RUN")
+
+
+@pytest.fixture(scope="module")
+def from_csv_to_parquet_cluster(
+    dask_env_variables,
+    cluster_kwargs,
+    github_cluster_tags,
+):
+    if LOCAL_RUN is not None:
+        with LocalCluster() as cluster:
+            yield cluster
+    else:
+        with coiled.Cluster(
+            f"from-csv-to-parquet-{uuid.uuid4().hex[:8]}",
+            environ=dask_env_variables,
+            tags=github_cluster_tags,
+            **cluster_kwargs["from_csv_to_parquet_cluster"],
+        ) as cluster:
+            yield cluster
+
+
+@pytest.fixture
+def from_csv_to_parquet_client(
+    from_csv_to_parquet_cluster,
+    cluster_kwargs,
+    upload_cluster_dump,
+    benchmark_all,
+):
+    if LOCAL_RUN is not None:
+        with Client(from_csv_to_parquet_cluster) as client:
+            yield client
+    else:
+        n_workers = cluster_kwargs["from_csv_to_parquet_cluster"]["n_workers"]
+        with Client(from_csv_to_parquet_cluster) as client:
+            from_csv_to_parquet_cluster.scale(n_workers)
+            client.wait_for_workers(n_workers)
+            client.restart()
+            with upload_cluster_dump(client), benchmark_all(client):
+                yield client
+
+
+COLUMNSV1 = {
+    "GlobalEventID": "Int64",
+    "Day": "Int64",
+    "MonthYear": "Int64",
+    "Year": "Int64",
+    "FractionDate": "Float64",
+    "Actor1Code": "string[pyarrow]",
+    "Actor1Name": "string[pyarrow]",
+    "Actor1CountryCode": "string[pyarrow]",
+    "Actor1KnownGroupCode": "string[pyarrow]",
+    "Actor1EthnicCode": "string[pyarrow]",
+    "Actor1Religion1Code": "string[pyarrow]",
+    "Actor1Religion2Code": "string[pyarrow]",
+    "Actor1Type1Code": "string[pyarrow]",
+    "Actor1Type2Code": "string[pyarrow]",
+    "Actor1Type3Code": "string[pyarrow]",
+    "Actor2Code": "string[pyarrow]",
+    "Actor2Name": "string[pyarrow]",
+    "Actor2CountryCode": "string[pyarrow]",
+    "Actor2KnownGroupCode": "string[pyarrow]",
+    "Actor2EthnicCode": "string[pyarrow]",
+    "Actor2Religion1Code": "string[pyarrow]",
+    "Actor2Religion2Code": "string[pyarrow]",
+    "Actor2Type1Code": "string[pyarrow]",
+    "Actor2Type2Code": "string[pyarrow]",
+    "Actor2Type3Code": "string[pyarrow]",
+    "IsRootEvent": "Int64",
+    "EventCode": "string[pyarrow]",
+    "EventBaseCode": "string[pyarrow]",
+    "EventRootCode": "string[pyarrow]",
+    "QuadClass": "Int64",
+    "GoldsteinScale": "Float64",
+    "NumMentions": "Int64",
+    "NumSources": "Int64",
+    "NumArticles": "Int64",
+    "AvgTone": "Float64",
+    "Actor1Geo_Type": "Int64",
+    "Actor1Geo_Fullname": "string[pyarrow]",
+    "Actor1Geo_CountryCode": "string[pyarrow]",
+    "Actor1Geo_ADM1Code": "string[pyarrow]",
+    "Actor1Geo_Lat": "Float64",
+    "Actor1Geo_Long": "Float64",
+    "Actor1Geo_FeatureID": "string[pyarrow]",
+    "Actor2Geo_Type": "Int64",
+    "Actor2Geo_Fullname": "string[pyarrow]",
+    "Actor2Geo_CountryCode": "string[pyarrow]",
+    "Actor2Geo_ADM1Code": "string[pyarrow]",
+    "Actor2Geo_Lat": "Float64",
+    "Actor2Geo_Long": "Float64",
+    "Actor2Geo_FeatureID": "string[pyarrow]",
+    "ActionGeo_Type": "Int64",
+    "ActionGeo_Fullname": "string[pyarrow]",
+    "ActionGeo_CountryCode": "string[pyarrow]",
+    "ActionGeo_ADM1Code": "string[pyarrow]",
+    "ActionGeo_Lat": "Float64",
+    "ActionGeo_Long": "Float64",
+    "ActionGeo_FeatureID": "string[pyarrow]",
+    "DATEADDED": "Int64",
+    "SOURCEURL": "string[pyarrow]",
+}
+
+
+def test_from_csv_to_parquet(from_csv_to_parquet_client, s3_factory):
+    s3 = s3_factory(anon=True)
+    df = dd.read_csv(
+        "s3://gdelt-open-data/events/*.csv",
+        names=COLUMNSV1.keys(),
+        sep="\t",
+        dtype=COLUMNSV1,
+        storage_options=s3.storage_options,
+    )
+
+    df = df.partitions[-10:]
+
+    result = from_csv_to_parquet_client.compute(df.GoldsteinScale.mean())  # noqa
+    print(result)
+    assert df.GlobalEventID.dtype == "Int64"