From 0343b128b86251b8a786d21cbac4c83724b41f5a Mon Sep 17 00:00:00 2001 From: Raunak Bhagat Date: Mon, 18 Nov 2024 11:35:31 -0800 Subject: [PATCH] Add workflows to build a commit and run tpch benchmarks - slightly different than existing one --- .github/actions/install/action.yaml | 29 ++++ .github/assets/ray.yaml | 65 +++++++++ .github/scripts/csv_to_md.py | 29 ++++ .github/workflows/build-commit-run-tpch.yaml | 41 ++++++ .github/workflows/build-commit.yaml | 66 +++++++++ .github/workflows/run-tpch.yaml | 146 +++++++++++++++++++ benchmarking/tpch/__main__.py | 15 +- 7 files changed, 388 insertions(+), 3 deletions(-) create mode 100644 .github/actions/install/action.yaml create mode 100644 .github/assets/ray.yaml create mode 100644 .github/scripts/csv_to_md.py create mode 100644 .github/workflows/build-commit-run-tpch.yaml create mode 100644 .github/workflows/build-commit.yaml create mode 100644 .github/workflows/run-tpch.yaml diff --git a/.github/actions/install/action.yaml b/.github/actions/install/action.yaml new file mode 100644 index 0000000000..543d9a3e86 --- /dev/null +++ b/.github/actions/install/action.yaml @@ -0,0 +1,29 @@ +name: Install uv, rust, and python +description: Install uv, rust, and python +inputs: + python_version: + description: The version of python to install + required: false + default: '3.9' +runs: + using: composite + steps: + - shell: bash + run: | + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + CARGO_BIN="$HOME/.cargo/bin" + echo 'export PATH="$CARGO_BIN:$PATH"' >> $HOME/.bashrc + echo "$CARGO_BIN" >> $GITHUB_PATH + - shell: bash + run: | + curl -LsSf https://astral.sh/uv/install.sh | sh + UV_BIN="$HOME/.local/bin" + echo 'export PATH="$UV_BIN:$PATH"' >> $HOME/.bashrc + echo "$UV_BIN" >> $GITHUB_PATH + - shell: bash + run: | + source $HOME/.bashrc + - shell: bash + run: | + uv python install ${{ inputs.python_version }} + uv python pin ${{ inputs.python_version }} diff --git a/.github/assets/ray.yaml b/.github/assets/ray.yaml new file mode 100644 index 0000000000..486b72bad4 --- /dev/null +++ b/.github/assets/ray.yaml @@ -0,0 +1,65 @@ +cluster_name: performance-comparisons + +provider: + type: aws + region: us-west-2 + cache_stopped_nodes: true + security_group: + GroupName: ray-autoscaler-c1 + +auth: + ssh_user: ubuntu + ssh_private_key: ~/.ssh/ci-github-actions-ray-cluster-key.pem + +max_workers: 2 +available_node_types: + ray.head.default: + resources: {"CPU": 0} + node_config: + KeyName: ci-github-actions-ray-cluster-key + InstanceType: i3.2xlarge + ImageId: ami-04dd23e62ed049936 + IamInstanceProfile: + Name: ray-autoscaler-v1 + + ray.worker.default: + min_workers: 2 + max_workers: 2 + resources: {} + node_config: + KeyName: ci-github-actions-ray-cluster-key + InstanceType: i3.2xlarge + ImageId: ami-04dd23e62ed049936 + IamInstanceProfile: + Name: ray-autoscaler-v1 + +setup_commands: +# Mount drive +- | + findmnt /tmp 1> /dev/null + code=$? + if [ $code -ne 0 ]; then + sudo mkfs.ext4 /dev/nvme0n1 + sudo mount -t ext4 /dev/nvme0n1 /tmp + sudo chmod 777 /tmp + fi +# Install dependencies +- sudo snap install aws-cli --classic +- curl -LsSf https://astral.sh/uv/install.sh | sh +- echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc +- source ~/.bashrc +- uv python install <> +- uv python pin <> +- uv v +- echo "source $HOME/.venv/bin/activate" >> $HOME/.bashrc +- source .venv/bin/activate +- uv pip install pip ray[default] py-spy +# GitHub Actions workflow will replace all parameters between `<<...>>` with the +# actual values as determined dynamically during runtime of the actual workflow. +- uv pip install https://github-actions-artifacts-bucket.s3.us-west-2.amazonaws.com/builds/<>/<> +# Download benchmarking fixtures +- | + aws s3 sync \ + s3://eventual-dev-benchmarking-fixtures/uncompressed/tpch-dbgen/<>/<>/parquet/ \ + /tmp/data/<>/<>/parquet/ \ + --quiet diff --git a/.github/scripts/csv_to_md.py b/.github/scripts/csv_to_md.py new file mode 100644 index 0000000000..b9a74f4f40 --- /dev/null +++ b/.github/scripts/csv_to_md.py @@ -0,0 +1,29 @@ +import csv +import sys +from pathlib import Path + +file = Path(sys.argv[1]) +assert file.exists() + +output = Path(sys.argv[2]) +assert not output.exists() + + +def make_md_row(row: list[str]) -> str: + return f'|{"|".join(row)}|\n' + + +with open(file) as file: + with open(output, "w+") as output: + csv_reader = csv.reader(file) + header = next(csv_reader) + + header_str = make_md_row(header) + output.write(header_str) + + separator_str = make_md_row(["-"] * len(header)) + output.write(separator_str) + + for row in csv_reader: + row_str = make_md_row(row) + output.write(row_str) diff --git a/.github/workflows/build-commit-run-tpch.yaml b/.github/workflows/build-commit-run-tpch.yaml new file mode 100644 index 0000000000..52102c4196 --- /dev/null +++ b/.github/workflows/build-commit-run-tpch.yaml @@ -0,0 +1,41 @@ +name: Build commit and run tpch benchmarks + +on: + workflow_dispatch: + inputs: + wheel: + description: The wheel artifact to use + required: false + default: getdaft-0.3.0.dev0-cp38-abi3-manylinux_2_31_x86_64.whl + skip_questions: + description: The TPC-H questions to skip + required: false + default: "" + scale_factor: + description: Which scale factor to use + required: false + default: 2 + partition_size: + description: Which partition size to use + required: false + default: 2 + python_version: + description: The version of python to use + required: false + default: '3.9' + +jobs: + build: + uses: ./.github/workflows/build-commit.yaml + secrets: + ACTIONS_AWS_ROLE_ARN: ${{ secrets.ACTIONS_AWS_ROLE_ARN }} + + run: + needs: build + uses: ./.github/workflows/run-tpch.yaml + with: + wheel: ${{ needs.build.outputs.wheel }} + skip_questions: ${{ inputs.skip_questions }} + scale_factor: ${{ inputs.scale_factor }} + partition_size: ${{ inputs.partition_size }} + python_version: ${{ inputs.python_version }} diff --git a/.github/workflows/build-commit.yaml b/.github/workflows/build-commit.yaml new file mode 100644 index 0000000000..89186738d1 --- /dev/null +++ b/.github/workflows/build-commit.yaml @@ -0,0 +1,66 @@ +name: Build a Daft commit and store the outputted wheel in AWS S3 + +on: + workflow_dispatch: + workflow_call: + secrets: + ACTIONS_AWS_ROLE_ARN: + description: The ARN of the AWS role to assume + required: true + outputs: + wheel: + description: The wheel file that was built + value: ${{ jobs.build-commit.outputs.wheel }} + +jobs: + build-commit: + runs-on: buildjet-8vcpu-ubuntu-2004 + timeout-minutes: 15 # Remove for ssh debugging + permissions: + id-token: write + contents: read + outputs: + wheel: ${{ steps.build_and_upload.outputs.wheel }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 1 + - uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: us-west-2 + role-session-name: build-commit-workflow + role-to-assume: ${{ secrets.ACTIONS_AWS_ROLE_ARN }} + - uses: ./.github/actions/install + - uses: buildjet/cache@v4 + with: + path: ~/target + key: ${{ runner.os }}-cargo-deps-${{ hashFiles('**/Cargo.lock') }} + restore-keys: ${{ runner.os }}-cargo-deps- + - id: build_and_upload + run: | + if ! ls ~/target/wheels/*.whl 1> /dev/null 2>&1; then + # Build wheel + export CARGO_TARGET_DIR=~/target + uv v + source .venv/bin/activate + uv pip install pip maturin + maturin build --release + fi + + count=$(ls ~/target/wheels/*.whl 2> /dev/null | wc -l) + if [ "$count" -gt 1 ]; then + echo "Found more than 1 wheel" + exit 1 + elif [ "$count" -eq 0 ]; then + echo "Found no wheel files" + exit 1 + fi + + # Upload wheel + for file in ~/target/wheels/*.whl; do + aws s3 cp $file s3://github-actions-artifacts-bucket/builds/${{ github.sha }}/ --acl public-read --no-progress; + file_basename=$(basename $file) + echo "wheel=$file_basename" >> $GITHUB_OUTPUT + echo "Output wheel has been built and stored in S3 at the following location:" >> $GITHUB_STEP_SUMMARY + echo "https://us-west-2.console.aws.amazon.com/s3/buckets/github-actions-artifacts-bucket?prefix=builds/${{ github.sha }}/" >> $GITHUB_STEP_SUMMARY + done diff --git a/.github/workflows/run-tpch.yaml b/.github/workflows/run-tpch.yaml new file mode 100644 index 0000000000..73ee503cb9 --- /dev/null +++ b/.github/workflows/run-tpch.yaml @@ -0,0 +1,146 @@ +name: Run tpch benchmarks + +on: + workflow_dispatch: + inputs: + wheel: + description: The wheel artifact to use + required: false + default: getdaft-0.3.0.dev0-cp38-abi3-manylinux_2_31_x86_64.whl + skip_questions: + description: The TPC-H questions to skip + required: false + default: "" + scale_factor: + description: Which scale factor to use + required: false + default: 2 + partition_size: + description: Which partition size to use + required: false + default: 2 + python_version: + description: The version of python to use + required: false + default: '3.9' + workflow_call: + inputs: + wheel: + type: string + description: The wheel artifact to use + required: false + default: getdaft-0.3.0.dev0-cp38-abi3-manylinux_2_31_x86_64.whl + skip_questions: + type: string + description: The TPC-H questions to skip + required: false + default: "" + scale_factor: + type: string + description: Which scale factor to use + required: false + default: 2 + partition_size: + type: string + description: Which partition size to use + required: false + default: 2 + python_version: + type: string + description: The version of python to use + required: false + default: '3.9' + +jobs: + run-tpch: + runs-on: [self-hosted, linux, x64, ci-dev] + timeout-minutes: 15 # Remove for ssh debugging + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 1 + - uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: us-west-2 + role-session-name: run-tpch-workflow + - uses: ./.github/actions/install + - run: | + scale_factor_str="${{ inputs.scale_factor }}_0" + + # Dynamically update ray config file + sed -i 's|<>|${{ github.sha }}|g' .github/assets/ray.yaml + sed -i 's|<>|${{ inputs.wheel }}|g' .github/assets/ray.yaml + sed -i "s|<>|$scale_factor_str|g" .github/assets/ray.yaml + sed -i 's|<>|${{ inputs.partition_size }}|g' .github/assets/ray.yaml + sed -i 's|<>|${{ inputs.python_version }}|g' .github/assets/ray.yaml + + # Download private ssh key + KEY=$(aws secretsmanager get-secret-value --secret-id ci-github-actions-ray-cluster-key-3 --query SecretString --output text) + echo "$KEY" >> ~/.ssh/ci-github-actions-ray-cluster-key.pem + chmod 600 ~/.ssh/ci-github-actions-ray-cluster-key.pem + + # Install dependencies + uv v + source .venv/bin/activate + rm -rf daft + uv pip install ray[default] boto3 https://github-actions-artifacts-bucket.s3.us-west-2.amazonaws.com/builds/${{ github.sha }}/${{ inputs.wheel }} + + # Boot up ray cluster and submit tpch benchmarking job + ray up .github/assets/ray.yaml -y + HEAD_NODE_IP=$(ray get-head-ip .github/assets/ray.yaml | tail -n 1) + ssh -o StrictHostKeyChecking=no -fN -L 8265:localhost:8265 -i ~/.ssh/ci-github-actions-ray-cluster-key.pem ubuntu@$HEAD_NODE_IP + export DAFT_ENABLE_RAY_TRACING=1 + export DAFT_RUNNER=ray + if [[ -n "${{ inputs.skip_questions }}" ]]; then + python -m benchmarking.tpch \ + --scale_factor ${{ inputs.scale_factor }} \ + --num_parts ${{ inputs.partition_size }} \ + --parquet_file_cache /tmp/data \ + --output_csv output.csv \ + --ray_job_dashboard_url http://localhost:8265 \ + --skip_warmup \ + --no_pymodules \ + --skip_questions="${{ inputs.skip_questions }}" + else + python -m benchmarking.tpch \ + --scale_factor ${{ inputs.scale_factor }} \ + --num_parts ${{ inputs.partition_size }} \ + --parquet_file_cache /tmp/data \ + --output_csv output.csv \ + --ray_job_dashboard_url http://localhost:8265 \ + --skip_warmup \ + --no_pymodules + fi + + # Download all logs + # + # We also need to convert all files containing ':' to '_'. + # GHA `actions/upload-artifact@v4` does *not* allow semicolons! + ray rsync-down .github/assets/ray.yaml /tmp/ray/session_*/logs/daft ray-daft-logs + find ray-daft-logs -depth -name '*:*' -exec bash -c ' + for filepath; do + dir=$(dirname "$filepath") + base=$(basename "$filepath") + new_base=${base//:/_} + mv "$filepath" "$dir/$new_base" + done + ' _ {} + + + # Tear down ray cluster + ray down .github/assets/ray.yaml -y + + # Convert csv to markdown and print to GHA Summary Page + python .github/scripts/csv_to_md.py output.csv output.md + echo "# Results" >> $GITHUB_STEP_SUMMARY + cat output.md >> $GITHUB_STEP_SUMMARY + - uses: actions/upload-artifact@v4 + with: + name: output.csv + path: output.csv + - uses: actions/upload-artifact@v4 + with: + name: ray-daft-logs + path: ray-daft-logs diff --git a/benchmarking/tpch/__main__.py b/benchmarking/tpch/__main__.py index 8ad131e08f..125712ef92 100644 --- a/benchmarking/tpch/__main__.py +++ b/benchmarking/tpch/__main__.py @@ -126,6 +126,7 @@ def run_all_benchmarks( csv_output_location: str | None, ray_job_dashboard_url: str | None = None, requirements: str | None = None, + no_pymodules: bool = False, ): get_df = get_df_with_parquet_folder(parquet_folder) @@ -143,7 +144,7 @@ def run_all_benchmarks( tpch_qnum=i, working_dir=working_dir, entrypoint=entrypoint, - runtime_env=get_ray_runtime_env(requirements), + runtime_env=get_ray_runtime_env(requirements, no_pymodules=no_pymodules), ) # Run once as a warmup step @@ -202,11 +203,13 @@ def get_daft_benchmark_runner_name() -> Literal["ray"] | Literal["py"] | Literal return name -def get_ray_runtime_env(requirements: str | None) -> dict: +def get_ray_runtime_env(requirements: str | None, no_pymodules: bool = False) -> dict: + daft_env_variables = dict(filter(lambda key_value: key_value[0].startswith("DAFT"), os.environ.items())) runtime_env = { - "py_modules": [daft], + "py_modules": None if no_pymodules else [daft], "eager_install": True, "env_vars": { + **daft_env_variables, "DAFT_PROGRESS_BAR": "0", "DAFT_RUNNER": "ray", }, @@ -293,6 +296,11 @@ def warm_up_function(): default=None, help="Ray Dashboard URL to submit jobs instead of using Ray client, most useful when running on a remote cluster", ) + parser.add_argument( + "--no_pymodules", + action="store_true", + help="Avoid pickling any modules in the ray-environment before initializing the Ray cluster; useful in CI", + ) args = parser.parse_args() if args.output_csv_headers: @@ -331,4 +339,5 @@ def warm_up_function(): csv_output_location=args.output_csv, ray_job_dashboard_url=args.ray_job_dashboard_url, requirements=args.requirements, + no_pymodules=True if args.no_pymodules else False, )