diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 432d20df3a5..21d4ca25642 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -182,9 +182,9 @@ jobs: export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} pytest -s -vv integration-tests - build-and-push-image-rocm: + build-and-push-image-intel: concurrency: - group: ${{ github.workflow }}-build-and-push-image-rocm-${{ github.head_ref || github.run_id }} + group: ${{ github.workflow }}-build-and-push-image-intel-${{ github.head_ref || github.run_id }} cancel-in-progress: true needs: - start-runner @@ -198,6 +198,9 @@ jobs: # with sigstore/fulcio when running outside of PRs. id-token: write security-events: write + outputs: + # env is not available in the later `container:`, but previous job outputs are. + short_sha: ${{ env.GITHUB_SHA_SHORT }} steps: - name: Checkout repository uses: actions/checkout@v3 @@ -240,7 +243,7 @@ jobs: images: | registry.internal.huggingface.tech/api-inference/community/text-generation-inference tags: | - type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-rocm + type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-intel # If main, release or tag - name: Extract metadata (tags, labels) for Docker if: ${{ github.event_name != 'pull_request' }} @@ -254,29 +257,55 @@ jobs: ghcr.io/huggingface/text-generation-inference db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference tags: | - type=semver,pattern={{version}}-rocm - type=semver,pattern={{major}}.{{minor}}-rocm - type=raw,value=latest-rocm,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }} - type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-rocm + type=semver,pattern={{version}}-intel + type=semver,pattern={{major}}.{{minor}}-intel + type=raw,value=latest-intel,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }} + type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-intel - name: Build and push Docker image id: build-and-push uses: docker/build-push-action@v4 with: context: . - file: Dockerfile_amd + file: Dockerfile_intel push: true platforms: 'linux/amd64' build-args: | GIT_SHA=${{ env.GITHUB_SHA }} - DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}-rocm + DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}-intel tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }} labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }} - cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min - cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min + cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-intel,mode=min + cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-intel,mode=min - build-and-push-image-intel: + stop-runner: + name: Stop self-hosted EC2 runner + needs: + - start-runner + - build-and-push-image + - build-and-push-image-intel + - integration-tests + runs-on: ubuntu-latest + env: + AWS_REGION: us-east-1 + if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ env.AWS_REGION }} + - name: Stop EC2 runner + uses: philschmid/philschmid-ec2-github-runner@main + with: + mode: stop + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + label: ${{ needs.start-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} + + build-and-push-image-rocm: concurrency: - group: ${{ github.workflow }}-build-and-push-image-intel-${{ github.head_ref || github.run_id }} + group: ${{ github.workflow }}-build-and-push-image-rocm-${{ github.head_ref || github.run_id }} cancel-in-progress: true needs: - start-runner @@ -290,9 +319,6 @@ jobs: # with sigstore/fulcio when running outside of PRs. id-token: write security-events: write - outputs: - # env is not available in the later `container:`, but previous job outputs are. - short_sha: ${{ env.GITHUB_SHA_SHORT }} steps: - name: Checkout repository uses: actions/checkout@v3 @@ -335,7 +361,7 @@ jobs: images: | registry.internal.huggingface.tech/api-inference/community/text-generation-inference tags: | - type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-intel + type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-rocm # If main, release or tag - name: Extract metadata (tags, labels) for Docker if: ${{ github.event_name != 'pull_request' }} @@ -349,85 +375,52 @@ jobs: ghcr.io/huggingface/text-generation-inference db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference tags: | - type=semver,pattern={{version}}-intel - type=semver,pattern={{major}}.{{minor}}-intel - type=raw,value=latest-intel,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }} - type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-intel + type=semver,pattern={{version}}-rocm + type=semver,pattern={{major}}.{{minor}}-rocm + type=raw,value=latest-rocm,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }} + type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-rocm - name: Build and push Docker image id: build-and-push uses: docker/build-push-action@v4 with: context: . - file: Dockerfile_intel + file: Dockerfile_amd push: true platforms: 'linux/amd64' build-args: | GIT_SHA=${{ env.GITHUB_SHA }} - DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}-intel + DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}-rocm tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }} labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }} - cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-intel,mode=min - cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-intel,mode=min - - stop-runner: - name: Stop self-hosted EC2 runner + cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min + cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min + integration-tests-rocm: + concurrency: + group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + runs-on: [amd-gpu-tgi, mi250] needs: - - start-runner - - build-and-push-image - build-and-push-image-rocm - - build-and-push-image-intel - - integration-tests - runs-on: ubuntu-latest - env: - AWS_REGION: us-east-1 - if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs steps: - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v1 + - uses: actions/setup-python@v5 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ env.AWS_REGION }} - - name: Stop EC2 runner - uses: philschmid/philschmid-ec2-github-runner@main - with: - mode: stop - github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - label: ${{ needs.start-runner.outputs.label }} - ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} - - # TODO: Move this to `build_amd.yml` (and `build_nvidia.yml`) - - # integration-tests-rocm: - # concurrency: - # group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} - # cancel-in-progress: true - # needs: - # - start-runner - # - build-and-push-image - # - integration-tests - # - build-and-push-image-rocm - # - stop-runner - # runs-on: [self-hosted, amd-gpu, multi-gpu, mi300] - # container: - # image: registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ needs.build-and-push-image-rocm.outputs.short_sha }}-rocm - # options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/cache - # env: - # DOCKER_VOLUME: /cache - # steps: - # - name: ROCM-SMI - # run: | - # rocm-smi - # - name: ROCM-INFO - # run: | - # rocminfo | grep "Agent" -A 14 - # - name: Show ROCR environment - # run: | - # echo "ROCR: $ROCR_VISIBLE_DEVICES" - # - name: Install - # run: | - # make install-integration-tests - # - name: Run tests - # run: | - # export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} - # pytest -s -vv integration-tests + python-version: '3.10' + - uses: actions/checkout@v4 + - name: install deps + run: | + make install-integration-tests + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + - name: Run tests + run: | + export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }} + export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} + export DOCKER_DEVICES=/dev/kfd,/dev/dri + python -m pytest -s -vv integration-tests/models/test_flash_gpt2.py diff --git a/Dockerfile b/Dockerfile index 659e2673680..d8efe6d8b1b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,5 @@ # Rust builder +# Dummy. FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef WORKDIR /usr/src diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py index 2ef85da6c7b..3f89090a938 100644 --- a/integration-tests/conftest.py +++ b/integration-tests/conftest.py @@ -34,6 +34,7 @@ DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None) HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None) DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data") +DOCKER_DEVICES = os.getenv("DOCKER_DEVICES") class ResponseComparator(JSONSnapshotExtension): @@ -435,6 +436,7 @@ def docker_launcher( container = client.containers.get(container_name) container.stop() container.wait() + client.containers.prune() except NotFound: pass @@ -453,16 +455,27 @@ def docker_launcher( if DOCKER_VOLUME: volumes = [f"{DOCKER_VOLUME}:/data"] + if DOCKER_DEVICES: + devices = DOCKER_DEVICES.split(",") + visible = os.getenv("ROCR_VISIBLE_DEVICES") + if visible: + env["ROCR_VISIBLE_DEVICES"] = visible + device_requests = [] + else: + devices = [] + device_requests = [ + docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]]) + ] + container = client.containers.run( DOCKER_IMAGE, command=args, name=container_name, environment=env, auto_remove=False, + devices=devices, + device_requests=device_requests, detach=True, - device_requests=[ - docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]]) - ], volumes=volumes, ports={"80/tcp": port}, shm_size="1G", diff --git a/integration-tests/models/test_flash_gpt2.py b/integration-tests/models/test_flash_gpt2.py index 0c7977d0d64..1e906521abc 100644 --- a/integration-tests/models/test_flash_gpt2.py +++ b/integration-tests/models/test_flash_gpt2.py @@ -3,7 +3,7 @@ @pytest.fixture(scope="module") def flash_gpt2_handle(launcher): - with launcher("openai-community/gpt2", num_shard=2) as handle: + with launcher("openai-community/gpt2", num_shard=1) as handle: yield handle