-
Notifications
You must be signed in to change notification settings - Fork 68
127 lines (122 loc) · 4.9 KB
/
sagemaker-integration.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
name: SageMaker PythonSDK Integration Tests
on:
workflow_dispatch:
inputs:
mode:
description: "candidate release version, or nightly. Default is nightly"
required: false
default: 'nightly'
sagemaker-repository:
description: 'Link to Github repository for SageMaker Python SDK. Can be a personal fork.'
required: false
default: ''
repository-branch:
description: 'The branch from the SagMaker Python SDK fork to use for testing'
required: false
default: ''
run_benchmark:
description: 'Runs benchmark and upload to cloud watch metrics'
required: false
default: 'true'
schedule:
- cron: '0 4 * * *'
jobs:
create-runners:
runs-on: [self-hosted, scheduler]
steps:
- name: Create new CPU instance
id: create_cpu1
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
- name: Create new CPU instance
id: create_cpu2
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
outputs:
cpu_instance_id1: ${{ steps.create_cpu1.outputs.action_cpu_instance_id }}
cpu_instance_id2: ${{ steps.create_cpu2.outputs.action_cpu_instance_id }}
endpoint-tests:
runs-on: [ self-hosted, cpu ]
timeout-minutes: 120
needs: create-runners
strategy:
fail-fast: false
matrix:
container: [lmi, tensorrt-llm]
env:
run_benchmark: ${{ github.event.inputs.run_benchmark || 'true' }}
image_type: ${{ github.event.inputs.mode || 'nightly' }}
steps:
- uses: actions/checkout@v4
- name: Set up Python3
uses: actions/setup-python@v5
with:
python-version: '3.10.x'
- name: Install pip dependencies
run: pip3 install -U boto3 awscli
- name: Install SageMaker Python SDK
working-directory: tests/integration
run: |
./install_sagemaker_pysdk.sh ${{ github.event.inputs.sagemaker-repository }} ${{ github.event.inputs.repository-branch }}
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-west-2
- name: Test llama3-8b
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py llama3-8b sme ${image_type} ${{ matrix.container}} ${run_benchmark}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
- name: Test mistral-7b
if: ${{ always() }}
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py mistral-7b sme ${image_type} ${{ matrix.container}} ${run_benchmark}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
- name: Test phi-2
if: ${{ always() }}
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py phi-2 sme ${image_type} ${{ matrix.container}} ${run_benchmark}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
- name: Test llava-v1.6
if: ${{ always() && matrix.container == 'lmi' }}
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py llava-v1.6 sme ${image_type} ${{ matrix.container }} ${run_benchmark}
- name: Test Multi Model Endpoint
if: ${{ always() }}
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py mme_common mme ${image_type} ${{ matrix.container}} ${run_benchmark}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
stop-runners:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [ create-runners, endpoint-tests ]
steps:
- name: Cleanup dangling SageMaker resources
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
./cleanup_sagemaker_resources.sh sm-integration-test us-west-2
- name: Stop all instances
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
instance_id=${{ needs.create-runners.outputs.cpu_instance_id1 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.cpu_instance_id2 }}
./stop_instance.sh $instance_id