-
Notifications
You must be signed in to change notification settings - Fork 17
/
spotty_a100.yaml
78 lines (74 loc) · 2.46 KB
/
spotty_a100.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# You must delete disks manually on GCP :\
# https://console.cloud.google.com/compute/disks?project=hear2021-evaluation
project:
name: spotty-heareval
syncFilters:
- exclude:
- .idea/*
- .git/*
- '*/__pycache__/*'
- _workdir/*
- tasks/*
- embeddings/*
- .mypy_cache/*
- lightning_logs/*
- heareval.egg-info/*
- pretrained/*
- wandb/*
containers:
- projectDir: /workspace/project
#file: docker/Dockerfile-cuda11.2
image: turian/heareval
#image: turian/heareval:cuda11.2
# ports:
# # TensorBoard
# - containerPort: 6006
# hostPort: 6006
# # Jupyter
# - containerPort: 8888
# hostPort: 8888
volumeMounts:
- name: workspace
mountPath: /workspace
runtimeParameters: ['--shm-size', '20G']
instances:
- name: jshier-a100
provider: gcp
parameters:
# https://cloud.google.com/compute/docs/gpus/gpu-regions-zones
zone: europe-west4-b
# V100
machineType: a2-megagpu-16g
# Machine Type requirements for A100: https://cloud.google.com/compute/docs/gpus
# Needs to be updated for more than one GPU
preemptibleInstance: True
gpu:
type: nvidia-tesla-a100
count: 16
imageUri: projects/ml-images/global/images/c0-deeplearning-common-cu113-v20211105-debian-10
# spotInstance: True
# ports: [6006, 8888]
# dockerDataRoot: /docker
volumes:
- name: workspace
parameters:
size: 250
# Pick SR!
# gsutil -m cp gs://hear2021/open-tasks/hear2021-task-*.tar . && for f in hear2021-task-*.tar; do tar xf "$f"; done
# gsutil -m cp gs://hear2021/open-tasks/hear2021-task-*sr48000.tar . && for f in hear2021-task-*sr48000.tar; do tar xf "$f"; done
# wget https://github.com/hearbenchmark/hear-baseline/raw/main/saved_models/naive_baseline.pt
# python3 -m heareval.embeddings.runner hearbaseline --model naive_baseline.pt
# python3 -m heareval.predictions.runner hearbaseline --model ./naive_baseline.pt
# python3 -m heareval.evaluation.runner
#
#scripts:
# get: |
# gsutil -m cp gs://hear2021/open-tasks/hear2021-task-*.tar . && for f in hear2021-task-*.tar; do tar xf "$f"; done
# setup: |
# bash setup.sh
# train: |
# bash train.sh
# tensorboard: |
# tensorboard --bind_all --port 6006 --logdir /workspace/project/logs
# jupyter: |
# jupyter notebook --allow-root --ip 0.0.0.0 --notebook-dir=/workspace/project