Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added GPU enabled sandbox image. #3256

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions docker/sandbox-bundled/Dockerfile.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# syntax=docker/dockerfile:1.4-labs

FROM --platform=${BUILDPLATFORM} mgoltzsche/podman:minimal AS builder

ARG TARGETARCH
ENV TARGETARCH "${TARGETARCH}"

WORKDIR /build

COPY images/manifest.txt images/preload ./
RUN --security=insecure ./preload manifest.txt

FROM --platform=${BUILDPLATFORM} golang:1.19-bullseye AS bootstrap

ARG TARGETARCH
ENV CGO_ENABLED 0
ENV GOARCH "${TARGETARCH}"
ENV GOOS linux

WORKDIR /flyteorg/build
COPY bootstrap/go.mod bootstrap/go.sum ./
RUN go mod download
COPY bootstrap/ ./
RUN --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/root/go/pkg/mod \
go build -o dist/flyte-sandbox-bootstrap cmd/bootstrap/main.go
# syntax=docker/dockerfile:1.4-labs

#Following
FROM nvidia/cuda:12.1.1-base-ubuntu20.04

RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections

RUN apt-get update && \
apt-get -y install gnupg2 curl lsb-release && \
apt-get clean

# Install NVIDIA Container Runtime
RUN curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | apt-key add -
RUN curl -s -L https://nvidia.github.io/nvidia-container-runtime/ubuntu20.04/nvidia-container-runtime.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list
RUN apt-get update && \
apt-get -y install nvidia-docker2 && \
apt-get clean

# Install crictl
ENV CRICTL_VERSION="v1.26.0"
RUN curl -L https://github.com/kubernetes-sigs/cri-tools/releases/download/$CRICTL_VERSION/crictl-${CRICTL_VERSION}-linux-amd64.tar.gz --output crictl-${CRICTL_VERSION}-linux-amd64.tar.gz
RUN tar zxvf crictl-$CRICTL_VERSION-linux-amd64.tar.gz -C /usr/local/bin
RUN rm -f crictl-$CRICTL_VERSION-linux-amd64.tar.gz

# Install k3s
RUN curl -s -L https://github.com/k3s-io/k3s/releases/download/v1.24.9+k3s1/k3s > /usr/bin/k3s
RUN chmod u+x /usr/bin/k3s
RUN echo "alias kubectl='k3s kubectl'" >> /root/.bashrc

# Setup containerd for nvidia
COPY config.toml.tmpl /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl
ENV CRI_CONFIG_FILE="/var/lib/rancher/k3s/agent/etc/crictl.yaml"

# ENV that signals this container should have gpu enabled
ENV FLYTE_GPU "ENABLED"

ARG TARGETARCH

ARG FLYTE_SANDBOX_VERSION
ENV FLYTE_SANDBOX_VERSION "${FLYTE_SANDBOX_VERSION}"

COPY --from=builder /build/images/ /var/lib/rancher/k3s/agent/images/
COPY images/tar/${TARGETARCH}/ /var/lib/rancher/k3s/agent/images/
COPY manifests/ /var/lib/rancher/k3s/server/manifests-staging/
COPY bin/ /bin/

COPY --from=bootstrap /flyteorg/build/dist/flyte-sandbox-bootstrap /bin/

VOLUME /var/lib/kubelet
VOLUME /var/lib/rancher/k3s
VOLUME /var/lib/cni
VOLUME /var/log


ENTRYPOINT [ "/bin/k3d-entrypoint.sh" ]
CMD [ "server", "--disable=traefik", "--disable=servicelb" ]
15 changes: 15 additions & 0 deletions docker/sandbox-bundled/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ manifests:
--load-restrictor=LoadRestrictionsNone \
kustomize/dev > manifests/dev.yaml


.PHONY: manifests-gpu
manifests-gpu: manifests
cat kustomize/gpu-operator.yaml >> manifests/complete.yaml

.PHONY: build
build: flyte manifests
[ -n "$(shell docker buildx ls | awk '/^flyte-sandbox / {print $$1}')" ] || \
Expand All @@ -38,6 +43,16 @@ build: flyte manifests
docker buildx build --builder flyte-sandbox --allow security.insecure --load \
--tag flyte-sandbox:latest .

.PHONY: build-gpu
build-gpu: flyte manifests-gpu
[ -n "$(shell docker buildx ls | awk '/^flyte-sandbox / {print $$1}')" ] || \
docker buildx create --name flyte-sandbox \
--driver docker-container --driver-opt image=moby/buildkit:master \
--buildkitd-flags '--allow-insecure-entitlement security.insecure'
docker buildx build --builder flyte-sandbox --allow security.insecure --load \
--tag flyte-sandbox-gpu:latest -f Dockerfile.gpu .


# Port map
# 6443 - k8s API server
# 30000 - Docker Registry
Expand Down
15 changes: 15 additions & 0 deletions docker/sandbox-bundled/bin/k3d-entrypoint-gpu-check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/sh

if [ -n "${FLYTE_GPU}" ]; then
echo "GPU Enabled - checking if it's available"
nvidia-smi
if [ $? -eq 0 ]; then
echo "nvidia-smi working"
else
>&2 echo "NVIDIA not available, enable it in docker like so: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/user-guide.html"
exit 255
fi

else
echo "GPU not enabled"
fi
55 changes: 55 additions & 0 deletions docker/sandbox-bundled/config.toml.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
[plugins.opt]
path = "{{ .NodeConfig.Containerd.Opt }}"

[plugins.cri]
stream_server_address = "127.0.0.1"
stream_server_port = "10010"

{{- if .IsRunningInUserNS }}
disable_cgroup = true
disable_apparmor = true
restrict_oom_score_adj = true
{{end}}

{{- if .NodeConfig.AgentConfig.PauseImage }}
sandbox_image = "{{ .NodeConfig.AgentConfig.PauseImage }}"
{{end}}

{{- if not .NodeConfig.NoFlannel }}
[plugins.cri.cni]
bin_dir = "{{ .NodeConfig.AgentConfig.CNIBinDir }}"
conf_dir = "{{ .NodeConfig.AgentConfig.CNIConfDir }}"
{{end}}

[plugins.cri.containerd.runtimes.runc]
# ---- changed from 'io.containerd.runc.v2' for GPU support
runtime_type = "io.containerd.runtime.v1.linux"

# ---- added for GPU support
[plugins.linux]
runtime = "nvidia-container-runtime"

{{ if .PrivateRegistryConfig }}
{{ if .PrivateRegistryConfig.Mirrors }}
[plugins.cri.registry.mirrors]{{end}}
{{range $k, $v := .PrivateRegistryConfig.Mirrors }}
[plugins.cri.registry.mirrors."{{$k}}"]
endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf "%q" .}}{{end}}]
{{end}}

{{range $k, $v := .PrivateRegistryConfig.Configs }}
{{ if $v.Auth }}
[plugins.cri.registry.configs."{{$k}}".auth]
{{ if $v.Auth.Username }}username = "{{ $v.Auth.Username }}"{{end}}
{{ if $v.Auth.Password }}password = "{{ $v.Auth.Password }}"{{end}}
{{ if $v.Auth.Auth }}auth = "{{ $v.Auth.Auth }}"{{end}}
{{ if $v.Auth.IdentityToken }}identitytoken = "{{ $v.Auth.IdentityToken }}"{{end}}
{{end}}
{{ if $v.TLS }}
[plugins.cri.registry.configs."{{$k}}".tls]
{{ if $v.TLS.CAFile }}ca_file = "{{ $v.TLS.CAFile }}"{{end}}
{{ if $v.TLS.CertFile }}cert_file = "{{ $v.TLS.CertFile }}"{{end}}
{{ if $v.TLS.KeyFile }}key_file = "{{ $v.TLS.KeyFile }}"{{end}}
{{end}}
{{end}}
{{end}}
9 changes: 9 additions & 0 deletions docker/sandbox-bundled/kustomize/gpu-operator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
apiVersion: helm.cattle.io/v1
kind: HelmChart
metadata:
name: nvidia-device-plugin
namespace: kube-system
spec:
chart: nvidia-device-plugin
repo: https://nvidia.github.io/k8s-device-plugin
Loading