Skip to content

Commit

Permalink
Rework image builder (#175)
Browse files Browse the repository at this point in the history
* Lock pip version for pyfunc and batch predictor dependencies, also add activeDeadlineSeconds to image building job

* Add nodeselector and tolerations for image building job

* Fix broken import for imagebuilder mocks

* Image building job using node pool name as node selector

* Adding jobSpec for image builder that container volumes, volumesMount, tolerations, resources and nodeSelector information

* Only make image builder tolerations, nodeselectors and maxRetry configurable

Co-authored-by: Tio Pramayudi <[email protected]>
  • Loading branch information
tiopramayudi and tiopramayudi authored Sep 3, 2021
1 parent 3fc018e commit 0308f1a
Show file tree
Hide file tree
Showing 26 changed files with 465 additions and 88 deletions.
13 changes: 10 additions & 3 deletions api/cmd/api/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ import (
"github.com/gojek/merlin/batch"
"github.com/gojek/merlin/cluster"
"github.com/gojek/merlin/config"
"github.com/gojek/merlin/imagebuilder"
"github.com/gojek/merlin/istio"
"github.com/gojek/merlin/log"
"github.com/gojek/merlin/mlp"
"github.com/gojek/merlin/models"
"github.com/gojek/merlin/pkg/imagebuilder"
"github.com/gojek/merlin/queue"
"github.com/gojek/merlin/queue/work"
"github.com/gojek/merlin/service"
Expand Down Expand Up @@ -86,7 +86,6 @@ func initMLPAPIClient(ctx context.Context, cfg config.MlpAPIConfig) mlp.APIClien
}

func initFeastCoreClient(feastCoreURL, feastAuthAudience string, enableAuth bool) core.CoreServiceClient {

dialOpts := []grpc.DialOption{grpc.WithInsecure()}
if enableAuth {
cred, err := feast.NewGoogleCredential(feastAuthAudience)
Expand Down Expand Up @@ -149,6 +148,10 @@ func initImageBuilder(cfg *config.Config, vaultClient vault.Client) (webserviceB
DockerRegistry: cfg.ImageBuilderConfig.DockerRegistry,
ContextSubPath: cfg.ImageBuilderConfig.ContextSubPath,
BuildTimeoutDuration: timeout,
KanikoImage: cfg.ImageBuilderConfig.KanikoImage,
Tolerations: cfg.ImageBuilderConfig.Tolerations,
NodeSelectors: cfg.ImageBuilderConfig.NodeSelectors,
MaximumRetry: cfg.ImageBuilderConfig.MaximumRetry,

ClusterName: cfg.ImageBuilderConfig.ClusterName,
GcpProject: cfg.ImageBuilderConfig.GcpProject,
Expand All @@ -165,6 +168,10 @@ func initImageBuilder(cfg *config.Config, vaultClient vault.Client) (webserviceB
DockerRegistry: cfg.ImageBuilderConfig.DockerRegistry,
ContextSubPath: cfg.ImageBuilderConfig.PredictionJobContextSubPath,
BuildTimeoutDuration: timeout,
KanikoImage: cfg.ImageBuilderConfig.KanikoImage,
Tolerations: cfg.ImageBuilderConfig.Tolerations,
NodeSelectors: cfg.ImageBuilderConfig.NodeSelectors,
MaximumRetry: cfg.ImageBuilderConfig.MaximumRetry,

ClusterName: cfg.ImageBuilderConfig.ClusterName,
GcpProject: cfg.ImageBuilderConfig.GcpProject,
Expand Down Expand Up @@ -506,7 +513,7 @@ func initLogService(cfg *config.Config, vaultClient vault.Client) service.LogSer
log.Panicf("unable to initialize cluster controller %v", err)
}

var clusterControllers = make(map[string]cluster.Controller)
clusterControllers := make(map[string]cluster.Controller)
clusterControllers[cfg.ImageBuilderConfig.ClusterName] = ctl

for _, env := range cfg.EnvironmentConfigs {
Expand Down
31 changes: 30 additions & 1 deletion api/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"time"

"github.com/kelseyhightower/envconfig"
v1 "k8s.io/api/core/v1"

"github.com/gojek/mlp/api/pkg/instrumentation/newrelic"
"github.com/gojek/mlp/api/pkg/instrumentation/sentry"
Expand Down Expand Up @@ -111,8 +112,36 @@ type ImageBuilderConfig struct {
BuildNamespace string `envconfig:"IMG_BUILDER_NAMESPACE" default:"mlp"`
DockerRegistry string `envconfig:"IMG_BUILDER_DOCKER_REGISTRY"`
BuildTimeout string `envconfig:"IMG_BUILDER_TIMEOUT" default:"10m"`
KanikoImage string `envconfig:"IMG_BUILDER_KANIKO_IMAGE" default:"gcr.io/kaniko-project/executor:v1.6.0"`
// How long to keep the image building job resource in the Kubernetes cluster. Default: 2 days (48 hours).
Retention time.Duration `envconfig:"IMG_BUILDER_RETENTION" default:"48h"`
Retention time.Duration `envconfig:"IMG_BUILDER_RETENTION" default:"48h"`
Tolerations Tolerations `envconfig:"IMG_BUILDER_TOLERATIONS"`
NodeSelectors NodeSelectors `envconfig:"IMG_BUILDER_NODE_SELECTORS"`
MaximumRetry int32 `envconfig:"IMG_BUILDER_MAX_RETRY" default:"3"`
}

type Tolerations []v1.Toleration

func (spec *Tolerations) Decode(value string) error {
var tolerations Tolerations

if err := json.Unmarshal([]byte(value), &tolerations); err != nil {
return err
}
*spec = tolerations
return nil
}

type NodeSelectors map[string]string

func (ns *NodeSelectors) Decode(value string) error {
var nodeSelectors NodeSelectors

if err := json.Unmarshal([]byte(value), &nodeSelectors); err != nil {
return err
}
*ns = nodeSelectors
return nil
}

type VaultConfig struct {
Expand Down
14 changes: 13 additions & 1 deletion api/imagebuilder/config.go → api/pkg/imagebuilder/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@

package imagebuilder

import "time"
import (
"time"

v1 "k8s.io/api/core/v1"
)

type Config struct {
// GCS URL Containing build context
Expand All @@ -31,6 +35,14 @@ type Config struct {
DockerRegistry string
// Build timeout duration
BuildTimeoutDuration time.Duration
// Kaniko docker image
KanikoImage string
// Tolerations for Jobs Specification
Tolerations []v1.Toleration
// Node Selectors for Jobs Specification
NodeSelectors map[string]string
// Maximum number of retry of image builder job
MaximumRetry int32

// Cluster Name
ClusterName string
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,6 @@ import (
"strings"
"time"

"github.com/google/go-containerregistry/pkg/authn"
"github.com/google/go-containerregistry/pkg/name"
"github.com/google/go-containerregistry/pkg/v1/google"
"github.com/google/go-containerregistry/pkg/v1/remote"
"github.com/google/go-containerregistry/pkg/v1/remote/transport"
"github.com/pkg/errors"
batchv1 "k8s.io/api/batch/v1"
v1 "k8s.io/api/core/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
Expand All @@ -37,6 +31,12 @@ import (
"github.com/gojek/merlin/log"
"github.com/gojek/merlin/mlp"
"github.com/gojek/merlin/models"
"github.com/google/go-containerregistry/pkg/authn"
"github.com/google/go-containerregistry/pkg/name"
"github.com/google/go-containerregistry/pkg/v1/google"
"github.com/google/go-containerregistry/pkg/v1/remote"
"github.com/google/go-containerregistry/pkg/v1/remote/transport"
"github.com/pkg/errors"
)

type ImageBuilder interface {
Expand All @@ -62,7 +62,6 @@ type imageBuilder struct {

const (
containerName = "pyfunc-image-builder"
kanikoImage = "gcr.io/kaniko-project/executor:v1.1.0"
kanikoSecretName = "kaniko-secret"
tickDurationSecond = 5

Expand All @@ -77,14 +76,11 @@ const (
var (
jobTTLSecondAfterComplete int32 = 3600 * 24 // 24 hours
jobCompletions int32 = 1
jobBackOffLimit int32 = 3
)

var (
defaultResourceRequests = v1.ResourceList{
v1.ResourceCPU: resource.MustParse("1"),
}
)
var defaultResourceRequests = v1.ResourceList{
v1.ResourceCPU: resource.MustParse("1"),
}

func newImageBuilder(kubeClient kubernetes.Interface, config Config, nameGenerator nameGenerator) ImageBuilder {
return &imageBuilder{
Expand Down Expand Up @@ -164,7 +160,6 @@ func (c *imageBuilder) GetContainers(project mlp.Project, model *models.Model, v
LabelSelector: fmt.Sprintf("job-name=%s", c.nameGenerator.generateBuilderJobName(project, model, version)),
FieldSelector: "status.phase!=Pending",
})

if err != nil {
return nil, err
}
Expand Down Expand Up @@ -285,18 +280,14 @@ func (c *imageBuilder) createKanikoJobSpec(project mlp.Project, model *models.Mo
kanikoPodName := c.nameGenerator.generateBuilderJobName(project, model, version)
imageRef := c.imageRef(project, model, version)

var labels = map[string]string{
labels := map[string]string{
labelTeamName: project.Team,
labelStreamName: project.Stream,
labelAppName: model.Name,
labelEnvironment: c.config.Environment,
labelOrchestratorName: "merlin",
}

for _, label := range project.Labels {
labels[fmt.Sprintf(labelUsersHeading, label.Key)] = label.Value
}

kanikoArgs := []string{
fmt.Sprintf("--dockerfile=%s", c.config.DockerfilePath),
fmt.Sprintf("--context=%s", c.config.BuildContextURL),
Expand All @@ -311,6 +302,8 @@ func (c *imageBuilder) createKanikoJobSpec(project mlp.Project, model *models.Mo
kanikoArgs = append(kanikoArgs, fmt.Sprintf("--context-sub-path=%s", c.config.ContextSubPath))
}

activeDeadlineSeconds := int64(c.config.BuildTimeoutDuration / time.Second)

return &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: kanikoPodName,
Expand All @@ -319,16 +312,17 @@ func (c *imageBuilder) createKanikoJobSpec(project mlp.Project, model *models.Mo
},
Spec: batchv1.JobSpec{
Completions: &jobCompletions,
BackoffLimit: &jobBackOffLimit,
BackoffLimit: &c.config.MaximumRetry,
TTLSecondsAfterFinished: &jobTTLSecondAfterComplete,
ActiveDeadlineSeconds: &activeDeadlineSeconds,
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
// https://stackoverflow.com/questions/54091659/kubernetes-pods-disappear-after-failed-jobs
RestartPolicy: v1.RestartPolicyNever,
Containers: []v1.Container{
{
Name: containerName,
Image: kanikoImage,
Image: c.config.KanikoImage,
Args: kanikoArgs,
VolumeMounts: []v1.VolumeMount{
{
Expand Down Expand Up @@ -358,6 +352,8 @@ func (c *imageBuilder) createKanikoJobSpec(project mlp.Project, model *models.Mo
},
},
},
Tolerations: c.config.Tolerations,
NodeSelector: c.config.NodeSelectors,
},
},
},
Expand Down
Loading

0 comments on commit 0308f1a

Please sign in to comment.