diff --git a/clusterloader2/pkg/execservice/exec_service.go b/clusterloader2/pkg/execservice/exec_service.go index 0590be98fa..d0c579db5e 100644 --- a/clusterloader2/pkg/execservice/exec_service.go +++ b/clusterloader2/pkg/execservice/exec_service.go @@ -37,7 +37,7 @@ const ( execDeploymentNamespace = "cluster-loader" execDeploymentName = "exec-pod" execDeploymentPath = "pkg/execservice/manifest/exec_deployment.yaml" - execPodReplicas = 3 + execPodReplicas = 10 execPodSelector = "feature = exec" execPodCheckInterval = 10 * time.Second diff --git a/clusterloader2/pkg/measurement/common/etcd_metrics.go b/clusterloader2/pkg/measurement/common/etcd_metrics.go index 485280801d..80a643a4f7 100644 --- a/clusterloader2/pkg/measurement/common/etcd_metrics.go +++ b/clusterloader2/pkg/measurement/common/etcd_metrics.go @@ -193,14 +193,16 @@ func (e *etcdMetricsMeasurement) getEtcdMetrics(host string, provider provider.P // in order to bypass TLS credential requirement when checking etc /metrics and /health, you // need to provide the insecure http port number to access etcd, http://localhost:2382 for // example. - cmd := fmt.Sprintf("curl http://localhost:%d/metrics", port) - if samples, err := e.sshEtcdMetrics(cmd, host, provider); err == nil { - return samples, nil - } + // cmd := fmt.Sprintf("curl http://localhost:%d/metrics", port) + // if samples, err := e.sshEtcdMetrics(cmd, host, provider); err == nil { + // return samples, nil + // } // Use old endpoint if new one fails, "2379" is hard-coded here as well, it is kept as is since // we don't want to bloat the cluster config only for a fall-back attempt. etcdCert, etcdKey, etcdHost := os.Getenv("ETCD_CERTIFICATE"), os.Getenv("ETCD_KEY"), os.Getenv("ETCD_HOST") + etcdHost = host + cmd := "" if etcdHost == "" { etcdHost = "localhost" } @@ -208,7 +210,7 @@ func (e *etcdMetricsMeasurement) getEtcdMetrics(host string, provider provider.P klog.Warning("empty etcd cert or key, using http") cmd = fmt.Sprintf("curl http://%s:2379/metrics", etcdHost) } else { - cmd = fmt.Sprintf("curl -k --cert %s --key %s https://%s:2379/metrics", etcdCert, etcdKey, etcdHost) + cmd = fmt.Sprintf("sudo curl -k --cert %s --key %s https://%s:2379/metrics", etcdCert, etcdKey, etcdHost) } return e.sshEtcdMetrics(cmd, host, provider) diff --git a/clusterloader2/pkg/measurement/common/scheduler_latency.go b/clusterloader2/pkg/measurement/common/scheduler_latency.go index 8ae9294808..52e92eef99 100644 --- a/clusterloader2/pkg/measurement/common/scheduler_latency.go +++ b/clusterloader2/pkg/measurement/common/scheduler_latency.go @@ -293,6 +293,8 @@ func (s *schedulerLatencyMeasurement) sendRequestToScheduler(c clientset.Interfa return "", fmt.Errorf("unknown REST request") } + // luwang hack for cluster deployed by tanzu + masterRegistered = false var responseText string if masterRegistered { ctx, cancel := context.WithTimeout(context.Background(), singleRestCallTimeout) diff --git a/clusterloader2/pkg/measurement/common/service_creation_latency.go b/clusterloader2/pkg/measurement/common/service_creation_latency.go index 2f59b0c2c4..f7e7f05cab 100644 --- a/clusterloader2/pkg/measurement/common/service_creation_latency.go +++ b/clusterloader2/pkg/measurement/common/service_creation_latency.go @@ -20,6 +20,8 @@ import ( "context" "fmt" "time" + "sync" + "regexp" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" @@ -31,6 +33,7 @@ import ( "k8s.io/klog" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/perf-tests/clusterloader2/pkg/errors" "k8s.io/perf-tests/clusterloader2/pkg/execservice" "k8s.io/perf-tests/clusterloader2/pkg/measurement" measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util" @@ -48,10 +51,21 @@ const ( defaultCheckInterval = 10 * time.Second pingBackoff = 1 * time.Second pingChecks = 10 + // backendThreshold used to check whether total backend number curled from the services is + // larger than/equal to this value. + // 0 means that the testing disabled this check. + // regexString is the regex expression string, which is used to filter the main caring info from the service curl output + // regexString define depends on the container image used by services' pod. + // backendThreshold need to be enabled with regexString, align with the container image. + // example is in testing/svc/ + backendThreshold = 0 + regexString = "" creatingPhase = "creating" ipAssigningPhase = "ipAssigning" reachabilityPhase = "reachability" + startCheckingPhase = "startChecking" + consecutiveSuccCheckStartPhase = "consecutiveSuccCheckStart" ) func init() { @@ -63,21 +77,29 @@ func init() { func createServiceCreationLatencyMeasurement() measurement.Measurement { return &serviceCreationLatencyMeasurement{ selector: measurementutil.NewObjectSelector(), - queue: workerqueue.NewWorkerQueue(serviceCreationLatencyWorkers), + //queue: workerqueue.NewWorkerQueue(serviceCreationLatencyWorkers), creationTimes: measurementutil.NewObjectTransitionTimes(serviceCreationLatencyName), + svcBackends: measurementutil.NewSvcBackends(serviceCreationLatencyName), pingCheckers: checker.NewMap(), } } type serviceCreationLatencyMeasurement struct { - selector *measurementutil.ObjectSelector - waitTimeout time.Duration - stopCh chan struct{} - isRunning bool - queue workerqueue.Interface - client clientset.Interface - creationTimes *measurementutil.ObjectTransitionTimes - pingCheckers checker.Map + selector *measurementutil.ObjectSelector + waitTimeout time.Duration + stopCh chan struct{} + isRunning bool + queue workerqueue.Interface + client clientset.Interface + creationTimes *measurementutil.ObjectTransitionTimes + svcBackends *measurementutil.SvcBackends + pingCheckers checker.Map + lock sync.Mutex + succeedCheckNums int + checkerWorkers int + backendThreshold int + regexString string + regexpObj *regexp.Regexp } // Execute executes service startup latency measurement actions. @@ -106,6 +128,33 @@ func (s *serviceCreationLatencyMeasurement) Execute(config *measurement.Config) if err != nil { return nil, err } + s.checkerWorkers, err = util.GetIntOrDefault(config.Params, "parallel_checker_workers", serviceCreationLatencyWorkers) + if err != nil { + return nil, err + } + s.queue = workerqueue.NewWorkerQueue(s.checkerWorkers) + s.succeedCheckNums, err = util.GetIntOrDefault(config.Params, "consecutive_succeed_checks", pingChecks) + if err != nil { + return nil, err + } + s.backendThreshold, err = util.GetIntOrDefault(config.Params, "backendThreshold", backendThreshold) + if err != nil { + return nil, err + } + s.regexString, err = util.GetStringOrDefault(config.Params, "regexString", regexString) + if err != nil { + return nil, err + } + s.regexString = "Server address: ([0-9.]+):80" + if s.regexString != "" { + s.regexpObj, err = regexp.Compile(s.regexString) + if err != nil { + return nil, err + } + } else { + s.regexpObj = nil + } + return nil, s.start() case "waitForReady": return nil, s.waitForReady() @@ -123,6 +172,8 @@ func (s *serviceCreationLatencyMeasurement) Dispose() { close(s.stopCh) } s.queue.Stop() + s.lock.Lock() + defer s.lock.Unlock() s.pingCheckers.Dispose() } @@ -181,6 +232,22 @@ var serviceCreationTransitions = map[string]measurementutil.Transition{ From: phaseName(creatingPhase, corev1.ServiceTypeClusterIP), To: phaseName(reachabilityPhase, corev1.ServiceTypeClusterIP), }, + "create_to_startchecking_clusterip": { + From: phaseName(creatingPhase, corev1.ServiceTypeClusterIP), + To: phaseName(startCheckingPhase, corev1.ServiceTypeClusterIP), + }, + "startchecking_to_consecutivestart_clusterip": { + From: phaseName(startCheckingPhase, corev1.ServiceTypeClusterIP), + To: phaseName(consecutiveSuccCheckStartPhase, corev1.ServiceTypeClusterIP), + }, + "startchecking_to_available_clusterip": { + From: phaseName(startCheckingPhase, corev1.ServiceTypeClusterIP), + To: phaseName(reachabilityPhase, corev1.ServiceTypeClusterIP), + }, + "consecutivestart_to_available_clusterip": { + From: phaseName(consecutiveSuccCheckStartPhase, corev1.ServiceTypeClusterIP), + To: phaseName(reachabilityPhase, corev1.ServiceTypeClusterIP), + }, "create_to_available_nodeport": { From: phaseName(creatingPhase, corev1.ServiceTypeNodePort), To: phaseName(reachabilityPhase, corev1.ServiceTypeNodePort), @@ -200,6 +267,8 @@ var serviceCreationTransitions = map[string]measurementutil.Transition{ } func (s *serviceCreationLatencyMeasurement) gather(identifier string) ([]measurement.Summary, error) { + var summaries []measurement.Summary + klog.V(2).Infof("%s: gathering service created latency measurement...", s) if !s.isRunning { return nil, fmt.Errorf("metric %s has not been started", s) @@ -213,7 +282,32 @@ func (s *serviceCreationLatencyMeasurement) gather(identifier string) ([]measure return nil, err } summary := measurement.CreateSummary(fmt.Sprintf("%s_%s", serviceCreationLatencyName, identifier), "json", content) - return []measurement.Summary{summary}, nil + summaries = append(summaries, summary) + + if s.backendThreshold != 0 { + svcBackendNum := s.svcBackends.CalculateBackendNum() + content, err = util.PrettyPrintJSON(svcBackendNum) + if err != nil { + return nil, err + } + summary = measurement.CreateSummary(fmt.Sprintf("%s_%s_BackendNums", serviceCreationLatencyName, identifier), "json", content) + summaries = append(summaries, summary) + + if len(svcBackendNum) == 0 { + err = errors.NewMetricViolationError( + "service creation latency", + fmt.Sprintf("%s_%s can not get any backends following pattern %s", serviceCreationLatencyName, identifier, s.regexString)) + } + for svc, num := range svcBackendNum { + if num < s.backendThreshold { + klog.Errorf("only found %d backends for svc %s, expected at least %d backends", num, svc, s.backendThreshold) + err = errors.NewMetricViolationError( + "service creation latency", + fmt.Sprintf("some services can not get at least %d backends", s.backendThreshold)) + } + } + } + return summaries, err } func (s *serviceCreationLatencyMeasurement) handleObject(oldObj, newObj interface{}) { @@ -257,6 +351,8 @@ func (s *serviceCreationLatencyMeasurement) deleteObject(svc *corev1.Service) er if err != nil { return fmt.Errorf("meta key created error: %v", err) } + s.lock.Lock() + defer s.lock.Unlock() s.pingCheckers.DeleteAndStop(key) return nil } @@ -284,12 +380,17 @@ func (s *serviceCreationLatencyMeasurement) updateObject(svc *corev1.Service) er s.creationTimes.Set(key, phaseName(ipAssigningPhase, svc.Spec.Type), time.Now()) } pc := &pingChecker{ - callerName: s.String(), - svc: svc, - creationTimes: s.creationTimes, - stopCh: make(chan struct{}), + callerName: s.String(), + svc: svc, + creationTimes: s.creationTimes, + svcBackends: s.svcBackends, + succeedCheckNums: s.succeedCheckNums, + regexpObj: s.regexpObj, + stopCh: make(chan struct{}), } pc.run() + s.lock.Lock() + defer s.lock.Unlock() s.pingCheckers.Add(key, pc) return nil @@ -300,10 +401,13 @@ func phaseName(phase string, serviceType corev1.ServiceType) string { } type pingChecker struct { - callerName string - svc *corev1.Service - creationTimes *measurementutil.ObjectTransitionTimes - stopCh chan struct{} + callerName string + svc *corev1.Service + creationTimes *measurementutil.ObjectTransitionTimes + svcBackends *measurementutil.SvcBackends + stopCh chan struct{} + succeedCheckNums int + regexpObj *regexp.Regexp } func (p *pingChecker) run() { @@ -329,25 +433,44 @@ func (p *pingChecker) run() { time.Sleep(pingBackoff) continue } + if _, exists := p.creationTimes.Get(key, phaseName(startCheckingPhase, p.svc.Spec.Type)); !exists { + p.creationTimes.Set(key, phaseName(startCheckingPhase, p.svc.Spec.Type), time.Now()) + } + if success == 0 { + p.creationTimes.Set(key, phaseName(consecutiveSuccCheckStartPhase, p.svc.Spec.Type), time.Now()) + } + msg := "" + cmd := "" switch p.svc.Spec.Type { case corev1.ServiceTypeClusterIP: - cmd := fmt.Sprintf("curl %s:%d", p.svc.Spec.ClusterIP, p.svc.Spec.Ports[0].Port) - _, err = execservice.RunCommand(pod, cmd) + // curl parameter is https://www.mit.edu/afs.new/sipb/user/ssen/src/curl-7.11.1/docs/curl.html + // we use 3 as the value of -m, instead of the default timeout value 120s, to make the service creation time more precise + cmd = fmt.Sprintf("curl -m 3 -s -S %s:%d", p.svc.Spec.ClusterIP, p.svc.Spec.Ports[0].Port) + msg, err = execservice.RunCommand(pod, cmd) case corev1.ServiceTypeNodePort: - cmd := fmt.Sprintf("curl %s:%d", pod.Status.HostIP, p.svc.Spec.Ports[0].NodePort) - _, err = execservice.RunCommand(pod, cmd) + cmd = fmt.Sprintf("curl -m 3 -s -S %s:%d", pod.Status.HostIP, p.svc.Spec.Ports[0].NodePort) + msg, err = execservice.RunCommand(pod, cmd) case corev1.ServiceTypeLoadBalancer: - cmd := fmt.Sprintf("curl %s:%d", p.svc.Status.LoadBalancer.Ingress[0].IP, p.svc.Spec.Ports[0].Port) - _, err = execservice.RunCommand(pod, cmd) + cmd = fmt.Sprintf("curl -m 3 -s -S %s:%d", p.svc.Status.LoadBalancer.Ingress[0].IP, p.svc.Spec.Ports[0].Port) + msg, err = execservice.RunCommand(pod, cmd) } if err != nil { + klog.V(2).Infof("cmd %v in pod %v is error: %v", cmd, pod.Name, msg) success = 0 time.Sleep(pingBackoff) continue } + if p.regexpObj != nil { + ip := p.regexpObj.FindStringSubmatch(msg) + // [luwang-vmware] will think a more generic method to filter the user expected value + if len(ip) >= 2 { + p.svcBackends.Set(p.svc.Spec.ClusterIP, ip[1]) + } + } success++ - if success == pingChecks { + if success == p.succeedCheckNums { p.creationTimes.Set(key, phaseName(reachabilityPhase, p.svc.Spec.Type), time.Now()) + klog.V(2).Infof("%v succeed to check", key) } } } diff --git a/clusterloader2/pkg/measurement/common/system_pod_metrics.go b/clusterloader2/pkg/measurement/common/system_pod_metrics.go index 0958d517e5..1488eff706 100644 --- a/clusterloader2/pkg/measurement/common/system_pod_metrics.go +++ b/clusterloader2/pkg/measurement/common/system_pod_metrics.go @@ -130,14 +130,30 @@ func getPodMetrics(config *measurement.Config) (*systemPodsMetrics, error) { return extractMetrics(lst), nil } -func getPodList(client kubernetes.Interface) (*v1.PodList, error) { +//func getPodList(client kubernetes.Interface) (*v1.PodList, error) { +func getPodList(client kubernetes.Interface) ([]v1.Pod, error) { lst, err := client.CoreV1().Pods(systemNamespace).List(context.TODO(), metav1.ListOptions{ ResourceVersion: "0", // to read from cache }) if err != nil { return nil, err } - return lst, nil + //return lst, nil + pods_lst := lst.Items + + ns_lst := []string{ "avi-system", "tanzu-system", "tkg-system", "tkg-system-public"} + for _, ns := range ns_lst { + lst_t, err := client.CoreV1().Pods(ns).List(context.TODO(), metav1.ListOptions{ + ResourceVersion: "0", // to read from cache + }) + if err != nil { + klog.V(2).Info("failed to collect pod metrics in ns %v", ns) + continue + } + pods_lst = append(pods_lst,lst_t.Items...) + } + + return pods_lst, nil } func subtractInitialRestartCounts(metrics *systemPodsMetrics, initMetrics *systemPodsMetrics) { @@ -230,11 +246,13 @@ func getThresholdOverrides(config *measurement.Config) (map[string]int, error) { return parsed, nil } -func extractMetrics(lst *v1.PodList) *systemPodsMetrics { +//func extractMetrics(lst *v1.PodList) *systemPodsMetrics { +func extractMetrics(lst []v1.Pod) *systemPodsMetrics { metrics := systemPodsMetrics{ - Pods: []podMetrics{}, - } - for _, pod := range lst.Items { + Pods: []podMetrics{}, + } + // for _, pod := range lst.Items { + for _, pod := range lst { podMetrics := podMetrics{ Containers: []containerMetrics{}, Name: pod.Name, diff --git a/clusterloader2/pkg/measurement/util/gatherers/container_resource_gatherer.go b/clusterloader2/pkg/measurement/util/gatherers/container_resource_gatherer.go index ea7055b458..a91b4db09b 100644 --- a/clusterloader2/pkg/measurement/util/gatherers/container_resource_gatherer.go +++ b/clusterloader2/pkg/measurement/util/gatherers/container_resource_gatherer.go @@ -103,10 +103,10 @@ func NewResourceUsageGatherer(c clientset.Interface, host string, port int, prov provider: provider, }) } else { - pods, err := c.CoreV1().Pods(namespace).List(context.TODO(), metav1.ListOptions{}) - if err != nil { - return nil, fmt.Errorf("listing pods error: %v", err) - } + // pods, err := c.CoreV1().Pods(namespace).List(context.TODO(), metav1.ListOptions{}) + // if err != nil { + // return nil, fmt.Errorf("listing pods error: %v", err) + // } nodeList, err := c.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) if err != nil { @@ -121,21 +121,43 @@ func NewResourceUsageGatherer(c clientset.Interface, host string, port int, prov } nodesToConsider := make(map[string]bool) - for _, pod := range pods.Items { - if (options.Nodes == MasterAndNonDaemons) && !masterNodes.Has(pod.Spec.NodeName) && isDaemonPod(&pod) { - continue - } - for _, container := range pod.Status.InitContainerStatuses { - g.containerIDs = append(g.containerIDs, container.Name) - } - for _, container := range pod.Status.ContainerStatuses { - g.containerIDs = append(g.containerIDs, container.Name) - } - if options.Nodes == MasterAndNonDaemons { - nodesToConsider[pod.Spec.NodeName] = true + // for _, pod := range pods.Items { + // if (options.Nodes == MasterAndNonDaemons) && !masterNodes.Has(pod.Spec.NodeName) && isDaemonPod(&pod) { + // continue + // } + // for _, container := range pod.Status.InitContainerStatuses { + // g.containerIDs = append(g.containerIDs, container.Name) + // } + // for _, container := range pod.Status.ContainerStatuses { + // g.containerIDs = append(g.containerIDs, container.Name) + // } + // if options.Nodes == MasterAndNonDaemons { + // nodesToConsider[pod.Spec.NodeName] = true + // } + // } + + // hardcode for now, will change to param + ns_lst := []string{namespace, "avi-system", "tkg-system-public", "tkg-system", "tanzu-system"} + for _, ns := range ns_lst { + pods, err := c.CoreV1().Pods(ns).List(context.TODO(), metav1.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("listing pods error: %v", err) + } + for _, pod := range pods.Items { + if (options.Nodes == MasterAndNonDaemons) && !masterNodes.Has(pod.Spec.NodeName) && isDaemonPod(&pod) { + continue + } + for _, container := range pod.Status.InitContainerStatuses { + g.containerIDs = append(g.containerIDs, container.Name) + } + for _, container := range pod.Status.ContainerStatuses { + g.containerIDs = append(g.containerIDs, container.Name) + } + if options.Nodes == MasterAndNonDaemons { + nodesToConsider[pod.Spec.NodeName] = true + } } } - for _, node := range nodeList.Items { if options.Nodes == AllNodes || masterNodes.Has(node.Name) || nodesToConsider[node.Name] { g.workerWg.Add(1) diff --git a/clusterloader2/pkg/measurement/util/kubelet/kubelet.go b/clusterloader2/pkg/measurement/util/kubelet/kubelet.go index 3bed9a3ca3..0ccb1822eb 100644 --- a/clusterloader2/pkg/measurement/util/kubelet/kubelet.go +++ b/clusterloader2/pkg/measurement/util/kubelet/kubelet.go @@ -57,7 +57,7 @@ func GetOneTimeResourceUsageOnNode(c clientset.Interface, nodeName string, port } // Process container infos that are relevant to us. containers := containerNames() - usageMap := make(util.ResourceUsagePerContainer, len(containers)+len(summary.Node.SystemContainers)) + usageMap := make(util.ResourceUsagePerContainer, len(containers)+len(summary.Node.SystemContainers)) + 1 for _, pod := range summary.Pods { for _, container := range pod.Containers { isInteresting := false @@ -82,6 +82,17 @@ func GetOneTimeResourceUsageOnNode(c clientset.Interface, nodeName string, port usageMap[nodeName+"/"+sysContainer.Name] = usage } } + // process the nodes information + node_usage := &util.ContainerResourceUsage{ + Name: nodeName, + Timestamp: summary.Node.StartTime.Time, + CPUUsageInCores: float64(removeUint64Ptr(summary.Node.CPU.UsageNanoCores)) / 1000000000, + MemoryUsageInBytes: removeUint64Ptr(summary.Node.Memory.UsageBytes), + MemoryWorkingSetInBytes: removeUint64Ptr(summary.Node.Memory.WorkingSetBytes), + MemoryRSSInBytes: removeUint64Ptr(summary.Node.Memory.RSSBytes), + CPUInterval: 0, + } + usageMap[nodeName+"/node"] = node_usage return usageMap, nil } diff --git a/clusterloader2/pkg/measurement/util/phase_latency.go b/clusterloader2/pkg/measurement/util/phase_latency.go index e62fbefd5a..b68a5f5bac 100644 --- a/clusterloader2/pkg/measurement/util/phase_latency.go +++ b/clusterloader2/pkg/measurement/util/phase_latency.go @@ -167,3 +167,58 @@ func LatencyMapToPerfData(latency map[string]*LatencyMetric) *PerfData { } return perfData } + +type BackendSet struct { + members map[string]struct{} +} + +func (b *BackendSet) Add(m string) { + if _, exists := b.members[m]; !exists { + b.members[m] = struct{}{} + } +} + +func (b *BackendSet) Len() int { + return len(b.members) +} + +func NewBackendSet() *BackendSet { + return &BackendSet{ + members: make(map[string]struct{}), + } +} + +type SvcBackends struct { + name string + lock sync.Mutex + // map of svc_ip -> BackendSet + svcBackendsMap map[string]*BackendSet +} + +// NewSvcBackends creates new SvcBackends instance. +func NewSvcBackends(name string) *SvcBackends { + return &SvcBackends{ + name: name, + svcBackendsMap: make(map[string]*BackendSet), + } +} + +// Set backend_ip for a given svc +func (s *SvcBackends) Set(svc, backend string) { + s.lock.Lock() + defer s.lock.Unlock() + if _, exists := s.svcBackendsMap[svc]; !exists { + s.svcBackendsMap[svc] = NewBackendSet() + } + s.svcBackendsMap[svc].Add(backend) +} + +func (s *SvcBackends) CalculateBackendNum() map[string]int { + data := make(map[string]int) + s.lock.Lock() + defer s.lock.Unlock() + for k := range s.svcBackendsMap { + data[k] = s.svcBackendsMap[k].Len() + } + return data +} \ No newline at end of file diff --git a/clusterloader2/pkg/prometheus/manifests/0ssd-storage-class.yaml b/clusterloader2/pkg/prometheus/manifests/0ssd-storage-class.yaml index 65aa0098c9..0529126dd3 100644 --- a/clusterloader2/pkg/prometheus/manifests/0ssd-storage-class.yaml +++ b/clusterloader2/pkg/prometheus/manifests/0ssd-storage-class.yaml @@ -5,9 +5,9 @@ kind: StorageClass apiVersion: storage.k8s.io/v1 metadata: name: ssd -provisioner: {{$PROMETHEUS_STORAGE_CLASS_PROVISIONER}} +provisioner: csi.vsphere.vmware.com parameters: - type: {{$PROMETHEUS_STORAGE_CLASS_VOLUME_TYPE}} + datastoreurl: "ds:///vmfs/volumes/vsan:5249834a9f288b9b-4e35d9ee48be834d/" {{if .RetainPD}} reclaimPolicy: Retain {{end}} diff --git a/clusterloader2/pkg/prometheus/manifests/grafana-service.yaml b/clusterloader2/pkg/prometheus/manifests/grafana-service.yaml index 3acdf1e888..b36be667a7 100644 --- a/clusterloader2/pkg/prometheus/manifests/grafana-service.yaml +++ b/clusterloader2/pkg/prometheus/manifests/grafana-service.yaml @@ -6,6 +6,7 @@ metadata: name: grafana namespace: monitoring spec: + type: NodePort ports: - name: http port: 3000 diff --git a/clusterloader2/pkg/prometheus/manifests/master-ip/master-endpoints.yaml b/clusterloader2/pkg/prometheus/manifests/master-ip/master-endpoints.yaml index 3a36e10cdd..879ee0be3a 100644 --- a/clusterloader2/pkg/prometheus/manifests/master-ip/master-endpoints.yaml +++ b/clusterloader2/pkg/prometheus/manifests/master-ip/master-endpoints.yaml @@ -16,12 +16,12 @@ subsets: {{end}} ports: - name: apiserver - port: 443 + port: 6443 {{if not $PROMETHEUS_SCRAPE_APISERVER_ONLY}} - name: etcd-2379 port: 2379 - - name: etcd-2382 - port: 2382 + # - name: etcd-2382 + # port: 2382 - name: kubelet port: 10250 - name: kube-scheduler diff --git a/clusterloader2/pkg/prometheus/manifests/master-ip/master-service.yaml b/clusterloader2/pkg/prometheus/manifests/master-ip/master-service.yaml index 4a5ca7846f..854a92bae4 100644 --- a/clusterloader2/pkg/prometheus/manifests/master-ip/master-service.yaml +++ b/clusterloader2/pkg/prometheus/manifests/master-ip/master-service.yaml @@ -14,12 +14,12 @@ spec: clusterIP: None ports: - name: apiserver - port: 443 + port: 6443 {{if not $PROMETHEUS_SCRAPE_APISERVER_ONLY}} - name: etcd-2379 port: 2379 - - name: etcd-2382 - port: 2382 + # - name: etcd-2382 + # port: 2382 - name: kubelet port: 10250 - name: kube-scheduler diff --git a/clusterloader2/pkg/prometheus/manifests/master-ip/master-serviceMonitor.yaml b/clusterloader2/pkg/prometheus/manifests/master-ip/master-serviceMonitor.yaml index bc7048a40f..47457dc633 100644 --- a/clusterloader2/pkg/prometheus/manifests/master-ip/master-serviceMonitor.yaml +++ b/clusterloader2/pkg/prometheus/manifests/master-ip/master-serviceMonitor.yaml @@ -16,8 +16,13 @@ spec: {{if $PROMETHEUS_SCRAPE_ETCD}} - interval: 5s port: etcd-2379 - - interval: 5s - port: etcd-2382 + scheme: https + tlsConfig: + certFile: /etc/prometheus/secrets/kube-etcd-client-certs/server.crt + insecureSkipVerify: true + keyFile: /etc/prometheus/secrets/kube-etcd-client-certs/server.key + # - interval: 5s + # port: etcd-2382 {{end}} {{if $PROMETHEUS_SCRAPE_NODE_EXPORTER}} # TODO(mborsz): Debug why node-exporter is that slow and change interval back to 5s. @@ -38,21 +43,25 @@ spec: - interval: 5s port: kubelet scheme: https + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token tlsConfig: insecureSkipVerify: true - interval: 5s port: kubelet + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token path: /metrics/cadvisor scheme: https tlsConfig: insecureSkipVerify: true - interval: 5s port: kube-scheduler + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token scheme: https tlsConfig: insecureSkipVerify: true - interval: 5s port: kube-controller-manager + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token scheme: https tlsConfig: insecureSkipVerify: true diff --git a/clusterloader2/pkg/prometheus/manifests/prometheus-prometheus.yaml b/clusterloader2/pkg/prometheus/manifests/prometheus-prometheus.yaml index 0632cd0be2..f93ecbbe99 100644 --- a/clusterloader2/pkg/prometheus/manifests/prometheus-prometheus.yaml +++ b/clusterloader2/pkg/prometheus/manifests/prometheus-prometheus.yaml @@ -33,6 +33,8 @@ spec: ruleSelector: matchLabels: prometheus: k8s + secrets: + - kube-etcd-client-certs securityContext: fsGroup: 2000 runAsNonRoot: true diff --git a/clusterloader2/pkg/prometheus/manifests/prometheus-service.yaml b/clusterloader2/pkg/prometheus/manifests/prometheus-service.yaml index 4f61e88ab7..fac63def78 100644 --- a/clusterloader2/pkg/prometheus/manifests/prometheus-service.yaml +++ b/clusterloader2/pkg/prometheus/manifests/prometheus-service.yaml @@ -6,6 +6,7 @@ metadata: name: prometheus-k8s namespace: monitoring spec: + type: NodePort ports: - name: web port: 9090 @@ -13,4 +14,4 @@ spec: selector: app: prometheus prometheus: k8s - sessionAffinity: ClientIP + # sessionAffinity: ClientIP diff --git a/clusterloader2/pkg/prometheus/prometheus.go b/clusterloader2/pkg/prometheus/prometheus.go index bed661e2a7..72370ef960 100644 --- a/clusterloader2/pkg/prometheus/prometheus.go +++ b/clusterloader2/pkg/prometheus/prometheus.go @@ -379,7 +379,7 @@ func (pc *Controller) isPrometheusReady() (bool, error) { return CheckTargetsReady( // 1 out of 2 etcd targets should be ready. pc.framework.GetClientSets().GetClient(), func(t Target) bool { return isEtcdEndpoint(t.Labels["endpoint"]) }, - 2, // expected targets: etcd-2379 and etcd-2382 + 1, // expected targets: etcd-2379 and etcd-2382 1) // one of them should be healthy } return CheckAllTargetsReady( diff --git a/clusterloader2/pkg/provider/vsphere.go b/clusterloader2/pkg/provider/vsphere.go index 941c9326b4..ceb9c846cd 100644 --- a/clusterloader2/pkg/provider/vsphere.go +++ b/clusterloader2/pkg/provider/vsphere.go @@ -61,6 +61,7 @@ func (p *VsphereProvider) RunSSHCommand(cmd, host string) (string, string, int, return "", "", 0, err } user := defaultSSHUser() + user = "capv" return sshutil.RunSSHCommand(cmd, user, host, signer) } diff --git a/clusterloader2/testing/svc/config.yaml b/clusterloader2/testing/svc/config.yaml new file mode 100644 index 0000000000..db212579b1 --- /dev/null +++ b/clusterloader2/testing/svc/config.yaml @@ -0,0 +1,442 @@ +# ASSUMPTIONS: +# - This test is designed for 100+ node cluster. + +#Constants +{{$POD_THROUGHPUT := DefaultParam .POD_THROUGHPUT 5}} +{{$POD_STARTUP_LATENCY_THRESHOLD := DefaultParam .POD_STARTUP_LATENCY_THRESHOLD "5s"}} +{{$OPERATION_TIMEOUT := DefaultParam .OPERATION_TIMEOUT "11m"}} +{{$ENABLE_SYSTEM_POD_METRICS:= DefaultParam .ENABLE_SYSTEM_POD_METRICS true}} +{{$ENABLE_CLUSTER_OOMS_TRACKER := DefaultParam .CL2_ENABLE_CLUSTER_OOMS_TRACKER true}} +{{$CLUSTER_OOMS_IGNORED_PROCESSES := DefaultParam .CL2_CLUSTER_OOMS_IGNORED_PROCESSES ""}} +{{$RESTART_COUNT_THRESHOLD_OVERRIDES:= DefaultParam .RESTART_COUNT_THRESHOLD_OVERRIDES ""}} +{{$ENABLE_RESTART_COUNT_CHECK := DefaultParam .ENABLE_RESTART_COUNT_CHECK true}} + +#Constants +{{$CONTAINER_IMAGE := "gcr.io/eminent-nation-87317/webserver:v1"}} +{{$REGEXP_PATTERN := "Server address: ([0-9.]+):80"}} +{{$BASE_BACKEND_SIZE := 3}} +{{$SMALL_BACKEND_SIZE := 10}} +{{$MEDIUM_BACKEND_SIZE := 50}} +{{$BIG_BACKEND_SIZE:= 250}} + +{{$BASE_BACKEND_LABEL := "base"}} +{{$SMALL_BACKEND_LABEL := "small"}} +{{$MEDIUM_BACKEND_LABEL := "medium"}} +{{$BIG_BACKEND_LABEL := "big"}} + +{{$BASE_REPLICAS_NS := 30}} +{{$SMALL_REPLICAS_NS := 1}} +{{$MEDIUM_REPLICAS_NS := 1}} +{{$BIG_REPLICAS_NS := 1}} + +{{$BASE_NS := 30}} +{{$SMALL_NS := 10}} +{{$MEDIUM_NS := 10}} +{{$BIG_NS := 10}} + +{{$svcQPS := 1}} + +#Test +name: svc +namespace: + number: {{AddInt $BASE_NS $SMALL_NS $MEDIUM_NS $BIG_NS}} + +tuningSets: +- name: SVCConstantQPS + qpsLoad: + averageQps: {{$svcQPS}} +- name: UniformQPS + qpsLoad: + qps: {{$POD_THROUGHPUT}} +- name: SlowQPS + qpsLoad: + qps: 1 + +steps: +- name: Start monitoring + measurements: + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: start + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + clusterOOMsTrackerEnabled: {{$ENABLE_CLUSTER_OOMS_TRACKER}} + clusterOOMsIgnoredProcesses: {{$CLUSTER_OOMS_IGNORED_PROCESSES}} + restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 4}} + enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}} + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = svc-load + threshold: {{$POD_STARTUP_LATENCY_THRESHOLD}} + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = svc-load + operationTimeout: 120m + +- name: Create Base Deployment + phases: + - namespaceRange: + min: 1 + max: {{$BASE_NS}} + replicasPerNamespace: {{$BASE_REPLICAS_NS}} + tuningSet: UniformQPS + objectBundle: + - basename: base-dep + objectTemplatePath: dep.yaml + templateFillMap: + NumReplicas: {{$BASE_BACKEND_SIZE}} + Image: {{$CONTAINER_IMAGE}} + +- name: Create Small Deployment + phases: + - namespaceRange: + min: {{AddInt $BASE_NS 1}} + max: {{AddInt $BASE_NS $SMALL_NS}} + replicasPerNamespace: {{$SMALL_REPLICAS_NS}} + tuningSet: SlowQPS + objectBundle: + - basename: small-dep + objectTemplatePath: dep.yaml + templateFillMap: + NumReplicas: {{$SMALL_BACKEND_SIZE}} + Image: {{$CONTAINER_IMAGE}} + +- name: Create Medium Deployment + phases: + - namespaceRange: + min: {{AddInt $BASE_NS $SMALL_NS 1}} + max: {{AddInt $BASE_NS $SMALL_NS $MEDIUM_NS}} + replicasPerNamespace: {{$MEDIUM_REPLICAS_NS}} + tuningSet: SlowQPS + objectBundle: + - basename: medium-dep + objectTemplatePath: dep.yaml + templateFillMap: + NumReplicas: {{$MEDIUM_BACKEND_SIZE}} + Image: {{$CONTAINER_IMAGE}} + +- name: Create Big Deployment + phases: + - namespaceRange: + min: {{AddInt $BASE_NS $SMALL_NS $MEDIUM_NS 1}} + max: {{AddInt $BASE_NS $SMALL_NS $MEDIUM_NS $BIG_NS}} + replicasPerNamespace: {{$BIG_REPLICAS_NS}} + tuningSet: SlowQPS + objectBundle: + - basename: big-dep + objectTemplatePath: dep.yaml + templateFillMap: + NumReplicas: {{$BIG_BACKEND_SIZE}} + Image: {{$CONTAINER_IMAGE}} + +- name: Wait Pods Running + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + + +- name: Start Base Svc Monitor + measurements: + - Identifier: Base + Method: ServiceCreationLatency + Params: + action: start + parallel_checker_workers: 10 + consecutive_succeed_checks: {{DivideInt $BASE_BACKEND_SIZE 1}} + backendThreshold: {{DivideInt $BASE_BACKEND_SIZE 2}} + labelSelector: size = {{$BASE_BACKEND_LABEL}} + regexString: {{$REGEXP_PATTERN}} + +- name: Create Base Svc + phases: + - namespaceRange: + min: 1 + max: {{$BASE_NS}} + replicasPerNamespace: {{$BASE_REPLICAS_NS}} + tuningSet: SVCConstantQPS + objectBundle: + - basename: base-dep-svc + objectTemplatePath: service.yaml + templateFillMap: + DeploymentBaseName: base-dep + SVCSizeLabel: {{$BASE_BACKEND_LABEL}} + +- name: Wait Base Svc Ready + measurements: + - Identifier: Base + Method: ServiceCreationLatency + Params: + action: waitForReady + +- name: Collect Base Svc Data + measurements: + - Identifier: Base + Method: ServiceCreationLatency + Params: + action: gather + +- name: Wait 2 min + measurements: + - Identifier: Wait + Method: Sleep + Params: + duration: 2m + +- name: Start Small Svc Monitor + measurements: + - Identifier: Small + Method: ServiceCreationLatency + Params: + action: start + parallel_checker_workers: {{MultiplyInt $SMALL_NS $SMALL_REPLICAS_NS}} + consecutive_succeed_checks: {{MultiplyInt $SMALL_BACKEND_SIZE 2}} + backendThreshold: {{DivideInt $SMALL_BACKEND_SIZE 1}} + regexString: {{$REGEXP_PATTERN}} + labelSelector: size = {{$SMALL_BACKEND_LABEL}} + +- name: Create Small Svc + phases: + - namespaceRange: + min: {{AddInt $BASE_NS 1}} + max: {{AddInt $BASE_NS $SMALL_NS}} + replicasPerNamespace: {{$SMALL_REPLICAS_NS}} + tuningSet: SVCConstantQPS + objectBundle: + - basename: small-dep-svc + objectTemplatePath: service.yaml + templateFillMap: + DeploymentBaseName: small-dep + SVCSizeLabel: {{$SMALL_BACKEND_LABEL}} + +- name: Wait Small Svc Ready + measurements: + - Identifier: Small + Method: ServiceCreationLatency + Params: + action: waitForReady + +- name: Collect Small Svc Data + measurements: + - Identifier: Small + Method: ServiceCreationLatency + Params: + action: gather + +- name: Wait 2 min + measurements: + - Identifier: Wait + Method: Sleep + Params: + duration: 2m + +- name: Start Medium Svc + measurements: + - Identifier: Medium + Method: ServiceCreationLatency + Params: + action: start + parallel_checker_workers: {{MultiplyInt $MEDIUM_NS $MEDIUM_REPLICAS_NS}} + consecutive_succeed_checks: {{DivideInt $MEDIUM_BACKEND_SIZE 2}} + backendThreshold: {{DivideInt $MEDIUM_BACKEND_SIZE 5}} + regexString: {{$REGEXP_PATTERN}} + labelSelector: size = {{$MEDIUM_BACKEND_LABEL}} + +- name: Create Medium Svc + phases: + - namespaceRange: + min: {{AddInt $BASE_NS $SMALL_NS 1}} + max: {{AddInt $BASE_NS $SMALL_NS $MEDIUM_NS}} + replicasPerNamespace: {{$MEDIUM_REPLICAS_NS}} + tuningSet: SVCConstantQPS + objectBundle: + - basename: medium-dep-svc + objectTemplatePath: service.yaml + templateFillMap: + DeploymentBaseName: medium-dep + SVCSizeLabel: {{$MEDIUM_BACKEND_LABEL}} + +- name: Wait Medium Svc Ready + measurements: + - Identifier: Medium + Method: ServiceCreationLatency + Params: + action: waitForReady + +- name: Collect Medium Svc Data + measurements: + - Identifier: Medium + Method: ServiceCreationLatency + Params: + action: gather + +- name: Wait 2 min + measurements: + - Identifier: Wait + Method: Sleep + Params: + duration: 2m + +- name: Start Big Svc + measurements: + - Identifier: Big + Method: ServiceCreationLatency + Params: + action: start + parallel_checker_workers: {{MultiplyInt $BIG_NS $BIG_REPLICAS_NS}} + consecutive_succeed_checks: {{DivideInt $BIG_BACKEND_SIZE 5}} + backendThreshold: {{DivideInt $BIG_BACKEND_SIZE 15}} + regexString: {{$REGEXP_PATTERN}} + labelSelector: size = {{$BIG_BACKEND_LABEL}} + +- name: Create Big Svc + phases: + - namespaceRange: + min: {{AddInt $BASE_NS $SMALL_NS $MEDIUM_NS 1}} + max: {{AddInt $BASE_NS $SMALL_NS $MEDIUM_NS $BIG_NS}} + replicasPerNamespace: {{$BIG_REPLICAS_NS}} + tuningSet: SVCConstantQPS + objectBundle: + - basename: bigall-dep-svc + objectTemplatePath: service.yaml + templateFillMap: + DeploymentBaseName: big-dep + SVCSizeLabel: {{$BIG_BACKEND_LABEL}} + +- name: Wait Big Svc Ready + measurements: + - Identifier: Big + Method: ServiceCreationLatency + Params: + action: waitForReady + +- name: Collect Big Data + measurements: + - Identifier: Big + Method: ServiceCreationLatency + Params: + action: gather + +- name: Wait 2 min + measurements: + - Identifier: Wait + Method: Sleep + Params: + duration: 2m + +- name: Delete Base Svc + phases: + - namespaceRange: + min: 1 + max: {{$BASE_NS }} + replicasPerNamespace: 0 + tuningSet: SVCConstantQPS + objectBundle: + - basename: base-dep-svc + objectTemplatePath: service.yaml + +- name: Delete Small Svc + phases: + - namespaceRange: + min: {{AddInt $BASE_NS 1}} + max: {{AddInt $BASE_NS $SMALL_NS}} + replicasPerNamespace: 0 + tuningSet: SVCConstantQPS + objectBundle: + - basename: small-dep-svc + objectTemplatePath: service.yaml + +- name: Delete Medium Svc + phases: + - namespaceRange: + min: {{AddInt $BASE_NS $SMALL_NS 1}} + max: {{AddInt $BASE_NS $SMALL_NS $MEDIUM_NS}} + replicasPerNamespace: 0 + tuningSet: SVCConstantQPS + objectBundle: + - basename: medium-dep-svc + objectTemplatePath: service.yaml + +- name: Delete Big Svc + phases: + - namespaceRange: + min: {{AddInt $BASE_NS $SMALL_NS $MEDIUM_NS 1}} + max: {{AddInt $BASE_NS $SMALL_NS $MEDIUM_NS $BIG_NS}} + replicasPerNamespace: 0 + tuningSet: SVCConstantQPS + objectBundle: + - basename: big-dep-svc + objectTemplatePath: service.yaml + +- name: Delete Base Deployment + phases: + - namespaceRange: + min: 1 + max: {{$BASE_NS}} + replicasPerNamespace: 0 + tuningSet: UniformQPS + objectBundle: + - basename: base-dep + objectTemplatePath: dep.yaml + +- name: Delete Small Deployment + phases: + - namespaceRange: + min: {{AddInt $BASE_NS 1}} + max: {{AddInt $BASE_NS $SMALL_NS}} + replicasPerNamespace: 0 + tuningSet: SlowQPS + objectBundle: + - basename: small-dep + objectTemplatePath: dep.yaml + +- name: Delete Medium Deployment + phases: + - namespaceRange: + min: {{AddInt $BASE_NS $SMALL_NS 1}} + max: {{AddInt $BASE_NS $SMALL_NS $MEDIUM_NS}} + replicasPerNamespace: 0 + tuningSet: SlowQPS + objectBundle: + - basename: medium-dep + objectTemplatePath: dep.yaml + +- name: Delete Big Deployment + phases: + - namespaceRange: + min: {{AddInt $BASE_NS $SMALL_NS $MEDIUM_NS 1}} + max: {{AddInt $BASE_NS $SMALL_NS $MEDIUM_NS $BIG_NS}} + replicasPerNamespace: 0 + tuningSet: SlowQPS + objectBundle: + - basename: big-dep + objectTemplatePath: dep.yaml + +- name: Wait Pod Disappear + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + +- name: Collect Data + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: gather + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + clusterOOMsTrackerEnabled: {{$ENABLE_CLUSTER_OOMS_TRACKER}} + restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 4}} + enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}} \ No newline at end of file diff --git a/clusterloader2/testing/svc/dep.yaml b/clusterloader2/testing/svc/dep.yaml new file mode 100644 index 0000000000..e064998eab --- /dev/null +++ b/clusterloader2/testing/svc/dep.yaml @@ -0,0 +1,22 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: svc-load +spec: + replicas: {{.NumReplicas}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: svc-load + name: {{.Name}} + spec: + containers: + - name: {{.Name}} + image: {{.Image}} + ports: + - containerPort: 8080 \ No newline at end of file diff --git a/clusterloader2/testing/svc/service.yaml b/clusterloader2/testing/svc/service.yaml new file mode 100644 index 0000000000..01d33c0408 --- /dev/null +++ b/clusterloader2/testing/svc/service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{.Name}} + labels: + size: {{.SVCSizeLabel}} +spec: + type: ClusterIP + selector: + name: {{.DeploymentBaseName}}-{{.Index}} + ports: + - port: 8080 + targetPort: 80 \ No newline at end of file diff --git a/clusterloader2/testing/tkgm_density/pods.yaml b/clusterloader2/testing/tkgm_density/pods.yaml new file mode 100644 index 0000000000..a31da0e2c9 --- /dev/null +++ b/clusterloader2/testing/tkgm_density/pods.yaml @@ -0,0 +1,100 @@ +# ASSUMPTIONS: +# - This test is designed for node-density testing. Please only label one worker node with mtyp=density + +#Constants +{{$POD_COUNT := DefaultParam .POD_COUNT 250}} +{{$POD_THROUGHPUT := DefaultParam .POD_THROUGHPUT 5}} +{{$CONTAINER_IMAGE := DefaultParam .CONTAINER_IMAGE "k8s.gcr.io/pause:3.1"}} +{{$POD_STARTUP_LATENCY_THRESHOLD := DefaultParam .POD_STARTUP_LATENCY_THRESHOLD "5s"}} +{{$OPERATION_TIMEOUT := DefaultParam .OPERATION_TIMEOUT "15m"}} + +name: pods-density +namespace: + number: {{$POD_COUNT}} +tuningSets: +- name: UniformQPS + qpsLoad: + qps: {{$POD_THROUGHPUT}} +steps: +- measurements: + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: start + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + clusterOOMsTrackerEnabled: {{$ENABLE_CLUSTER_OOMS_TRACKER}} + clusterOOMsIgnoredProcesses: {{$CLUSTER_OOMS_IGNORED_PROCESSES}} + restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 1}} + enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}} + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: start + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = density + threshold: {{$POD_STARTUP_LATENCY_THRESHOLD}} +- measurements: + - Identifier: WaitForRunningDensityPods + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: v1 + kind: ReplicationController + labelSelector: group = density + operationTimeout: {{$OPERATION_TIMEOUT}} +- phases: + - namespaceRange: + min: 1 + max: {{$POD_COUNT}} + replicasPerNamespace: 1 + tuningSet: UniformQPS + objectBundle: + - basename: density-pod + objectTemplatePath: rc.yaml + templateFillMap: + Replicas: 1 + Group: density + Image: {{$CONTAINER_IMAGE}} +- measurements: + - Identifier: WaitForRunningDensityPods + Method: WaitForControlledPodsRunning + Params: + action: gather +- phases: + - namespaceRange: + min: 1 + max: {{$POD_COUNT}} + replicasPerNamespace: 0 + tuningSet: UniformQPS + objectBundle: + - basename: density-pod + objectTemplatePath: rc.yaml +- measurements: + - Identifier: WaitForRunningDensityPods + Method: WaitForControlledPodsRunning + Params: + action: gather +# Collect measurements +- measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: gather + enableViolations: true + useSimpleLatencyQuery: true + summaryName: APIResponsivenessPrometheus_simple + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: gather + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + clusterOOMsTrackerEnabled: {{$ENABLE_CLUSTER_OOMS_TRACKER}} + restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 4}} + enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}} \ No newline at end of file diff --git a/clusterloader2/testing/tkgm_density/pv.yaml b/clusterloader2/testing/tkgm_density/pv.yaml new file mode 100644 index 0000000000..4f000465e8 --- /dev/null +++ b/clusterloader2/testing/tkgm_density/pv.yaml @@ -0,0 +1,106 @@ +# ASSUMPTIONS: +# - This test is designed for node-density testing. Please only label one worker node with mtyp=density + +#Constants +{{$POD_COUNT := DefaultParam .POD_COUNT 59}} +{{$POD_THROUGHPUT := DefaultParam .POD_THROUGHPUT 2}} +{{$CONTAINER_IMAGE := DefaultParam .CONTAINER_IMAGE "k8s.gcr.io/pause:3.1"}} +{{$POD_STARTUP_LATENCY_THRESHOLD := DefaultParam .POD_STARTUP_LATENCY_THRESHOLD "360"}} +{{$OPERATION_TIMEOUT := DefaultParam .OPERATION_TIMEOUT "60m"}} + +{{$ENABLE_SYSTEM_POD_METRICS:= DefaultParam .ENABLE_SYSTEM_POD_METRICS true}} +{{$ENABLE_CLUSTER_OOMS_TRACKER := DefaultParam .CL2_ENABLE_CLUSTER_OOMS_TRACKER true}} +{{$CLUSTER_OOMS_IGNORED_PROCESSES := DefaultParam .CL2_CLUSTER_OOMS_IGNORED_PROCESSES ""}} +{{$RESTART_COUNT_THRESHOLD_OVERRIDES:= DefaultParam .RESTART_COUNT_THRESHOLD_OVERRIDES ""}} +{{$ENABLE_RESTART_COUNT_CHECK := DefaultParam .ENABLE_RESTART_COUNT_CHECK true}} + +name: pv-density +namespace: + number: {{$POD_COUNT}} +tuningSets: +- name: UniformQPS + qpsLoad: + qps: {{$POD_THROUGHPUT}} +steps: +- measurements: + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: start + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + clusterOOMsTrackerEnabled: {{$ENABLE_CLUSTER_OOMS_TRACKER}} + clusterOOMsIgnoredProcesses: {{$CLUSTER_OOMS_IGNORED_PROCESSES}} + restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 1}} + enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}} + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: start + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = density + threshold: {{$POD_STARTUP_LATENCY_THRESHOLD}} +- measurements: + - Identifier: WaitForRunningDensitySTS + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: StatefulSet + labelSelector: group = density + operationTimeout: {{$OPERATION_TIMEOUT}} +- phases: + - namespaceRange: + min: 1 + max: {{$POD_COUNT}} + replicasPerNamespace: 1 + tuningSet: UniformQPS + objectBundle: + - basename: density-sts + objectTemplatePath: sts.yaml + templateFillMap: + Replicas: 1 + Group: density + Image: {{$CONTAINER_IMAGE}} +- measurements: + - Identifier: WaitForRunningDensitySTS + Method: WaitForControlledPodsRunning + Params: + action: gather +- phases: + - namespaceRange: + min: 1 + max: {{$POD_COUNT}} + replicasPerNamespace: 0 + tuningSet: UniformQPS + objectBundle: + - basename: density-sts + objectTemplatePath: sts.yaml +- measurements: + - Identifier: WaitForRunningDensityTS + Method: WaitForControlledPodsRunning + Params: + action: gather +# Collect measurements +- measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: gather + enableViolations: true + useSimpleLatencyQuery: true + summaryName: APIResponsivenessPrometheus_simple + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: gather + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + clusterOOMsTrackerEnabled: {{$ENABLE_CLUSTER_OOMS_TRACKER}} + restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 4}} + enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}} \ No newline at end of file diff --git a/clusterloader2/testing/tkgm_density/rc.yaml b/clusterloader2/testing/tkgm_density/rc.yaml new file mode 100644 index 0000000000..a698325715 --- /dev/null +++ b/clusterloader2/testing/tkgm_density/rc.yaml @@ -0,0 +1,36 @@ +apiVersion: v1 +kind: ReplicationController +metadata: + name: {{.Name}} + labels: + group: density +spec: + replicas: {{.Replicas}} + selector: + name: {{.Name}} + template: + metadata: + labels: + name: {{.Name}} + group: density + spec: + nodeSelector: + mtype: density + # Do not automount default service account, to eliminate its impact. + automountServiceAccountToken: false + containers: + - image: {{.Image}} + imagePullPolicy: IfNotPresent + name: {{.Name}} + ports: + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/clusterloader2/testing/tkgm_density/sts.yaml b/clusterloader2/testing/tkgm_density/sts.yaml new file mode 100644 index 0000000000..31477f465d --- /dev/null +++ b/clusterloader2/testing/tkgm_density/sts.yaml @@ -0,0 +1,62 @@ +{{$HostNetworkMode := DefaultParam .CL2_USE_HOST_NETWORK_PODS false}} +{{$EnablePVs := DefaultParam .CL2_ENABLE_PVS true}} + +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: {{.Name}} + labels: + group: density +spec: + podManagementPolicy: Parallel + selector: + matchLabels: + group: density + name: {{.Name}} + serviceName: {{.Name}} + replicas: {{.Replicas}} + template: + metadata: + labels: + group: density + name: {{.Name}} + spec: + nodeSelector: + mtype: density + hostNetwork: {{$HostNetworkMode}} + containers: + - name: {{.Name}} + image: {{.Image}} + resources: + requests: + cpu: 10m + memory: "10M" + {{if $EnablePVs}} + volumeMounts: + - name: pv + mountPath: /var/pv + {{end}} + terminationGracePeriodSeconds: 1 + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + {{if $EnablePVs}} + # NOTE: PVs created this way should be cleaned-up manually, as deleting the StatefulSet doesn't automatically delete PVs. + # To avoid deleting all the PVs at once during namespace deletion, they should be deleted explicitly via Phase. + volumeClaimTemplates: + - metadata: + name: pv + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 100Mi + {{end}} diff --git a/clusterloader2/tkgm/README.md b/clusterloader2/tkgm/README.md new file mode 100644 index 0000000000..1a579728ab --- /dev/null +++ b/clusterloader2/tkgm/README.md @@ -0,0 +1,38 @@ +# Method to run clusterloader2 on clusters deployed by tanzu in vsphere platform + +## Deploy a workload cluster by tanzu +1. need to edit vsphere ytt file(.tanzu/tkg/providers/infrastructure-vsphere/v0.7.7/ytt/base-template.yaml) to + a. add `` kube-api-qps: "100" kube-api-burst: "100" `` for kubelet, kube-scheduler and kube-controller + b. bind `` bind-address: 0.0.0.0`` to kube-scheduler and kube-controller +2. to add ``max-pods: "254"`` to kubelet, if want to test 250 pods per node +3. then create a workload cluster for testing + + +## Sync the code +1. git clone https://github.com/luwang-vmware/perf-tests.git +2. git checkout -b tkr_1.22_fips origin/tkr_1.22_fips +2. make softlink ``ln -s /src/k8s.io/perf-test`` + +## Sync the time of cluster with the hosting ESXi +1. check the set_time_align_esx.sh and update the vc ip and datacenter per your test env +2. ``bash set_time_align_esx.sh `` + +## Retrieve etcd related crt from one of the controlplane +1. ``bash retrieve_crt.sh `` + +## Update the env var and export them +1. edit export_env.rc with the correct controlplane ip address +2. ``source export_env.rc`` + +## Run clusterloader2 +1. cd /src/k8s.io/perf-tests; ./run-e2e.sh --testconfig=./testing/node-throughput/config.yaml --report-dir=/tmp/1 --masterip=$masterip --master-internal-ip=$masterip --enable-prometheus-server=true --tear-down-prometheus-server=false --prometheus-scrape-etcd=true --prometheus-scrape-kube-proxy=false --prometheus-scrape-node-exporter=false --prometheus-manifest-path /src/k8s.io//perf-tests/clusterloader2/pkg/prometheus/manifests/ --alsologtostderr --provider vsphere 2>&1 + + +## Do prometheus snapshot +1. enable snapshot using ``kubectl patch prometheus k8s -n monitoring --type merge --patch '{"spec":{"enableAdminAPI":true}}'`` +2. get prometheus pod ip using ``kubectl get pods -n monitoring -o wide`` +3. create a pod ``create curl pod kubectl run curl --image harbor-repo.vmware.com/relengfortkgi/appropriate/curl --restart=Never -- sleep 36000`` +4. issue a snapshot operation`` kubectl exec -it curl -- curl -XPOST http://100.96.35.115:9090/api/v1/admin/tsdb/snapshot?skip_head=true`` +{"status":"success","data":{"name":"20210719T090905Z-13e016af8a51d965"}} +5. copy the snapshot from the prometheus pod to the outside `` kubectl cp -n monitoring prometheus-k8s-0:/prometheus/snapshots/20210719T090905Z-13e016af8a51d965/ -c prometheus ./ `` +6. \ No newline at end of file diff --git a/clusterloader2/tkgm/export_env.rc b/clusterloader2/tkgm/export_env.rc new file mode 100644 index 0000000000..8dad05b833 --- /dev/null +++ b/clusterloader2/tkgm/export_env.rc @@ -0,0 +1,8 @@ +export ETCD_CERTIFICATE=/etc/kubernetes/pki/etcd/server.crt +export ETCD_KEY=/etc/kubernetes/pki/etcd/server.key +export KUBE_SSH_KEY="/root/.ssh/id_rsa" +export KUBE_SSH_USER="capv" +export masterip=20.20.162.112,20.20.162.115,20.20.162.116 +export CL2_ENABLE_HUGE_SERVICES=true +export CL2_SCHEDULER_THROUGHPUT_THRESHOLD=0 +export CL2_ENABLE_DNS_PROGRAMMING=true diff --git a/clusterloader2/tkgm/retrieve_crt.sh b/clusterloader2/tkgm/retrieve_crt.sh new file mode 100644 index 0000000000..161620f8a5 --- /dev/null +++ b/clusterloader2/tkgm/retrieve_crt.sh @@ -0,0 +1,10 @@ +set -e + +MASTER_IP=$1 +ssh -o StrictHostKeyChecking=no capv@${MASTER_IP} "sudo cp /etc/kubernetes/pki/etcd/server.* .; sudo chmod 644 server.*;" +ssh -o StrictHostKeyChecking=no capv@${MASTER_IP} "sudo cp /etc/kubernetes/pki/etcd/ca.* .; sudo chmod 644 ca.*;" +scp -o StrictHostKeyChecking=no capv@${MASTER_IP}:~/server.* . +scp -o StrictHostKeyChecking=no capv@${MASTER_IP}:~/ca.* . + +kubectl create ns monitoring +kubectl create secret generic kube-etcd-client-certs --from-file=server.crt=server.crt --from-file=server.key=server.key -n monitoring diff --git a/clusterloader2/tkgm/set_time_align_esx.sh b/clusterloader2/tkgm/set_time_align_esx.sh new file mode 100644 index 0000000000..eb3b690682 --- /dev/null +++ b/clusterloader2/tkgm/set_time_align_esx.sh @@ -0,0 +1,11 @@ +export GOVC_URL='Administrator@vsphere.local:Admin!23@20.20.0.6' +export GOVC_INSECURE=true +PREFIX=$1 + + +VMS=$(govc find /tkg-dc -type m -name "${PREFIX}-*" | tr '\n' ' ') +for VM in ${VMS[@]} +do + echo $VM + govc vm.change -vm ${VM} -sync-time-with-host=True + done