Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: get node and control plane costs #324

Merged
merged 2 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions cmd/agent/args/args.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ var (
argLocal = flag.Bool("local", false, "Whether you're running the operator locally.")
argProfiler = flag.Bool("profiler", false, "Enable pprof handler. By default it will be exposed on localhost:7777 under '/debug/pprof'")
argDisableResourceCache = flag.Bool("disable-resource-cache", false, "Control whether resource cache should be enabled or not.")
argEnableKubecostProxy = flag.Bool("enable-kubecost-proxy", false, "If set, will proxy a Kubecost API request through the K8s API server.")

argMaxConcurrentReconciles = flag.Int("max-concurrent-reconciles", 20, "Maximum number of concurrent reconciles which can be run.")
argResyncSeconds = flag.Int("resync-seconds", 300, "Resync duration in seconds.")
Expand Down Expand Up @@ -112,6 +113,10 @@ func DisableHelmTemplateDryRunServer() bool {
return *argDisableHelmTemplateDryRunServer
}

func EnableKubecostProxy() bool {
return *argEnableKubecostProxy
}

func EnableHelmDependencyUpdate() bool {
return *argEnableHelmDependencyUpdate
}
Expand Down
2 changes: 2 additions & 0 deletions cmd/agent/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ func registerKubeReconcilersOrDie(
config *rest.Config,
extConsoleClient consoleclient.Client,
discoveryClient discovery.DiscoveryInterface,
enableKubecostProxy bool,
) {

rolloutsClient, dynamicClient, kubeClient, metricsClient := initKubeClientsOrDie(config)
Expand Down Expand Up @@ -250,6 +251,7 @@ func registerKubeReconcilersOrDie(
KubeClient: kubeClient,
ExtConsoleClient: extConsoleClient,
Tasks: cmap.New[context.CancelFunc](),
Proxy: enableKubecostProxy,
}).SetupWithManager(manager); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "MetricsAggregate")
}
Expand Down
2 changes: 1 addition & 1 deletion cmd/agent/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ func main() {
cache.InitGateCache(args.ControllerCacheTTL(), extConsoleClient)

registerConsoleReconcilersOrDie(consoleManager, config, kubeManager.GetClient(), kubeManager.GetScheme(), extConsoleClient)
registerKubeReconcilersOrDie(ctx, kubeManager, consoleManager, config, extConsoleClient, discoveryClient)
registerKubeReconcilersOrDie(ctx, kubeManager, consoleManager, config, extConsoleClient, discoveryClient, args.EnableKubecostProxy())

//+kubebuilder:scaffold:builder

Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ require (
github.com/opencost/opencost/core v0.0.0-20241216191657-30e5d9a27f41
github.com/orcaman/concurrent-map/v2 v2.0.1
github.com/pkg/errors v0.9.1
github.com/pluralsh/console/go/client v1.25.2
github.com/pluralsh/console/go/client v1.25.3
github.com/pluralsh/controller-reconcile-helper v0.1.0
github.com/pluralsh/gophoenix v0.1.3-0.20231201014135-dff1b4309e34
github.com/pluralsh/polly v0.1.10
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1190,8 +1190,8 @@ github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjL
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pluralsh/console/go/client v1.25.2 h1:Ha/ZF5t+ilJ0MVZPeDO46tK7HyKmi74c1DIHPA2sDY0=
github.com/pluralsh/console/go/client v1.25.2/go.mod h1:lpoWASYsM9keNePS3dpFiEisUHEfObIVlSL3tzpKn8k=
github.com/pluralsh/console/go/client v1.25.3 h1:6MvNz0AuxGwH+zWQyXakMCKf/UG8YfalZBLED9pWLoU=
github.com/pluralsh/console/go/client v1.25.3/go.mod h1:lpoWASYsM9keNePS3dpFiEisUHEfObIVlSL3tzpKn8k=
github.com/pluralsh/controller-reconcile-helper v0.1.0 h1:BV3dYZFH5rn8ZvZjtpkACSv/GmLEtRftNQj/Y4ddHEo=
github.com/pluralsh/controller-reconcile-helper v0.1.0/go.mod h1:RxAbvSB4/jkvx616krCdNQXPbpGJXW3J1L3rASxeFOA=
github.com/pluralsh/gophoenix v0.1.3-0.20231201014135-dff1b4309e34 h1:ab2PN+6if/Aq3/sJM0AVdy1SYuMAnq4g20VaKhTm/Bw=
Expand Down
131 changes: 106 additions & 25 deletions internal/controller/kubecostextractor_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"encoding/json"
"fmt"
"io"
"math/rand"
"net/http"
"net/url"
"strconv"
Expand Down Expand Up @@ -50,6 +51,8 @@ import (
"sigs.k8s.io/controller-runtime/pkg/log"
)

const kubeCostJitter = time.Minute * 5

var kubecostResourceTypes = []string{"deployment", "statefulset", "daemonset"}

// KubecostExtractorReconciler reconciles a KubecostExtractor object
Expand All @@ -59,6 +62,7 @@ type KubecostExtractorReconciler struct {
KubeClient kubernetes.Interface
ExtConsoleClient consoleclient.Client
Tasks cmap.ConcurrentMap[string, context.CancelFunc]
Proxy bool
}

func (r *KubecostExtractorReconciler) RunOnInterval(ctx context.Context, key string, interval time.Duration, condition wait.ConditionWithContextFunc) {
Expand All @@ -69,7 +73,7 @@ func (r *KubecostExtractorReconciler) RunOnInterval(ctx context.Context, key str
r.Tasks.Set(key, cancel)

go func() {
_ = wait.PollUntilContextCancel(ctxCancel, interval, true, condition)
_ = wait.PollUntilContextCancel(ctxCancel, interval+time.Duration(rand.Int63n(int64(kubeCostJitter))), true, condition)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this will fully work because it will keep the jitter constant through the poll loop. I'd rather us sleep for the jitter amount at the top of the loop, making it random each run.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, you are right, I will change it

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

}()
}

Expand Down Expand Up @@ -134,24 +138,21 @@ func (r *KubecostExtractorReconciler) Reconcile(ctx context.Context, req ctrl.Re
reterr = err
}
}()
clusterCostAttr, err := r.getClusterCost(ctx, kubecostService, kubecost.Spec.GetPort(), kubecost.Spec.GetInterval())
clusterCostAttr, err := r.getClusterCost(ctx, kubecostService, kubecost.Spec.GetPort())
if err != nil {
logger.Error(err, "Unable to fetch cluster cost")
utils.MarkCondition(kubecost.SetCondition, v1alpha1.ReadyConditionType, v1.ConditionFalse, v1alpha1.ErrorConditionReason, err.Error())
return false, nil
}
namespacesCostAtrr, err := r.getNamespacesCost(ctx, kubecostService, kubecost.Spec.GetPort(), kubecost.Spec.GetInterval())
namespacesCostAtrr, err := r.getNamespacesCost(ctx, kubecostService, kubecost.Spec.GetPort())
if err != nil {
logger.Error(err, "Unable to fetch namespacesCostAtrr cost")
utils.MarkCondition(kubecost.SetCondition, v1alpha1.ReadyConditionType, v1.ConditionFalse, v1alpha1.ErrorConditionReason, err.Error())
return false, nil
}

recommendations, err := r.getRecommendationAttributes(ctx, kubecostService, kubecost.Spec.GetPort(), kubecost.Spec.GetInterval(), recommendationThreshold)
recommendations, err := r.getRecommendationAttributes(ctx, kubecostService, kubecost.Spec.GetPort(), recommendationThreshold)
if err != nil {
logger.Error(err, "Unable to fetch recommendations")
// utils.MarkCondition(kubecost.SetCondition, v1alpha1.ReadyConditionType, v1.ConditionFalse, v1alpha1.ErrorConditionReason, err.Error())
// return false, nil
utils.MarkCondition(kubecost.SetCondition, v1alpha1.ReadyConditionType, v1.ConditionFalse, v1alpha1.ErrorConditionReason, err.Error())
}

// nothing for specified time window
Expand Down Expand Up @@ -193,8 +194,8 @@ func (r *KubecostExtractorReconciler) fetch(host, path string, params map[string
ResponseHeaderTimeout: 120 * time.Second,
}

client := &http.Client{Transport: tr}
resp, err := client.Get(fmt.Sprintf("http://%s%s%s", host, path, query))
httpClient := &http.Client{Transport: tr}
resp, err := httpClient.Get(fmt.Sprintf("http://%s%s%s", host, path, query))
if err != nil {
return nil, err
}
Expand All @@ -216,18 +217,25 @@ func (r *KubecostExtractorReconciler) getAllocation(ctx context.Context, srv *co
"accumulate": "true",
}

bytes, err := r.fetch(fmt.Sprintf("%s.%s:%s", srv.Name, srv.Namespace, servicePort), "/model/allocation", queryParams)
var response []byte
var err error
if r.Proxy {
response, err = r.KubeClient.CoreV1().Services(srv.Namespace).ProxyGet("", srv.Name, servicePort, "/model/allocation", queryParams).DoRaw(ctx)
} else {
response, err = r.fetch(fmt.Sprintf("%s.%s:%s", srv.Name, srv.Namespace, servicePort), "/model/allocation", queryParams)

}
if err != nil {
return nil, err
}
ar := &allocationResponse{}
if err = json.Unmarshal(bytes, ar); err != nil {
if err = json.Unmarshal(response, ar); err != nil {
return nil, err
}
return ar, nil
}

func (r *KubecostExtractorReconciler) getRecommendationAttributes(ctx context.Context, srv *corev1.Service, servicePort string, interval time.Duration, recommendationThreshold float64) ([]*console.ClusterRecommendationAttributes, error) {
func (r *KubecostExtractorReconciler) getRecommendationAttributes(ctx context.Context, srv *corev1.Service, servicePort string, recommendationThreshold float64) ([]*console.ClusterRecommendationAttributes, error) {
var result []*console.ClusterRecommendationAttributes
for _, resourceType := range kubecostResourceTypes {
ar, err := r.getAllocation(ctx, srv, servicePort, resourceType)
Expand Down Expand Up @@ -256,7 +264,7 @@ func (r *KubecostExtractorReconciler) getRecommendationAttributes(ctx context.Co
return result, nil
}

func (r *KubecostExtractorReconciler) getNamespacesCost(ctx context.Context, srv *corev1.Service, servicePort string, interval time.Duration) ([]*console.CostAttributes, error) {
func (r *KubecostExtractorReconciler) getNamespacesCost(ctx context.Context, srv *corev1.Service, servicePort string) ([]*console.CostAttributes, error) {
var result []*console.CostAttributes
ar, err := r.getAllocation(ctx, srv, servicePort, "namespace")
if err != nil {
Expand All @@ -265,15 +273,15 @@ func (r *KubecostExtractorReconciler) getNamespacesCost(ctx context.Context, srv
if ar.Code != http.StatusOK {
return nil, fmt.Errorf("unexpected status code: %d", ar.Code)
}
for _, clusterCosts := range ar.Data {
if clusterCosts == nil {
for _, namespaceCosts := range ar.Data {
if namespaceCosts == nil {
continue
}
for namespace, allocation := range clusterCosts {
for namespace, allocation := range namespaceCosts {
if namespace == opencost.IdleSuffix {
continue
}
attr := convertCostAttributes(allocation)
attr := convertCostAttributes(allocation, nil, nil)
attr.Namespace = lo.ToPtr(namespace)
result = append(result, attr)
}
Expand All @@ -282,17 +290,20 @@ func (r *KubecostExtractorReconciler) getNamespacesCost(ctx context.Context, srv
return result, nil
}

func (r *KubecostExtractorReconciler) getClusterCost(ctx context.Context, srv *corev1.Service, servicePort string, interval time.Duration) (*console.CostAttributes, error) {
bytes, err := r.fetch(fmt.Sprintf("%s.%s:%s", srv.Name, srv.Namespace, servicePort), "/model/clusterInfo", nil)
func (r *KubecostExtractorReconciler) getClusterCost(ctx context.Context, srv *corev1.Service, servicePort string) (*console.CostAttributes, error) {
controlPlaneCost, err := r.getControlPlaneCost(ctx, srv, servicePort)
if err != nil {
return nil, err
}
var resp clusterinfoResponse
err = json.Unmarshal(bytes, &resp)
nodeCost, err := r.getNodeCost(ctx, srv, servicePort)
if err != nil {
return nil, err
}

clusterID, err := r.getClusterID(ctx, srv, servicePort)
if err != nil {
return nil, err
}
ar, err := r.getAllocation(ctx, srv, servicePort, "cluster")
if err != nil {
return nil, err
Expand All @@ -304,15 +315,83 @@ func (r *KubecostExtractorReconciler) getClusterCost(ctx context.Context, srv *c
if clusterCosts == nil {
continue
}
clusterCostAllocation, ok := clusterCosts[resp.Data.ClusterID]

allocation, ok := clusterCosts[clusterID]
if ok {
return convertCostAttributes(clusterCostAllocation), nil
return convertCostAttributes(allocation, nodeCost, controlPlaneCost), nil
}
}

return nil, nil
}

func (r *KubecostExtractorReconciler) getControlPlaneCost(ctx context.Context, srv *corev1.Service, servicePort string) (*float64, error) {
ar, err := r.getAllocation(ctx, srv, servicePort, "controller")
if err != nil {
return nil, err
}
if ar.Code != http.StatusOK {
return nil, fmt.Errorf("unexpected status code: %d", ar.Code)
}
for _, controllerCosts := range ar.Data {
if controllerCosts == nil {
continue
}
allocation, ok := controllerCosts[opencost.UnallocatedSuffix]
if ok {
return lo.ToPtr(allocation.TotalCost()), nil
}
}

return nil, nil
}

func (r *KubecostExtractorReconciler) getNodeCost(ctx context.Context, srv *corev1.Service, servicePort string) (*float64, error) {
var totalNodeCost float64
ar, err := r.getAllocation(ctx, srv, servicePort, "node")
if err != nil {
return nil, err
}
if ar.Code != http.StatusOK {
return nil, fmt.Errorf("unexpected status code: %d", ar.Code)
}
for _, nodeCosts := range ar.Data {
if nodeCosts == nil {
continue
}
for name, allocation := range nodeCosts {
if name == opencost.IdleSuffix {
continue
}
totalNodeCost += allocation.TotalCost()
}
}
if totalNodeCost > 0 {
return &totalNodeCost, nil
}
return nil, nil
}

func (r *KubecostExtractorReconciler) getClusterID(ctx context.Context, srv *corev1.Service, servicePort string) (string, error) {
var response []byte
var err error
if r.Proxy {
response, err = r.KubeClient.CoreV1().Services(srv.Namespace).ProxyGet("", srv.Name, servicePort, "/model/clusterInfo", nil).DoRaw(ctx)
} else {
response, err = r.fetch(fmt.Sprintf("%s.%s:%s", srv.Name, srv.Namespace, servicePort), "/model/clusterInfo", nil)

}
if err != nil {
return "", err
}
var resp clusterinfoResponse
err = json.Unmarshal(response, &resp)
if err != nil {
return "", err
}
return resp.Data.ClusterID, nil
}

func (r *KubecostExtractorReconciler) getObjectInfo(ctx context.Context, resourceType console.ScalingRecommendationType, namespace, name string) (container, serviceId *string, err error) {
gvk := schema.GroupVersionKind{
Group: "apps",
Expand Down Expand Up @@ -394,7 +473,7 @@ func (r *KubecostExtractorReconciler) convertClusterRecommendationAttributes(ctx
return result
}

func convertCostAttributes(allocation opencost.Allocation) *console.CostAttributes {
func convertCostAttributes(allocation opencost.Allocation, nodeCost, controlPlaneCost *float64) *console.CostAttributes {
attr := &console.CostAttributes{
Memory: lo.ToPtr(allocation.RAMBytes()),
CPU: lo.ToPtr(allocation.CPUCores()),
Expand All @@ -405,6 +484,8 @@ func convertCostAttributes(allocation opencost.Allocation) *console.CostAttribut
MemoryCost: lo.ToPtr(allocation.RAMCost),
GpuCost: lo.ToPtr(allocation.GPUCost),
LoadBalancerCost: lo.ToPtr(allocation.LoadBalancerCost),
ControlPlaneCost: controlPlaneCost,
NodeCost: nodeCost,
}
if allocation.GPUAllocation != nil {
attr.GpuUtil = allocation.GPUAllocation.GPUUsageAverage
Expand Down
Loading