Refactor config

grafana · Nov 13, 2023 · 784cf59 · 784cf59
1 parent 79b4153
commit 784cf59
Show file tree

Hide file tree

Showing 4 changed files with 135 additions and 148 deletions.
diff --git a/docs/node-observ-lib/README.md b/docs/node-observ-lib/README.md
@@ -20,14 +20,15 @@ You can use observ-lib to fill in monitoring-mixin structure:
 local nodelib = import 'node-observ-lib/main.libsonnet';
 
 local linux =
-  nodelib.new(
-    filteringSelector='job="node"',
-    groupLabels=['job'],
-    instanceLabels=['instance'],
-    dashboardNamePrefix='Node exporter / ',
-    dashboardTags=['node-exporter-mixin'],
-    uid='node'
-  )
+  nodelib.new()
+  + nodelib.withConfigMixin({
+    filteringSelector: 'job=~".*node.*"',
+    groupLabels: ['job'],
+    instanceLabels: ['instance'],
+    dashboardNamePrefix: 'Node exporter / ',
+    dashboardTags: ['node-exporter-mixin'],
+    uid: 'node',
+  })
   + nodelib.withConfigMixin(
     {
       // enable loki logs
@@ -51,14 +52,15 @@ local g = import './g.libsonnet';
 local nodelib = import 'node-observ-lib/main.libsonnet';
 
 local linux =
-  nodelib.new(
-    filteringSelector='job="node"',
-    groupLabels=['job'],
-    instanceLabels=['instance'],
-    dashboardNamePrefix='Node exporter / ',
-    dashboardTags=['node-exporter-mixin'],
-    uid='node'
-  )
+  nodelib.new()
+  + nodelib.withConfigMixin({
+    filteringSelector: 'job=~".*node.*"',
+    groupLabels: ['job'],
+    instanceLabels: ['instance'],
+    dashboardNamePrefix: 'Node exporter / ',
+    dashboardTags: ['node-exporter-mixin'],
+    uid: 'node',
+  })
   + {
       grafana+: {
         panels+: {

diff --git a/docs/node-observ-lib/config.libsonnet b/docs/node-observ-lib/config.libsonnet
@@ -0,0 +1,104 @@
+{
+
+  //  any modular observability library should inlcude as inputs:
+  // 'dashboardNamePrefix' - Use as prefix for all Dashboards and (optional) rule groups
+  // 'filteringSelector' - Static selector to apply to ALL dashboard variables of type query, panel queries, alerts and recording rules.
+  // 'groupLabels' - one or more labels that can be used to identify 'group' of instances. In simple cases, can be 'job' or 'cluster'.
+  // 'instanceLabels' - one or more labels that can be used to identify single entity of instances. In simple cases, can be 'instance' or 'pod'.
+  // 'uid' - UID to prefix all dashboards original uids
+
+  filteringSelector: std.get(self, 'nodeExporterSelector', default='"job="node"'),
+  groupLabels: ['job'],
+  instanceLabels: ['instance'],
+  dashboardNamePrefix: 'Node exporter / ',
+  uid: 'node',
+
+  dashboardTags: [self.uid],
+
+  // Select the fstype for filesystem-related queries. If left
+  // empty, all filesystems are selected. If you have unusual
+  // filesystem you don't want to include in dashboards and
+  // alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'.
+  fsSelector: 'fstype!=""',
+
+  // Select the mountpoint for filesystem-related queries. If left
+  // empty, all mountpoints are selected. For example if you have a
+  // special purpose tmpfs instance that has a fixed size and will
+  // always be 100% full, but you still want alerts and dashboards for
+  // other tmpfs instances, you can exclude those by mountpoint prefix
+  // like so: 'mountpoint!~"/var/lib/foo.*"'.
+  fsMountpointSelector: 'mountpoint!=""',
+
+  // Select the device for disk-related queries. If left empty, all
+  // devices are selected. If you have unusual devices you don't
+  // want to include in dashboards and alerting, you can exclude
+  // them here, e.g. 'device!="tmpfs"'.
+  diskDeviceSelector: 'device!=""',
+
+  // Some of the alerts are meant to fire if a criticadiskDeviceSelector failure of a
+  // node is imminent (e.g. the disk is about to run full). In a
+  // true “cloud native” setup, failures of a single node should be
+  // tolerated. Hence, even imminent failure of a single node is no
+  // reason to create a paging alert. However, in practice there are
+  // still many situations where operators like to get paged in time
+  // before a node runs out of disk space. nodeCriticalSeverity can
+  // be set to the desired severity for this kind of alerts. This
+  // can even be templated to depend on labels of the node, e.g. you
+  // could make this critical for traditional database masters but
+  // just a warning for K8s nodes.
+  nodeCriticalSeverity: 'critical',
+
+  // CPU utilization (%) on which to trigger the
+  // 'NodeCPUHighUsage' alert.
+  cpuHighUsageThreshold: 90,
+  // Load average 1m (per core) on which to trigger the
+  // 'NodeSystemSaturation' alert.
+  systemSaturationPerCoreThreshold: 2,
+
+  // Available disk space (%) thresholds on which to trigger the
+  // 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk
+  // usage grows in a way that it is predicted to run out in 4h or 1d
+  // and if the provided thresholds have been reached right now.
+  // In some cases you'll want to adjust these, e.g. by default Kubernetes
+  // runs the image garbage collection when the disk usage reaches 85%
+  // of its available space. In that case, you'll want to reduce the
+  // critical threshold below to something like 14 or 15, otherwise
+  // the alert could fire under normal node usage.
+  fsSpaceFillingUpWarningThreshold: 40,
+  fsSpaceFillingUpCriticalThreshold: 20,
+
+  // Available disk space (%) thresholds on which to trigger the
+  // 'NodeFilesystemAlmostOutOfSpace' alerts.
+  fsSpaceAvailableWarningThreshold: 5,
+  fsSpaceAvailableCriticalThreshold: 3,
+
+  // Memory utilzation (%) level on which to trigger the
+  // 'NodeMemoryHighUtilization' alert.
+  memoryHighUtilizationThreshold: 90,
+
+  // Threshold for the rate of memory major page faults to trigger
+  // 'NodeMemoryMajorPagesFaults' alert.
+  memoryMajorPagesFaultsThreshold: 500,
+
+  // Disk IO queue level above which to trigger
+  // 'NodeDiskIOSaturation' alert.
+  diskIOSaturationThreshold: 10,
+
+  rateInterval: '5m',
+
+  dashboardPeriod: 'now-1h',
+  dashboardTimezone: 'default',
+  dashboardRefresh: '1m',
+
+  // logs lib related
+  enableLokiLogs: false,
+  extraLogLabels: ['transport', 'unit', 'level'],
+  logsVolumeGroupBy: 'level',
+  showLogsVolume: true,
+  logsFilteringSelector: self.filteringSelector,
+  logsExtraFilters:
+    |||
+      | label_format timestamp="{{__timestamp__}}"
+      | line_format `{{ if eq "[[instance]]" ".*" }}{{alignLeft 25 .instance}}|{{alignLeft 25 .unit}}|{{else}}{{alignLeft 25 .unit}}|{{end}} {{__line__}}`
+    |||,
+}
diff --git a/docs/node-observ-lib/main.libsonnet b/docs/node-observ-lib/main.libsonnet
@@ -1,5 +1,6 @@
 local alerts = import './alerts.libsonnet';
 local annotations = import './annotations.libsonnet';
+local config = import './config.libsonnet';
 local dashboards = import './dashboards.libsonnet';
 local datasources = import './datasources.libsonnet';
 local g = import './g.libsonnet';
@@ -15,132 +16,10 @@ local commonlib = import 'common-lib/common/main.libsonnet';
     config+: config,
   },
 
-  //  any modular observability library should inlcude as inputs:
-  // 'dashboardNamePrefix' - Use as prefix for all Dashboards and (optional) rule groups
-  // 'filteringSelector' - Static selector to apply to ALL dashboard variables of type query, panel queries, alerts and recording rules.
-  // 'groupLabels' - one or more labels that can be used to identify 'group' of instances. In simple cases, can be 'job' or 'cluster'.
-  // 'instanceLabels' - one or more labels that can be used to identify single entity of instances. In simple cases, can be 'instance' or 'pod'.
-  // 'uid' - UID to prefix all dashboards original uids
-
-  new(
-    filteringSelector='job="node"',
-    groupLabels=['job'],
-    instanceLabels=['instance'],
-    dashboardNamePrefix='Node exporter / ',
-    dashboardTags=[uid],
-    uid,
-  ): {
+  new(): {
 
     local this = self,
-    config: {
-
-      groupLabels: groupLabels,
-      instanceLabels: instanceLabels,
-
-      dashboardTags: dashboardTags,
-      uid: uid,
-      dashboardNamePrefix: dashboardNamePrefix,
-
-      // optional
-
-      // Selectors are inserted between {} in Prometheus queries.
-      // Select the metrics coming from the node exporter. Note that all
-      // the selected metrics are shown stacked on top of each other in
-      // the 'USE Method / Cluster' dashboard. Consider disabling that
-      // dashboard if mixing up all those metrics in the same dashboard
-      // doesn't make sense (e.g. because they are coming from different
-      // clusters).
-      nodeExporterSelector: filteringSelector,
-      filteringSelector: self.nodeExporterSelector,
-
-      // Select the fstype for filesystem-related queries. If left
-      // empty, all filesystems are selected. If you have unusual
-      // filesystem you don't want to include in dashboards and
-      // alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'.
-      fsSelector: 'fstype!=""',
-
-      // Select the mountpoint for filesystem-related queries. If left
-      // empty, all mountpoints are selected. For example if you have a
-      // special purpose tmpfs instance that has a fixed size and will
-      // always be 100% full, but you still want alerts and dashboards for
-      // other tmpfs instances, you can exclude those by mountpoint prefix
-      // like so: 'mountpoint!~"/var/lib/foo.*"'.
-      fsMountpointSelector: 'mountpoint!=""',
-
-      // Select the device for disk-related queries. If left empty, all
-      // devices are selected. If you have unusual devices you don't
-      // want to include in dashboards and alerting, you can exclude
-      // them here, e.g. 'device!="tmpfs"'.
-      diskDeviceSelector: 'device!=""',
-
-      // Some of the alerts are meant to fire if a criticadiskDeviceSelector failure of a
-      // node is imminent (e.g. the disk is about to run full). In a
-      // true “cloud native” setup, failures of a single node should be
-      // tolerated. Hence, even imminent failure of a single node is no
-      // reason to create a paging alert. However, in practice there are
-      // still many situations where operators like to get paged in time
-      // before a node runs out of disk space. nodeCriticalSeverity can
-      // be set to the desired severity for this kind of alerts. This
-      // can even be templated to depend on labels of the node, e.g. you
-      // could make this critical for traditional database masters but
-      // just a warning for K8s nodes.
-      nodeCriticalSeverity: 'critical',
-
-      // CPU utilization (%) on which to trigger the
-      // 'NodeCPUHighUsage' alert.
-      cpuHighUsageThreshold: 90,
-      // Load average 1m (per core) on which to trigger the
-      // 'NodeSystemSaturation' alert.
-      systemSaturationPerCoreThreshold: 2,
-
-      // Available disk space (%) thresholds on which to trigger the
-      // 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk
-      // usage grows in a way that it is predicted to run out in 4h or 1d
-      // and if the provided thresholds have been reached right now.
-      // In some cases you'll want to adjust these, e.g. by default Kubernetes
-      // runs the image garbage collection when the disk usage reaches 85%
-      // of its available space. In that case, you'll want to reduce the
-      // critical threshold below to something like 14 or 15, otherwise
-      // the alert could fire under normal node usage.
-      fsSpaceFillingUpWarningThreshold: 40,
-      fsSpaceFillingUpCriticalThreshold: 20,
-
-      // Available disk space (%) thresholds on which to trigger the
-      // 'NodeFilesystemAlmostOutOfSpace' alerts.
-      fsSpaceAvailableWarningThreshold: 5,
-      fsSpaceAvailableCriticalThreshold: 3,
-
-      // Memory utilzation (%) level on which to trigger the
-      // 'NodeMemoryHighUtilization' alert.
-      memoryHighUtilizationThreshold: 90,
-
-      // Threshold for the rate of memory major page faults to trigger
-      // 'NodeMemoryMajorPagesFaults' alert.
-      memoryMajorPagesFaultsThreshold: 500,
-
-      // Disk IO queue level above which to trigger
-      // 'NodeDiskIOSaturation' alert.
-      diskIOSaturationThreshold: 10,
-
-      rateInterval: '5m',
-
-      dashboardPeriod: 'now-1h',
-      dashboardTimezone: 'default',
-      dashboardRefresh: '1m',
-
-      // logs lib related
-      enableLokiLogs: false,
-      extraLogLabels: ['transport', 'unit', 'level'],
-      logsVolumeGroupBy: 'level',
-      showLogsVolume: true,
-      logsFilteringSelector: self.filteringSelector,
-      logsExtraFilters:
-        |||
-          | label_format timestamp="{{__timestamp__}}"
-          | line_format `{{ if eq "[[instance]]" ".*" }}{{alignLeft 25 .instance}}|{{alignLeft 25 .unit}}|{{else}}{{alignLeft 25 .unit}}|{{end}} {{__line__}}`
-        |||,
-    },
-
+    config: config,
     grafana: {
       variables: variables.new(this),
       targets: targets.new(this),

diff --git a/docs/node-observ-lib/mixin.libsonnet b/docs/node-observ-lib/mixin.libsonnet
@@ -1,15 +1,17 @@
 local g = import './g.libsonnet';
 local nodelib = import './main.libsonnet';
 
+
 local linux =
-  nodelib.new(
-    filteringSelector='job="node"',
-    groupLabels=['job'],
-    instanceLabels=['instance'],
-    dashboardNamePrefix='Node exporter / ',
-    dashboardTags=['node-exporter-mixin'],
-    uid='node'
-  );
+  nodelib.new()
+  + nodelib.withConfigMixin({
+    filteringSelector: 'job=~".*node.*"',
+    groupLabels: ['job'],
+    instanceLabels: ['instance'],
+    dashboardNamePrefix: 'Node exporter / ',
+    dashboardTags: ['node-exporter-mixin'],
+    uid: 'node',
+  });
 
 {
   grafanaDashboards+:: linux.grafana.dashboards,