diff --git a/docs/node-observ-lib/README.md b/docs/node-observ-lib/README.md index ef80c5e528..106e05c840 100644 --- a/docs/node-observ-lib/README.md +++ b/docs/node-observ-lib/README.md @@ -20,14 +20,15 @@ You can use observ-lib to fill in monitoring-mixin structure: local nodelib = import 'node-observ-lib/main.libsonnet'; local linux = - nodelib.new( - filteringSelector='job="node"', - groupLabels=['job'], - instanceLabels=['instance'], - dashboardNamePrefix='Node exporter / ', - dashboardTags=['node-exporter-mixin'], - uid='node' - ) + nodelib.new() + + nodelib.withConfigMixin({ + filteringSelector: 'job=~".*node.*"', + groupLabels: ['job'], + instanceLabels: ['instance'], + dashboardNamePrefix: 'Node exporter / ', + dashboardTags: ['node-exporter-mixin'], + uid: 'node', + }) + nodelib.withConfigMixin( { // enable loki logs @@ -51,14 +52,15 @@ local g = import './g.libsonnet'; local nodelib = import 'node-observ-lib/main.libsonnet'; local linux = - nodelib.new( - filteringSelector='job="node"', - groupLabels=['job'], - instanceLabels=['instance'], - dashboardNamePrefix='Node exporter / ', - dashboardTags=['node-exporter-mixin'], - uid='node' - ) + nodelib.new() + + nodelib.withConfigMixin({ + filteringSelector: 'job=~".*node.*"', + groupLabels: ['job'], + instanceLabels: ['instance'], + dashboardNamePrefix: 'Node exporter / ', + dashboardTags: ['node-exporter-mixin'], + uid: 'node', + }) + { grafana+: { panels+: { diff --git a/docs/node-observ-lib/config.libsonnet b/docs/node-observ-lib/config.libsonnet new file mode 100644 index 0000000000..b136c492ef --- /dev/null +++ b/docs/node-observ-lib/config.libsonnet @@ -0,0 +1,104 @@ +{ + + // any modular observability library should inlcude as inputs: + // 'dashboardNamePrefix' - Use as prefix for all Dashboards and (optional) rule groups + // 'filteringSelector' - Static selector to apply to ALL dashboard variables of type query, panel queries, alerts and recording rules. + // 'groupLabels' - one or more labels that can be used to identify 'group' of instances. In simple cases, can be 'job' or 'cluster'. + // 'instanceLabels' - one or more labels that can be used to identify single entity of instances. In simple cases, can be 'instance' or 'pod'. + // 'uid' - UID to prefix all dashboards original uids + + filteringSelector: std.get(self, 'nodeExporterSelector', default='"job="node"'), + groupLabels: ['job'], + instanceLabels: ['instance'], + dashboardNamePrefix: 'Node exporter / ', + uid: 'node', + + dashboardTags: [self.uid], + + // Select the fstype for filesystem-related queries. If left + // empty, all filesystems are selected. If you have unusual + // filesystem you don't want to include in dashboards and + // alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'. + fsSelector: 'fstype!=""', + + // Select the mountpoint for filesystem-related queries. If left + // empty, all mountpoints are selected. For example if you have a + // special purpose tmpfs instance that has a fixed size and will + // always be 100% full, but you still want alerts and dashboards for + // other tmpfs instances, you can exclude those by mountpoint prefix + // like so: 'mountpoint!~"/var/lib/foo.*"'. + fsMountpointSelector: 'mountpoint!=""', + + // Select the device for disk-related queries. If left empty, all + // devices are selected. If you have unusual devices you don't + // want to include in dashboards and alerting, you can exclude + // them here, e.g. 'device!="tmpfs"'. + diskDeviceSelector: 'device!=""', + + // Some of the alerts are meant to fire if a criticadiskDeviceSelector failure of a + // node is imminent (e.g. the disk is about to run full). In a + // true “cloud native” setup, failures of a single node should be + // tolerated. Hence, even imminent failure of a single node is no + // reason to create a paging alert. However, in practice there are + // still many situations where operators like to get paged in time + // before a node runs out of disk space. nodeCriticalSeverity can + // be set to the desired severity for this kind of alerts. This + // can even be templated to depend on labels of the node, e.g. you + // could make this critical for traditional database masters but + // just a warning for K8s nodes. + nodeCriticalSeverity: 'critical', + + // CPU utilization (%) on which to trigger the + // 'NodeCPUHighUsage' alert. + cpuHighUsageThreshold: 90, + // Load average 1m (per core) on which to trigger the + // 'NodeSystemSaturation' alert. + systemSaturationPerCoreThreshold: 2, + + // Available disk space (%) thresholds on which to trigger the + // 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk + // usage grows in a way that it is predicted to run out in 4h or 1d + // and if the provided thresholds have been reached right now. + // In some cases you'll want to adjust these, e.g. by default Kubernetes + // runs the image garbage collection when the disk usage reaches 85% + // of its available space. In that case, you'll want to reduce the + // critical threshold below to something like 14 or 15, otherwise + // the alert could fire under normal node usage. + fsSpaceFillingUpWarningThreshold: 40, + fsSpaceFillingUpCriticalThreshold: 20, + + // Available disk space (%) thresholds on which to trigger the + // 'NodeFilesystemAlmostOutOfSpace' alerts. + fsSpaceAvailableWarningThreshold: 5, + fsSpaceAvailableCriticalThreshold: 3, + + // Memory utilzation (%) level on which to trigger the + // 'NodeMemoryHighUtilization' alert. + memoryHighUtilizationThreshold: 90, + + // Threshold for the rate of memory major page faults to trigger + // 'NodeMemoryMajorPagesFaults' alert. + memoryMajorPagesFaultsThreshold: 500, + + // Disk IO queue level above which to trigger + // 'NodeDiskIOSaturation' alert. + diskIOSaturationThreshold: 10, + + rateInterval: '5m', + + dashboardPeriod: 'now-1h', + dashboardTimezone: 'default', + dashboardRefresh: '1m', + + // logs lib related + enableLokiLogs: false, + extraLogLabels: ['transport', 'unit', 'level'], + logsVolumeGroupBy: 'level', + showLogsVolume: true, + logsFilteringSelector: self.filteringSelector, + logsExtraFilters: + ||| + | label_format timestamp="{{__timestamp__}}" + | line_format `{{ if eq "[[instance]]" ".*" }}{{alignLeft 25 .instance}}|{{alignLeft 25 .unit}}|{{else}}{{alignLeft 25 .unit}}|{{end}} {{__line__}}` + |||, +} diff --git a/docs/node-observ-lib/main.libsonnet b/docs/node-observ-lib/main.libsonnet index 688c7d2450..0fb13f70d2 100644 --- a/docs/node-observ-lib/main.libsonnet +++ b/docs/node-observ-lib/main.libsonnet @@ -1,5 +1,6 @@ local alerts = import './alerts.libsonnet'; local annotations = import './annotations.libsonnet'; +local config = import './config.libsonnet'; local dashboards = import './dashboards.libsonnet'; local datasources = import './datasources.libsonnet'; local g = import './g.libsonnet'; @@ -15,132 +16,10 @@ local commonlib = import 'common-lib/common/main.libsonnet'; config+: config, }, - // any modular observability library should inlcude as inputs: - // 'dashboardNamePrefix' - Use as prefix for all Dashboards and (optional) rule groups - // 'filteringSelector' - Static selector to apply to ALL dashboard variables of type query, panel queries, alerts and recording rules. - // 'groupLabels' - one or more labels that can be used to identify 'group' of instances. In simple cases, can be 'job' or 'cluster'. - // 'instanceLabels' - one or more labels that can be used to identify single entity of instances. In simple cases, can be 'instance' or 'pod'. - // 'uid' - UID to prefix all dashboards original uids - - new( - filteringSelector='job="node"', - groupLabels=['job'], - instanceLabels=['instance'], - dashboardNamePrefix='Node exporter / ', - dashboardTags=[uid], - uid, - ): { + new(): { local this = self, - config: { - - groupLabels: groupLabels, - instanceLabels: instanceLabels, - - dashboardTags: dashboardTags, - uid: uid, - dashboardNamePrefix: dashboardNamePrefix, - - // optional - - // Selectors are inserted between {} in Prometheus queries. - // Select the metrics coming from the node exporter. Note that all - // the selected metrics are shown stacked on top of each other in - // the 'USE Method / Cluster' dashboard. Consider disabling that - // dashboard if mixing up all those metrics in the same dashboard - // doesn't make sense (e.g. because they are coming from different - // clusters). - nodeExporterSelector: filteringSelector, - filteringSelector: self.nodeExporterSelector, - - // Select the fstype for filesystem-related queries. If left - // empty, all filesystems are selected. If you have unusual - // filesystem you don't want to include in dashboards and - // alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'. - fsSelector: 'fstype!=""', - - // Select the mountpoint for filesystem-related queries. If left - // empty, all mountpoints are selected. For example if you have a - // special purpose tmpfs instance that has a fixed size and will - // always be 100% full, but you still want alerts and dashboards for - // other tmpfs instances, you can exclude those by mountpoint prefix - // like so: 'mountpoint!~"/var/lib/foo.*"'. - fsMountpointSelector: 'mountpoint!=""', - - // Select the device for disk-related queries. If left empty, all - // devices are selected. If you have unusual devices you don't - // want to include in dashboards and alerting, you can exclude - // them here, e.g. 'device!="tmpfs"'. - diskDeviceSelector: 'device!=""', - - // Some of the alerts are meant to fire if a criticadiskDeviceSelector failure of a - // node is imminent (e.g. the disk is about to run full). In a - // true “cloud native” setup, failures of a single node should be - // tolerated. Hence, even imminent failure of a single node is no - // reason to create a paging alert. However, in practice there are - // still many situations where operators like to get paged in time - // before a node runs out of disk space. nodeCriticalSeverity can - // be set to the desired severity for this kind of alerts. This - // can even be templated to depend on labels of the node, e.g. you - // could make this critical for traditional database masters but - // just a warning for K8s nodes. - nodeCriticalSeverity: 'critical', - - // CPU utilization (%) on which to trigger the - // 'NodeCPUHighUsage' alert. - cpuHighUsageThreshold: 90, - // Load average 1m (per core) on which to trigger the - // 'NodeSystemSaturation' alert. - systemSaturationPerCoreThreshold: 2, - - // Available disk space (%) thresholds on which to trigger the - // 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk - // usage grows in a way that it is predicted to run out in 4h or 1d - // and if the provided thresholds have been reached right now. - // In some cases you'll want to adjust these, e.g. by default Kubernetes - // runs the image garbage collection when the disk usage reaches 85% - // of its available space. In that case, you'll want to reduce the - // critical threshold below to something like 14 or 15, otherwise - // the alert could fire under normal node usage. - fsSpaceFillingUpWarningThreshold: 40, - fsSpaceFillingUpCriticalThreshold: 20, - - // Available disk space (%) thresholds on which to trigger the - // 'NodeFilesystemAlmostOutOfSpace' alerts. - fsSpaceAvailableWarningThreshold: 5, - fsSpaceAvailableCriticalThreshold: 3, - - // Memory utilzation (%) level on which to trigger the - // 'NodeMemoryHighUtilization' alert. - memoryHighUtilizationThreshold: 90, - - // Threshold for the rate of memory major page faults to trigger - // 'NodeMemoryMajorPagesFaults' alert. - memoryMajorPagesFaultsThreshold: 500, - - // Disk IO queue level above which to trigger - // 'NodeDiskIOSaturation' alert. - diskIOSaturationThreshold: 10, - - rateInterval: '5m', - - dashboardPeriod: 'now-1h', - dashboardTimezone: 'default', - dashboardRefresh: '1m', - - // logs lib related - enableLokiLogs: false, - extraLogLabels: ['transport', 'unit', 'level'], - logsVolumeGroupBy: 'level', - showLogsVolume: true, - logsFilteringSelector: self.filteringSelector, - logsExtraFilters: - ||| - | label_format timestamp="{{__timestamp__}}" - | line_format `{{ if eq "[[instance]]" ".*" }}{{alignLeft 25 .instance}}|{{alignLeft 25 .unit}}|{{else}}{{alignLeft 25 .unit}}|{{end}} {{__line__}}` - |||, - }, - + config: config, grafana: { variables: variables.new(this), targets: targets.new(this), diff --git a/docs/node-observ-lib/mixin.libsonnet b/docs/node-observ-lib/mixin.libsonnet index 6c4b3eae6d..f902699f3e 100644 --- a/docs/node-observ-lib/mixin.libsonnet +++ b/docs/node-observ-lib/mixin.libsonnet @@ -1,15 +1,17 @@ local g = import './g.libsonnet'; local nodelib = import './main.libsonnet'; + local linux = - nodelib.new( - filteringSelector='job="node"', - groupLabels=['job'], - instanceLabels=['instance'], - dashboardNamePrefix='Node exporter / ', - dashboardTags=['node-exporter-mixin'], - uid='node' - ); + nodelib.new() + + nodelib.withConfigMixin({ + filteringSelector: 'job=~".*node.*"', + groupLabels: ['job'], + instanceLabels: ['instance'], + dashboardNamePrefix: 'Node exporter / ', + dashboardTags: ['node-exporter-mixin'], + uid: 'node', + }); { grafanaDashboards+:: linux.grafana.dashboards,