Skip to content

Commit

Permalink
Refactor config
Browse files Browse the repository at this point in the history
  • Loading branch information
v-zhuravlev committed Nov 13, 2023
1 parent 79b4153 commit 784cf59
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 148 deletions.
34 changes: 18 additions & 16 deletions docs/node-observ-lib/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,15 @@ You can use observ-lib to fill in monitoring-mixin structure:
local nodelib = import 'node-observ-lib/main.libsonnet';
local linux =
nodelib.new(
filteringSelector='job="node"',
groupLabels=['job'],
instanceLabels=['instance'],
dashboardNamePrefix='Node exporter / ',
dashboardTags=['node-exporter-mixin'],
uid='node'
)
nodelib.new()
+ nodelib.withConfigMixin({
filteringSelector: 'job=~".*node.*"',
groupLabels: ['job'],
instanceLabels: ['instance'],
dashboardNamePrefix: 'Node exporter / ',
dashboardTags: ['node-exporter-mixin'],
uid: 'node',
})
+ nodelib.withConfigMixin(
{
// enable loki logs
Expand All @@ -51,14 +52,15 @@ local g = import './g.libsonnet';
local nodelib = import 'node-observ-lib/main.libsonnet';
local linux =
nodelib.new(
filteringSelector='job="node"',
groupLabels=['job'],
instanceLabels=['instance'],
dashboardNamePrefix='Node exporter / ',
dashboardTags=['node-exporter-mixin'],
uid='node'
)
nodelib.new()
+ nodelib.withConfigMixin({
filteringSelector: 'job=~".*node.*"',
groupLabels: ['job'],
instanceLabels: ['instance'],
dashboardNamePrefix: 'Node exporter / ',
dashboardTags: ['node-exporter-mixin'],
uid: 'node',
})
+ {
grafana+: {
panels+: {
Expand Down
104 changes: 104 additions & 0 deletions docs/node-observ-lib/config.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
{

// any modular observability library should inlcude as inputs:
// 'dashboardNamePrefix' - Use as prefix for all Dashboards and (optional) rule groups
// 'filteringSelector' - Static selector to apply to ALL dashboard variables of type query, panel queries, alerts and recording rules.
// 'groupLabels' - one or more labels that can be used to identify 'group' of instances. In simple cases, can be 'job' or 'cluster'.
// 'instanceLabels' - one or more labels that can be used to identify single entity of instances. In simple cases, can be 'instance' or 'pod'.
// 'uid' - UID to prefix all dashboards original uids

filteringSelector: std.get(self, 'nodeExporterSelector', default='"job="node"'),
groupLabels: ['job'],
instanceLabels: ['instance'],
dashboardNamePrefix: 'Node exporter / ',
uid: 'node',

dashboardTags: [self.uid],

// Select the fstype for filesystem-related queries. If left
// empty, all filesystems are selected. If you have unusual
// filesystem you don't want to include in dashboards and
// alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'.
fsSelector: 'fstype!=""',

// Select the mountpoint for filesystem-related queries. If left
// empty, all mountpoints are selected. For example if you have a
// special purpose tmpfs instance that has a fixed size and will
// always be 100% full, but you still want alerts and dashboards for
// other tmpfs instances, you can exclude those by mountpoint prefix
// like so: 'mountpoint!~"/var/lib/foo.*"'.
fsMountpointSelector: 'mountpoint!=""',

// Select the device for disk-related queries. If left empty, all
// devices are selected. If you have unusual devices you don't
// want to include in dashboards and alerting, you can exclude
// them here, e.g. 'device!="tmpfs"'.
diskDeviceSelector: 'device!=""',

// Some of the alerts are meant to fire if a criticadiskDeviceSelector failure of a
// node is imminent (e.g. the disk is about to run full). In a
// true “cloud native” setup, failures of a single node should be
// tolerated. Hence, even imminent failure of a single node is no
// reason to create a paging alert. However, in practice there are
// still many situations where operators like to get paged in time
// before a node runs out of disk space. nodeCriticalSeverity can
// be set to the desired severity for this kind of alerts. This
// can even be templated to depend on labels of the node, e.g. you
// could make this critical for traditional database masters but
// just a warning for K8s nodes.
nodeCriticalSeverity: 'critical',

// CPU utilization (%) on which to trigger the
// 'NodeCPUHighUsage' alert.
cpuHighUsageThreshold: 90,
// Load average 1m (per core) on which to trigger the
// 'NodeSystemSaturation' alert.
systemSaturationPerCoreThreshold: 2,

// Available disk space (%) thresholds on which to trigger the
// 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk
// usage grows in a way that it is predicted to run out in 4h or 1d
// and if the provided thresholds have been reached right now.
// In some cases you'll want to adjust these, e.g. by default Kubernetes
// runs the image garbage collection when the disk usage reaches 85%
// of its available space. In that case, you'll want to reduce the
// critical threshold below to something like 14 or 15, otherwise
// the alert could fire under normal node usage.
fsSpaceFillingUpWarningThreshold: 40,
fsSpaceFillingUpCriticalThreshold: 20,

// Available disk space (%) thresholds on which to trigger the
// 'NodeFilesystemAlmostOutOfSpace' alerts.
fsSpaceAvailableWarningThreshold: 5,
fsSpaceAvailableCriticalThreshold: 3,

// Memory utilzation (%) level on which to trigger the
// 'NodeMemoryHighUtilization' alert.
memoryHighUtilizationThreshold: 90,

// Threshold for the rate of memory major page faults to trigger
// 'NodeMemoryMajorPagesFaults' alert.
memoryMajorPagesFaultsThreshold: 500,

// Disk IO queue level above which to trigger
// 'NodeDiskIOSaturation' alert.
diskIOSaturationThreshold: 10,

rateInterval: '5m',

dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',

// logs lib related
enableLokiLogs: false,
extraLogLabels: ['transport', 'unit', 'level'],
logsVolumeGroupBy: 'level',
showLogsVolume: true,
logsFilteringSelector: self.filteringSelector,
logsExtraFilters:
|||
| label_format timestamp="{{__timestamp__}}"
| line_format `{{ if eq "[[instance]]" ".*" }}{{alignLeft 25 .instance}}|{{alignLeft 25 .unit}}|{{else}}{{alignLeft 25 .unit}}|{{end}} {{__line__}}`
|||,
}
127 changes: 3 additions & 124 deletions docs/node-observ-lib/main.libsonnet
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
local alerts = import './alerts.libsonnet';
local annotations = import './annotations.libsonnet';
local config = import './config.libsonnet';
local dashboards = import './dashboards.libsonnet';
local datasources = import './datasources.libsonnet';
local g = import './g.libsonnet';
Expand All @@ -15,132 +16,10 @@ local commonlib = import 'common-lib/common/main.libsonnet';
config+: config,
},

// any modular observability library should inlcude as inputs:
// 'dashboardNamePrefix' - Use as prefix for all Dashboards and (optional) rule groups
// 'filteringSelector' - Static selector to apply to ALL dashboard variables of type query, panel queries, alerts and recording rules.
// 'groupLabels' - one or more labels that can be used to identify 'group' of instances. In simple cases, can be 'job' or 'cluster'.
// 'instanceLabels' - one or more labels that can be used to identify single entity of instances. In simple cases, can be 'instance' or 'pod'.
// 'uid' - UID to prefix all dashboards original uids

new(
filteringSelector='job="node"',
groupLabels=['job'],
instanceLabels=['instance'],
dashboardNamePrefix='Node exporter / ',
dashboardTags=[uid],
uid,
): {
new(): {

local this = self,
config: {

groupLabels: groupLabels,
instanceLabels: instanceLabels,

dashboardTags: dashboardTags,
uid: uid,
dashboardNamePrefix: dashboardNamePrefix,

// optional

// Selectors are inserted between {} in Prometheus queries.
// Select the metrics coming from the node exporter. Note that all
// the selected metrics are shown stacked on top of each other in
// the 'USE Method / Cluster' dashboard. Consider disabling that
// dashboard if mixing up all those metrics in the same dashboard
// doesn't make sense (e.g. because they are coming from different
// clusters).
nodeExporterSelector: filteringSelector,
filteringSelector: self.nodeExporterSelector,

// Select the fstype for filesystem-related queries. If left
// empty, all filesystems are selected. If you have unusual
// filesystem you don't want to include in dashboards and
// alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'.
fsSelector: 'fstype!=""',

// Select the mountpoint for filesystem-related queries. If left
// empty, all mountpoints are selected. For example if you have a
// special purpose tmpfs instance that has a fixed size and will
// always be 100% full, but you still want alerts and dashboards for
// other tmpfs instances, you can exclude those by mountpoint prefix
// like so: 'mountpoint!~"/var/lib/foo.*"'.
fsMountpointSelector: 'mountpoint!=""',

// Select the device for disk-related queries. If left empty, all
// devices are selected. If you have unusual devices you don't
// want to include in dashboards and alerting, you can exclude
// them here, e.g. 'device!="tmpfs"'.
diskDeviceSelector: 'device!=""',

// Some of the alerts are meant to fire if a criticadiskDeviceSelector failure of a
// node is imminent (e.g. the disk is about to run full). In a
// true “cloud native” setup, failures of a single node should be
// tolerated. Hence, even imminent failure of a single node is no
// reason to create a paging alert. However, in practice there are
// still many situations where operators like to get paged in time
// before a node runs out of disk space. nodeCriticalSeverity can
// be set to the desired severity for this kind of alerts. This
// can even be templated to depend on labels of the node, e.g. you
// could make this critical for traditional database masters but
// just a warning for K8s nodes.
nodeCriticalSeverity: 'critical',

// CPU utilization (%) on which to trigger the
// 'NodeCPUHighUsage' alert.
cpuHighUsageThreshold: 90,
// Load average 1m (per core) on which to trigger the
// 'NodeSystemSaturation' alert.
systemSaturationPerCoreThreshold: 2,

// Available disk space (%) thresholds on which to trigger the
// 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk
// usage grows in a way that it is predicted to run out in 4h or 1d
// and if the provided thresholds have been reached right now.
// In some cases you'll want to adjust these, e.g. by default Kubernetes
// runs the image garbage collection when the disk usage reaches 85%
// of its available space. In that case, you'll want to reduce the
// critical threshold below to something like 14 or 15, otherwise
// the alert could fire under normal node usage.
fsSpaceFillingUpWarningThreshold: 40,
fsSpaceFillingUpCriticalThreshold: 20,

// Available disk space (%) thresholds on which to trigger the
// 'NodeFilesystemAlmostOutOfSpace' alerts.
fsSpaceAvailableWarningThreshold: 5,
fsSpaceAvailableCriticalThreshold: 3,

// Memory utilzation (%) level on which to trigger the
// 'NodeMemoryHighUtilization' alert.
memoryHighUtilizationThreshold: 90,

// Threshold for the rate of memory major page faults to trigger
// 'NodeMemoryMajorPagesFaults' alert.
memoryMajorPagesFaultsThreshold: 500,

// Disk IO queue level above which to trigger
// 'NodeDiskIOSaturation' alert.
diskIOSaturationThreshold: 10,

rateInterval: '5m',

dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',

// logs lib related
enableLokiLogs: false,
extraLogLabels: ['transport', 'unit', 'level'],
logsVolumeGroupBy: 'level',
showLogsVolume: true,
logsFilteringSelector: self.filteringSelector,
logsExtraFilters:
|||
| label_format timestamp="{{__timestamp__}}"
| line_format `{{ if eq "[[instance]]" ".*" }}{{alignLeft 25 .instance}}|{{alignLeft 25 .unit}}|{{else}}{{alignLeft 25 .unit}}|{{end}} {{__line__}}`
|||,
},

config: config,
grafana: {
variables: variables.new(this),
targets: targets.new(this),
Expand Down
18 changes: 10 additions & 8 deletions docs/node-observ-lib/mixin.libsonnet
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
local g = import './g.libsonnet';
local nodelib = import './main.libsonnet';


local linux =
nodelib.new(
filteringSelector='job="node"',
groupLabels=['job'],
instanceLabels=['instance'],
dashboardNamePrefix='Node exporter / ',
dashboardTags=['node-exporter-mixin'],
uid='node'
);
nodelib.new()
+ nodelib.withConfigMixin({
filteringSelector: 'job=~".*node.*"',
groupLabels: ['job'],
instanceLabels: ['instance'],
dashboardNamePrefix: 'Node exporter / ',
dashboardTags: ['node-exporter-mixin'],
uid: 'node',
});

{
grafanaDashboards+:: linux.grafana.dashboards,
Expand Down

0 comments on commit 784cf59

Please sign in to comment.