Skip to content

Commit

Permalink
Telegraf telemetry feedback (#628)
Browse files Browse the repository at this point in the history
  • Loading branch information
gracewehner authored Oct 9, 2023
1 parent 205ea2e commit d102719
Show file tree
Hide file tree
Showing 4 changed files with 231 additions and 7 deletions.
2 changes: 1 addition & 1 deletion otelcollector/build/linux/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ COPY --from=prom-config-validator-builder /src/prom-config-validator-builder/pro

COPY ./scripts/*.sh $tmpdir/
COPY ./metricextension/me.config ./metricextension/me_internal.config ./metricextension/me_ds.config ./metricextension/me_ds_internal.config /usr/sbin/
COPY ./telegraf/telegraf-prometheus-collector.conf $tmpdir/telegraf/
COPY ./telegraf/ $tmpdir/telegraf/
COPY ./fluent-bit/fluent-bit.conf ./fluent-bit/fluent-bit-daemonset.conf ./fluent-bit/fluent-bit-parsers.conf $tmpdir/fluent-bit/
COPY --from=fluent-bit-builder /src/out_appinsights.so $tmpdir/fluent-bit/bin/
COPY ./react /static/react
Expand Down
6 changes: 5 additions & 1 deletion otelcollector/scripts/main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,11 @@ echo_var "PROMETHEUS_VERSION" "$PROMETHEUS_VERSION"

echo "starting telegraf"
if [ "$TELEMETRY_DISABLED" != "true" ]; then
/usr/bin/telegraf --config /opt/telegraf/telegraf-prometheus-collector.conf &
if [ "$CONTROLLER_TYPE" == "ReplicaSet" ]; then
/usr/bin/telegraf --config /opt/telegraf/telegraf-prometheus-collector.conf &
else
/usr/bin/telegraf --config /opt/telegraf/telegraf-prometheus-collector-ds.conf &
fi
TELEGRAF_VERSION=`cat /opt/telegrafversion.txt`
echo_var "TELEGRAF_VERSION" "$TELEGRAF_VERSION"
fi
Expand Down
206 changes: 206 additions & 0 deletions otelcollector/telegraf/telegraf-prometheus-collector-ds.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
# Telegraf Configuration
#
# Telegraf is entirely plugin driven. All metrics are gathered from the
# declared inputs, and sent to the declared outputs.
#
# Plugins must be declared in here to be active.
# To deactivate a plugin, comment out the name and any variables.
#
# Use 'telegraf -config telegraf.conf -test' to see what metrics a config
# file would generate.
#
# Environment variables can be used anywhere in this config file, simply prepend
# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"),
# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR)


# Global tags can be specified here in key="value" format.
[global_tags]
#Below are entirely used for telemetry
agentversion = "$AGENT_VERSION"
cluster = "$customResourceId"
calias = "$AZMON_CLUSTER_ALIAS"
clabel = "$AZMON_CLUSTER_LABEL"
Region = "$AKSREGION"
computer = "$NODE_NAME"
nodeip = "$NODE_IP"
mode = "$MODE"
winmode = "$WINMODE"
macmode = "$MAC"
controllertype = "$CONTROLLER_TYPE"
defaultmetricaccountname = "$AZMON_DEFAULT_METRIC_ACCOUNT_NAME"
namespace = "$POD_NAMESPACE"
podname = "$POD_NAME"
ostype = "$OS_TYPE"
mip="$MINIMAL_INGESTION_PROFILE"

# Configuration for telegraf agent
[agent]
## Default data collection interval for all inputs
interval = "60s"
## Rounds collection interval to 'interval'
## ie, if interval="10s" then always collect on :00, :10, :20, etc.
round_interval = true

## Telegraf will send metrics to outputs in batches of at most
## metric_batch_size metrics.
## This controls the size of writes that Telegraf sends to output plugins.
metric_batch_size = 1000

## For failed writes, telegraf will cache metric_buffer_limit metrics for each
## output, and will flush this buffer on a successful write. Oldest metrics
## are dropped first when this buffer fills.
## This buffer only fills when writes fail to output plugin(s).
metric_buffer_limit = 10000

## Collection jitter is used to jitter the collection by a random amount.
## Each plugin will sleep for a random time within jitter before collecting.
## This can be used to avoid many plugins querying things like sysfs at the
## same time, which can have a measurable effect on the system.
collection_jitter = "0s"

## Default flushing interval for all outputs. You shouldn't set this below
## interval. Maximum flush_interval will be flush_interval + flush_jitter
flush_interval = "15s"
## Jitter the flush interval by a random amount. This is primarily to avoid
## large write spikes for users running a large number of telegraf instances.
## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
flush_jitter = "0s"

## By default or when set to "0s", precision will be set to the same
## timestamp order as the collection interval, with the maximum being 1s.
## ie, when interval = "10s", precision will be "1s"
## when interval = "250ms", precision will be "1ms"
## Precision will NOT be used for service inputs. It is up to each individual
## service input to set the timestamp at the appropriate precision.
## Valid time units are "ns", "us" (or "µs"), "ms", "s".
precision = ""

## Logging configuration:
## Run telegraf with debug log messages.
debug = false
## Run telegraf in quiet mode (error log messages only).
quiet = true
## Specify the log file name. The empty string means to log to stderr.
logfile = ""

## Override default hostname, if empty use os.Hostname()
#hostname = "placeholder_hostname"
## If set to true, do no set the "host" tag in the telegraf agent.
omit_hostname = true


###############################################################################
# OUTPUT PLUGINS #
###############################################################################

[[outputs.application_insights]]
## Instrumentation key of the Application Insights resource.
instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY"

## Timeout for closing (default: 5s).
# timeout = "5s"

## Enable additional diagnostic logging.
# enable_diagnostic_logging = false

###############################################################################
# PROCESSOR PLUGINS #
###############################################################################
[[processors.converter]]
[processors.converter.fields]
float = ["*"]

# Transforms tag and field values as well as measurement, tag and field names with regex pattern
[[processors.regex]]
namepass = ["opentelemetry_allocator_targets"]

# Tag and field conversions defined in a separate sub-tables
[[processors.regex.tags]]
## Tag to change, "*" will change every tag
key = "job_name"
## Regular expression to match on a tag value
pattern = '(^(podmonitor|servicemonitor)\/.*)|(.*)'
## Matches of the pattern will be replaced with this string. Use ${1}
## notation to use the text of the first submatch.
replacement = "${2}"

###############################################################################
# AGGREGATOR PLUGINS #
###############################################################################
[[aggregators.quantile]]
period = "5m"
drop_original = true
quantiles = [0.50,0.95]
algorithm = "t-digest"
compression = 100.0
namepass = ["otelcollector", "metricsextension"]

# Keep the aggregate basicstats of each metric passing through.
[[aggregators.basicstats]]
namepass = ["opentelemetry_allocator_targets"]

## The period on which to flush & clear the aggregator.
period = "5m"

## If true, the original metric will be dropped by the
## aggregator and will not get sent to the output plugins.
drop_original = true

## Configures which basic stats to push as fields
stats = ["count"]

###############################################################################
# INPUT PLUGINS #
###############################################################################

# Read metrics about cpu usage
#[[inputs.cpu]]
## Whether to report per-cpu stats or not
# percpu = false
## Whether to report total system cpu stats or not
# totalcpu = true
## If true, collect raw CPU time metrics.
# collect_cpu_time = false
## If true, compute and report the sum of all non-idle CPU states.
# report_active = true
# fieldpass = ["usage_active","cluster","node","host","device"]
# taginclude = ["cluster","cpu","node"]

# Read metrics about memory usage
#[[inputs.mem]]
# fieldpass = ["used_percent", "cluster", "node","host","device"]
# taginclude = ["cluster","node"]

[[inputs.procstat]]
exe = "otelcollector"
interval = "10s"
pid_finder = "pgrep"
pid_tag = true
name_override = "otelcollector"
fieldpass = ["cpu_usage", "memory_rss"]
[inputs.procstat.tags]
# Computer = "$NODE_NAME"
# NodeIp = "$NODE_IP"
cpulimit = "$CONTAINER_CPU_LIMIT"
memlimit = "$CONTAINER_MEMORY_LIMIT"
debugmodeenabled = "$DEBUG_MODE_ENABLED"
tadapterh="$tokenadapterHealthyAfterSecs"
tadapterf="$tokenadapterUnhealthyAfterSecs"

[[inputs.procstat]]
exe = "MetricsExtension"
interval = "10s"
pid_finder = "pgrep"
pid_tag = true
name_override = "metricsextension"
fieldpass = ["cpu_usage", "memory_rss"]

[[inputs.prometheus]]
interval = "5m"
urls = ["http://localhost:8888/metrics"]
fieldpass = ["otelcol_processor_dropped_metric_points", "otelcol_receiver_refused_metric_points", "otelcol_receiver_accepted_metric_points", "otelcol_exporter_sent_metric_points", "otelcol_exporter_queue_size", "otelcol_exporter_send_failed_metric_points", "otelcol_process_memory_rss", "otelcol_processor_batch_batch_send_size_bytes_sum", "otelcol_processor_batch_batch_send_size_bytes_count"]
tagexclude = ["service_instance_id"]
metric_version = 2
url_tag = "scrapeUrl"
response_timeout = "15s"
24 changes: 19 additions & 5 deletions otelcollector/telegraf/telegraf-prometheus-collector.conf
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,23 @@
[processors.converter.fields]
float = ["*"]

# Transforms tag and field values as well as measurement, tag and field names with regex pattern
[[processors.regex]]
namepass = ["target_allocator"]

# Tag and field conversions defined in a separate sub-tables
[[processors.regex.tags]]
## Tag to change, "*" will change every tag
key = "job_name"
## Regular expression to match on a tag value
pattern = '(^(podMonitor|serviceMonitor)\/.*)|(.*)'
## Matches of the pattern will be replaced with this string. Use ${1}
## notation to use the text of the first submatch.

## "Job" is necessary or else configmap jobs would not be replaced and would keep their
## original name
replacement = "${2}Job"

###############################################################################
# AGGREGATOR PLUGINS #
###############################################################################
Expand All @@ -128,7 +145,7 @@
algorithm = "t-digest"
compression = 100.0
namepass = ["otelcollector", "metricsextension"]

###############################################################################
# INPUT PLUGINS #
###############################################################################
Expand Down Expand Up @@ -214,8 +231,6 @@
metric_version = 2
url_tag = "scrapeUrl"
response_timeout = "15s"
[inputs.prometheus.tagdrop]
controllertype = [ "DaemonSet"]

[[inputs.prometheus]]
interval = "5m"
Expand All @@ -224,5 +239,4 @@
metric_version = 2
url_tag = "scrapeUrl"
response_timeout = "15s"
[inputs.prometheus.tagdrop]
controllertype = [ "DaemonSet"]
name_override = "target_allocator"

0 comments on commit d102719

Please sign in to comment.