Telegraf telemetry feedback (#628)

Azure · Oct 9, 2023 · d102719 · d102719
1 parent 205ea2e
commit d102719
Show file tree

Hide file tree

Showing 4 changed files with 231 additions and 7 deletions.
diff --git a/otelcollector/build/linux/Dockerfile b/otelcollector/build/linux/Dockerfile
@@ -62,7 +62,7 @@ COPY --from=prom-config-validator-builder /src/prom-config-validator-builder/pro
 
 COPY ./scripts/*.sh $tmpdir/
 COPY ./metricextension/me.config ./metricextension/me_internal.config ./metricextension/me_ds.config ./metricextension/me_ds_internal.config /usr/sbin/
-COPY ./telegraf/telegraf-prometheus-collector.conf $tmpdir/telegraf/
+COPY ./telegraf/ $tmpdir/telegraf/
 COPY ./fluent-bit/fluent-bit.conf ./fluent-bit/fluent-bit-daemonset.conf ./fluent-bit/fluent-bit-parsers.conf $tmpdir/fluent-bit/
 COPY --from=fluent-bit-builder /src/out_appinsights.so $tmpdir/fluent-bit/bin/
 COPY ./react /static/react

diff --git a/otelcollector/scripts/main.sh b/otelcollector/scripts/main.sh
@@ -263,7 +263,11 @@ echo_var "PROMETHEUS_VERSION" "$PROMETHEUS_VERSION"
 
 echo "starting telegraf"
 if [ "$TELEMETRY_DISABLED" != "true" ]; then
-  /usr/bin/telegraf --config /opt/telegraf/telegraf-prometheus-collector.conf &
+  if [ "$CONTROLLER_TYPE" == "ReplicaSet" ]; then
+    /usr/bin/telegraf --config /opt/telegraf/telegraf-prometheus-collector.conf &
+  else
+    /usr/bin/telegraf --config /opt/telegraf/telegraf-prometheus-collector-ds.conf &
+  fi
   TELEGRAF_VERSION=`cat /opt/telegrafversion.txt`
   echo_var "TELEGRAF_VERSION" "$TELEGRAF_VERSION"
 fi

diff --git a/otelcollector/telegraf/telegraf-prometheus-collector-ds.conf b/otelcollector/telegraf/telegraf-prometheus-collector-ds.conf
@@ -0,0 +1,206 @@
+# Telegraf Configuration
+#
+# Telegraf is entirely plugin driven. All metrics are gathered from the
+# declared inputs, and sent to the declared outputs.
+#
+# Plugins must be declared in here to be active.
+# To deactivate a plugin, comment out the name and any variables.
+#
+# Use 'telegraf -config telegraf.conf -test' to see what metrics a config
+# file would generate.
+#
+# Environment variables can be used anywhere in this config file, simply prepend
+# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"),
+# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR)
+
+
+# Global tags can be specified here in key="value" format.
+[global_tags]
+  #Below are entirely used for telemetry
+  agentversion = "$AGENT_VERSION"
+  cluster = "$customResourceId"
+  calias = "$AZMON_CLUSTER_ALIAS"
+  clabel = "$AZMON_CLUSTER_LABEL"
+  Region = "$AKSREGION"
+  computer = "$NODE_NAME"
+  nodeip = "$NODE_IP"
+  mode = "$MODE"
+  winmode = "$WINMODE"
+  macmode = "$MAC"
+  controllertype = "$CONTROLLER_TYPE"
+  defaultmetricaccountname = "$AZMON_DEFAULT_METRIC_ACCOUNT_NAME"
+  namespace = "$POD_NAMESPACE"
+  podname = "$POD_NAME"
+  ostype = "$OS_TYPE"
+  mip="$MINIMAL_INGESTION_PROFILE"
+
+# Configuration for telegraf agent
+[agent]
+  ## Default data collection interval for all inputs
+  interval = "60s"
+  ## Rounds collection interval to 'interval'
+  ## ie, if interval="10s" then always collect on :00, :10, :20, etc.
+  round_interval = true
+
+  ## Telegraf will send metrics to outputs in batches of at most
+  ## metric_batch_size metrics.
+  ## This controls the size of writes that Telegraf sends to output plugins.
+  metric_batch_size = 1000
+
+  ## For failed writes, telegraf will cache metric_buffer_limit metrics for each
+  ## output, and will flush this buffer on a successful write. Oldest metrics
+  ## are dropped first when this buffer fills.
+  ## This buffer only fills when writes fail to output plugin(s).
+  metric_buffer_limit = 10000
+
+  ## Collection jitter is used to jitter the collection by a random amount.
+  ## Each plugin will sleep for a random time within jitter before collecting.
+  ## This can be used to avoid many plugins querying things like sysfs at the
+  ## same time, which can have a measurable effect on the system.
+  collection_jitter = "0s"
+
+  ## Default flushing interval for all outputs. You shouldn't set this below
+  ## interval. Maximum flush_interval will be flush_interval + flush_jitter
+  flush_interval = "15s"
+  ## Jitter the flush interval by a random amount. This is primarily to avoid
+  ## large write spikes for users running a large number of telegraf instances.
+  ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
+  flush_jitter = "0s"
+
+  ## By default or when set to "0s", precision will be set to the same
+  ## timestamp order as the collection interval, with the maximum being 1s.
+  ##   ie, when interval = "10s", precision will be "1s"
+  ##       when interval = "250ms", precision will be "1ms"
+  ## Precision will NOT be used for service inputs. It is up to each individual
+  ## service input to set the timestamp at the appropriate precision.
+  ## Valid time units are "ns", "us" (or "µs"), "ms", "s".
+  precision = ""
+
+  ## Logging configuration:
+  ## Run telegraf with debug log messages.
+  debug = false
+  ## Run telegraf in quiet mode (error log messages only).
+  quiet = true
+  ## Specify the log file name. The empty string means to log to stderr.
+  logfile = ""
+
+  ## Override default hostname, if empty use os.Hostname()
+  #hostname = "placeholder_hostname"
+  ## If set to true, do no set the "host" tag in the telegraf agent.
+  omit_hostname = true
+
+
+###############################################################################
+#                            OUTPUT PLUGINS                                   #
+###############################################################################
+
+[[outputs.application_insights]]
+  ## Instrumentation key of the Application Insights resource.
+  instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY"
+
+  ## Timeout for closing (default: 5s).
+  # timeout = "5s"
+
+  ## Enable additional diagnostic logging.
+  # enable_diagnostic_logging = false
+
+###############################################################################
+#                            PROCESSOR PLUGINS                                #
+###############################################################################
+[[processors.converter]]
+  [processors.converter.fields]
+    float = ["*"]
+
+# Transforms tag and field values as well as measurement, tag and field names with regex pattern
+[[processors.regex]]
+  namepass = ["opentelemetry_allocator_targets"]
+
+  # Tag and field conversions defined in a separate sub-tables
+  [[processors.regex.tags]]
+    ## Tag to change, "*" will change every tag
+    key = "job_name"
+    ## Regular expression to match on a tag value
+    pattern = '(^(podmonitor|servicemonitor)\/.*)|(.*)'
+    ## Matches of the pattern will be replaced with this string.  Use ${1}
+    ## notation to use the text of the first submatch.
+    replacement = "${2}"
+
+###############################################################################
+#                            AGGREGATOR PLUGINS                               #
+###############################################################################
+[[aggregators.quantile]]
+  period = "5m"
+  drop_original = true
+  quantiles = [0.50,0.95]
+  algorithm = "t-digest"
+  compression = 100.0
+  namepass = ["otelcollector", "metricsextension"]
+
+# Keep the aggregate basicstats of each metric passing through.
+[[aggregators.basicstats]]
+  namepass = ["opentelemetry_allocator_targets"]
+
+  ## The period on which to flush & clear the aggregator.
+  period = "5m"
+
+  ## If true, the original metric will be dropped by the
+  ## aggregator and will not get sent to the output plugins.
+  drop_original = true
+
+  ## Configures which basic stats to push as fields
+  stats = ["count"]
+
+###############################################################################
+#                            INPUT PLUGINS                                    #
+###############################################################################
+
+# Read metrics about cpu usage
+#[[inputs.cpu]]
+  ## Whether to report per-cpu stats or not
+#  percpu = false
+  ## Whether to report total system cpu stats or not
+#  totalcpu = true
+  ## If true, collect raw CPU time metrics.
+#  collect_cpu_time = false
+  ## If true, compute and report the sum of all non-idle CPU states.
+#  report_active = true
+#  fieldpass = ["usage_active","cluster","node","host","device"]
+#  taginclude = ["cluster","cpu","node"]
+
+# Read metrics about memory usage
+#[[inputs.mem]]
+#  fieldpass = ["used_percent", "cluster", "node","host","device"]
+#  taginclude = ["cluster","node"]
+
+[[inputs.procstat]]
+   exe = "otelcollector"
+   interval = "10s"
+   pid_finder = "pgrep"
+   pid_tag = true
+   name_override = "otelcollector"
+   fieldpass = ["cpu_usage", "memory_rss"]
+   [inputs.procstat.tags]
+#    Computer = "$NODE_NAME"
+#    NodeIp = "$NODE_IP"
+    cpulimit = "$CONTAINER_CPU_LIMIT"
+    memlimit = "$CONTAINER_MEMORY_LIMIT" 
+    debugmodeenabled = "$DEBUG_MODE_ENABLED"
+    tadapterh="$tokenadapterHealthyAfterSecs"
+    tadapterf="$tokenadapterUnhealthyAfterSecs"
+
+[[inputs.procstat]]
+   exe = "MetricsExtension"
+   interval = "10s"
+   pid_finder = "pgrep"
+   pid_tag = true
+   name_override = "metricsextension"
+   fieldpass = ["cpu_usage", "memory_rss"]
+
+[[inputs.prometheus]]
+  interval = "5m"
+  urls = ["http://localhost:8888/metrics"]
+  fieldpass = ["otelcol_processor_dropped_metric_points", "otelcol_receiver_refused_metric_points", "otelcol_receiver_accepted_metric_points", "otelcol_exporter_sent_metric_points", "otelcol_exporter_queue_size", "otelcol_exporter_send_failed_metric_points", "otelcol_process_memory_rss", "otelcol_processor_batch_batch_send_size_bytes_sum", "otelcol_processor_batch_batch_send_size_bytes_count"]
+  tagexclude = ["service_instance_id"]
+  metric_version = 2
+  url_tag = "scrapeUrl"
+  response_timeout = "15s"
diff --git a/otelcollector/telegraf/telegraf-prometheus-collector.conf b/otelcollector/telegraf/telegraf-prometheus-collector.conf
@@ -118,6 +118,23 @@
   [processors.converter.fields]
     float = ["*"]
 
+# Transforms tag and field values as well as measurement, tag and field names with regex pattern
+[[processors.regex]]
+  namepass = ["target_allocator"]
+
+  # Tag and field conversions defined in a separate sub-tables
+  [[processors.regex.tags]]
+    ## Tag to change, "*" will change every tag
+    key = "job_name"
+    ## Regular expression to match on a tag value
+    pattern = '(^(podMonitor|serviceMonitor)\/.*)|(.*)'
+    ## Matches of the pattern will be replaced with this string.  Use ${1}
+    ## notation to use the text of the first submatch.
+
+    ## "Job" is necessary or else configmap jobs would not be replaced and would keep their
+    ## original name
+    replacement = "${2}Job"
+
 ###############################################################################
 #                            AGGREGATOR PLUGINS                               #
 ###############################################################################
@@ -128,7 +145,7 @@
   algorithm = "t-digest"
   compression = 100.0
   namepass = ["otelcollector", "metricsextension"]
- 
+
 ###############################################################################
 #                            INPUT PLUGINS                                    #
 ###############################################################################
@@ -214,8 +231,6 @@
   metric_version = 2
   url_tag = "scrapeUrl"
   response_timeout = "15s"
-  [inputs.prometheus.tagdrop]
-    controllertype = [ "DaemonSet"]
 
 [[inputs.prometheus]]
   interval = "5m"
@@ -224,5 +239,4 @@
   metric_version = 2
   url_tag = "scrapeUrl"
   response_timeout = "15s"
-  [inputs.prometheus.tagdrop]
-    controllertype = [ "DaemonSet"]
+  name_override = "target_allocator"