make check-marathon-apps not to fail on first faulty app (#24)

* make check-marathon-apps not to fail on first faulty app * address PR comments
sensu-plugins · Sep 11, 2018 · 70f1716 · 70f1716
1 parent aaba919
commit 70f1716
Show file tree

Hide file tree

Showing 5 changed files with 146 additions and 47 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,11 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 This CHANGELOG follows the format listed [here](https://github.com/sensu-plugins/community/blob/master/HOW_WE_CHANGELOG.md)
 
 ## [Unreleased]
+### Fixed
+- check-marathon-apps.rb: script should not fail on first faulty result (@bergerx)
+
+### Added
+- check-marathon-apps.rb: introduced `check-config-overrides` flag (@bergerx)
 
 ## [2.4.0] - 2018-03-20
 ### Changed

diff --git a/README.md b/README.md
@@ -38,7 +38,10 @@ another check result for the apps `status`.
 Check results can be customised by two ways:
 
 1. Default check result fields thats applied to all will be provided by a
-   default check config. Please see th esource code to see the whole defaults.
+   default check config. Please see the source code to see the whole defaults.
+   Since the whole default check config tends to be big, you can also use
+   `check-config-overrides` flag just to provide few new fields or override
+   existing defaults.
 2. Application owners can override check results by using marathon labels. This
    allows each application to have different fields in the published result.
    e.g. per app escalation or aggregate can be controlled by applying Marathon

diff --git a/bin/check-marathon-apps.rb b/bin/check-marathon-apps.rb
@@ -172,6 +172,13 @@ class MarathonAppsCheck < Sensu::Plugin::Check::CLI
          description: 'Similar to `--default-check-config` but read from given file. If both parameters are provided  '\
                       '`--default-check-config` will override this one.'
 
+  option :check_config_overrides,
+         long: '--check-config-overrides CHECK_CONFIG_OVERRIDES',
+         description: 'Instead of providing whole default-check-config if you just want to introduce some new fields '\
+                      'to the check config without having to provide whole config, this will be merged to the '\
+                      'default-check-config.',
+         default: '{}'
+
   option :sensu_client_url,
          description: 'Sensu client HTTP URL',
          long: '--sensu-client-url url',
@@ -203,57 +210,78 @@ def run
                        else
                          DEFAULT_CHECK_CONFIG
                        end
-    check_config = parse_json(check_config_str)
+    default_check_config = parse_json(check_config_str)
+    check_config_overrides = parse_json(config[:check_config_overrides])
+    check_config = default_check_config.merge(check_config_overrides)
 
     # Filter apps, if both exists exclude pattern will override match pattern
     apps.keep_if { |app| app['id'][/#{config[:match_pattern]}/] } if config[:match_pattern]
     apps.delete_if { |app| app['id'][/#{config[:exclude_pat]}/] } if config[:exclude_pat]
 
+    failed_apps_to_be_reported = 0
     apps.each do |app|
-      # Select app queue if any
-      app_queue = queue.select { |q| q['app']['id'][/^#{app['id']}$/] }.to_a.first
-
-      # Build check result
-      check_result = check_result_scaffold(app)
-
-      # Parse Marathon app labels
-      labels_config = parse_app_labels(app['labels'].to_h)
-
-      REFERENCES.each do |reference|
-        # / is and invalid character
-        check_result['name'] = "check_marathon_app#{app['id'].tr('/', '_')}_#{reference}"
-
-        state = case reference
-                when 'health'
-                  get_marathon_app_health(app)
-                when 'status'
-                  get_marathon_app_status(app, app_queue.to_h)
-                end
-
-        # Merge user provided check config
-        check_result.merge!(check_config.dig('_').to_h)
-        check_result.merge!(check_config.dig(reference, '_').to_h)
-        check_result.merge!(check_config.dig(reference, state).to_h)
-
-        # Merge Marathon parsed check config
-        check_result.merge!(labels_config.dig('_').to_h)
-        check_result.merge!(labels_config.dig(reference, '_').to_h)
-        check_result.merge!(labels_config.dig(reference, state).to_h)
-
-        # Build check result output
-        check_result['output'] = "#{reference.upcase} #{state.capitalize} - "\
-          "tasksRunning(#{app['tasksRunning'].to_i}), tasksStaged(#{app['tasksStaged'].to_i}), "\
-          "tasksHealthy(#{app['tasksHealthy'].to_i}), tasksUnhealthy(#{app['tasksUnhealthy'].to_i})"
-
-        # Make sure that check result data types are correct
-        enforce_sensu_field_types(check_result)
-
-        # Send the result to sensu-client HTTP socket
-        post_check_result(check_result)
-      end
+      failed_apps_to_be_reported += 1 unless process_app_results(app, queue, check_config)
+    end
+
+    if failed_apps_to_be_reported > 0
+      critical "#{failed_apps_to_be_reported} apps are failed to be reported to sensu"
+    else
+      ok 'Marathon Apps Status and Health check is running properly'
     end
+  end
+
+  def process_app_results(app, queue, check_config)
+    app_result_pushed = true
+
+    # Select app queue if any
+    app_queue = queue.select { |q| q['app']['id'][/^#{app['id']}$/] }.to_a.first
+
+    # Build check result
+    check_result = check_result_scaffold(app)
 
-    ok 'Marathon Apps Status and Health check is running properly'
+    # Parse Marathon app labels
+    labels_config = parse_app_labels(app['labels'].to_h)
+
+    REFERENCES.each do |reference|
+      # / is and invalid character
+      check_result['name'] = "check_marathon_app#{app['id'].tr('/', '_')}_#{reference}"
+
+      state = case reference
+              when 'health'
+                get_marathon_app_health(app)
+              when 'status'
+                get_marathon_app_status(app, app_queue.to_h)
+              end
+
+      # Merge user provided check config
+      check_result.merge!(check_config.dig('_').to_h)
+      check_result.merge!(check_config.dig(reference, '_').to_h)
+      check_result.merge!(check_config.dig(reference, state).to_h)
+
+      # Merge Marathon parsed check config
+      check_result.merge!(labels_config.dig('_').to_h)
+      check_result.merge!(labels_config.dig(reference, '_').to_h)
+      check_result.merge!(labels_config.dig(reference, state).to_h)
+
+      # Build check result output
+      check_result['output'] = "#{reference.upcase} #{state.capitalize} - "\
+        "tasksRunning(#{app['tasksRunning'].to_i}), tasksStaged(#{app['tasksStaged'].to_i}), "\
+        "tasksHealthy(#{app['tasksHealthy'].to_i}), tasksUnhealthy(#{app['tasksUnhealthy'].to_i})"
+
+      # Make sure that check result data types are correct
+      enforce_sensu_field_types(check_result)
+
+      # Send the result to sensu-client HTTP socket
+      app_result = post_check_result(check_result)
+
+      # mark if result cant be posted to sensu
+      app_result_pushed = if app_result_pushed && app_result
+                            true
+                          else
+                            false
+                          end
+    end
+    app_result_pushed
   end
 
   def check_result_scaffold(app)
@@ -311,8 +339,11 @@ def post_check_result(data)
                     data.to_json,
                     content_type: 'application/json',
                     timeout: config[:timeout])
+    true
   rescue RestClient::ExceptionWithResponse => e
-    critical "Error while trying to POST check result (#{config[:sensu_client_url]}/results): #{e.response}"
+    # print a message about failing POST but keep going
+    STDERR.puts "Error while trying to POST check result for #{data} (#{config[:sensu_client_url]}/results): #{e.response}"
+    false
   end
 
   def parse_json(json)

diff --git a/test/check_marathon_apps_spec.rb b/test/check_marathon_apps_spec.rb
@@ -24,10 +24,21 @@ def fetch_queue(*)
   end
 
   def post_check_result(res)
-    @check_results.push(res.dup)
+    # simulate failure from sensu agent, see the overridden method in MarathonAppsCheck
+    if res['name'] =~ /non-sensu-compliant-test/
+      false
+    else
+      @check_results.push(res.dup)
+      true
+    end
   end
 
   def ok(*); end
+
+  def critical(*args)
+    @status = 'CRITICAL'
+    output(*args)
+  end
 end
 
 describe 'MarathonTaskCheck' do
@@ -43,7 +54,8 @@ def ok(*); end
 
   describe '#run' do
     it 'tests multiple applications with different states' do
-      @check.run
+      expect { @check.run }.to output("CheckMarathonApps CRITICAL: 1 apps are failed to be reported to sensu\n").to_stdout
+
       expect(@check.check_results).to contain_hash_with_keys(
         'name' => 'check_marathon_app_sensu-test_health',
         'output' => 'HEALTH Unknown - tasksRunning(1), tasksStaged(0), tasksHealthy(0), tasksUnhealthy(0)',

diff --git a/test/fixtures/marathon_apps_with_embeds.json b/test/fixtures/marathon_apps_with_embeds.json
@@ -589,5 +589,53 @@
       "slaveId": "4c5cca36-8774-429e-a912-eaaad2765219-S1"
     },
     "taskStats": {}
+  },
+  {
+    "id": "/non-sensu-compliant-test",
+    "backoffFactor": 10,
+    "backoffSeconds": 10,
+    "cmd": "sleep 1000000",
+    "container": {
+      "type": "MESOS",
+      "volumes": []
+    },
+    "cpus": 0.1,
+    "disk": 0,
+    "executor": "",
+    "instances": 1,
+    "labels": {
+      "MARATHON_SENSU_AGGREGATE": "aggregate with space is not valid for sensu"
+    },
+    "maxLaunchDelaySeconds": 36000000,
+    "mem": 128,
+    "gpus": 0,
+    "networks": [
+      {
+        "mode": "host"
+      }
+    ],
+    "portDefinitions": [],
+    "requirePorts": false,
+    "upgradeStrategy": {
+      "maximumOverCapacity": 1,
+      "minimumHealthCapacity": 1
+    },
+    "version": "2018-03-08T14:47:25.3Z",
+    "versionInfo": {
+      "lastScalingAt": "2018-03-08T14:47:25.3Z",
+      "lastConfigChangeAt": "2018-03-08T14:47:25.3Z"
+    },
+    "killSelection": "YOUNGEST_FIRST",
+    "unreachableStrategy": {
+      "inactiveAfterSeconds": 0,
+      "expungeAfterSeconds": 0
+    },
+    "tasksStaged": 0,
+    "tasksRunning": 0,
+    "tasksHealthy": 0,
+    "tasksUnhealthy": 0,
+    "deployments": [],
+    "tasks": [],
+    "taskStats": {}
   }
 ]