Skip to content

Commit

Permalink
make check-marathon-apps not to fail on first faulty app (#24)
Browse files Browse the repository at this point in the history
* make check-marathon-apps not to fail on first faulty app

* address PR comments
  • Loading branch information
bergerx authored and majormoses committed Sep 11, 2018
1 parent aaba919 commit 70f1716
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 47 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ This project adheres to [Semantic Versioning](http://semver.org/).
This CHANGELOG follows the format listed [here](https://github.com/sensu-plugins/community/blob/master/HOW_WE_CHANGELOG.md)

## [Unreleased]
### Fixed
- check-marathon-apps.rb: script should not fail on first faulty result (@bergerx)

### Added
- check-marathon-apps.rb: introduced `check-config-overrides` flag (@bergerx)

## [2.4.0] - 2018-03-20
### Changed
Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@ another check result for the apps `status`.
Check results can be customised by two ways:

1. Default check result fields thats applied to all will be provided by a
default check config. Please see th esource code to see the whole defaults.
default check config. Please see the source code to see the whole defaults.
Since the whole default check config tends to be big, you can also use
`check-config-overrides` flag just to provide few new fields or override
existing defaults.
2. Application owners can override check results by using marathon labels. This
allows each application to have different fields in the published result.
e.g. per app escalation or aggregate can be controlled by applying Marathon
Expand Down
119 changes: 75 additions & 44 deletions bin/check-marathon-apps.rb
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,13 @@ class MarathonAppsCheck < Sensu::Plugin::Check::CLI
description: 'Similar to `--default-check-config` but read from given file. If both parameters are provided '\
'`--default-check-config` will override this one.'

option :check_config_overrides,
long: '--check-config-overrides CHECK_CONFIG_OVERRIDES',
description: 'Instead of providing whole default-check-config if you just want to introduce some new fields '\
'to the check config without having to provide whole config, this will be merged to the '\
'default-check-config.',
default: '{}'

option :sensu_client_url,
description: 'Sensu client HTTP URL',
long: '--sensu-client-url url',
Expand Down Expand Up @@ -203,57 +210,78 @@ def run
else
DEFAULT_CHECK_CONFIG
end
check_config = parse_json(check_config_str)
default_check_config = parse_json(check_config_str)
check_config_overrides = parse_json(config[:check_config_overrides])
check_config = default_check_config.merge(check_config_overrides)

# Filter apps, if both exists exclude pattern will override match pattern
apps.keep_if { |app| app['id'][/#{config[:match_pattern]}/] } if config[:match_pattern]
apps.delete_if { |app| app['id'][/#{config[:exclude_pat]}/] } if config[:exclude_pat]

failed_apps_to_be_reported = 0
apps.each do |app|
# Select app queue if any
app_queue = queue.select { |q| q['app']['id'][/^#{app['id']}$/] }.to_a.first

# Build check result
check_result = check_result_scaffold(app)

# Parse Marathon app labels
labels_config = parse_app_labels(app['labels'].to_h)

REFERENCES.each do |reference|
# / is and invalid character
check_result['name'] = "check_marathon_app#{app['id'].tr('/', '_')}_#{reference}"

state = case reference
when 'health'
get_marathon_app_health(app)
when 'status'
get_marathon_app_status(app, app_queue.to_h)
end

# Merge user provided check config
check_result.merge!(check_config.dig('_').to_h)
check_result.merge!(check_config.dig(reference, '_').to_h)
check_result.merge!(check_config.dig(reference, state).to_h)

# Merge Marathon parsed check config
check_result.merge!(labels_config.dig('_').to_h)
check_result.merge!(labels_config.dig(reference, '_').to_h)
check_result.merge!(labels_config.dig(reference, state).to_h)

# Build check result output
check_result['output'] = "#{reference.upcase} #{state.capitalize} - "\
"tasksRunning(#{app['tasksRunning'].to_i}), tasksStaged(#{app['tasksStaged'].to_i}), "\
"tasksHealthy(#{app['tasksHealthy'].to_i}), tasksUnhealthy(#{app['tasksUnhealthy'].to_i})"

# Make sure that check result data types are correct
enforce_sensu_field_types(check_result)

# Send the result to sensu-client HTTP socket
post_check_result(check_result)
end
failed_apps_to_be_reported += 1 unless process_app_results(app, queue, check_config)
end

if failed_apps_to_be_reported > 0
critical "#{failed_apps_to_be_reported} apps are failed to be reported to sensu"
else
ok 'Marathon Apps Status and Health check is running properly'
end
end

def process_app_results(app, queue, check_config)
app_result_pushed = true

# Select app queue if any
app_queue = queue.select { |q| q['app']['id'][/^#{app['id']}$/] }.to_a.first

# Build check result
check_result = check_result_scaffold(app)

ok 'Marathon Apps Status and Health check is running properly'
# Parse Marathon app labels
labels_config = parse_app_labels(app['labels'].to_h)

REFERENCES.each do |reference|
# / is and invalid character
check_result['name'] = "check_marathon_app#{app['id'].tr('/', '_')}_#{reference}"

state = case reference
when 'health'
get_marathon_app_health(app)
when 'status'
get_marathon_app_status(app, app_queue.to_h)
end

# Merge user provided check config
check_result.merge!(check_config.dig('_').to_h)
check_result.merge!(check_config.dig(reference, '_').to_h)
check_result.merge!(check_config.dig(reference, state).to_h)

# Merge Marathon parsed check config
check_result.merge!(labels_config.dig('_').to_h)
check_result.merge!(labels_config.dig(reference, '_').to_h)
check_result.merge!(labels_config.dig(reference, state).to_h)

# Build check result output
check_result['output'] = "#{reference.upcase} #{state.capitalize} - "\
"tasksRunning(#{app['tasksRunning'].to_i}), tasksStaged(#{app['tasksStaged'].to_i}), "\
"tasksHealthy(#{app['tasksHealthy'].to_i}), tasksUnhealthy(#{app['tasksUnhealthy'].to_i})"

# Make sure that check result data types are correct
enforce_sensu_field_types(check_result)

# Send the result to sensu-client HTTP socket
app_result = post_check_result(check_result)

# mark if result cant be posted to sensu
app_result_pushed = if app_result_pushed && app_result
true
else
false
end
end
app_result_pushed
end

def check_result_scaffold(app)
Expand Down Expand Up @@ -311,8 +339,11 @@ def post_check_result(data)
data.to_json,
content_type: 'application/json',
timeout: config[:timeout])
true
rescue RestClient::ExceptionWithResponse => e
critical "Error while trying to POST check result (#{config[:sensu_client_url]}/results): #{e.response}"
# print a message about failing POST but keep going
STDERR.puts "Error while trying to POST check result for #{data} (#{config[:sensu_client_url]}/results): #{e.response}"
false
end

def parse_json(json)
Expand Down
16 changes: 14 additions & 2 deletions test/check_marathon_apps_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,21 @@ def fetch_queue(*)
end

def post_check_result(res)
@check_results.push(res.dup)
# simulate failure from sensu agent, see the overridden method in MarathonAppsCheck
if res['name'] =~ /non-sensu-compliant-test/
false
else
@check_results.push(res.dup)
true
end
end

def ok(*); end

def critical(*args)
@status = 'CRITICAL'
output(*args)
end
end

describe 'MarathonTaskCheck' do
Expand All @@ -43,7 +54,8 @@ def ok(*); end

describe '#run' do
it 'tests multiple applications with different states' do
@check.run
expect { @check.run }.to output("CheckMarathonApps CRITICAL: 1 apps are failed to be reported to sensu\n").to_stdout

expect(@check.check_results).to contain_hash_with_keys(
'name' => 'check_marathon_app_sensu-test_health',
'output' => 'HEALTH Unknown - tasksRunning(1), tasksStaged(0), tasksHealthy(0), tasksUnhealthy(0)',
Expand Down
48 changes: 48 additions & 0 deletions test/fixtures/marathon_apps_with_embeds.json
Original file line number Diff line number Diff line change
Expand Up @@ -589,5 +589,53 @@
"slaveId": "4c5cca36-8774-429e-a912-eaaad2765219-S1"
},
"taskStats": {}
},
{
"id": "/non-sensu-compliant-test",
"backoffFactor": 10,
"backoffSeconds": 10,
"cmd": "sleep 1000000",
"container": {
"type": "MESOS",
"volumes": []
},
"cpus": 0.1,
"disk": 0,
"executor": "",
"instances": 1,
"labels": {
"MARATHON_SENSU_AGGREGATE": "aggregate with space is not valid for sensu"
},
"maxLaunchDelaySeconds": 36000000,
"mem": 128,
"gpus": 0,
"networks": [
{
"mode": "host"
}
],
"portDefinitions": [],
"requirePorts": false,
"upgradeStrategy": {
"maximumOverCapacity": 1,
"minimumHealthCapacity": 1
},
"version": "2018-03-08T14:47:25.3Z",
"versionInfo": {
"lastScalingAt": "2018-03-08T14:47:25.3Z",
"lastConfigChangeAt": "2018-03-08T14:47:25.3Z"
},
"killSelection": "YOUNGEST_FIRST",
"unreachableStrategy": {
"inactiveAfterSeconds": 0,
"expungeAfterSeconds": 0
},
"tasksStaged": 0,
"tasksRunning": 0,
"tasksHealthy": 0,
"tasksUnhealthy": 0,
"deployments": [],
"tasks": [],
"taskStats": {}
}
]

0 comments on commit 70f1716

Please sign in to comment.