forked from HariSekhon/Nagios-Plugins
-
Notifications
You must be signed in to change notification settings - Fork 0
/
check_datastax_opscenter_cassandra_metrics.pl
executable file
·266 lines (239 loc) · 9.3 KB
/
check_datastax_opscenter_cassandra_metrics.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#!/usr/bin/perl -T
# nagios: -epn
#
# Author: Hari Sekhon
# Date: 2013-10-18 18:44:35 +0100 (Fri, 18 Oct 2013)
#
# https://github.com/harisekhon/nagios-plugins
#
# License: see accompanying LICENSE file
#
# http://www.datastax.com/documentation/opscenter/5.0/api/docs/metrics.html
# The first plugin I started for DataStax OpsCenter in 2013, but the last one to be finished. The idea for this is similar to the Cloudera Manager metrics plugin I wrote while working at Cloudera.
$DESCRIPTION = "Nagios Plugin to fetch and optionally alert on Cassandra cluster metrics via DataStax OpsCenter's Rest API
DataStax OpsCenter has many useful metrics for your Cassandra cluster(s) such as read/write latency and number of operations, both per node as well as aggregated per datacenter. See this link for a list of metric keys to query
http://www.datastax.com/documentation/opscenter/5.0/api/docs/metrics.html#metrics-attribute-key-lists
Some metrics may only apply to Column-Families or Nodes, and will result in UNKNOWN if attempting to querying a combination that doesn't make sense or isn't supported by OpsCenter as OpsCenter will return 'null' in these cases.
Sometimes a metric's latest results will be undefined in OpsCenter. In this case the latest valid metric will be returned instead. Increase --time-period to fetch more historical metrics from the last N minutes so there is more chance of finding a last valid metric to return or specify --latest-only to only return the latest result regardless of whether it's defined or not (in which case you'll get UNKNOWN: 'metric'='undefined').
Tested on DataStax OpsCenter 3.2.2 and 5.0.0";
$VERSION = "0.1";
use strict;
use warnings;
BEGIN {
use File::Basename;
use lib dirname(__FILE__) . "/lib";
}
use HariSekhonUtils;
use HariSekhon::DataStax::OpsCenter;
use Data::Dumper;
use LWP::Simple '$ua';
$ua->agent("Hari Sekhon $progname version $main::VERSION");
my $dc = "all";
my $cf;
my $device;
# min/max results don't seem to be returned despite API docs saying all 3 are returned by default and trying to fetch min/max results in no results
#my $max;
#my $nodes;
#my $node_group;
#my $node_aggregation;
my $metric;
#my $metrics;
#my $min;
my $period = 1; # last minute
my $latest_only;
#my @metrics;
env_vars(["DATASTAX_OPSCENTER_DATACENTER", "DATACENTER"], \$dc);
%options = (
%hostoptions,
%useroptions,
%clusteroption,
"m|metric=s" => [ \$metric, "Metric(s) to fetch, comma separated. Thresholds may optionally be applied if a single metric is given" ],
"A|dc|datacenter=s" => [ \$dc, "Datacenter. Defaults to 'all' to aggregate metrics across all datacenters for the specified cluster unless a specific --node-ip or --dc is given (\$DATASTAX_OPSCENTER_DATACENTER, \$DATACENTER)" ],
%keyspaceoption,
"F|cf|column-family=s" => [ \$cf, "Column-family to be measured (optional, use with --keyspace for metric drill down)" ],
%nodeipoption,
#"nodes=s" => [ \$nodes, "Nodes comma separated to fetch metrics for. Optional" ],
#"node-group=s" => [ \$node_group, "Node group to fetch metrics for. Optional" ],
"E|device=s" => [ \$device, "Specify a device to fetch a metric for (eg. a specific disk or network interface or 'all' for all disks/network interfaces. Optional, can't be used with --keyspace/--column-family)" ],
"T|time-period=s" => [ \$period, "Time period of last N mins to fetch results for (defaults to 1 for the last minute)" ],
"L|latest-only" => [ \$latest_only, "Return latest result even if it's 'undefined'" ],
#"min" => [ \$min, "Check thresholds against the minimum value returned for the last minute (defautls to comparing to the average)" ],
#"max" => [ \$max, "Check thresholds against the maximum value returned for the last minute (defaults to comparing to the average)" ],
%thresholdoptions,
);
splice @usage_order, 6, 0, qw/cluster metric metrics dc datacenter node-ip nodes node-group keyspace cf column-family device time-period latest-only list-clusters list-keyspaces min max/;
get_options();
$host = validate_host($host);
$port = validate_port($port);
$user = validate_user($user);
$password = validate_password($password);
validate_cluster();
validate_keyspace() if $keyspace;
$dc = validate_alnum($dc, "datacenter") if $dc;
if($cf){
usage "must specify --keyspace if using --column-family" unless $keyspace;
$cf = validate_alnum($cf, "column-family");
}
if($device){
usage "cannot mix --device with --keyspace / --column-family" if ($keyspace or $cf);
$device = validate_alnum($device, "device");
}
if($latest_only){
vlog_option "latest only", "true";
vlog2 "resetting --time-period to 1 since --latest-only is set" if $period ne 1;
$period = 1;
}
$period = validate_int($period, "time period (last N minutes)", 1);
#if($metrics){
# foreach(split(",", $metrics)){
# $_ = trim($_);
# /^\s*([\w_-]+)\s*$/ or usage "invalid metric '$_' given, must be alphanumeric, may contain underscores in the middle";
# push(@metrics, $1);
# }
# @metrics or usage "no valid metrics given";
# @metrics = uniq_array @metrics;
# vlog_option "metrics", "[ " . join(" ", @metrics) . " ]";
#}
#$metric = $metrics[0];
validate_thresholds();
vlog2;
set_timeout();
set_http_timeout($timeout-1);
$ua->show_progress(1) if $debug;
$status = "OK";
list_clusters();
list_keyspaces();
list_nodes();
defined($metric) or usage "metric must be specified, see here for list of valid metric keys: http://www.datastax.com/documentation/opscenter/5.0/api/docs/metrics.html#metrics-attribute-key-lists";
$metric =~ /^([\w-]+)$/ or usage "invalid metric given, must be alphanumeric, may contain underscores and dashes";
$metric = $1;
# list network interfaces for node
# "/$cluster/nodes/$node_ip/network_interfaces"
# "/$cluster/nodes/$node_ip/devices"
# "/$cluster/nodes/$node_ip/partitions"
my $url;
sub build_url(){
if($keyspace){
$url .= "/$keyspace";
if($cf){
$url .= "/$cf";
}
}
if($metric){
$url .= "/$metric";
if($device){
$url .= "/$device";
}
}
}
if($node_ip){
$url = "$cluster/metrics/$node_ip";
build_url();
} elsif($dc){
# take an aggregate of the cluster
$url = "$cluster/cluster-metrics/$dc";
build_url();
} else {
# not implementing new-metrics API for now
usage "must specify --dc or --node-ip to get metrics for";
# new way of getting all metrics
# $url = "$cluster/new-metrics?";
# if($nodes){
# $url .= "nodes=$nodes";
# } elsif($node_group){
# $url .= "node_group=$node_group";
# } else {
# $url .= "node_group=all";
# }
# if($metric){
# $url .= "&metrics=$metric";
# }
# if($keyspace){
# $url .= "&keyspace=$keyspace";
# if($cf){
# $url .= "&columnfamilies=$cf";
# }
# }
# if($device){
# $url .= "&devices=$device";
# }
# if($node_aggregation){
# $url .= "&node_aggregation=1";
# }
}
my $now = time;
$url .= "?start=" . ($now - ($period * 60) ) . "&end=$now";
# retrieve 300 secs before now
# new-metrics is in secs, otherwise in mins - default is in 1 min step
#$url .= "&step=";
#if($new_metrics){
# $url .= "60";
#} else {
# $url .= "1";
#}
# return all
#if($min){
# $url .= "&function=min";
#} elsif($max){
# $url .= "&function=max";
#}
$json = curl_opscenter $url;
vlog3 Dumper($json);
my @results;
if(isHash($json) and not %{$json}){
quit "UNKNOWN", "blank metrics returned by DataStax OpsCenter";
} elsif(defined($json->{"Average"})){
# heap-used / heap-max
quit "UNKNOWN", "metric not found for this combination" unless defined($json->{"Average"}->{"AVERAGE"});
@results = get_field_array("Average.AVERAGE");
} elsif(defined($json->{"Total"})){
# write-ops
quit "UNKNOWN", "metric not found for this combination" unless defined($json->{"Total"}->{"AVERAGE"});
@results = get_field_array("Total.AVERAGE");
} elsif(defined($json->{$node_ip})){
(my $node_ip2 = $node_ip) =~ s/\./\\./g;
@results = get_field_array("$node_ip2.AVERAGE");
} else {
quit "UNKNOWN", "neither Average nor Total json child found. $nagios_plugins_support_msg_api";
}
my $metric_result;
my @array;
quit "UNKNOWN", "no results returned" unless @results;
# iterate and take latest non-undef reading since results are in timestamp ascending order
foreach my $array_ref (@results){
@array = @{$array_ref};
isArray(\@array) or quit "UNKNOWN", "non-array returned in results. $nagios_plugins_support_msg_api";
if(scalar @array > 1){
if(defined($array[1]) and isFloat($array[1])){
$metric_result = sprintf("%.2f", $array[1]);
} elsif($latest_only){
$metric_result = 'undefined';
}
}
}
if(defined($metric_result)){
$msg = "$metric = '$metric_result'";
check_thresholds($metric_result);
} else {
unknown;
$msg = "$metric = 'N/A'";
}
if($node_ip){
$msg .= " for node '$node_ip'";
} elsif($dc){
$msg .= " for dc='$dc'";
}
if($keyspace){
$msg .= " keyspace='$keyspace'";
}
if($cf){
$msg .= " column-family='$cf'";
}
if($device){
$msg .= " device='$device'";
}
if(defined($metric_result)){
$msg .= " | '$metric'=$metric_result";
msg_perf_thresholds();
}
quit $status, $msg;