mirror of https://github.com/schoebel/mars
contrib: configurable Nagios checks
This commit is contained in:
parent
c39a2988b7
commit
e23f0c35f9
|
@ -0,0 +1,65 @@
|
|||
# Config file for mars_check.sh
|
||||
#
|
||||
# This file is part of MARS project: http://schoebel.github.io/mars/
|
||||
#
|
||||
# Copyright (C) 2015 Thomas Schoebel-Theuer
|
||||
# Copyright (C) 2015 1&1 Internet AG
|
||||
#
|
||||
# Copying and distribution of this file, with or without modification,
|
||||
# are permitted in any medium without royalty provided the copyright
|
||||
# notice and this notice are preserved. This file is offered as-is,
|
||||
# without any warranty.
|
||||
|
||||
|
||||
# For each variable $Var (as documented in "mars_check.sh --help"), the following relatives are always defined:
|
||||
#
|
||||
# $LastVar the old value from the last run of mars_check.sh (whenever it was called; recommendation: 5 minutes)
|
||||
# $DeltaLastVar the difference between $Var and $LastVar
|
||||
# $RateLastVar the $DeltaLastVar normalized to the elapsed time (unit: per minutes)
|
||||
#
|
||||
# $MediumVar the old value from a medium-term run of mars_check.sh ($window_medium, default 3600s)
|
||||
# $DeltaMediumVar the difference between $Var and $MediumVar
|
||||
# $RateMediumVar the $DeltaMediumVar normalized to the elapsed time (unit: per minutes)
|
||||
#
|
||||
# $LongtermVar the old value from a longterm run of mars_check.sh ($window_longterm, default 24h)
|
||||
# $DeltaLongtermVar the difference between $Var and $LongtermVar
|
||||
# $RateLongtermVar the $DeltaLongtermVar normalized to the elapsed time (unit: per minutes)
|
||||
|
||||
|
||||
# The first number in each line is the priority class.
|
||||
# Lower number = higher priority = takes precence over higher class numbers
|
||||
# Exception: 0 means that the check will appear unconditionally (class is irrelevant)
|
||||
#
|
||||
# Hint: checks can be simply disabled by commenting them out
|
||||
|
||||
|
||||
#####################################################################################
|
||||
# List of global checks
|
||||
|
||||
1 ModuleLoaded <= 0 CRITICAL: mars module is not loaded
|
||||
2 Responsive <= 0 CRITICAL: mars_light thread is not responsive / possibly hanging
|
||||
5 SpaceRest <= 4 CRITICAL: only $SpaceRest GiB left on /mars/
|
||||
6 SpacePercent >= 70 CRITICAL: Used space on /mars/ is $SpacePercent %
|
||||
7 SpacePercent >= 30 WARNING: Used space on /mars/ is $SpacePercent %
|
||||
|
||||
#####################################################################################
|
||||
# List of local checks = per resource. The resource name can be substituted via $res
|
||||
|
||||
# all hosts
|
||||
10 AliveAge >= 300 CRITICAL: resource $res: primary host ${Designated[$res]} is not reachable for $AliveAge seconds
|
||||
11 Alive <= 0 WARNING: resource $res: primary host ${Designated[$res]} is not reachable
|
||||
12 Emergency >= 1 CRITICAL: resource $res is in emergency mode, too less space on /mars/
|
||||
13 SplitBrain >= 1 CRITICAL: split brain on $res detected
|
||||
|
||||
# only secondaries
|
||||
30 Sync <= 0 WARNING: resource $res sync is switched off
|
||||
31 Fetch <= 0 WARNING: resource $res fetch is switched off
|
||||
32 Replay <= 0 WARNING: resource $res replay is switched off
|
||||
|
||||
40 SyncRest >= 999999 WARNING: resource $res SyncRest=${SyncRest[$res]} is too large
|
||||
41 FetchRest >= 999999 WARNING: resource $res FetchRest=${FetchRest[$res]} is too large
|
||||
42 ReplayRest >= 999999 WARNING: resource $res ReplayRest=${ReplayRest[$res]} is too large
|
||||
|
||||
50 DeltaLastSyncRest <= 99999 && SyncRest >= 1 WARNING: resource $res SyncRest=${SyncRest[$res]} sync has stopped
|
||||
51 DeltaLastFetchRest <= 99999 && FetchRest >= 1 WARNING: resource $res FetchRest=${FetchRest[$res]} fetch has stopped
|
||||
52 DeltaLastReplayRest <= 99999 && ReplayRest >= 1 WARNING: resource $res ReplayRest=${ReplayRest[$res]} replay has stopped
|
|
@ -0,0 +1,450 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
# This file is part of MARS project: http://schoebel.github.io/mars/
|
||||
#
|
||||
# Copyright (C) 2015 Thomas Schoebel-Theuer
|
||||
# Copyright (C) 2015 1&1 Internet AG
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
|
||||
#
|
||||
# Nagios check, respecting debian package settings
|
||||
#
|
||||
# TST spring 2015 lab prototype
|
||||
#
|
||||
# Verbose mode and options / help is not yet supported.
|
||||
|
||||
#set -e
|
||||
orig_vars="$(set | grep '^[_A-Za-z0-9]\+=' | cut -d= -f1)"
|
||||
|
||||
# Config file for defaults.
|
||||
# May be used for hard override of the following definitions.
|
||||
default_file="${default_file:-/etc/default/mars}"
|
||||
|
||||
# Defaults for configuration variables
|
||||
service="${service:-MARS}"
|
||||
check_enable=${check_enable:-1}
|
||||
alive_window=${alive_window:-120} # seconds
|
||||
responsive_window=${responsive_window:-600} # seconds
|
||||
class_mode=${class_mode:-1}
|
||||
warnings=${warnings:-0}
|
||||
simulate=${simulate:-0}
|
||||
verbose=${verbose:-0}
|
||||
help=${help:-0}
|
||||
|
||||
mars_dir="${mars_dir:-/mars}"
|
||||
config_dir="${config_dir:-/etc/mars}"
|
||||
config_file="${config_file:-mars.rules}"
|
||||
statusfile_dir="${statusfile_dir:-/var/cache/mars}"
|
||||
status_last="${status_last:-$statusfile_dir/last.status}"
|
||||
status_medium="${status_medium:-$statusfile_dir/medium.status}"
|
||||
status_longterm="${status_long:-$statusfile_dir/longterm.status}"
|
||||
window_medium=${window_medium:-3600}
|
||||
window_longterm=${window_longterm:-$(( 3600 * 24 ))}
|
||||
|
||||
# Enable this script by default
|
||||
ENABLED="true"
|
||||
|
||||
param_vars="$(set | grep '^[_A-Za-z0-9]\+=' | cut -d= -f1)"
|
||||
|
||||
# Derived from the defaults
|
||||
file_list="./$config_file $config_dir/$config_file"
|
||||
|
||||
# Nagios Exit Codes
|
||||
OK=0
|
||||
WARNING=1
|
||||
CRITICAL=2
|
||||
UNKNOWN=3
|
||||
|
||||
function abort
|
||||
{
|
||||
local msg="$1"
|
||||
echo "$service Unknown: $msg"
|
||||
exit $UNKNOWN
|
||||
}
|
||||
|
||||
function source_when_possible
|
||||
{
|
||||
local file="$1"
|
||||
local type="$2"
|
||||
|
||||
if [[ -r "$file" ]]; then
|
||||
. "$file" || abort "$type file $file is not parsable"
|
||||
fi
|
||||
}
|
||||
|
||||
source_when_possible "$default_file" "config"
|
||||
|
||||
# Allow forceful override of any _known_ variable at the command line
|
||||
for i; do
|
||||
if [[ "$i" =~ ^--[-_A-Za-z0-9]+$ ]]; then
|
||||
param="${i#--}"
|
||||
var="${param//-/_}"
|
||||
[[ "$(eval "echo \"\$$var\"")" = "" ]] && abort "Variable '$var' is unknown"
|
||||
eval "$var=1"
|
||||
elif [[ "$i" =~ ^--[-_A-Za-z0-9]+= ]]; then
|
||||
param="${i#--}"
|
||||
var="${param%%=*}"
|
||||
var="${var//-/_}"
|
||||
val="${param#*=}"
|
||||
[[ "$(eval "echo \"\$$var\"")" = "" ]] && abort "Variable '$var' is unknown"
|
||||
eval "$var=$val"
|
||||
elif [[ "$i" =~ ^-h$ ]]; then
|
||||
help=1
|
||||
elif [[ "$i" =~ ^-v$ ]]; then
|
||||
(( verbose++ ))
|
||||
else
|
||||
abort "bad parameter syntax '$i'"
|
||||
fi
|
||||
done
|
||||
|
||||
# Almost silently exit if not enabled
|
||||
if (( !check_enable)) || [[ "$ENABLED" != "true" ]] && (( !help )); then
|
||||
echo "${service}_IS_DISABLED OK"
|
||||
exit $OK
|
||||
fi
|
||||
|
||||
########################
|
||||
# Prepare Variables
|
||||
|
||||
var_list="ListOfPrimary ListOfNotYetPrimary ListOfRemainsPrimary ListOfSecondary ListOfAny"
|
||||
val_list="ElapsedLast ElapsedMedium ElapsedLongterm ModuleLoaded Responsive SpacePercent SpaceRest"
|
||||
array_list="SplitBrain Designated Alive AliveAge Sync Fetch Replay SyncRest FetchRest ReplayRest Emergency"
|
||||
|
||||
start_vars="$(set | grep '^[_A-Za-z0-9]\+=' | cut -d= -f1)"
|
||||
|
||||
for i in $var_list $val_list; do
|
||||
eval "$i=''"
|
||||
for age in Last Medium Longterm; do
|
||||
eval "$age$i=''"
|
||||
done
|
||||
done
|
||||
|
||||
for i in $array_list; do
|
||||
eval "declare -A $i"
|
||||
for age in Last Medium Longterm; do
|
||||
eval "declare -A $age$i"
|
||||
done
|
||||
done
|
||||
|
||||
basic_vars="$(set | grep '^[_A-Za-z0-9]\+=' | cut -d= -f1)"
|
||||
|
||||
########################
|
||||
# Help text
|
||||
|
||||
if (( help )); then
|
||||
all_vars="$(set | grep '^[_A-Za-z0-9]\+=' | cut -d= -f1 | sort)"
|
||||
cat<<EOF
|
||||
Nagios-compatible plugin for service $service
|
||||
|
||||
Usage: $0 [-h] {-v} {--<switch>} {--<var>=<value>}
|
||||
|
||||
The following parameter variables can be either passed by the
|
||||
environment, or used for hard overrinding on the command line:
|
||||
|
||||
$(
|
||||
declare -A orig
|
||||
for i in $orig_vars; do
|
||||
orig[$i]=1
|
||||
done
|
||||
declare -A param
|
||||
for i in $param_vars; do
|
||||
param[$i]=1
|
||||
done
|
||||
for i in $all_vars; do
|
||||
[[ "$i" =~ _vars$ ]] && continue
|
||||
if (( param[$i] && !orig[$i] )); then
|
||||
echo "$i=$(eval "echo \${$i}")"
|
||||
fi
|
||||
done
|
||||
)
|
||||
|
||||
The following CamelCase basic variables may be used in one of the
|
||||
rules files $file_list:
|
||||
|
||||
$(
|
||||
declare -A start
|
||||
for i in $start_vars; do
|
||||
start[$i]=1
|
||||
done
|
||||
declare -A basic
|
||||
for i in $basic_vars; do
|
||||
basic[$i]=1
|
||||
done
|
||||
for i in $all_vars; do
|
||||
[[ "$i" =~ _vars$ ]] && continue
|
||||
if (( basic[$i] && !start[$i] )); then
|
||||
keys="$(eval echo "\${!$i[@]}")"
|
||||
if [[ "$keys" != "0" ]]; then
|
||||
echo "${i}[\$res]"
|
||||
else
|
||||
echo "$i"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
)
|
||||
EOF
|
||||
exit 0
|
||||
fi
|
||||
|
||||
########################
|
||||
# Read in old status files
|
||||
|
||||
source_when_possible "$status_last" "last status"
|
||||
source_when_possible "$status_medium" "medium-term status"
|
||||
source_when_possible "$status_longterm" "longterm status"
|
||||
|
||||
marsadm=${marsadm:-$(which marsadm)}
|
||||
# exit if marsadm is not found
|
||||
command -v $marsadm > /dev/null || abort "Command marsadm '$marsadm' is not installed"
|
||||
|
||||
########################
|
||||
# get Global variables
|
||||
|
||||
ElapsedLast=$(( $(date +%s) - $(stat --printf="%Y" $status_last 2> /dev/null || echo "0") ))
|
||||
ElapsedMedium=$(( $(date +%s) - $(stat --printf="%Y" $status_medium 2> /dev/null || echo "0") ))
|
||||
ElapsedLongterm=$(( $(date +%s) - $(stat --printf="%Y" $status_longterm 2> /dev/null || echo "0") ))
|
||||
|
||||
ModuleLoaded="$( [[ -d /prco/sys/mars ]]; echo $? )"
|
||||
|
||||
Responsive="$($marsadm --macro="%is-alive{%{host}}" --window=$responsive_window view 2> /dev/null)"
|
||||
|
||||
SpacePercent="$(df $mars_dir | grep -o "[0-9]\%" | tail -1 | sed 's/\%//g' 2> /dev/null)"
|
||||
|
||||
SpaceRest="$($marsadm view-rest-space 2> /dev/null)"
|
||||
|
||||
# get a list of Primary and Secondary resource names
|
||||
# don't run the while loop in a subshell, use the main shell
|
||||
while read dashes txt res; do
|
||||
read role
|
||||
eval "ListOf$role+=' $res'"
|
||||
eval "ListOfAny+=' $res'"
|
||||
done <<EOF 2> /dev/null
|
||||
$($marsadm view-role all 2> /dev/null)
|
||||
EOF
|
||||
|
||||
########################
|
||||
# get Resource variables
|
||||
|
||||
for i in $ListOfAny; do
|
||||
SplitBrain[$i]="$($marsadm view-is-split-brain $i 2> /dev/null)"
|
||||
Emergency[$i]="$($marsadm view-is-emergency $i < /dev/null 2> /dev/null)"
|
||||
done
|
||||
|
||||
for i in $ListOfPrimary; do
|
||||
: #echo "Pri '$i'"
|
||||
done
|
||||
|
||||
for i in $ListOfSecondary; do
|
||||
Designated[$i]="$($marsadm view-get-primary $i 2> /dev/null)"
|
||||
Alive[$i]="$($marsadm --macro="%is-alive{${Designated[$i]}}" --window=$alive_window view $i 2> /dev/null)"
|
||||
AliveAge[$i]="$($marsadm view-alive-age $i 2> /dev/null)"
|
||||
Sync[$i]="$($marsadm view-todo-sync $i 2> /dev/null)"
|
||||
Fetch[$i]="$($marsadm view-todo-fetch $i 2> /dev/null)"
|
||||
Replay[$i]="$($marsadm view-todo-replay $i 2> /dev/null)"
|
||||
SyncRest[$i]="$($marsadm view-sync-rest $i 2> /dev/null)"
|
||||
FetchRest[$i]="$($marsadm view-fetch-rest $i 2> /dev/null)"
|
||||
ReplayRest[$i]="$($marsadm view-replay-rest $i 2> /dev/null)"
|
||||
done
|
||||
|
||||
|
||||
########################
|
||||
# compute Delta variables (when possible)
|
||||
|
||||
for i in $val_list; do
|
||||
for age in Last Medium Longterm; do
|
||||
if [[ "$(eval echo "\${$age$i}")" != "" ]]; then
|
||||
declare Delta$age$i
|
||||
eval "Delta$age$i=$(( $(eval echo "\${$i}") - $(eval echo "\${$age$i}") ))"
|
||||
declare Rate$age$i
|
||||
eval "Rate$age$i=$(( $(eval echo "\${Delta$age$i}") * 60 / Elapsed${age} ))"
|
||||
fi
|
||||
done
|
||||
done
|
||||
for i in $array_list; do
|
||||
for j in $(eval echo "\${!$i[*]}"); do
|
||||
for age in Last Medium Longterm; do
|
||||
if [[ "$(eval echo "\${$age$i[$j]}")" != "" ]]; then
|
||||
declare -A Delta$age$i[$j]
|
||||
eval "Delta$age$i[$j]=$(( $(eval echo "\${$i[$j]}") - $(eval echo "\${$age$i[$j]}") ))"
|
||||
declare -A Rate$age$i[$j]
|
||||
eval "Rate$age$i[$j]=$(( $(eval echo "\${Delta$age$i[$j]}") * 60 / Elapsed${age} ))"
|
||||
fi
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
########################
|
||||
# Write out new status file
|
||||
|
||||
mkdir -p "$statusfile_dir"
|
||||
(
|
||||
for i in $var_list $val_list; do
|
||||
echo "Last$i='$(eval echo "\${$i}")'"
|
||||
done
|
||||
|
||||
for i in $array_list; do
|
||||
for j in $(eval echo "\${!$i[*]}"); do
|
||||
echo "Last$i[$j]='$(eval echo "\${$i[$j]}")'"
|
||||
done
|
||||
done
|
||||
) > "${status_last}.tmp" && \
|
||||
mv "${status_last}.tmp" "${status_last}" && \
|
||||
if ! [[ -r $status_medium.tmp ]] || (( $(stat --printf="%Y" $status_medium.tmp) < $(stat --printf="%Y" $status_last) - ( $window_medium / 2 ) )); then
|
||||
mv -f $status_medium.tmp $status_medium 2> /dev/null || true
|
||||
sed 's/^Last/Medium/' < $status_last > $status_medium.tmp2 && \
|
||||
mv $status_medium.tmp2 $status_medium.tmp
|
||||
fi &&\
|
||||
if ! [[ -r $status_longterm.tmp ]] || (( $(stat --printf="%Y" $status_longterm.tmp) < $(stat --printf="%Y" $status_last) - ( $window_longterm / 2 ) )); then
|
||||
mv -f $status_longterm.tmp $status_longterm 2> /dev/null || true
|
||||
sed 's/^Last/Longterm/' < $status_last > $status_longterm.tmp2 && \
|
||||
mv $status_longterm.tmp2 $status_longterm.tmp
|
||||
fi
|
||||
|
||||
########################
|
||||
# Output Handling
|
||||
|
||||
code_max=0
|
||||
|
||||
# this can be called multiple times.
|
||||
# it remembers the maximum error level in $code_max
|
||||
function do_check
|
||||
{
|
||||
local class="$1"
|
||||
local key="$2"
|
||||
|
||||
local file
|
||||
local rule_var
|
||||
local rule_op
|
||||
local rule_val
|
||||
local rule_txt
|
||||
local found_count=0
|
||||
local matches=0
|
||||
|
||||
for file in $file_list; do
|
||||
if [[ -r "$file" ]]; then
|
||||
while read rule_class rule_var rule_op rule_val rule_txt; do
|
||||
if [[ "$rule_var" = "$key" ]]; then
|
||||
(( ++found_count ))
|
||||
(( rule_class != class && class_mode )) && continue
|
||||
local keys="$(eval echo "\${!$rule_var[@]}")"
|
||||
if [[ "$keys" != "" ]] && [[ "$keys" != "0" ]]; then
|
||||
local res
|
||||
for res in $keys; do
|
||||
if [[ "$(eval echo "\${$rule_var[$res]}")" != "" ]]; then
|
||||
while (( $rule_var[$res] $rule_op $rule_val || simulate )); do
|
||||
(( ++matches ))
|
||||
if [[ "$rule_txt" =~ "&&" ]]; then
|
||||
read dummy rule_var rule_op rule_val rule_txt <<< "$rule_txt"
|
||||
else
|
||||
_out_txt "$rule_txt" "$res"
|
||||
break
|
||||
fi
|
||||
done
|
||||
else
|
||||
(( warnings )) && echo "Undefined variable '$rule_var[$res]'" >> /dev/stderr
|
||||
fi
|
||||
done
|
||||
else
|
||||
if [[ "$(eval echo "\${$rule_var}")" != "" ]]; then
|
||||
while (( $rule_var $rule_op $rule_val || simulate )); do
|
||||
(( ++matches ))
|
||||
if [[ "$rule_txt" =~ "&&" ]]; then
|
||||
read dummy rule_var rule_op rule_val rule_txt <<< "$rule_txt"
|
||||
else
|
||||
_out_txt "$rule_txt" "UNDEF"
|
||||
break
|
||||
fi
|
||||
done
|
||||
else
|
||||
(( warnings )) && echo "Undefined variable '$rule_var'" >> /dev/stderr
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
done <<EOF
|
||||
$(grep -v '^#' $file | grep -v '^\s*$')
|
||||
EOF
|
||||
fi
|
||||
done
|
||||
if (( warnings && !found_count )); then
|
||||
echo "Cannot find key '$key' in $config_file $config_dir/$config_file" >> /dev/stderr
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
function _out_txt
|
||||
{
|
||||
local txt="$1"
|
||||
local res="$2"
|
||||
|
||||
txt="$(echo "$txt" | sed 's/\${/\\\${/g')"
|
||||
# eval down (fixedpoint iteration)
|
||||
local old=""
|
||||
while [[ "$txt" != "$old" ]]; do
|
||||
old="$txt"
|
||||
txt="$(eval echo "$txt")"
|
||||
done
|
||||
echo "$service $txt"
|
||||
|
||||
local this_code=0
|
||||
echo "$txt" | grep -i -q "WARNING" && this_code=1
|
||||
echo "$txt" | grep -i -q "CRITICAL" && this_code=2
|
||||
(( this_code > code_max )) && code_max=$this_code
|
||||
return 0
|
||||
}
|
||||
|
||||
########################
|
||||
# Main program
|
||||
|
||||
class_list="$(cat $file_list 2>/dev/null | grep -v '^#' | grep -v '^\s*$' | cut -d" " -f1 | sort -n -u)"
|
||||
|
||||
for class in $class_list; do
|
||||
########################
|
||||
# Global checks
|
||||
|
||||
do_check "$class" ModuleLoaded
|
||||
do_check "$class" Responsive
|
||||
do_check "$class" SpacePercent
|
||||
do_check "$class" SpaceRest
|
||||
|
||||
########################
|
||||
# Resource checks
|
||||
|
||||
do_check "$class" Alive
|
||||
do_check "$class" AliveAge
|
||||
do_check "$class" Emergency
|
||||
do_check "$class" SplitBrain
|
||||
for i in $ListOfSecondary; do
|
||||
do_check "$class" Sync
|
||||
do_check "$class" Fetch
|
||||
do_check "$class" Replay
|
||||
do_check "$class" SyncRest
|
||||
do_check "$class" FetchRest
|
||||
do_check "$class" ReplayRest
|
||||
for age in Last Medium Longterm; do
|
||||
do_check "$class" Delta${age}SyncRest
|
||||
do_check "$class" Delta${age}FetchRest
|
||||
do_check "$class" Delta${age}ReplayRest
|
||||
done
|
||||
done
|
||||
(( !class_mode )) && break
|
||||
(( class > 0 && code_max > 0 )) && break
|
||||
done
|
||||
|
||||
if (( !code_max )); then
|
||||
echo "$service OK"
|
||||
fi
|
||||
exit $code_max
|
Loading…
Reference in New Issue