diff --git a/monitoring/README b/monitoring/README new file mode 100644 index 00000000..61e1535a --- /dev/null +++ b/monitoring/README @@ -0,0 +1,25 @@ +This directory contains files for the status of mars indicate to monitors, and monitor. + +mars-status - mars status overview +mars-status.8 - man page fpr mars-status +/zabbix/mars.conf - zabbix config file for mars +/zabbix/mars-cron-job - zabbix cron-job for mars +/zabbix/zabbix_mars_template.xml - zabbix template for mars + +TODO on mars-status: +-------------------- +1) write file to disk + +2) run mars-status with --help for more informations + + +TODO on mars to zabbix: +----------------------- +1) add a cronjob for mars. this should run every 2-5 minutes. the job writes +a file '/tmp/zabbix.mars' with all the important value from mars. + +2) put the file 'mars.conf' in the '/etc/zabbix/zabbix_agentd.d' directory, +so it can be loaded from zabbix-agent. then start the agent again. + +3) load the template in zabbix and associate it with the mars servers. + diff --git a/userspace/mars-status b/monitoring/mars-status similarity index 99% rename from userspace/mars-status rename to monitoring/mars-status index 6821c623..5e20b4c0 100755 --- a/userspace/mars-status +++ b/monitoring/mars-status @@ -1,5 +1,6 @@ #!/usr/bin/perl -w # (c) 2012/2013 Joerg Mann / 1&1 Internet AG +# released under GPL # # last update at now ... @@ -20,7 +21,7 @@ use File::Basename; binmode STDOUT, ":utf8"; ### defaults -my $version = "0.072k"; +my $version = "0.072m"; my $alife_timeout = "30"; # sec for remote-nodes timeout my $is_tty = 0; my $mars_dir = '/mars'; @@ -533,7 +534,7 @@ sub check_ressource { foreach my $partner (@servers) { $partner =~ s/^data-//; if ( $partner eq $himself ) { next; } - $himselfip = check_link "$mars_dir/ips/ip-$himself"; + $himselfip = check_link "$mars_dir/ips/ip-$partner"; print_screen " -> remote Node ($partner [$himselfip]) as ", 'bold'; display_partner( ressource => $res, @@ -666,11 +667,11 @@ sub check_logfile { } } else { - # secondary + # secondary if ( $VersionFileCount eq $LPartner ) { print_screen "\t$Gls$Gfr$Gls$Gfr$Gfr$Gfr $Gao$Gab", "$Color_red"; } elsif ( $VersionFileCount > 1 ) { - print_screen "\t$Gls$Gfr$Gls$Gfr$Gfr$Gfr $Gkr$Gab", "$Color_red"; + print_screen "\t$Gls$Gfr$Gls$Gfr$Gfr $Gls$Gfr$Gkr$Gab", "$Color_red"; } else { print_screen "\t$Gls$Gfr$Gls$Gfr$Gfr $Gls$Gfr$Gau$Gab", "$Color_red"; } @@ -900,6 +901,7 @@ sub check_limit { } close MARS_LIMIT; } + $mars_limit_sol = 0 if ( !$mars_limit_sol); ### ist my $mars_limit_ist; @@ -910,6 +912,7 @@ sub check_limit { } close MARS_LIMIT; } + $mars_limit_ist = 0 if ( !$mars_limit_ist); ### presently results print_screen "$LimitText ", 'bold'; @@ -1111,4 +1114,4 @@ while(1) { sleep($params->{'interval'}); } -exit; \ No newline at end of file +exit; diff --git a/userspace/mars-status.8 b/monitoring/mars-status.8 similarity index 100% rename from userspace/mars-status.8 rename to monitoring/mars-status.8 diff --git a/monitoring/zabbix/mars-cron-job b/monitoring/zabbix/mars-cron-job new file mode 100755 index 00000000..e7c583db --- /dev/null +++ b/monitoring/zabbix/mars-cron-job @@ -0,0 +1,91 @@ +#!/bin/bash +# +# Copyright 2013 Joerg Mann / 1&1 Internet AG +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. This file is offered as-is, +# without any warranty. +##################################################################### +# +# use for mars relase 2013/11/19 +# + +LOG="/tmp/zabbix.mars" +MDIR="/proc/sys/mars" +echo "Date: `date`" >$LOG + + +work() { + if [ -f $MDIR/$FN ]; then + echo "$MDIR/$FN `cat $MDIR/$FN`" >>$LOG + else + echo "$MDIR/$FN 0" >>$LOG + fi +} + +FN="copy_read_max_fly"; work +FN="copy_write_max_fly"; work +FN="io_flying_count"; work +FN="logger_mem_used_kb"; work +FN="logger_replay_timeout_sec"; work +FN="mapfree_period_sec"; work +FN="mem_used_raw_kb"; work +FN="propagate_interval_sec"; work +FN="scan_interval_sec"; work +FN="statusfiles_rollover_sec"; work +FN="sync_flip_interval_sec"; work +FN="tuning/aio_io_r_true_hit"; work +FN="tuning/aio_io_r_factor_percent"; work +FN="tuning/aio_io_w_true_hit"; work +FN="tuning/aio_io_w_factor_percent"; work +FN="tuning/aio_sync_true_hit"; work +FN="tuning/aio_submit_factor_percent"; work +FN="tuning/aio_submit_true_hit"; work +FN="tuning/aio_sync_factor_percent"; work +FN="tuning/bio_io_r_true_hit"; work +FN="tuning/bio_io_r_factor_percent"; work +FN="tuning/bio_io_w_true_hit"; work +FN="tuning/bio_io_w_factor_percent"; work +FN="tuning/bio_submit_factor_percent"; work +FN="tuning/bio_submit_true_hit"; work +FN="tuning/server_io_rate_kb"; work +FN="tuning/traffic_rate_kb"; work +FN="tuning/writeback_rate_kb"; work +FN="tuning/writeback_until_percent"; work +FN="fimem_alloc_count"; work +FN="mem_alloc_max"; work +FN="mem_allow_freelist"; work +FN="tuning/write_throttle_count_ops"; work +FN="tuning/write_throttle_cumul_kb"; work +FN="tuning/write_throttle_end_percent"; work +FN="tuning/write_throttle_maxdelay_ms"; work +FN="tuning/write_throttle_maxwindow_ms"; work +FN="tuning/write_throttle_minwindow_ms"; work +FN="tuning/write_throttle_rate_kb"; work +FN="tuning/write_throttle_ratelimit_kb"; work +FN="tuning/write_throttle_size_threshold_kb"; work +FN="tuning/write_throttle_start_percent"; work +FN="tuning/writeback_count_ops"; work +FN="tuning/writeback_cumul_kb"; work +FN="tuning/writeback_maxdelay_ms"; work +FN="tuning/writeback_maxwindow_ms"; work +FN="tuning/writeback_minwindow_ms"; work +FN="tuning/traffic_count_ops"; work +FN="tuning/traffic_cumul_kb"; work +FN="tuning/traffic_maxdelay_ms"; work +FN="tuning/traffic_maxwindow_ms"; work +FN="tuning/traffic_minwindow_ms"; work +FN="tuning/server_io_count_ops"; work +FN="tuning/server_io_cumul_kb"; work +FN="tuning/server_io_maxdelay_ms"; work +FN="tuning/server_io_maxwindow_ms"; work +FN="tuning/server_io_minwindow_ms"; work + +HOSTNAME="`/bin/hostname`" +ls -ld /mars/resource-*|awk '{print $9}'|sed -e 's!.*resource-!!g' | \ +while read RESNAME; do + #echo "-> $HOSTNAME - $RESNAME" + ls -ld /mars/resource-$RESNAME/actual-$HOSTNAME/* | sed -e 's!.*/actual-.*/!!g' | awk '{print "status '$RESNAME' " $1 " " $3}' >>$LOG + ls -ld /mars/resource-$RESNAME/todo-$HOSTNAME/* | sed -e 's!.*/todo-.*/!!g' | awk '{print "status '$RESNAME' " $1 " " $3}' >>$LOG +done diff --git a/monitoring/zabbix/mars.conf b/monitoring/zabbix/mars.conf new file mode 100644 index 00000000..3f8b3144 --- /dev/null +++ b/monitoring/zabbix/mars.conf @@ -0,0 +1,27 @@ +# Copyright 2013 Joerg Mann / 1&1 Internet AG +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. This file is offered as-is, +# without any warranty. +##################################################################### +# +# zabbix mars config +# + +UserParameter=system.mars[*], echo -n "0"; cat /tmp/zabbix.mars|grep $1|awk '{print $$2}' +UserParameter=system.marsstatus[*], echo -n "0"; cat /tmp/zabbix.mars|grep $1|grep ' $2 '|awk '{print $$4}' + +#status data1 device-data1 1 +#status data1 is-attached 1 +#status data1 is-copying 0 +#status data1 is-primary 1 +#status data1 is-replaying 0 +#status data1 is-syncing 0 +#status data1 open-count 1 +#status data1 replay_rate 0 +#status data1 allow-replay 1 +#status data1 attach 1 +#status data1 connect 1 +#status data1 sync 1 + diff --git a/monitoring/zabbix/zabbix_mars_template.xml b/monitoring/zabbix/zabbix_mars_template.xml new file mode 100644 index 00000000..2ce276a7 --- /dev/null +++ b/monitoring/zabbix/zabbix_mars_template.xml @@ -0,0 +1,3797 @@ + + + 2.0 + 2013-12-05T14:13:00Z + + + Templates + + + + + + + + {Template Mars Server:system.marsstatus[{$RESNAME},allow-replay].last(0)}=0 & {Template Mars Server:system.modules[mars].last(0)}=1 + MARS Device on {HOST.NAME} not allowed replay + + 0 + 5 + + 0 + + + + {Template Mars Server:system.marsstatus[{$RESNAME},attach].last(0)}=0 & {Template Mars Server:system.modules[mars].last(0)}=1 + MARS Device on {HOST.NAME} not attached + + 0 + 5 + + 0 + + + + {Template Mars Server:system.marsstatus[{$RESNAME},connect].last(0)}=0 & {Template Mars Server:system.modules[mars].last(0)}=1 + MARS Device on {HOST.NAME} not connected + + 0 + 5 + + 0 + + + + {Template Mars Server:system.marsstatus[{$RESNAME},is-attached].last(0)}=0 & {Template Mars Server:system.marsstatus[{$RESNAME},attach].last(0)}=1 & {Template Mars Server:system.modules[mars].last(0)}=1 + MARS Device on {HOST.NAME} not is-attached + + 0 + 1 + + 0 + + + + {Template Mars Server:system.marsstatus[{$RESNAME},is-replaying].last(0)}=0 & {Template Mars Server:system.marsstatus[{$RESNAME},allow-replay].last(0)}=1 & {Template Mars Server:system.modules[mars].last(0)}=1 + MARS Device on {HOST.NAME} not is-replaying + + 0 + 1 + + 0 + + + + {Template Mars Server:system.marsstatus[{$RESNAME},sync].last(0)}=0 & {Template Mars Server:system.modules[mars].last(0)}=1 + MARS Device on {HOST.NAME} not sync + + 0 + 2 + + 0 + + + + {Template Mars Server:system.modules[mars].last(0)}=0 + MARS Module on {HOST.NAME} not loaded + + 0 + 4 + + 0 + + + + + + mars sec + 900 + 200 + 0.0000 + 100.0000 + 1 + 1 + 0 + 1 + 0 + 0.0000 + 0.0000 + 0 + 0 + 0 + 0 + + + 0 + 0 + C80000 + 0 + 2 + 0 + + Template Mars Server + system.mars[logger_replay_timeout_sec] + + + + 1 + 0 + 00C800 + 0 + 2 + 0 + + Template Mars Server + system.mars[mapfree_period_sec] + + + + 2 + 0 + 0000C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[propagate_interval_sec] + + + + 3 + 0 + C800C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[scan_interval_sec] + + + + 4 + 0 + 00C8C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[statusfiles_rollover_sec] + + + + 5 + 0 + C8C800 + 0 + 2 + 0 + + Template Mars Server + system.mars[sync_flip_interval_sec] + + + + + + mars tuning count ops + 900 + 200 + 0.0000 + 100.0000 + 1 + 1 + 0 + 1 + 0 + 0.0000 + 0.0000 + 0 + 0 + 0 + 0 + + + 0 + 0 + C80000 + 0 + 2 + 0 + + Template Mars Server + system.mars[server_io_count_ops] + + + + 1 + 0 + 00C800 + 0 + 2 + 0 + + Template Mars Server + system.mars[traffic_count_ops] + + + + 2 + 0 + 0000C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[writeback_count_ops] + + + + 3 + 0 + C800C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[write_throttle_count_ops] + + + + + + mars tuning cumul kb + 900 + 200 + 0.0000 + 100.0000 + 1 + 1 + 0 + 1 + 0 + 0.0000 + 0.0000 + 0 + 0 + 0 + 0 + + + 0 + 0 + C80000 + 0 + 1 + 0 + + Template Mars Server + system.mars[server_io_cumul_kb] + + + + 1 + 0 + 00C800 + 0 + 2 + 0 + + Template Mars Server + system.mars[traffic_cumul_kb] + + + + 2 + 0 + 0000C8 + 0 + 4 + 0 + + Template Mars Server + system.mars[writeback_cumul_kb] + + + + 3 + 0 + C800C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[write_throttle_cumul_kb] + + + + + + mars tuning hits + 900 + 200 + 0.0000 + 100.0000 + 1 + 1 + 0 + 1 + 0 + 0.0000 + 0.0000 + 0 + 0 + 0 + 0 + + + 0 + 0 + C80000 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/aio_io_r_true_hit] + + + + 1 + 0 + 00C800 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/aio_io_w_true_hit] + + + + 2 + 0 + 0000C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/aio_submit_true_hit] + + + + 3 + 0 + C800C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/aio_sync_true_hit] + + + + 4 + 0 + 00C8C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/bio_io_r_true_hit] + + + + 5 + 0 + C8C800 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/bio_io_w_true_hit] + + + + 6 + 0 + C8C8C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/bio_submit_true_hit] + + + + + + mars tuning kb + 900 + 200 + 0.0000 + 100.0000 + 1 + 1 + 0 + 1 + 0 + 0.0000 + 0.0000 + 0 + 0 + 0 + 0 + + + 0 + 0 + 00C800 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/server_io_rate_kb] + + + + 1 + 0 + C800C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/traffic_rate_kb] + + + + 2 + 0 + C8C800 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/writeback_rate_kb] + + + + 3 + 0 + C800C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[write_throttle_ratelimit_kb] + + + + 4 + 0 + 00C8C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[write_throttle_rate_kb] + + + + 5 + 0 + C8C800 + 0 + 2 + 0 + + Template Mars Server + system.mars[write_throttle_size_threshold_kb] + + + + + + mars tuning max- /min-window_ms + 900 + 200 + 0.0000 + 100.0000 + 1 + 1 + 0 + 1 + 0 + 0.0000 + 0.0000 + 0 + 0 + 0 + 0 + + + 0 + 0 + C80000 + 0 + 2 + 0 + + Template Mars Server + system.mars[server_io_maxwindow_ms] + + + + 1 + 0 + 00C800 + 0 + 2 + 0 + + Template Mars Server + system.mars[server_io_minwindow_ms] + + + + 2 + 0 + 0000C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[traffic_maxwindow_ms] + + + + 3 + 0 + C800C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[traffic_minwindow_ms] + + + + 4 + 0 + 00C8C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[writeback_maxwindow_ms] + + + + 5 + 0 + C8C800 + 0 + 2 + 0 + + Template Mars Server + system.mars[writeback_minwindow_ms] + + + + 6 + 0 + C8C8C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[write_throttle_maxwindow_ms] + + + + 7 + 0 + 960000 + 0 + 2 + 0 + + Template Mars Server + system.mars[write_throttle_minwindow_ms] + + + + + + mars tuning maxdelay ms + 900 + 200 + 0.0000 + 100.0000 + 1 + 1 + 0 + 1 + 0 + 0.0000 + 0.0000 + 0 + 0 + 0 + 0 + + + 0 + 0 + C80000 + 0 + 2 + 0 + + Template Mars Server + system.mars[server_io_maxdelay_ms] + + + + 1 + 0 + 00C800 + 0 + 2 + 0 + + Template Mars Server + system.mars[traffic_maxdelay_ms] + + + + 2 + 0 + 0000C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[writeback_maxdelay_ms] + + + + 3 + 0 + C800C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[write_throttle_maxdelay_ms] + + + + + + mars tuning mem alloc + 900 + 200 + 0.0000 + 100.0000 + 1 + 1 + 0 + 1 + 0 + 0.0000 + 0.0000 + 0 + 0 + 0 + 0 + + + 0 + 0 + C80000 + 0 + 2 + 0 + + Template Mars Server + system.mars[mem_alloc_count] + + + + 1 + 0 + 00C800 + 0 + 2 + 0 + + Template Mars Server + system.mars[mem_alloc_max] + + + + 2 + 0 + 0000C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[mem_allow_freelist] + + + + + + mars tuning percent + 900 + 200 + 0.0000 + 100.0000 + 1 + 1 + 0 + 1 + 0 + 0.0000 + 0.0000 + 0 + 0 + 0 + 0 + + + 0 + 0 + C80000 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/aio_io_r_factor_percent] + + + + 1 + 0 + 00C800 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/aio_io_w_factor_percent] + + + + 2 + 0 + 0000C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/aio_submit_factor_percent] + + + + 3 + 0 + C800C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/aio_sync_factor_percent] + + + + 4 + 0 + 00C8C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/bio_io_r_factor_percent] + + + + 5 + 0 + C8C800 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/bio_io_w_factor_percent] + + + + 6 + 0 + C8C8C8 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/bio_submit_factor_percent] + + + + 7 + 0 + 960000 + 0 + 2 + 0 + + Template Mars Server + system.mars[tuning/writeback_until_percent] + + + + 8 + 0 + 009600 + 0 + 2 + 0 + + Template Mars Server + system.mars[write_throttle_end_percent] + + + + 9 + 0 + 000096 + 0 + 2 + 0 + + Template Mars Server + system.mars[write_throttle_start_percent] + + + + + +