mirror of
https://github.com/schoebel/mars
synced 2024-12-27 00:52:21 +00:00
Merge branch 'mars0.1.y' into mars0.1a.y
This commit is contained in:
commit
7a8f6c2baf
@ -293,6 +293,15 @@ Hint: branch 0.1a will get a merge from here, and then get the
|
||||
(except Football related ones) will then go to 0.1b.
|
||||
Finally, when 0.1a is stable, I will close this branch.
|
||||
|
||||
mars0.1stable62
|
||||
* Minor fix: race between join-resource and log-rotate.
|
||||
* Minor fix: report split brain logfile amount only when
|
||||
actually detectable.
|
||||
* Minor improvement: shift annoying error message over
|
||||
to Orphan state detection.
|
||||
* Football: update to Football-2.0-RC12
|
||||
* doc: some updates.
|
||||
|
||||
mars0.1stable61
|
||||
* Minor fix: in very rare cases where some symlinks are missing,
|
||||
don't abort in try_to_avoid_splitbrain().
|
||||
|
BIN
docu/Football_FrOSCon2018.pdf
Normal file
BIN
docu/Football_FrOSCon2018.pdf
Normal file
Binary file not shown.
@ -12,7 +12,7 @@ Actions for resource migration:
|
||||
Run the sequence
|
||||
migrate_prepare ; migrate_wait ; migrate_finish; migrate_cleanup.
|
||||
|
||||
Dto for testing of phases:
|
||||
Dto for testing (do not rely on it):
|
||||
|
||||
./football.sh migrate_prepare <resource> <target_primary> [<target_secondary>]
|
||||
Allocate LVM space at the targets and start MARS replication.
|
||||
@ -32,7 +32,7 @@ Actions for inplace FS shrinking:
|
||||
./football.sh shrink <resource> <percent>
|
||||
Run the sequence shrink_prepare ; shrink_finish ; shrink_cleanup.
|
||||
|
||||
Dto for testing of phases:
|
||||
Dto for testing (do not rely on it):
|
||||
|
||||
./football.sh shrink_prepare <resource> [<percent>]
|
||||
Allocate temporary LVM space (when possible) and create initial
|
||||
@ -114,6 +114,10 @@ Actions for (manual) repair in emergency situations:
|
||||
Manually lock or unlock an item at all of the given hosts, in
|
||||
an atomic fashion. In most cases, use "ALL" for the item.
|
||||
|
||||
Only for testing / development (no stable interfaces):
|
||||
|
||||
./football.sh manual_call_hook <name> <args>
|
||||
|
||||
Global maintenance:
|
||||
|
||||
./football.sh lv_cleanup <resource>
|
||||
@ -253,7 +257,7 @@ Configuration:
|
||||
# IMPORTANT: some intermediate progress report is absolutely needed,
|
||||
# because otherwise a false-positive TIMEOUT may be assumed when
|
||||
# no output is generated for several hours.
|
||||
rsync_opt="${rsync_opt:- -aSH --info=progress2,STATS}"
|
||||
rsync_opt="${rsync_opt:- -aH --inplace --info=progress2,STATS}"
|
||||
|
||||
## rsync_opt_prepare
|
||||
# Additional rsync options for preparation and updating
|
||||
@ -338,6 +342,16 @@ Configuration:
|
||||
# of a failed command.
|
||||
serious_status="${serious_status:-198}"
|
||||
|
||||
## interrupted_status
|
||||
# This is the "magic" exit code indicating a manual interruption
|
||||
# (e.g. keypress Ctl-c)
|
||||
interrupted_status="${interrupted_status:-190}"
|
||||
|
||||
## illegal_status
|
||||
# This is the "magic" exit code indicating an illegal command
|
||||
# (e.g. syntax error, illegal arguments, etc)
|
||||
illegal_status="${illegal_status:-191}"
|
||||
|
||||
## pre_hand or --pre-hand=
|
||||
# Set this to do an ordinary handover to a new start position
|
||||
# (in the source cluster) before doing anything else.
|
||||
@ -370,6 +384,10 @@ Configuration:
|
||||
# shoule be called
|
||||
finished_regex="${finished_regex:-^(migrate_finish|migrate|migrate+|shrink_finish|shrink)}"
|
||||
|
||||
## call_finished
|
||||
# Whether to call the hook football_failed at failures.
|
||||
call_finished="${call_finished:-1}"
|
||||
|
||||
## lock_break_timeout
|
||||
# When remote ssh commands are failing, remote locks may sustain forever.
|
||||
# Avoid deadlocks by breaking remote locks after this timeout has elapsed.
|
||||
@ -393,10 +411,24 @@ Configuration:
|
||||
# Normally not needed.
|
||||
resource_pre_check="${resource_pre_check:-0}"
|
||||
|
||||
## enable_background_reporting
|
||||
# Progress reporting to screener.
|
||||
# Runs in the background, in parallel to forground processes
|
||||
# like rsync or tar.
|
||||
enable_background_reporting="${enable_background_reporting:-1}"
|
||||
|
||||
## condition_check_interval
|
||||
# How often conditions should be re-evaluated.
|
||||
condition_check_interval="${condition_check_interval:-180}" # Seconds
|
||||
|
||||
## lease_time
|
||||
# Any intents (e.g. for creation of new resources) are recorded.
|
||||
# This is needed for race avoidance, when multiple resources
|
||||
# are migrated in _parallel_ to the _same_ target.
|
||||
# This might lead to livelocks when there would be no lease time
|
||||
# after which the intents are regarded as "invalid".
|
||||
lease_time="${lease_time:-3600}" # seconds
|
||||
|
||||
## limit_syncs
|
||||
# Limit the number of actually running syncs by waiting
|
||||
# until less than this number of syncs are running at any
|
||||
@ -420,6 +452,16 @@ Configuration:
|
||||
# new primary site.
|
||||
limit_mars_logfile="${limit_mars_logfile:-1024}" # MiB
|
||||
|
||||
## shrink_min_ram_gb
|
||||
# When set, check that the target machines for shrinking
|
||||
# have enough RAM.
|
||||
# Rationale: even incremental rsync needs the Dentry cache of the
|
||||
# kernel. When there is not enough RAM, and when there are some millions
|
||||
# of inodes, the customer downtime may rise to some hours or even some days
|
||||
# instead of some minutes (only when the detnry+inode cache does not
|
||||
# fit into kernel memory <<<=== this is the cruscial point)
|
||||
shrink_min_ram_gb="${shrink_min_ram_gb:-0}" # GiB
|
||||
|
||||
## optimize_dentry_cache
|
||||
# Don't umount the temporary shrink space unnecessarily.
|
||||
# Try to shutdown the VM / container without umounting.
|
||||
@ -469,6 +511,23 @@ Configuration:
|
||||
xfs_dump="${xfs_dump:-xfs_quota -x -c dump}"
|
||||
xfs_restore="${xfs_restore:-xfs_quota -x -c restore}"
|
||||
|
||||
## shortcut_tar_percent
|
||||
# Percentage when a shrink space should no longer be considered
|
||||
# as "inital" (or empty).
|
||||
shortcut_tar_percent="${shortcut_tar_percent:-5}"
|
||||
|
||||
## max_rsync_downtime
|
||||
# When set, check the _expected_ duration of customer downtime.
|
||||
# if it takes longer than this limit, abort without causing
|
||||
# customer downtime.
|
||||
# Afterward, sysadmins need to decide what to do:
|
||||
# For example, move the resource to faster hardware with more RAM, or similar.
|
||||
max_rsync_downtime="${max_rsync_downtime:-0}" # seconds
|
||||
|
||||
## merge_shrink_secondaries
|
||||
# This is only needed when targets are not yet pre-merged.
|
||||
merge_shrink_secondaries="${merge_shrink_secondaries:-0}"
|
||||
|
||||
## fs_resize_cmd
|
||||
# Command for online filesystem expansion.
|
||||
fs_resize_cmd="${fs_resize_cmd:-xfs_growfs -d}"
|
||||
@ -601,11 +660,18 @@ Specific features with plugin football-cm3:
|
||||
# und thus must be pingable over network.
|
||||
skip_resource_ping="${skip_resource_ping:-0}"
|
||||
|
||||
## date_lock
|
||||
# Don't enter critical sections at certain days of the week,
|
||||
# and/or during certain hours.
|
||||
# This is a regex matching against "date +%u_%H"
|
||||
date_lock="${date_lock:-}"
|
||||
## business_hours
|
||||
# When set, critical sections are only entered during certain
|
||||
# days of the week, and/or during certain hours.
|
||||
# This is a regex matching against "date +%u_%H".
|
||||
# Example regex: [1-5]_(0[8-9]|1[0-8])
|
||||
# This means Monday to Friday from 8 to 18 o'clock.
|
||||
business_hours="${business_hours:-}"
|
||||
|
||||
## cm3_stop_safeguard_cmd
|
||||
# Workaround for a bug.
|
||||
# Sometimes a systemd unit does not go away.
|
||||
cm3_stop_safeguard_cmd="${cm3_stop_safeguard_cmd:-{ sleep 2; try=0; while (( try++ < 10 )) && systemctl show $res.scope | grep ActiveState | grep =active; do systemctl stop $res.scope; sleep 6; done; if mountpoint /vol/$res; then umount /vol/$res; fi; }}"
|
||||
|
||||
## check_ping_rounds
|
||||
# Number of pings to try before a container is assumed to
|
||||
@ -653,7 +719,9 @@ Specific features with plugin football-cm3:
|
||||
# a BigCluster constisting of several thousands of machines.
|
||||
# When a future version of mars0.1b.y (or 0.2.y) will allow this,
|
||||
# this can be disabled.
|
||||
do_split_cluster="${do_split_cluster:-1}"
|
||||
# do_split_cluster >= 2 means that the resulting MARS clusters should
|
||||
# not exceed these number of members, when possible.
|
||||
do_split_cluster="${do_split_cluster:-2}"
|
||||
|
||||
## forbidden_hosts
|
||||
# Regex for excluding hostnames from any Football actions.
|
||||
@ -760,7 +828,26 @@ Specific features with plugin football-cm3:
|
||||
|
||||
## monitis_downtime_duration
|
||||
# ShaHoLin-internal
|
||||
monitis_downtime_duration="${monitis_downtime_duration:-20}" # Minutes
|
||||
monitis_downtime_duration="${monitis_downtime_duration:-60}" # Minutes
|
||||
|
||||
## orwell_downtime_script
|
||||
# ShaHoLin-internal
|
||||
orwell_downtime_script="${orwell_downtime_script:-}"
|
||||
|
||||
## orwell_tz
|
||||
# Deal with differences in clock timezones.
|
||||
orwell_tz="${orwell_tz:-Europe/Berlin}"
|
||||
|
||||
## orwell_downtime_duration
|
||||
# ShaHoLin-internal
|
||||
orwell_downtime_duration="${orwell_downtime_duration:-20}" # Minutes
|
||||
|
||||
## orwell_workaround_sleep
|
||||
# Workaround for a race condition in Orwell.
|
||||
# Try to ensure that another check has been executed before
|
||||
# the downtime is removed.
|
||||
# 0 = dont remove the downtime at all.
|
||||
orwell_workaround_sleep="${orwell_workaround_sleep:-300}" # Seconds
|
||||
|
||||
## shaholin_customer_report_cmd
|
||||
# Action script when the hardware has improved.
|
||||
@ -770,14 +857,46 @@ Specific features with plugin football-cm3:
|
||||
shaholin_src_cpus="${shaholin_src_cpus:-4}"
|
||||
shaholin_dst_cpus="${shaholin_dst_cpus:-32}"
|
||||
|
||||
## ip_renumber_cmd
|
||||
# Cross-call with another independent project.
|
||||
ip_renumber_cmd="${ip_renumber_cmd:-}"
|
||||
|
||||
## shaholin_finished_log
|
||||
# ShaHoLin-specific logfile, reporting _only_ successful completion
|
||||
# of an action.
|
||||
shaholin_finished_log="${shaholin_finished_log:-$football_logdir/shaholin-finished.log}"
|
||||
|
||||
## shaholin_action
|
||||
## update_cmd
|
||||
# OPTIONAL: specific action script with parameters.
|
||||
shaholin_action="${shaholin_action:-}"
|
||||
update_cmd="${update_cmd:-}"
|
||||
|
||||
## update_host
|
||||
# To be provided in a *.conf or *.preconf file.
|
||||
update_host="${update_host:-}"
|
||||
|
||||
## parse_ticket
|
||||
# Regex for identifying tickets from script outputs or arguments
|
||||
parse_ticket="${parse_ticket:-TECCM-[0-9]+}"
|
||||
|
||||
## prefer_parsed_ticket
|
||||
# Workaround bugs from getting inconsistent ticket IDs from different sources.
|
||||
prefer_parsed_ticket="${prefer_parsed_ticket:-0}"
|
||||
|
||||
## translate_db_state
|
||||
# Whether to use the following mapping definitions.
|
||||
translate_db_state="${translate_db_state:-0}"
|
||||
|
||||
## db_state_*
|
||||
# Map logical names to the ones in the database.
|
||||
db_state_init="${db_state_init:-}"
|
||||
db_state_prepare="${db_state_prepare:-}"
|
||||
db_state_finish="${db_state_finish:-}"
|
||||
db_state_cleanup="${db_state_cleanup:-}"
|
||||
db_state_done="${db_state_done:-}"
|
||||
|
||||
## use_type_for_ticket
|
||||
# Internal ticketing convention.
|
||||
use_type_for_ticket="${use_type_for_ticket:-1}"
|
||||
|
||||
## auto_handover
|
||||
# Load-balancing accross locations.
|
||||
@ -791,6 +910,11 @@ Specific features with plugin football-cm3:
|
||||
# Thus it tries to reduce unnecessary handovers to other locations.
|
||||
auto_handover="${auto_handover:-1}"
|
||||
|
||||
## preferred_location
|
||||
# When set, override any other pre-handover to this location.
|
||||
# Useful for maintenance of a whole datacenter.
|
||||
preferred_location="${preferred_location:-}"
|
||||
|
||||
|
||||
PLUGIN football-ticket
|
||||
|
||||
@ -847,6 +971,32 @@ PLUGIN football-ticket
|
||||
# directories $football_creds $football_confs $football_includes
|
||||
ticket_require_comment="${ticket_require_comment:-1}"
|
||||
|
||||
## ticket_for_migrate
|
||||
# Optional 1&1-specific: separate ticket for migrate.
|
||||
# Useful when migrate+shink need to post into separate tickets.
|
||||
ticket_for_migrate="${ticket_for_migrate:-}"
|
||||
|
||||
## ticket_for_shrink
|
||||
# Optional 1&1-specific: separate ticket for migrate.
|
||||
# Useful when migrate+shink need to post into separate tickets.
|
||||
ticket_for_shrink="${ticket_for_shrink:-}"
|
||||
|
||||
## ticket_prefer_cached
|
||||
# Workaround a bug in ticket ID retrieval:
|
||||
# Trust my own cached values more than trust the "inconsistent read".
|
||||
ticket_prefer_cached="${ticket_prefer_cached:-1}"
|
||||
|
||||
## ticket_code
|
||||
# List of operation:res:shard
|
||||
ticket_code="${ticket_code:-}"
|
||||
|
||||
## get_ticket_code
|
||||
get_ticket_code="${get_ticket_code:-}"
|
||||
|
||||
## max_start_ticket
|
||||
# Maximum number of instances to start per call
|
||||
max_start_ticket="${max_start_ticket:-1}"
|
||||
|
||||
|
||||
PLUGIN football-basic
|
||||
|
||||
@ -1006,8 +1156,13 @@ PLUGIN football-waiting
|
||||
# By setting this, you can delay the cleanup operations for some time.
|
||||
# This way, you are keeping the old LV contents as a kind of "backup"
|
||||
# for some limited time.
|
||||
# HINT: dont set to wait_before_cleanuplarge values, because it can
|
||||
# seriously slow down Football.
|
||||
#
|
||||
# HINT1: dont set wait_before_cleanup to very large values, because it can
|
||||
# seriously slow down Football.
|
||||
#
|
||||
# HINT2: the waiting time starts when the last MARS replica was created.
|
||||
# Only when the syncing times are _smaller_ than this value,
|
||||
# an _additional_ delay will be produced.
|
||||
enable_cleanup_delayed="${enable_cleanup_delayed:-0}"
|
||||
wait_before_cleanup="${wait_before_cleanup:-180}" # Minutes
|
||||
|
||||
|
@ -11,7 +11,7 @@ Actions for resource migration:
|
||||
Run the sequence
|
||||
migrate_prepare ; migrate_wait ; migrate_finish; migrate_cleanup.
|
||||
|
||||
Dto for testing of phases:
|
||||
Dto for testing (do not rely on it):
|
||||
|
||||
./football.sh migrate_prepare <resource> <target_primary> [<target_secondary>]
|
||||
Allocate LVM space at the targets and start MARS replication.
|
||||
@ -31,7 +31,7 @@ Actions for inplace FS shrinking:
|
||||
./football.sh shrink <resource> <percent>
|
||||
Run the sequence shrink_prepare ; shrink_finish ; shrink_cleanup.
|
||||
|
||||
Dto for testing of phases:
|
||||
Dto for testing (do not rely on it):
|
||||
|
||||
./football.sh shrink_prepare <resource> [<percent>]
|
||||
Allocate temporary LVM space (when possible) and create initial
|
||||
@ -113,6 +113,10 @@ Actions for (manual) repair in emergency situations:
|
||||
Manually lock or unlock an item at all of the given hosts, in
|
||||
an atomic fashion. In most cases, use "ALL" for the item.
|
||||
|
||||
Only for testing / development (no stable interfaces):
|
||||
|
||||
./football.sh manual_call_hook <name> <args>
|
||||
|
||||
Global maintenance:
|
||||
|
||||
./football.sh lv_cleanup <resource>
|
||||
|
1006
docu/mars-manual.lyx
1006
docu/mars-manual.lyx
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@ -13,11 +13,15 @@ Synopsis:
|
||||
./screener.sh --help [--verbose]
|
||||
./screener.sh list-running
|
||||
./screener.sh list-waiting
|
||||
./screener.sh list-interrupted
|
||||
./screener.sh list-illegal
|
||||
./screener.sh list-timeouted
|
||||
./screener.sh list-failed
|
||||
./screener.sh list-critical
|
||||
./screener.sh list-serious
|
||||
./screener.sh list-done
|
||||
./screener.sh list
|
||||
./screener.sh list-archive
|
||||
./screener.sh list-screens
|
||||
./screener.sh run <file.csv> [<condition_list>]
|
||||
./screener.sh start <screen_id> <cmd> <args...>
|
||||
@ -162,6 +166,9 @@ Cleanup / bookkeeping:
|
||||
|
||||
./screener.sh clear-critical <screen_id>
|
||||
./screener.sh clear-serious <screen_id>
|
||||
./screener.sh clear-interrupted <screen_id>
|
||||
./screener.sh clear-illegal <screen_id>
|
||||
./screener.sh clear-timeouted <screen_id>
|
||||
./screener.sh clear-failed <screen_id>
|
||||
Mark the status as "done" and move the logfile away.
|
||||
|
||||
@ -227,7 +234,8 @@ Options:
|
||||
|
||||
## session_timeout
|
||||
# Detect hanging sessions when they don't produce any output anymore
|
||||
# for a longer time. Hanging sessions are then marked as failed or critical.
|
||||
# for a longer time. Hanging sessions are then marked as either
|
||||
# 'timeout' or 'critical'.
|
||||
session_timeout="${session_timeout:-$(( 3600 * 3 ))}" # seconds
|
||||
|
||||
## screener_logdir or logdir
|
||||
@ -250,6 +258,11 @@ Options:
|
||||
# from $screener_logdir/*/ when this period is exceeded.
|
||||
screener_log_purge_period="${screener_log_purge_period:-30}" # Days
|
||||
|
||||
## screener_log_purge_archive
|
||||
# When set, the logfiles will be moved to $screener_logdir/archive/
|
||||
# Otherwise they will be deleted.
|
||||
screener_log_purge_archive="${screener_log_purge_archive:-1}"
|
||||
|
||||
## dry_run
|
||||
# Dont actually start screen sessions when set.
|
||||
dry_run="${dry_run:-0}"
|
||||
@ -296,6 +309,21 @@ Options:
|
||||
# of a failed command.
|
||||
serious_status="${serious_status:-198}"
|
||||
|
||||
## interrupted_status
|
||||
# This is the "magic" exit code indicating a manual interruption
|
||||
# (e.g. keypress Ctl-c)
|
||||
interrupted_status="${interrupted_status:-190}"
|
||||
|
||||
## illegal_status
|
||||
# This is the "magic" exit code indicating an illegal command
|
||||
# (e.g. syntax error, illegal arguments, etc)
|
||||
illegal_status="${illegal_status:-191}"
|
||||
|
||||
## timeouted_status
|
||||
# This is the "magic" internal code indicating a
|
||||
# hanging session (see $session_timeout).
|
||||
timeouted_status="${timeouted_status:-195}"
|
||||
|
||||
## less_cmd
|
||||
# Used at $0 less $id
|
||||
less_cmd="${less_cmd:-less -r}"
|
||||
@ -326,6 +354,11 @@ Options:
|
||||
export user_name="${user_name:-$(ssh-add -l | grep -o '[^ ]+@[^ ]+' | sort -u | tail -1)}"
|
||||
export user_name="${user_name:-$LOGNAME}"
|
||||
|
||||
## screener_break_timeout
|
||||
# Avoid deadlocks by breaking a screener lock after this timeout has elapsed.
|
||||
# NOTICE: these type of locks are only intended for short-term locking.
|
||||
screener_break_timeout="${screener_break_timeout:-30}" # seconds
|
||||
|
||||
## tmp_dir and tmp_stub
|
||||
# Where temporary files are residing
|
||||
tmp_dir="${tmp_dir:-/tmp}"
|
||||
|
@ -12,11 +12,15 @@ Synopsis:
|
||||
./screener.sh --help [--verbose]
|
||||
./screener.sh list-running
|
||||
./screener.sh list-waiting
|
||||
./screener.sh list-interrupted
|
||||
./screener.sh list-illegal
|
||||
./screener.sh list-timeouted
|
||||
./screener.sh list-failed
|
||||
./screener.sh list-critical
|
||||
./screener.sh list-serious
|
||||
./screener.sh list-done
|
||||
./screener.sh list
|
||||
./screener.sh list-archive
|
||||
./screener.sh list-screens
|
||||
./screener.sh run <file.csv> [<condition_list>]
|
||||
./screener.sh start <screen_id> <cmd> <args...>
|
||||
@ -161,6 +165,9 @@ Cleanup / bookkeeping:
|
||||
|
||||
./screener.sh clear-critical <screen_id>
|
||||
./screener.sh clear-serious <screen_id>
|
||||
./screener.sh clear-interrupted <screen_id>
|
||||
./screener.sh clear-illegal <screen_id>
|
||||
./screener.sh clear-timeouted <screen_id>
|
||||
./screener.sh clear-failed <screen_id>
|
||||
Mark the status as "done" and move the logfile away.
|
||||
|
||||
|
2
football
2
football
@ -1 +1 @@
|
||||
Subproject commit 5364ccf7e376e509ff8decb207cd0a8a96d29e22
|
||||
Subproject commit ac9cdb926c9355f3fedc50b5ae4b4cc06276165b
|
@ -364,7 +364,6 @@ const char *rot_keys[] = {
|
||||
"err-replay-stop",
|
||||
// from _check_logging_status()
|
||||
"inf-replay-tolerance",
|
||||
"err-replay-size",
|
||||
NULL,
|
||||
};
|
||||
|
||||
@ -3504,8 +3503,6 @@ int _check_logging_status(struct mars_rotate *rot, int *log_nr, long long *oldpo
|
||||
*newpos = rot->aio_info.current_size;
|
||||
|
||||
if (unlikely(rot->aio_info.current_size < *oldpos_start)) {
|
||||
MARS_ERR_TO(rot->log_say, "oops, bad replay position attempted at logfile '%s' (file length %lld should never be smaller than requested position %lld, is your filesystem corrupted?) => please repair this by hand\n", rot->aio_dent->d_path, rot->aio_info.current_size, *oldpos_start);
|
||||
make_rot_msg(rot, "err-replay-size", "oops, bad replay position attempted at logfile '%s' (file length %lld should never be smaller than requested position %lld, is your filesystem corrupted?) => please repair this by hand", rot->aio_dent->d_path, rot->aio_info.current_size, *oldpos_start);
|
||||
status = -EBADF;
|
||||
goto done;
|
||||
}
|
||||
|
@ -1485,14 +1485,22 @@ sub detect_splitbrain {
|
||||
# dynamic programming
|
||||
return $detected_splits{$res} if defined($detected_splits{$res});
|
||||
my $basedir = "$mars/resource-$res";
|
||||
my $retry = 2;
|
||||
my $ok = 1;
|
||||
my @list = glob("$mars/resource-$res/replay-*");
|
||||
my @hosts = map { $_ =~ s:.*/replay-::; $_ } @list;
|
||||
AGAIN:
|
||||
foreach my $host1 (@hosts) {
|
||||
foreach my $host2 (@hosts) {
|
||||
next if $host1 ge $host2;
|
||||
my ($point, $split, $size1, $size2) = get_common_ancestor($basedir, $host1, $host2);
|
||||
if ($split) {
|
||||
# Workaround races.
|
||||
if ($retry > 0) {
|
||||
$retry--;
|
||||
sleep(2);
|
||||
next AGAIN;
|
||||
}
|
||||
$ok = 0;
|
||||
if ($do_report) {
|
||||
my $age = "";
|
||||
@ -1508,8 +1516,10 @@ sub detect_splitbrain {
|
||||
$age = " age ~" . seconds2human(mars_time() - $stamp) if $stamp;
|
||||
}
|
||||
lwarn "SPLIT BRAIN of resource '$res' after logfile '$point'$age\n";
|
||||
lwarn " hostA = '$host1' logfile_amount='$size1' (" . number2human($size1) . ")\n";
|
||||
lwarn " hostB = '$host2' logfile_amount='$size2' (" . number2human($size2) . ")\n";
|
||||
if ($point) {
|
||||
lwarn " hostA = '$host1' logfile_amount='$size1' (" . number2human($size1) . ")\n";
|
||||
lwarn " hostB = '$host2' logfile_amount='$size2' (" . number2human($size2) . ")\n";
|
||||
}
|
||||
} else {
|
||||
return $ok;
|
||||
}
|
||||
@ -2516,7 +2526,7 @@ sub create_res {
|
||||
mkdir("$resdir/actual-$host");
|
||||
my $todo = "$resdir/todo-$host";
|
||||
mkdir($todo);
|
||||
set_link("1", "$todo/attach");
|
||||
set_link("0", "$todo/attach");
|
||||
set_link("1", "$todo/connect");
|
||||
set_link("1", "$todo/sync");
|
||||
set_link("1", "$todo/allow-replay");
|
||||
@ -2549,8 +2559,17 @@ sub create_res {
|
||||
set_link("0", "$resdir/syncstatus-$host");
|
||||
finish_links();
|
||||
rsync_cmd($primary, "--max-size=1 --update $file $primary:$mars/resource-$res/", 1);
|
||||
# Re-read the primary replaylink because it might have log-rotated in the meantime
|
||||
wait_cluster($cmd, $res, $primary);
|
||||
my $replay = get_link("$resdir/replay-$primary");
|
||||
if ($replay =~ m/^log-([0-9]+)-/) {
|
||||
$replay_nr = $1;
|
||||
_set_replaylink($resdir, $replay_nr, $primary, "");
|
||||
}
|
||||
lprint "successfully joined resource '$res'\n";
|
||||
}
|
||||
set_link("1", "$todo/attach");
|
||||
finish_links();
|
||||
_systemd_trigger($cmd);
|
||||
}
|
||||
|
||||
@ -4130,12 +4149,21 @@ sub eval_fn {
|
||||
my $peer = parse_macro($arg1, $env);
|
||||
$peer = $$env{"host"} unless $peer;
|
||||
my $replay = get_link($$env{"resdir"} . "/replay-$peer", 1);
|
||||
$replay =~ m/^(log-[^,]+),/;
|
||||
$replay =~ m/^(log-[^,]+),([0-9]*)/;
|
||||
my $logfile = $$env{"resdir"} . "/" . $1;
|
||||
if (-r $logfile) {
|
||||
return 0;
|
||||
my $logpos = $2;
|
||||
if (! -r $logfile) {
|
||||
return 1;
|
||||
}
|
||||
return 1;
|
||||
my @stat = stat($logfile);
|
||||
if (!@stat) {
|
||||
return 1;
|
||||
}
|
||||
my $size= $stat[7];
|
||||
if ($size < $logpos) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
if (/^is[-_]?(almost[-_]?)?consistent$/) {
|
||||
my $almost = $1;
|
||||
|
Loading…
Reference in New Issue
Block a user