diff --git a/ChangeLog b/ChangeLog index e5f935a1..52a8b34c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -285,6 +285,17 @@ Hint: branch 0.1a will get a merge from here, and then get the (except Football related ones) will then go to 0.1b. Finally, when 0.1a is stable, I will close this branch. +mars0.1stable60 + * Major improvement: new option --ignore-sync allows primary + Handover without --force even when some sync is running + somewhere. Any running syncs will restart from scratch + (which might take some time, depending on LV size and + many more factors like the network). + * Minor fix: split-cluster did not work correctly when no + resources were existing anymore, at all. + * Doc: major update. More explanation on CAP theorem, and + on differences / commonalities with DRBD. + mars0.1stable59 * Major fix: "marsadm up" did not work when sync could not be started. Now does "best effort". diff --git a/docu/football-verbose.help b/docu/football-verbose.help index a9c95d6d..d582a42f 100644 --- a/docu/football-verbose.help +++ b/docu/football-verbose.help @@ -44,7 +44,9 @@ Actions for inplace FS shrinking: Actions for inplace FS extension: + ./football.sh expand ./football.sh extend + Increase mounted filesystem size during operations. Combined actions: @@ -168,6 +170,11 @@ General features: # Set this to your convenience. football_logdir="${football_logdir:-${logdir:-$HOME/football-logs}}" + ## football_backup_dir + # In this directory, various backups are created. + # Intended for manual repair. + football_backup_dir="${football_backup_dir:-$football_logdir/backups}" + ## screener # When enabled, handover execution to the screener. # Very useful for running Football in masses. @@ -201,6 +208,11 @@ General features: # of the temporary shrink mirror filesystem. rsync_opt_prepare="${rsync_opt_prepare:---exclude='.filemon2' --delete}" + ## rsync_opt_hot + # This is only used at the final rsync, immediately before going + # online again. + rsync_opt_hot="${rsync_opt_hot:---delete}" + ## rsync_nice # Typically, the preparation steps are run with background priority. rsync_nice="${rsync_nice:-nice -19}" @@ -215,6 +227,42 @@ General features: # Number of rsync lines to skip in output (avoid overflow of logfiles). rsync_skip_lines="${rsync_skip_lines:-1000}" + ## use_tar + # Use incremental Gnu tar in place of rsync: + # 0 = don't use tar + # 1 = only use for the first (full) data transfer, then use rsync + # 2 = always use tar + # Experience: tar has better performance on local data than rsync, but + # it tends to produce false-positive failure return codes on online + # filesystems which are altered during tar. + # The combined mode 1 tries to find a good compromise between both + # alternatives. + use_tar="${use_tar:-1}" + + ## tar_exe + # Use this for activation of patched tar versions, such as the + # 1&1-internal patched spacetools-tar. + tar_exe="${tar_exe:-/bin/tar}" + + ## tar_options_src and tar_options_dst + # Here you may give different options for both sides of tar invocations + # (source and destination), such as verbosity options etc. + tar_options_src="${tar_options_src:-}" + tar_options_dst="${tar_options_dst:-}" + + ## tar_is_fixed + # Tell whether your tar version reports false-positive transfer errors, + # or not. + tar_is_fixed="${tar_is_fixed:-0}" + + ## tar_state_dir + # This directory is used for keeping incremental tar state information. + tar_state_dir="${tar_state_dir:-/var/tmp}" + + ## buffer_cmd + # Speed up tar by intermediate buffering. + buffer_cmd="${buffer_cmd:-buffer -m 16m -S 1024m || cat}" + ## wait_timeout # Avoid infinite loops upon waiting. wait_timeout="${wait_timeout:-$(( 24 * 60 ))}" # Minutes @@ -223,6 +271,11 @@ General features: # Some LVM versions are requiring this for unattended batch operations. lvremove_opt="${lvremove_opt:--f}" + ## automatic recovery options: enable_failure_* + enable_failure_restart_vm="${enable_failure_restart_vm:-1}" + enable_failure_recreate_cluster="${enable_failure_recreate_cluster:-0}" + enable_failure_rebuild_mars="${enable_failure_rebuild_mars:-1}" + ## critical_status # This is the "magic" exit code indicating _criticality_ # of a failed command. @@ -281,5 +334,394 @@ General features: PLUGIN football-1and1config 1&1 specfic plugin for dealing with the cm3 clusters - and its concrete configuration . + and its concrete configuration. + + ## enable_1and1config + # ShaHoLin-specifc plugin for working with the infong platform + # (istore, icpu, infong) via 1&1-specific clustermanager cm3 + # and related toolsets. Much of it is bound to a singleton database + # instance (clustermw & siblings). + enable_1and1config="${enable_1and1config:-$(if [[ "$0" =~ tetris ]]; then echo 1; else echo 0; fi)}" + + +PLUGIN football-cm3 + + 1&1 specfic plugin for dealing with the cm3 cluster manager + and its concrete operating enviroment (singleton instance). + + Current maximum cluster size limit: + + Maximum #syncs running before migration can start: + + Following marsadm --version must be installed: + + Following mars kernel modules must be loaded: + +Specific actions for plugin football-cm3: + + ./football.sh clustertool {GET|PUT} + Call through to the clustertool via REST. + Useful for manual inspection and repair. + + ## enable_cm3 + # ShaHoLin-specifc plugin for working with the infong platform + # (istore, icpu, infong) via 1&1-specific clustermanager cm3 + # and related toolsets. Much of it is bound to a singleton database + # instance (clustermw & siblings). + enable_cm3="${enable_cm3:-$(if [[ "$0" =~ tetris ]]; then echo 1; else echo 0; fi)}" + + ## skip_resource_ping + # Enable this only for testing. Normally, a resource name denotes a + # container name == machine name which must be runnuing as a precondition, + # und thus must be pingable over network. + skip_resource_ping="${skip_resource_ping:-0}" + + ## date_lock + # Don't enter critical sections at certain days of the week, + # and/or during certain hours. + # This is a regex matching against "date +%u_%H" + date_lock="${date_lock:-}" + + ## check_ping_rounds + # Number of pings to try before a container is assumed to + # not respond. + check_ping_rounds="${check_ping_rounds:-5}" + + ## workaround_firewall + # Documentation of technical debt for later generations: + # This is needed since July 2017. In the many years before, no firewalling + # was effective at the replication network, because it is a physically + # separate network from the rest of the networking infrastructure. + # An attacker would first need to gain root access to the _hypervisor_ + # (not only to the LXC container and/or to KVM) before gaining access to + # those physical replication network interfaces. + # Since about that time, which is about the same time when the requirements + # for Container Football had been communicated, somebody introduced some + # unnecessary firewall rules, based on "security arguments". + # These arguments were however explicitly _not_ required by the _real_ + # security responsible person, and explicitly _not_ recommended by him. + # Now the problem is that it is almost politically impossible to get + # rid of suchalike "security feature". + # Until the problem is resolved, Container Football requires + # the _entire_ local firewall to be _temporarily_ shut down in order to + # allow marsadm commands over ssh to work. + # Notice: this is _not_ increasing the general security in any way. + # LONGTERM solution / TODO: future versions of mars should no longer + # depend on ssh. + # Then this "feature" can be turned off. + workaround_firewall="${workaround_firewall:-1}" + + ## ip_magic + # Similarly to workaround_firewall, this is needed since somebody + # introduced additional firewall rules also disabling sysadmin ssh + # connections at the _ordinary_ sysadmin network. + ip_magic="${ip_magic:-1}" + + ## do_split_cluster + # The current MARS branch 0.1a.y is not yet constructed for forming + # a BigCluster constisting of several thousands of machines. + # When a future version of mars0.1b.y (or 0.2.y) will allow this, + # this can be disabled. + do_split_cluster="${do_split_cluster:-1}" + + ## forbidden_hosts + # Regex for excluding hostnames from any Football actions. + # The script will fail when some of these is encountered. + forbidden_hosts="${forbidden_hosts:-}" + + ## forbidden_flavours + # Regex for excluding flavours from any Football actions. + # The script will fail when some of these is encountered. + forbidden_flavours="${forbidden_flavours:-}" + + ## forbidden_bz_ids + # PROVISIONARY regex for excluding certain bz_ids from any Football actions. + # NOTICE: bz_ids are deprecated and should not be used in future + # (technical debts). + # The script will fail when some of these is encountered. + forbidden_bz_ids="${forbidden_bz_ids:-}" + + ## clustertool_host + # URL prefix of the internal configuation database REST interface. + clustertool_host="${clustertool_host:-http://clustermw:3042}" + + ## clustertool_user + # Username for clustertool access. + # By default, scans for a *.password file (see next option). + clustertool_user="${clustertool_user:-$(get_cred_file "*.password" | head -1 | sed 's:.*/::g' | cut -d. -f1)}" + + ## clustertool_passwd_file + # Here you can supply the encrpted password. + # By default, a file $clustertool_user.password is used + # containing the encrypted password. + clustertool_passwd_file="${clustertool_passwd_file:-$(get_cred_file "$clustertool_user.password")}" + + ## clustertool_passwd + # Here you may override the password via config file. + # For security reasons, dont provide this at the command line. + clustertool_passwd="${clustertool_passwd:-$(< $clustertool_passwd_file)}" || echo "cannot read a password file *.password for clustermw: you MUST supply the credentials via default curl config files (see man page)" + + ## do_migrate + # Keep this enabled. Only disable for testing. + do_migrate="${do_migrate:-1}" # must be enabled; disable for dry-run testing + + ## always_migrate + # Only use for testing, or for special situation. + # This skip the test whether the resource has already migration. + always_migrate="${always_migrate:-0}" # only enable for testing + + ## check_segments + # 0 = disabled + # 1 = only display the segment names + # 2 = check for equality + # WORKAROUND, potentially harmful when used inadequately. + # The historical physical segment borders need to be removed for + # Container Football. + # Unfortunately, the subproject aiming to accomplish this did not + # proceed for one year now. In the meantime, Container Football can + # be only played within the ancient segment borders. + # After this big impediment is eventually resolved, this option + # should be switched off. + check_segments="${check_segments:-1}" + + ## enable_mod_deflate + # Internal, for support. + enable_mod_deflate="${enable_mod_deflate:-1}" + + ## enable_segment_move + # Seems to be needed by some other tooling. + enable_segment_move="${enable_segment_move:-1}" + + ## override_hwclass_id + # When necessary, override this from $include_dir/plugins/*.conf + override_hwclass_id="${override_hwclass_id:-}" # typically 25007 + + ## override_hvt_id + # When necessary, override this from $include_dir/plugins/*.conf + override_hvt_id="${override_hvt_id:-}" # typically 8057 or 8059 + + ## override_overrides + # When this is set and other override_* variables are not set, + # then try to _guess_ some values. + # No guarantees for correctness either. + override_overrides=${override_overrides:-1} + + ## iqn_base and iet_type and iscsi_eth and iscsi_tid + # Workaround: this is needed for _dynamic_ generation of iSCSI sessions + # bypassing the ordinary ones as automatically generated by the + # cm3 cluster manager (only at the old istore architecture). + # Notice: not needed for regular operations, only for testing. + # Normally, you dont want to shrink over a _shared_ 1MBit iSCSI line. + iqn_base="${iqn_base:-iqn.2000-01.info.test:test}" + iet_type="${iet_type:-blockio}" + iscsi_eth="${iscsi_eth:-eth1}" + iscsi_tid="${iscsi_tid:-4711}" + + ## monitis_downtime_script + # ShaHoLin-internal + monitis_downtime_script="${monitis_downtime_script:-}" + + ## monitis_downtime_duration + # ShaHoLin-internal + monitis_downtime_duration="${monitis_downtime_duration:-20}" # Minutes + + ## shaholin_finished_log + # ShaHoLin-specific logfile, reporting _only_ successful completion + # of an action. + shaholin_finished_log="${shaholin_finished_log:-$football_logdir/shaholin-finished.log}" + + ## ticket + # OPTIONAL: the meaning is ShaHoLin specific. + # This can be used for updating JIRA tickets. + # Can be set on the command line like "./tetris.sh $args --ticket=TECCM-4711 + ticket="${ticket:-}" + + ## ticket_get_cmd + # Optional: when set, this script can be used for retrieving ticket IDs + # in place of commandline option --ticket= + ticket_get_cmd="${ticket_get_cmd:-}" + + ## ticket_update_cmd + # This can be used for calling an external command which updates + # the ticket(s) given by the $ticket parameter. + ticket_update_cmd="${ticket_update_cmd:-}" + + ## shaholin_action + # OPTIONAL: specific action script with parameters. + shaholin_action="${shaholin_action:-}" + + +PLUGIN football-basic + + Generic driver for systemd-controlled MARS pools. + The current version supports only a flat model: + (1) There is a single "big cluster" at metadata level. + All cluster members are joined via merge-cluster. + All occurring names need to be globally unique. + (2) The network uses BGP or other means, thus any hypervisor + can (potentially) start any VM at any time. + (3) iSCSI or remote devices are not supported for now + (LocalSharding model). This may be extended in a future + release. + This plugin is exclusive-or with cm3. + +Plugin specific actions: + + ./football.sh basic_add_host + Manually add another host to the hostname cache. + + ## pool_cache_dir + # Directory for caching the pool status. + pool_cache_dir="${pool_cache_dir:-$script_dir/pool-cache}" + + ## initial_hostname_file + # This file must contain a list of storage and/or hypervisor hostnames + # where a /mars directory must exist. + # These hosts are then scanned for further cluster members, + # and the transitive closure of all host names is computed. + initial_hostname_file="${initial_hostname_file:-./hostnames.input}" + + ## hostname_cache + # This file contains the transitive closure of all host names. + hostname_cache="${hostname_cache:-$pool_cache_dir/hostnames.cache}" + + ## resources_cache + # This file contains the transitive closure of all resource names. + resources_cache="${resources_cache:-$pool_cache_dir/resources.cache}" + + ## res2hyper_cache + # This file contains the association between resources and hypervisors. + res2hyper_cache="${res2hyper_cache:-$pool_cache_dir/res2hyper.assoc}" + + ## enable_basic + # This plugin is exclusive-or with cm3. + enable_basic="${enable_basic:-$(if [[ "$0" =~ football ]]; then echo 1; else echo 0; fi)}" + + ## ssh_port + # Set this for separating sysadmin access from customer access + ssh_port="${ssh_port:-}" + + ## basic_mnt_dir + # Names the mountpoint directory at hypervisors. + # This must co-incide with the systemd mountpoints. + basic_mnt_dir="${basic_mnt_dir:-/mnt}" + + +PLUGIN football-downtime + + Generic plugin for communication of customer downtime. + + ## downtime_cmd_{set,unset} + # External command for setting / unsetting (or communicating) a downtime + # Empty = don't do anything + downtime_cmd_set="${downtime_cmd_set:-}" + downtime_cmd_unset="${downtime_cmd_unset:-}" + + +PLUGIN football-motd + + Generic plugin for motd. Communicate that Football is running + at login via motd. + + ## enable_motd + # whether to use the motd plugin. + enable_motd="${enable_motd:-0}" + + ## update_motd_cmd + # Distro-specific command for generating motd from several sources. + # Only tested for Debian Jessie at the moment. + update_motd_cmd="${update_motd_cmd:-update-motd}" + + ## download_motd_script and motd_script_dir + # When no script has been installed into /etc/update-motd.d/ + # you can do it dynamically here, bypassing any "official" deployment + # methods. Use this only for testing! + # An example script (which should be deployed via your ordinary methods) + # can be found under $script_dir/update-motd.d/67-football-running + download_motd_script="${download_motd_script:-}" + motd_script_dir="${motd_script_dir:-/etc/update-motd.d}" + + ## motd_file + # This will contain the reported motd message. + # It is created by this plugin. + motd_file="${motd_file:-/var/motd/football.txt}" + + ## motd_color_on and motd_color_off + # ANSI escape sequences for coloring the generated motd message. + motd_color_on="${motd_color_on:-\\033[31m}" + motd_color_off="${motd_color_off:-\\033[0m}" + + +PLUGIN football-report + + Generic plugin for communication of reports. + + ## report_cmd_{start,warning,failed,finished} + # External command which is called at start / failure / finish + # of Football. + # The following variables can be used (e.g. as parameters) when + # escaped with a backslash: + # $res = name of the resource (LV, container, etc) + # $primary = the current (old) + # $secondary_list = list of current (old) secondaries + # $target_primary = the target primary name + # $target_secondary = list of target secondaries + # $operation = the operation name + # $target_percent = the value used for shrinking + # $txt = some informative text from Football + # Further variables are possible by looking at the sourcecode, or by + # defining your own variables or functions externally or via plugins. + # Empty = don't do anything + report_cmd_start="${report_cmd_start:-}" + report_cmd_warning="${report_cmd_warning:-$script_dir/screener.sh notify "$res" warning "$txt"}" + report_cmd_failed="${report_cmd_failed:-}" + report_cmd_finished="${report_cmd_finished:-}" + + +PLUGIN football-waiting + + Generic plugig, interfacing with screener: when this is used + by your script and enabled, then you will be able to wait for + "screener.sh continue" operations at certain points in your + script. + + ## enable_*_waiting + # + # When this is enabled, and when Football had been started by screener, + # then football will delay the start of several operations until a sysadmin + # does one of the following manually: + # + # a) ./screener.sh continue $session + # b) ./screener.sh resume $session + # c) ./screener.sh attach $session and press the RETURN key + # d) doing nothing, and $wait_timeout has exceeded + # + # CONVENTION: football resource names are used as screener session ids. + # This ensures that only 1 operation can be started for the same resource, + # and it simplifies the handling for junior sysadmins. + # + enable_startup_waiting="${enable_startup_waiting:-0}" + enable_handover_waiting="${enable_handover_waiting:-0}" + enable_migrate_waiting="${enable_migrate_waiting:-0}" + enable_shrink_waiting="${enable_shrink_waiting:-0}" + + ## enable_cleanup_delayed and wait_before_cleanup + # By setting this, you can delay the cleanup operations for some time. + # This way, you are keeping the old LV contents as a kind of "backup" + # for some limited time. + # HINT: dont set to wait_before_cleanuplarge values, because it can + # seriously slow down Football. + enable_cleanup_delayed="${enable_cleanup_delayed:-0}" + wait_before_cleanup="${wait_before_cleanup:-180}" # Minutes + + ## reduce_wait_msg + # Instead of reporting the waiting status once per minute, + # decrease the frequency of resporting. + # Warning: dont increase this too much. Do not exceed + # session_timeout/2 from screener. Because of the Nyquist criterion, + # stay on the safe side by setting session_timeout at least to _twice_ + # the time than here. + reduce_wait_msg="${reduce_wait_msg:-60}" # Minutes + + \end{verbatim} diff --git a/docu/football.help b/docu/football.help index 9cf8ea3c..2f3e9257 100644 --- a/docu/football.help +++ b/docu/football.help @@ -43,7 +43,9 @@ Actions for inplace FS shrinking: Actions for inplace FS extension: + ./football.sh expand ./football.sh extend + Increase mounted filesystem size during operations. Combined actions: @@ -126,7 +128,8 @@ General features: PLUGIN football-1and1config 1&1 specfic plugin for dealing with the cm3 clusters - and its concrete configuration . + and its concrete configuration. + PLUGIN football-cm3 @@ -141,6 +144,12 @@ PLUGIN football-cm3 Following mars kernel modules must be loaded: +Specific actions for plugin football-cm3: + + ./football.sh clustertool {GET|PUT} + Call through to the clustertool via REST. + Useful for manual inspection and repair. + PLUGIN football-basic @@ -162,6 +171,11 @@ Plugin specific actions: Manually add another host to the hostname cache. +PLUGIN football-downtime + + Generic plugin for communication of customer downtime. + + PLUGIN football-motd Generic plugin for motd. Communicate that Football is running @@ -180,4 +194,5 @@ PLUGIN football-waiting "screener.sh continue" operations at certain points in your script. + \end{verbatim} diff --git a/docu/images/cap-drbd-connected.fig b/docu/images/cap-drbd-connected.fig new file mode 100644 index 00000000..b3dd9505 --- /dev/null +++ b/docu/images/cap-drbd-connected.fig @@ -0,0 +1,17 @@ +#FIG 3.2 Produced by xfig version 3.2.5c +Landscape +Center +Metric +A4 +100.00 +Single +-2 +1200 2 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4 + 450 1800 1800 0 3150 1800 450 1800 +2 1 0 3 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 1800 0 3150 1800 +4 0 0 50 -1 18 12 0.0000 4 195 1515 2115 90 C = Consistency\001 +4 0 0 50 -1 18 12 0.0000 4 195 1410 405 2115 A = Availability\001 +4 0 0 50 -1 18 12 0.0000 4 195 2445 3060 2070 P = Partitioning Tolerance\001 +4 0 4 50 -1 18 40 0.0000 4 480 435 450 2025 X\001 diff --git a/docu/images/cap-drbd-disconnected.fig b/docu/images/cap-drbd-disconnected.fig new file mode 100644 index 00000000..54d49cba --- /dev/null +++ b/docu/images/cap-drbd-disconnected.fig @@ -0,0 +1,17 @@ +#FIG 3.2 Produced by xfig version 3.2.5c +Landscape +Center +Metric +A4 +100.00 +Single +-2 +1200 2 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4 + 450 1800 1800 0 3150 1800 450 1800 +2 1 0 3 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 3150 1800 450 1800 +4 0 0 50 -1 18 12 0.0000 4 195 1515 2115 90 C = Consistency\001 +4 0 0 50 -1 18 12 0.0000 4 195 1410 405 2115 A = Availability\001 +4 0 0 50 -1 18 12 0.0000 4 195 2445 3060 2070 P = Partitioning Tolerance\001 +4 0 4 50 -1 18 40 0.0000 4 480 435 1755 360 X\001 diff --git a/docu/images/cap-drbd-operational.fig b/docu/images/cap-drbd-operational.fig new file mode 100644 index 00000000..f1f7278c --- /dev/null +++ b/docu/images/cap-drbd-operational.fig @@ -0,0 +1,16 @@ +#FIG 3.2 Produced by xfig version 3.2.5c +Landscape +Center +Metric +A4 +100.00 +Single +-2 +1200 2 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4 + 450 1800 1800 0 3150 1800 450 1800 +2 1 0 3 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 1800 0 450 1800 +4 0 0 50 -1 18 12 0.0000 4 195 1515 2115 90 C = Consistency\001 +4 0 0 50 -1 18 12 0.0000 4 195 1410 405 2115 A = Availability\001 +4 0 0 50 -1 18 12 0.0000 4 195 2445 3060 2070 P = Partitioning Tolerance\001 diff --git a/docu/images/cap-mars.fig b/docu/images/cap-mars.fig new file mode 100644 index 00000000..c98b1207 --- /dev/null +++ b/docu/images/cap-mars.fig @@ -0,0 +1,16 @@ +#FIG 3.2 Produced by xfig version 3.2.5c +Landscape +Center +Metric +A4 +100.00 +Single +-2 +1200 2 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4 + 450 1800 1800 0 3150 1800 450 1800 +2 1 0 3 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 3150 1800 450 1800 +4 0 0 50 -1 18 12 0.0000 4 195 1410 405 2115 A = Availability\001 +4 0 0 50 -1 18 12 0.0000 4 195 2445 3060 2070 P = Partitioning Tolerance\001 +4 0 0 50 -1 18 12 0.0000 4 195 1515 2115 90 C = Consistency\001 diff --git a/docu/images/cap-theorem.fig b/docu/images/cap-theorem.fig new file mode 100644 index 00000000..cd80540c --- /dev/null +++ b/docu/images/cap-theorem.fig @@ -0,0 +1,14 @@ +#FIG 3.2 Produced by xfig version 3.2.5c +Landscape +Center +Metric +A4 +100.00 +Single +-2 +1200 2 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4 + 450 1800 1800 0 3150 1800 450 1800 +4 0 0 50 -1 18 12 0.0000 4 195 1515 2115 90 C = Consistency\001 +4 0 0 50 -1 18 12 0.0000 4 195 1410 405 2115 A = Availability\001 +4 0 0 50 -1 18 12 0.0000 4 195 2445 3060 2070 P = Partitioning Tolerance\001 diff --git a/docu/mars-manual.lyx b/docu/mars-manual.lyx index d6123826..be87c154 100644 --- a/docu/mars-manual.lyx +++ b/docu/mars-manual.lyx @@ -141,7 +141,7 @@ tst@1und1.de \end_layout \begin_layout Date -Version 0.1a-12 +Version 0.1a-13 \end_layout \begin_layout Lowertitleback @@ -483,12 +483,195 @@ eventually consistent \end_layout \begin_layout Standard -There are some consequences from this definition: +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Notice that the term +\begin_inset Quotes eld +\end_inset + +network +\begin_inset Quotes erd +\end_inset + + does not occur in this definition. + However, the term +\begin_inset Quotes eld +\end_inset + +distributed resources +\begin_inset Quotes erd +\end_inset + + is implying +\emph on +some(!) +\emph default + kind of network. \end_layout -\begin_layout Enumerate -Distributed Storage, in particular BigCluster architectures (see section +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Important! The definition does +\emph on +not +\emph default + imply some +\emph on +specific +\emph default + type of network, such as a +\series bold +storage network +\series default + which must be capable of transporting masses of IO operations in +\series bold +realtime +\series default +. + We are free to use other types of networks, such as +\series bold +replication networks +\series default +, which need not be dimensioned for realtime IO traffic, but are usable + for +\series bold +background data migration +\series default +, and even over long distances, where the network typically has some bottlenecks. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Notice that the definition says nothing about the +\series bold +time scale +\series default + of operations +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice: go down to a time scale of microseconds. + You will then notice that typical IO operations will require several hundreds + of machine instructions between IO request +\emph on +submission +\emph default + and the corresponding IO request +\emph on +completion +\emph default +. + This is not only true for local IO. + In network clusters like Ceph, it will even involve creation of network + packets, and lead to additional IO latencies implied by the network packet + transfer latencies. +\end_layout + +\end_inset + +. + We are free to implement certain operations, such as background data migrations +, in a rather long timescale (from a human point of view). + Example: increasing the number of replicas in an operational Ceph cluster, + already containing a few hundreds of terabytes of data, will not only require + additional storage hardware, but also take a rather long time, implied + by the very nature of such reorganisational tasks. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +The famous CAP theorem is one of the motivations behind requirement (4) +\begin_inset Quotes eld +\end_inset + +eventually consistent +\begin_inset Quotes erd +\end_inset + +. + This is not an accident. + There is a +\emph on +reason +\emph default + for it, although it is not a +\emph on +hard +\emph default + requirement. + Strict consistency is not needed for many applications running on top of + cloud storage. + In addition, the CAP theorem and some other theorems cited at +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + +https://en.wikipedia.org/wiki/CAP_theorem +\end_layout + +\end_inset + + are telling us that Strict Consistency would be +\series bold + difficult and expensive +\series default + to achieve at global level in a bigger Distributed System, and at the cost + of other properties. + More detailed explanations are in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Explanation-via-CAP" + +\end_inset + +. +\end_layout + +\begin_layout Standard +There are some consequences from this definition of Cloud Storage, for each + of our high-level storage architectures: +\end_layout + +\begin_layout Description +Distributed +\begin_inset space ~ +\end_inset + +Storage, in particular +\family typewriter +BigCluster +\family default + architectures (see section \begin_inset CommandInset ref LatexCommand ref reference "sec:Distributed-vs-Local:" @@ -501,8 +684,12 @@ s. of data. \end_layout -\begin_layout Enumerate -Centralized Storage: does not conform to (1) and to (4) by definition +\begin_layout Description +Centralized +\begin_inset space ~ +\end_inset + +Storage: does not conform to (1) and to (4) by definition \begin_inset Foot status open @@ -533,11 +720,11 @@ almost sub-component \emph default ). - Typical granularity is replication of whole storage pools, or of LVs, or - of filesystem data. + Typical granularity is replication of whole internal storage pools, or + of LVs, or of filesystem data. \end_layout -\begin_layout Enumerate +\begin_layout Description LocalStorage, and some further models like RemoteSharding (see section \begin_inset CommandInset ref LatexCommand ref @@ -576,8 +763,38 @@ Big Virtual LVM Pool \end_layout \begin_layout Description -(4) can be achieved by MARS, which provides two different consistency guarantees - at different levels, +(4) at least Eventually Consistent or better can be alternatively achieved + by +\end_layout + +\begin_deeper +\begin_layout Description +(4a) +\series bold +DRBD +\series default +, which provides Strict consistency during +\family typewriter +connected +\family default + state, but works only reliably with passive crossover cables over short + distances (see CAP theorem in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Explanation-via-CAP" + +\end_inset + +). +\end_layout + +\begin_layout Description +(4b) +\series bold +MARS +\series default +, which works over long distances and provides two different consistency + guarantees at different levels, \emph on both at the same time \emph default @@ -590,7 +807,7 @@ locally: Strict local consistency at LV granularity, also \emph on within \emph default - any LV replica. + each of the LV replicas. \end_layout \begin_layout Description @@ -603,6 +820,53 @@ between \end_deeper \end_deeper +\end_deeper +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Notice: +\family typewriter +BigCluster +\family default + architectures are creating +\emph on +virtual +\emph default + storage pools out of physically distributed storage servers. + For fairness reasons, creation of a big virtual LVM pool, must be considered + as +\emph on +another +\emph default + valid Cloud Storage +\emph on +model +\emph default +, matching the above definition of Cloud Storage. + The main architectural difference is granularity, as explained in section + +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Granularity-at-Architecture" + +\end_inset + +, and the stacking order of sub-components. + Notice that Football is creating +\series bold +location transparency +\series default + inside of the distributed virtual LVM pool. + This is an important (though not always required) basic property of any + type of clusters and/or grids. +\end_layout + \begin_layout Section Granularity at Architecture \begin_inset CommandInset label @@ -2307,7 +2571,7 @@ Summary: CentralStorage is something for \begin_layout Itemize \noindent -small to medium-sized companies which don't have the +Small to medium-sized companies which don't have the \series bold manpower \series default @@ -2324,7 +2588,7 @@ skills \series bold \emph on -monolithic +Monolithic \emph default enterprise applications \series default @@ -2337,7 +2601,7 @@ Vendor Lock-In \end_layout \begin_layout Itemize -when your application +When your application \series bold is neither shardable \series default @@ -2955,8 +3219,8 @@ e way, big cluster architectures as implemented for example in Ceph or Swift \end_layout \begin_layout Standard -When sharding is possible, it is the preferred model due to reliability - and cost and performance reasons. +In the following sections, we will see: when sharding is possible, it is + the preferred model due to reliability and cost and performance reasons. \end_layout \begin_layout Subsection @@ -3032,7 +3296,7 @@ RemoteSharding This variant needs a (possibly dedicated) storage network, \begin_inset Formula $O(n)$ \end_inset -. + in total. Each storage server exports a block device over iSCSI (or over another transport) to at most \begin_inset Formula $O(k)$ @@ -3220,7 +3484,7 @@ ng model in favor of BigClusterSharding when ... \begin_layout Itemize ... - when more than 1 LV instance is placed onto your + when more than 1 LV instance would be placed onto your \begin_inset Quotes eld \end_inset @@ -8075,7 +8339,7 @@ reference "subsec:Reliability-Differences-CentralStorage" . Notice that the current self-built backup solution for a total of 15 billions of inodes is based on a sharding model; converting this to some more or - less centralized solution turns out as another challenge. + less centralized solution would turn out as another challenge. \end_layout \begin_layout Standard @@ -8617,7 +8881,7 @@ true \end_inset - Filesystems on top of object stores are no true filesystems. + Filesystems on top of object stores are no true intermediate filesystems. They are violating Dijkstra's important layering rules, as stated in his famous articles on THE. A similar argument holds for block devices on top of object stores. @@ -10458,6 +10722,862 @@ There may be some exceptions, e.g. We recommend to use MARS in such use cases. \end_layout +\begin_layout Section +Explanation via CAP Theorem +\begin_inset CommandInset label +LatexCommand label +name "sec:Explanation-via-CAP" + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/cap-theorem.fig + width 60col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +The famous CAP theorem, also called Brewer's theorem, is important for a + deeper understanding of the differences between DRBD and MARS. + A good explanation can be found at +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + +https://en.wikipedia.org/wiki/CAP_theorem +\end_layout + +\end_inset + + (retrieved July 2018). +\end_layout + +\begin_layout Standard +The CAP theorem states that only 2 out of 3 properties can be achieved at + the same time, when a Distributed System is under pressure: C = Consistency + means +\series bold +\emph on +Strict +\series default +\emph default + Consistency at the level of the +\emph on +distributed +\emph default + system (which is +\emph on +not +\emph default + the same as strict consistency +\emph on +inside +\emph default + of one of the +\emph on +local +\emph default + systems), A = Availability = intuitively clear from a user's perspective, + and P = Partitioning Tolerance = the network may have its own outages at + any time (which is a negative criterion). +\end_layout + +\begin_layout Standard +As explained in the Wikipedia article, the P = Partitioning Tolerance is + a property which is imporant at least in +\emph on +wide-distance +\emph default + data replication scenarios, and possibly in some other scenarios. +\end_layout + +\begin_layout Subsection +CAP Differences between DRBD and MARS +\begin_inset CommandInset label +LatexCommand label +name "subsec:CAP-Differences" + +\end_inset + + +\end_layout + +\begin_layout Standard +If you are considering only short distances like passive crossover cables + between racks, +\emph on +then +\emph default + (and +\emph on +only then +\emph default +) you may +\emph on +assume(!) +\emph default + that P is not required. + Then, and only then, you can get both A and C at the same time, without + sacrificing P, because P is already for free by assumption. + In such a crossover cable scenario, getting all three C and A and P is + possible, similarly to an explanation in the Wikipedia article. +\end_layout + +\begin_layout Standard +This is the classical use case for DRBD: when both DRBD replicas are always + staying physically connected via a passive crossover cable (which is +\emph on +assumed +\emph default + to never break down), you can get both strict global consistency and availabili +ty, even in cases where one of the DRBD nodes is failing +\begin_inset Foot +status open + +\begin_layout Plain Layout +In addition, you will need some further components like Pacemaker, iSCSI + failover, etc. +\end_layout + +\end_inset + +. + Both C and A are provided by DRBD during +\family typewriter +connected +\family default + state, while P is assumed to be provided by a passive component. + By addition of iSCSI failover, A can be achieved even in case of single + storage node failures, while retaining C from the viewpoint +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice: the CAP theorem does not deal with node failures, only with +\emph on +network +\emph default + failures. + Node failures would always violate C by some +\begin_inset Quotes eld +\end_inset + +strong +\begin_inset Quotes erd +\end_inset + + definition. + By some +\begin_inset Quotes eld +\end_inset + +weaker +\begin_inset Quotes erd +\end_inset + + definition, the downtime plus recovery time (e.g. + DRBD re-sync) can be taken out of the game. + Notice: while a node can always +\begin_inset Quotes eld +\end_inset + +know +\begin_inset Quotes erd +\end_inset + + whether it has failed (at least after reboot), network failures cannot + be distinguished from failures of remote nodes in general. + Therefore node failures and network failures are fundamentally different + by their nature. +\end_layout + +\end_inset + + of the application. +\end_layout + +\begin_layout Standard +This is explained by the thick line in the following variant of the graphics, + which is only valid for crossover cables where P need not be guaranteed + by the replication because it is already assumed for free: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/cap-drbd-operational.fig + width 60col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +Now look at the case of a truly Distributed System, where P cannot be assumed + as for free. + For example, try to use DRBD in a long-distance replication scenario. + There we cannot assume P as already given. + We +\series bold +must +\emph on +tolerate +\series default +\emph default + replication network outages. + DRBD is reacting to this differently in two different modes. +\end_layout + +\begin_layout Standard +First we look at the (short) time interval +\emph on +before +\emph default + DRBD recognizes the replication network incident, and before it leaves + the +\family typewriter +connected +\family default + state. + During this phase, the application IO will +\series bold +hang +\series default + for some time, indicating the (temporary) sacrifice (from a user's perspective) + by a red X: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/cap-drbd-connected.fig + width 60col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +Because Availability is one of the highest goods of enterprise-critical + IT operations, you will typically configure DRBD such that it automatically + switches to some variant of a +\family typewriter +disconnected +\family default + state after some timeout, thereby giving up consistency between both replicas. + The red X indicates not only loss of global strict consistency in the sense + of the CAP theorem, but also that your replica will become +\family typewriter +Inconsistent +\family default + during the following re-sync: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/cap-drbd-disconnected.fig + width 60col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +You may wonder what the difference to MARS is. + As explained in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Requirements-for-Cloud" + +\end_inset + +, MARS is not only intended for wide distances, but also for +\series bold +Cloud Storage +\series default + where no strict consistency is required at global level by definition, + but instead +\series bold +Eventually Consistent +\series default + is the preferred model for the Distributed System. + Therefore, +\emph on +strict +\emph default + consistency (in the sense of the CAP theorem) is +\emph on +not required by definition +\emph default +. + Therefore, the red X is not present in the following graphics, showing + the state where MARS is remaining +\emph on +locally consistent +\emph default + all the time +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice that the +\emph on +initial +\emph default + full sync is not considered here, neither for DRBD, nor for MARS. + +\emph on +Setup +\emph default + of the Distributed System is its own scenario, not considered here. + +\emph on +Repair +\emph default + of a +\emph on +damaged +\emph default + system is also a different scenario, also not considered here. + Notice the MARS' emergency mode also belongs to the class of +\begin_inset Quotes eld +\end_inset + +damages +\begin_inset Quotes erd +\end_inset + +, as well as DRBD' disk failure modes, where is has some additional functionalit +y compared to the current version of MARS. +\end_layout + +\end_inset + +, even when a network outage occurs: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/cap-mars.fig + width 60col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Notice: MARS does not guarantee strict consistency +\emph on +between +\emph default + LV replicas at the level of the Distributed System, but only Eventually + Consistent. + However, +\emph on +at the same time +\emph default + it +\emph on +also +\emph default + guarantees strict consistency +\emph on +locally +\emph default +, and even at +\emph on +each +\emph default + of the passive replicas, each by each. + Don't confuse these different levels. + There are different consistency guarantees at different levels, at the + same time. + This might be confusing if you are not looking at the system at different + levels: (1) overall Distributed System versus (2) each of the local system + instances. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Why does MARS this? Because a better way is not possible at all. + The CAP theorem tells us that there exists no better way when both A have + to be guaranteed (as almost everywhere in enterprise-critical IT operations), + and P has to be ensured in datacenter disaster scenarios or some other + scenarios. + Similarly to natural laws like Einstein's laws of the speed of light, there + +\emph on +does not exist +\emph default + a better way! +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Conclusion from the CAP theorem: when P is a +\emph on +hard +\emph default + +\emph on +requirement +\emph default +, don't use DRBD (or other +\emph on +synchronous +\emph default + replication implementations) for long-distance and/or Cloud Storage scenarios. + The red X is in particular problematic during re-sync, after the network + has become healthy again (cf section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Behaviour-of-DRBD" + +\end_inset + +). + MARS has no red X at C because of its +\series bold +Anytime Consistency +\series default +, which refers to +\emph on +local +\emph default + consistency, and which is violated by DRBD during certain important phases + of its regular operation. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Another conclusion from the CAP theorem: when A+C is a +\emph on +hard requirement +\emph default +, and when P can be faithfully assumed as already given by passive crossover + cables, then don't use the current version of MARS. + Use DRBD instead. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + If you think that you require alle three properties C+A+P, but you don't + have passive crossover cables over short distances, you are requiring something + which is +\series bold +impossible +\series default +. + There exists no solution, with whatever component, or from whatever commercial + storage vendor. + The CAP theorem is as hard as Einstein's natural laws are. + Rethink your complete concept, from end to end. + Something is wrong, somewhere. + Ignoring this on enterprise-critical use cases can endanger a company and/or + your career. +\end_layout + +\begin_layout Subsection +CAP Commonalities between DRBD and MARS +\begin_inset CommandInset label +LatexCommand label +name "subsec:CAP-Commonalities" + +\end_inset + + +\end_layout + +\begin_layout Standard +In this subsection, we look at the case that P is not for free, but has + to be ensured by the Distributed Storage system. +\end_layout + +\begin_layout Standard +You may have noticed that MARS' ordinary CAP behaviour is similar to DRBD's + CAP picture in +\family typewriter +disconnected +\family default + state, or during similar states when the replication network is interrupted. +\end_layout + +\begin_layout Standard +Replication network interruption is also known as +\begin_inset Quotes eld +\end_inset + +Network Partitioning +\begin_inset Quotes erd +\end_inset + +. + This is where property P = Partitioning Tolerance comes into play. +\end_layout + +\begin_layout Standard +When a network partition has +\emph on +actually occurred +\emph default +, both DRDB and MARS allow you to do the same: you may +\series bold +forcefully switch +\series default + the +\family typewriter +primary +\family default + role, which means activation of a former +\family typewriter +secondary +\family default + node. + In such a situation, you can issue commands like +\family typewriter + drbdadm primary --force +\family default + or +\family typewriter +marsadm primary --force +\family default +. + It is no accident that both commands are looking similar to each other. +\end_layout + +\begin_layout Standard +The outcome will be the same: you will most likely get a +\family typewriter +\series bold +SplitBrain +\family default +\series default + situation. +\end_layout + +\begin_layout Standard +The possibility of getting a split brain is no specific property of neither + DRBD nor MARS. + It will also happen with any other replication system, whether synchronous + or asynchronous. +\end_layout + +\begin_layout Standard +It is one of the consequences from the CAP theorem when (1a) P has to be + assured, and (1b) a network partition has +\emph on +actually occurred +\emph default +, and (2) when A = Availability is enforced at both sides of the network + partition. + The result is that C = global Consistency is violated, by creation of two + or more versions of the data. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Careful: at least for some application classes, it is a bad idea to systematica +lly create split brain via automatic cluster managers, e.g. + Pacemaker or similar. + As explained in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Inappropriate-Clustermanger" + +\end_inset + +, some cluster managers were originally constructed for truly shared disk + scenarios, where no split brain can occur by construction. + Using them in masses on versioned data in truly distributed systems can + result in existential surprises, once a bigger network partition and/or + a flaky replication networks triggers them in masses, and at some moments + where you didn't really want to do what they now are doing automatically, + and in masses. + Split brain should not be provoked when not +\emph on +absolutely +\emph default + necessary. +\end_layout + +\begin_layout Standard +Split brain resolution is all else but easy in general. + When the data is in a generic block device, you typically will have no + general means for merging both versions. + This means, split brain resolution is typically only possible by +\series bold +throwing away +\series default + some of the versions. +\end_layout + +\begin_layout Standard +This kind of split brain resolution problem is no specific property of DRBD + or of MARS. + It is a fundamental property of generic block devices. +\end_layout + +\begin_layout Standard +DRBD and MARS have some commands like +\family typewriter +drbdadm invalidate +\family default + or +\family typewriter +marsadm invalidate +\family default + for this. + Again, the similarity is no accident. +\end_layout + +\begin_layout Standard +Notice that classical filesystems aren't typically better than raw block + devices. + There are even more possibilities for tricky types of +\series bold +conflicts +\series default + (e.g. + on path names in addition to file content). +\end_layout + +\begin_layout Standard +Similary, BigCluster object stores are often suffering from similar (or + even worse) problems, because higher application layers may have some hidden + internal dependencies between object versions, while the object store itself + is agnostic of version dependencies in general +\begin_inset Foot +status open + +\begin_layout Plain Layout +There exists lots of types of potential dependencies between objects. + Timely ones are easy to capture, but this is not sufficient in general + for everything. +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + When stacking block devices or filesystems (or something else) on top of + some BigCluster object store, the latter will not magically resolve any + split brain for you. + Check whether your favorite object store implementation has some kind of + equivalent of a +\family typewriter +primary --force +\family default + command, and some equivalent +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice: BigCluster architectures are typically discriminating between between + client servers and storage servers. + This will typically introduce some more possibilities into the game, such + as forced client failover, independently from forced storage failover. +\end_layout + +\end_inset + + of an +\family typewriter +invalidate +\family default + command. + If it doesn't have one, or only a restricted one, you should be +\emph on +alerted +\emph default +. + In case of a long-lasting storage network partition, you might need suchalike + +\emph on +desperately +\emph default + for ensuring A, even at the cost of C. + Check: whether you need this is heavily depending on the +\series bold +\emph on +application class +\series default +\emph default + (see also the Cloud Storage definition in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Requirements-for-Cloud" + +\end_inset + +, or look at webhosting, etc). + When you +\emph on +would +\emph default + need it, but you are +\series bold +not prepared for suchalike scenarios at your enterprise-critical data +\series default +, it could cost you a lot of money and/or reputation and/or even your existence. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + Notice: the +\emph on +concept +\emph default + of +\family typewriter +SplitBrain +\family default + is occurring almost everywhere in truly Distributed Systems when C can + be violated in favour of A+P. + It is a very general consequence +\begin_inset Foot +status open + +\begin_layout Plain Layout +There exist only few opportunities for generic conflict resolution, even + in classical databases where +\emph on +some +\emph default + knowledge about the structure of the data is available. + Typically, there are some more hidden dependencies. + Lossless +\family typewriter +SplitBrain +\family default + resolution will thus need to be implemented at application layer, if it + is possible at all. +\end_layout + +\end_inset + + of the CAP theorem. +\end_layout + +\begin_layout Standard +The only reliable way for avoiding split brain in truly distributed systems + would be: don't insist on A = Availability. + Notice that there exist some application classes, like certain types of + banking, where C is typically a higher good than A. +\end_layout + +\begin_layout Standard +Notice that both DRBD and MARS are supporting this also: just don't add + the option +\family typewriter +--force +\family default + to the +\family typewriter +primary +\family default + switch command. +\end_layout + +\begin_layout Standard +However: even in banking, some +\emph on +extremely extraordinary +\emph default + scenarios might occur, where sacrifice of C in favour of A could be necessary + (e.g. + when +\emph on +manual cleanup +\emph default + of C is cheaper than long-lasting violations of A). + Good to know that both DRBD and MARS have some emergency measure for killing + C in favour of A! +\end_layout + \begin_layout Section Higher Consistency Guarantees vs Actuality \end_layout @@ -10815,6 +11935,67 @@ https://github.com/schoebel/blkreplay/raw/master/doc/blkreplay.pdf ). \end_layout +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Don't set your hardware BBU cache to +\begin_inset Quotes eld +\end_inset + +writethrough +\begin_inset Quotes erd +\end_inset + + mode. + This may lead to tremendous performance degradation. + Use the +\begin_inset Quotes eld +\end_inset + +writeback +\begin_inset Quotes erd +\end_inset + + strategy instead. + It should be operationally safe, because in case of power loss the BBU + cache content will be preserved thanks to the battery, and/or thanks to + goldcaps for saving the cache content into some flash chips. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +For better performance, use newer MARS versions from branch +\family typewriter +mars0.1a.y +\family default + or later. + You may also play around with +\family typewriter +/proc/sys/mars/aio_sync_mode +\family default + when actuality is less important. + Further tuning of +\family typewriter +/proc/sys/mars/io_tuning/ +\family default + and many more tunables is currently only recommended for experts. + Future versions of MARS are planned to provide better performance with + software RAID. +\end_layout + \begin_layout Standard Typically, you will need more than one RAID set \begin_inset Foot @@ -11717,6 +12898,31 @@ contents \end_inset +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Check that state +\family typewriter +Orphan +\family default + is left after a while on B. + Notice that +\family typewriter +join-resource +\family default + is only +\emph on +starting +\emph default + a new replica, but does not wait for its completion. +\begin_inset Newline newline +\end_inset + + \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 @@ -12890,6 +14096,32 @@ view-replinfo but also be inaccurate. \end_layout +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + Planned handover is refused +\emph on +by default +\emph default + when some sync is running somewhere. + By adding the option +\family typewriter +--ignore-sync +\family default +, you are no longer protected by this +\emph on +safety measure +\emph default +, and you are willing to accept that any already running syncs will restart + from point 0, in order to ensure consistency. +\end_layout + \begin_layout Subsubsection Forced Switching \begin_inset CommandInset label @@ -13842,6 +15074,31 @@ reference "chap:Alternative-Methods-for" . \end_layout +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Check that state +\family typewriter +Orphan +\family default + is left after a while. + Notice that +\family typewriter +invalidate +\family default + is only +\emph on +restarting +\emph default + an existing replica, but does not wait for its completion. +\end_layout + \begin_layout Paragraph Keeping a Split Brain Version \end_layout @@ -17814,6 +19071,14 @@ EMEGENCY MODE HYSTERESIS Until that it had been halted. \end_layout +\begin_layout Enumerate +Recommendation: check at secondaries that state +\family typewriter +Orphan +\family default + has been left after a while. +\end_layout + \begin_layout Standard Alternatively, there is another method by roughly following the instructions from appendix @@ -18784,6 +20049,44 @@ Orphan marsadm invalidate \family default needs to be done. +\begin_inset Newline newline +\end_inset + +There is an execption: shortly after +\family typewriter +join-resource +\family default + or +\family typewriter +invalidate +\family default +, it may take some time until state +\family typewriter +Orphan +\family default + may be left, and until the newest logfile has appeared at your secondary + site (depending on the size of logfiles, and on your network). + In case of network problems, this may take very long. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + This state tells you that your replica is not current, and currently not + being updated at all. + Don't forget to +\series bold +monitor +\series default + for longer occurrences of this state! Otherwise you may get a big surprise + when you need a forceful emergency failover, but your replica is very old + or even does not really exist at all. \end_layout \begin_layout Labeling @@ -24335,7 +25638,7 @@ The following table documents common options which work with (almost) any \size scriptsize \begin_inset Tabular - + @@ -24641,6 +25944,120 @@ for no good reason \end_inset +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +--ignore-sync +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +almost +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Use this for a +\emph on +planned +\emph default + handover instead of +\family typewriter +--force +\family default +. + Only one precondition is relaxed: some sync may be running somewhere. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + +Careful when using this on extremely huge LVs where the sync may take serveral + days, or weeks. + It is your sysadmin decision what you want to prefer: restarting the sync, + or planned handover. +\end_layout + +\end_inset + + \end_layout \end_inset @@ -27381,7 +28798,29 @@ $res \size scriptsize Notice: when the size of $disk_dev is strictly greater than the size of - the resource, you will unnecessarily waste some space.. + the resource, you will unnecessarily waste some space. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + +\size scriptsize +After a while, state +\family typewriter +Orphan +\family default + should be left. + Don't forget to regularly monitor for longer occurrences of +\family typewriter +Orphan +\family default +! \end_layout \end_inset @@ -32262,7 +33701,7 @@ Switches the designated primary \series default . - There are two variants: + There are three variants: \end_layout \begin_layout Plain Layout @@ -32390,6 +33829,23 @@ join-resource \size scriptsize 2) \series bold +Handover ignoring running syncs, +\series default + by adding the option +\family typewriter +--ignore-sync +\family default +. + Any running syncs will restart from scratch, in order to ensure consistency. + Use this only when the planned handover is more important than the sync + time. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +3) +\series bold Forced switching \series default : by giving –force while @@ -34485,6 +35941,28 @@ marsadm {pause,resume}-sync again. \end_layout +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + +\size scriptsize +After a while, state +\family typewriter +Orphan +\family default + should be left. + Don't forget to regularly monitor for longer occurrences of +\family typewriter +Orphan +\family default +! +\end_layout + \end_inset @@ -40259,6 +41737,13 @@ Tips and Tricks \begin_layout Section Avoiding Inappropriate Clustermanager Types for Medium and Long-Distance Replication +\begin_inset CommandInset label +LatexCommand label +name "sec:Inappropriate-Clustermanger" + +\end_inset + + \end_layout \begin_layout Standard diff --git a/docu/marsadm.help b/docu/marsadm.help index 327d0963..16a5359e 100644 --- a/docu/marsadm.help +++ b/docu/marsadm.help @@ -13,6 +13,10 @@ marsadm [] view[-] [ | all ] Use this only when you really know what you are doing! Warning! This is dangerous! First try --dry-run. Not combinable with 'all'. + --ignore-sync + Allow primary handover even when some sync is running somewhere. + This is less rude than --force because it checks for all else + preconditions. --dry-run Don't modify the symlink tree, but tell what would be done. Use this before starting potentially harmful actions such as @@ -567,5 +571,4 @@ marsadm [] view[-] [ | all ] {sync,fetch,replay,work}-{rest,{almost-,threshold-,}reached,percent,permille,vector} {sync,fetch,replay}-{rate,remain} {time,real-time} - {tree,features}-version \end{verbatim} diff --git a/userspace/marsadm b/userspace/marsadm index fb45c31f..c92269d7 100755 --- a/userspace/marsadm +++ b/userspace/marsadm @@ -106,7 +106,7 @@ sub lwarn { my $Id = '$Id$ '; my $user_version = 0.1; -my $marsadm_version = 2.3; # some rough hint at newer features +my $marsadm_version = 2.4; # some rough hint at newer features my $mars = "/mars"; my $host = `uname -n` or ldie "cannot determine my network node name\n"; chomp $host; @@ -114,6 +114,7 @@ check_id($host); my $real_host = $host; my $backup_dir = "$mars/backups-" . time(); my $force = 0; +my $ignore_sync = 0; my $cron_mode = 0; my $timeout = -1; my $ip = ""; @@ -2623,7 +2624,7 @@ sub split_cluster { $cmd .= "shopt -s nullglob; "; $cmd .= "for i in $mars/resource-*; do if ! [[ -e \$i/data-$peer ]] && ! [[ -e \$i/replay-$peer ]]; then rm -rf $backup_dir/\${i##*/}; mv \$i $backup_dir/; fi; done; "; $cmd .= "mkdir -p $mars/ips; "; - my $sub_list = "{ for i in \$(ls $mars/resource-*/data-$peer | cut -d/ -f1-3 | sort -u); do (cd \$i; ls data-*); done; echo x-$peer; }"; + my $sub_list = "{ for dir in $mars/resource-*/data-$peer; do (cd \${dir%/*} && for i in data-*; do echo \$i; done); done; echo x-$peer; }"; my $sub_cmd = "echo RESTORE IP \$j; cp -a $ips_backup/ip-\$j $mars/ips/"; $cmd .= "for j in \$($sub_list | cut -d- -f2- | sort -u); do $sub_cmd; done"; lprint "$cmd\n"; @@ -3172,16 +3173,19 @@ sub primary_phase0 { lprint "Current designated primary: $old\n"; if ($cmd eq "primary") { if ($host ne $old) { - check_sync_finished($res, $host); + lprint "Allowing handover in cases of sync: ignore_sync=$ignore_sync\n" if $ignore_sync; + check_sync_finished($res, $host, $ignore_sync); # also check that other secondaries won't loose their sync primary my @names = glob("$mars/resource-$res/data-*"); # for k <= 2 replicas, the previous check must have been sufficient if (scalar(@names) > 2) { + my $allow_anyway = ($force || $ignore_sync); + lprint "Allowing handover in cases of sync: force=$force ignore_sync=$ignore_sync\n" if $allow_anyway; foreach my $name (@names) { $name =~ m:/data-(.+):; my $peer = $1; next if ($peer eq $old || $peer eq $host); - check_sync_finished($res, $peer, $force); + check_sync_finished($res, $peer, $allow_anyway); } } } @@ -6120,6 +6124,10 @@ marsadm [] view[-] [ | all ] Use this only when you really know what you are doing! Warning! This is dangerous! First try --dry-run. Not combinable with 'all'. + --ignore-sync + Allow primary handover even when some sync is running somewhere. + This is less rude than --force because it checks for all else + preconditions. --dry-run Don't modify the symlink tree, but tell what would be done. Use this before starting potentially harmful actions such as @@ -6237,6 +6245,9 @@ foreach my $arg (@ARGV) { if ($arg eq "--force" || $arg eq "-f") { $force++; next; + } elsif ($arg eq "--ignore-sync") { + $ignore_sync++; + next; } elsif ($arg eq "--dry-run" || $arg eq "-d") { $dry_run++; next;