From 8d2d1194148ed4ba86b4e0e9607dda6e99d81e86 Mon Sep 17 00:00:00 2001
From: Thomas Schoebel-Theuer <tst@schoebel-theuer.de>
Date: Sun, 21 Jun 2015 17:05:14 +0200
Subject: [PATCH] doc: new section on clustermanagers

---
 docu/images/clustermanager-hierarchy.fig |   21 +
 docu/images/fencing-hierarchy.fig        |   37 +
 docu/images/shared-disk-model.fig        |   46 +
 docu/images/shared-nothing-model.fig     |   64 +
 docu/images/split-brain-history.fig      |   18 +
 docu/images/split-brain-resolved.fig     |   18 +
 docu/mars-manual.lyx                     | 2224 +++++++++++++++++++++-
 7 files changed, 2426 insertions(+), 2 deletions(-)
 create mode 100644 docu/images/clustermanager-hierarchy.fig
 create mode 100644 docu/images/fencing-hierarchy.fig
 create mode 100644 docu/images/shared-disk-model.fig
 create mode 100644 docu/images/shared-nothing-model.fig
 create mode 100644 docu/images/split-brain-history.fig
 create mode 100644 docu/images/split-brain-resolved.fig

diff --git a/docu/images/clustermanager-hierarchy.fig b/docu/images/clustermanager-hierarchy.fig
new file mode 100644
index 00000000..4b53e078
--- /dev/null
+++ b/docu/images/clustermanager-hierarchy.fig
@@ -0,0 +1,21 @@
+#FIG 3.2  Produced by xfig version 3.2.5c
+Landscape
+Center
+Metric
+A4      
+100.00
+Single
+-2
+1200 2
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 225 1125 5175 1125 5175 1575 225 1575 225 1125
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 225 1800 5175 1800 5175 2250 225 2250 225 1800
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 225 450 5175 450 5175 900 225 900 225 450
+4 1 0 50 -1 0 12 0.0000 4 180 4740 2700 1395 Mechanics Layer: Handover+Failover of whole Datacenter\001
+4 1 0 50 -1 0 12 0.0000 4 180 4455 2700 2070 Mechanics Layer: Handover+Failover of single Cluster\001
+4 0 0 50 -1 0 10 0.0000 4 135 3015 5355 2070 (several  hundreds / thousands of instances)\001
+4 0 0 50 -1 0 10 0.0000 4 135 1950 5355 1395 (about a dozen of instances)\001
+4 0 0 50 -1 0 10 0.0000 4 150 2370 5355 720 (one globally distributed instance)\001
+4 1 0 50 -1 0 12 0.0000 4 180 4395 2700 720 Automatics Layer: Failover of {Datacenters...Clusters}\001
diff --git a/docu/images/fencing-hierarchy.fig b/docu/images/fencing-hierarchy.fig
new file mode 100644
index 00000000..7b96fd2e
--- /dev/null
+++ b/docu/images/fencing-hierarchy.fig
@@ -0,0 +1,37 @@
+#FIG 3.2  Produced by xfig version 3.2.5c
+Landscape
+Center
+Metric
+A4      
+100.00
+Single
+-2
+1200 2
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 1575 1125 2700 1125 2700 1800 1575 1800 1575 1125
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 0 1125 1125 1125 1125 1800 0 1800 0 1125
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 3150 1125 4275 1125 4275 1800 3150 1800 3150 1125
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+	1 1 2.00 60.00 135.00
+	 1575 675 990 1125
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+	1 1 2.00 60.00 135.00
+	 2115 675 2115 1125
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+	1 1 2.00 60.00 135.00
+	 2700 675 3330 1125
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+	1 1 2.00 60.00 135.00
+	 3375 675 4770 1305
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 900 225 3375 225 3375 675 900 675 900 225
+4 1 0 50 -1 0 11 0.0000 4 120 480 2115 1755 Traffic\001
+4 1 0 50 -1 0 11 0.0000 4 165 825 2115 1530 Application\001
+4 1 0 50 -1 0 11 0.0000 4 165 960 2115 1305 Fencing from\001
+4 1 0 50 -1 0 11 0.0000 4 165 540 540 1620 Storage\001
+4 1 0 50 -1 0 11 0.0000 4 165 960 540 1395 Fencing from\001
+4 1 0 50 -1 2 20 0.0000 4 60 270 4770 1395 ...\001
+4 1 0 50 -1 0 11 0.0000 4 165 1845 2160 495 General Fencing Methods\001
+4 1 0 50 -1 0 11 0.0000 4 120 780 3690 1485 STONITH\001
diff --git a/docu/images/shared-disk-model.fig b/docu/images/shared-disk-model.fig
new file mode 100644
index 00000000..5894ed95
--- /dev/null
+++ b/docu/images/shared-disk-model.fig
@@ -0,0 +1,46 @@
+#FIG 3.2  Produced by xfig version 3.2.5c
+Landscape
+Center
+Metric
+A4      
+100.00
+Single
+-2
+1200 2
+6 2610 2835 3780 3645
+1 1 0 1 0 7 50 -1 -1 0.000 1 0.0000 3195 3105 585 270 3195 3105 3780 3375
+1 1 0 1 0 7 50 -1 -1 0.000 1 0.0000 3195 3375 585 270 3195 3375 3780 3645
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 2610 3105 2610 3330
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 3780 3105 3780 3330
+-6
+6 225 450 2250 1350
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 225 450 2250 450 2250 1350 225 1350 225 450
+4 1 0 50 -1 0 13 0.0000 4 195 1665 1260 810 App Cluster Side A\001
+4 1 0 50 -1 0 13 0.0000 4 195 1470 1260 1080 (currently active)\001
+-6
+6 4050 450 6075 1350
+2 2 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 5
+	 4050 450 6075 450 6075 1350 4050 1350 4050 450
+4 1 0 50 -1 0 13 0.0000 4 195 1650 5040 810 App Cluster Side B\001
+4 1 0 50 -1 0 13 0.0000 4 195 1575 5085 1080 (currently passive)\001
+-6
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
+	1 1 3.00 60.00 120.00
+	1 1 3.00 60.00 120.00
+	 2025 1350 2835 2880
+2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 1 1 2
+	1 1 3.00 60.00 120.00
+	1 1 3.00 60.00 120.00
+	 4269 1329 3510 2835
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
+	1 1 3.00 60.00 120.00
+	1 1 3.00 60.00 120.00
+	 2248 901 4050 900
+4 1 0 50 -1 0 13 0.0000 4 135 1035 3150 3870 Shared Disk\001
+4 1 0 50 -1 0 13 0.0000 4 195 1320 3195 855 Clustermanager\001
+4 1 0 50 -1 0 13 0.0000 4 180 1305 3195 1080 e.g. PaceMaker\001
+4 1 0 50 -1 0 11 0.0000 4 165 735 4320 2160 e.g. iSCSI\001
+4 1 0 50 -1 0 11 0.0000 4 165 735 1980 2160 e.g. iSCSI\001
diff --git a/docu/images/shared-nothing-model.fig b/docu/images/shared-nothing-model.fig
new file mode 100644
index 00000000..51df4042
--- /dev/null
+++ b/docu/images/shared-nothing-model.fig
@@ -0,0 +1,64 @@
+#FIG 3.2  Produced by xfig version 3.2.5c
+Landscape
+Center
+Metric
+A4      
+100.00
+Single
+-2
+1200 2
+6 225 450 2250 1350
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 225 450 2250 450 2250 1350 225 1350 225 450
+4 1 0 50 -1 0 13 0.0000 4 195 1665 1260 810 App Cluster Side A\001
+4 1 0 50 -1 0 13 0.0000 4 195 1470 1260 1080 (currently active)\001
+-6
+6 4050 450 6075 1350
+2 2 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 5
+	 4050 450 6075 450 6075 1350 4050 1350 4050 450
+4 1 0 50 -1 0 13 0.0000 4 195 1650 5040 810 App Cluster Side B\001
+4 1 0 50 -1 0 13 0.0000 4 195 1575 5085 1080 (currently passive)\001
+-6
+6 675 2700 1845 3510
+1 1 0 1 0 7 50 -1 -1 0.000 1 0.0000 1260 2970 585 270 1260 2970 1845 3240
+1 1 0 1 0 7 50 -1 -1 0.000 1 0.0000 1260 3240 585 270 1260 3240 1845 3510
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 675 2970 675 3195
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 1845 2970 1845 3195
+-6
+6 4455 2700 5625 3510
+1 1 0 1 0 7 50 -1 -1 0.000 1 0.0000 5040 2970 585 270 5040 2970 5625 3240
+1 1 0 1 0 7 50 -1 -1 0.000 1 0.0000 5040 3240 585 270 5040 3240 5625 3510
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 4455 2970 4455 3195
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 5625 2970 5625 3195
+-6
+6 1305 1890 2115 2070
+4 1 0 50 -1 0 11 0.0000 4 165 735 1710 2025 e.g. iSCSI\001
+-6
+6 4230 1890 5040 2070
+4 1 0 50 -1 0 11 0.0000 4 165 735 4635 2025 e.g. iSCSI\001
+-6
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
+	1 1 3.00 60.00 120.00
+	1 1 3.00 60.00 120.00
+	 2248 901 4050 900
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
+	1 1 3.00 60.00 120.00
+	1 1 3.00 60.00 120.00
+	 1260 1350 1260 2655
+2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 1 1 2
+	1 1 3.00 60.00 120.00
+	1 1 3.00 60.00 120.00
+	 5085 1350 5085 2655
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
+	1 1 3.00 60.00 120.00
+	1 1 3.00 60.00 120.00
+	 1843 3150 4455 3150
+4 1 0 50 -1 0 13 0.0000 4 195 1320 3195 855 Clustermanager\001
+4 1 0 50 -1 0 13 0.0000 4 135 600 1260 3735 Disk A\001
+4 1 0 50 -1 0 13 0.0000 4 135 585 5085 3735 Disk B\001
+4 1 0 50 -1 0 13 0.0000 4 195 1230 3240 3060 Disk Coupling\001
+4 1 0 50 -1 0 13 0.0000 4 180 1830 3240 3330 e.g. DRBD or MARS\001
diff --git a/docu/images/split-brain-history.fig b/docu/images/split-brain-history.fig
new file mode 100644
index 00000000..ff55d142
--- /dev/null
+++ b/docu/images/split-brain-history.fig
@@ -0,0 +1,18 @@
+#FIG 3.2  Produced by xfig version 3.2.5c
+Landscape
+Center
+Metric
+A4      
+100.00
+Single
+-2
+1200 2
+2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 1 0 3
+	5 1 1.00 60.00 120.00
+	 450 675 2925 675 3600 450
+2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+	5 1 1.00 60.00 120.00
+	 2925 675 3600 900
+4 0 0 50 -1 0 12 0.0000 4 135 135 3735 540 A\001
+4 0 0 50 -1 0 12 0.0000 4 135 135 3735 900 B\001
+4 1 0 50 -1 0 10 0.0000 4 135 1650 1755 585 common part of history\001
diff --git a/docu/images/split-brain-resolved.fig b/docu/images/split-brain-resolved.fig
new file mode 100644
index 00000000..c964ea5f
--- /dev/null
+++ b/docu/images/split-brain-resolved.fig
@@ -0,0 +1,18 @@
+#FIG 3.2  Produced by xfig version 3.2.5c
+Landscape
+Center
+Metric
+A4      
+100.00
+Single
+-2
+1200 2
+2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 1 0 3
+	5 1 1.00 60.00 120.00
+	 450 675 2925 675 3600 900
+2 1 1 2 0 7 50 -1 -1 6.000 0 0 -1 1 0 2
+	5 0 1.00 60.00 120.00
+	 2925 675 3600 450
+4 0 0 50 -1 0 12 0.0000 4 135 135 3735 540 A\001
+4 0 0 50 -1 0 12 0.0000 4 135 135 3735 900 B\001
+4 1 0 50 -1 0 10 0.0000 4 135 1650 1755 585 common part of history\001
diff --git a/docu/mars-manual.lyx b/docu/mars-manual.lyx
index afdb23cd..5ab48328 100644
--- a/docu/mars-manual.lyx
+++ b/docu/mars-manual.lyx
@@ -28945,6 +28945,2226 @@ demand
 Tips and Tricks
 \end_layout
 
+\begin_layout Section
+Avoiding Inappropriate Clustermanager Types for Medium and Long-Distance
+ Replication
+\end_layout
+
+\begin_layout Standard
+This section addresses some wide-spread misconceptions.
+ Its main target audience is developers, but sysadmins will profit from
+ 
+\series bold
+detailed explainations of problems and pitfalls
+\series default
+.
+ When the problems described in this section are solved somewhen in future,
+ this section will be shortened and some relevant parts moved to the appendix.
+\end_layout
+
+\begin_layout Standard
+Doing 
+\series bold
+High Availability (HA)
+\series default
+ wrong at 
+\emph on
+concept level
+\emph default
+ may easily get you into trouble, and may cost you several millions of €
+ or $ in larger installations, or even knock you out of business when disasters
+ are badly dealt with at higher levels such as clustermanagers.
+\end_layout
+
+\begin_layout Subsection
+General Cluster Models
+\end_layout
+
+\begin_layout Standard
+The most commonly known cluster model is called 
+\series bold
+shared-disk
+\series default
+, and typically controlled by clustermanagers like 
+\family typewriter
+PaceMaker
+\family default
+:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+	filename images/shared-disk-model.fig
+	width 50col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+The most important property of shared-disk is that there exists only a single
+ disk instance.
+ Nowadays, this disk often has some 
+\emph on
+internal
+\emph default
+ redundancy such as RAID.
+ At 
+\emph on
+system
+\emph default
+ architecure layer / network level, there exists no redundant disk at all.
+ Only the application cluster is built redundant.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+	filename images/lightbulb_brightlit_benj_.png
+	lyxscale 12
+	scale 7
+
+\end_inset
+
+ It should be immediately clear that shared-disk clusters are only suitable
+ for short-distance operations in the same datacenter.
+ Although running one of the data access lines over short distances between
+ very near-by datacenters (e.g.
+ 1 km) would be theoretically possible, there would be no sufficient protection
+ against failure of a whole datacenter.
+\end_layout
+
+\begin_layout Standard
+Both DRBD and MARS belong to a different architectural model called 
+\series bold
+shared-nothing
+\series default
+:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+	filename images/shared-nothing-model.fig
+	width 50col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+The characteristic feature of a shared-nothing model is (additional)
+\series bold
+ redundancy at network level
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+	filename images/lightbulb_brightlit_benj_.png
+	lyxscale 12
+	scale 7
+
+\end_inset
+
+ Shared-nothing 
+\begin_inset Quotes eld
+\end_inset
+
+clusters
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice that the term 
+\begin_inset Quotes eld
+\end_inset
+
+cluster computing
+\begin_inset Quotes erd
+\end_inset
+
+ usually refers to short-distance only.
+ Long-distance coupling should be called 
+\begin_inset Quotes eld
+\end_inset
+
+grid computing
+\begin_inset Quotes erd
+\end_inset
+
+ in preference.
+ As known from the scientific literature, grid computing requires different
+ concepts and methods in general.
+ Only for the sake of simplicity, we use 
+\begin_inset Quotes eld
+\end_inset
+
+cluster
+\begin_inset Quotes erd
+\end_inset
+
+ and 
+\begin_inset Quotes eld
+\end_inset
+
+grid
+\begin_inset Quotes erd
+\end_inset
+
+ interchangeably.
+\end_layout
+
+\end_inset
+
+
+\begin_inset Quotes erd
+\end_inset
+
+ could theoretically be built for 
+\emph on
+any
+\emph default
+ distances, from short to medium to long distances.
+ However, concrete technologies of disk coupling such as synchronous operation
+ may pose practical limits on the distances (see chapter 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "chap:Use-Cases-for"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+In general, clustermanagers must fit to the model.
+ Some clustermanager can be configured to fit to multiple models.
+ If so, this must be done properly, or you may get into serious trouble.
+\end_layout
+
+\begin_layout Standard
+Some people don't know, or they don't believe, that different architectural
+ models like shared-disk or shared-nothing will 
+\emph on
+require
+\emph default
+ an 
+\emph on
+appropriate
+\emph default
+ type of clustermanager and/or a different configuration.
+ Failing to do so, by selection of an inappropriate clustermanager type
+ and/or an inappropriate configuration may be hazardous.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+	filename images/MatieresCorrosives.png
+	lyxscale 50
+	scale 17
+
+\end_inset
+
+ Selection of the right model alone is not sufficient.
+ Some, if not many, clustermanagers have not been designed for long distances.
+ As explained in section 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Special-Requirements-for"
+
+\end_inset
+
+, long distances have further 
+\series bold
+hard requirements
+\series default
+.
+ Disregarding them may be also hazardous!
+\end_layout
+
+\begin_layout Subsection
+Handover / Failover Reasons and Scenarios
+\end_layout
+
+\begin_layout Standard
+From a sysadmin perspective, there exist a number of different 
+\series bold
+reasons
+\series default
+ why the application workload must be switched from the currently active
+ side A to the currently passive side B:
+\end_layout
+
+\begin_layout Enumerate
+Some 
+\series bold
+defect
+\series default
+ has occurred at cluster side A or at some corresponding part of the network.
+\end_layout
+
+\begin_layout Enumerate
+Some 
+\series bold
+maintenance
+\series default
+ has to be done at side A which would cause a longer downtime (e.g.
+ security kernel update or replacement of core network equipment or maintainance
+ of UPS or of the BBU cache etc - hardware isn't 24/7/365 in practice, although
+ some vendors 
+\emph on
+claim
+\emph default
+ it - it is either not really true, or it becomes 
+\emph on
+extremely
+\emph default
+ expensive).
+\end_layout
+
+\begin_layout Standard
+Both reasons are valid and must be automatically handled in larger installations.
+ In order to deal with all of these reasons, the following basic mechanisms
+ can be used in either model:
+\end_layout
+
+\begin_layout Enumerate
+
+\series bold
+Failover
+\series default
+ (triggered either manually or automatically)
+\end_layout
+
+\begin_layout Enumerate
+
+\series bold
+Handover
+\series default
+ (triggered manually
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Automatic triggering could be feasible for prophylactic treatments.
+\end_layout
+
+\end_inset
+
+)
+\end_layout
+
+\begin_layout Standard
+It is important to not confuse handover with failover at concept level.
+ Not only the reasons / preconditions are very different, but also the 
+\emph on
+requirements
+\emph default
+.
+ Example: precondition for handover is that 
+\emph on
+both
+\emph default
+ cluster sides are healthy, while precondition for failover is that 
+\emph on
+some relevant(!)
+\emph default
+ failure has been 
+\emph on
+detected
+\emph default
+ somewhere (whether this is 
+\emph on
+really
+\emph default
+ true is another matter).
+ Typically, failover must be able to run in masses, while planned handover
+ often has lower scaling requirements.
+\end_layout
+
+\begin_layout Standard
+Not all existing clustermanagers are dealing with all of these cases (or
+ their variants) equally well, and some are not even dealing with some of
+ these cases / variants 
+\emph on
+at all
+\emph default
+.
+ 
+\end_layout
+
+\begin_layout Standard
+Some clustermanagers cannot easily express the concept of 
+\begin_inset Quotes eld
+\end_inset
+
+automatic triggering
+\begin_inset Quotes erd
+\end_inset
+
+ versus 
+\begin_inset Quotes eld
+\end_inset
+
+manual triggering
+\begin_inset Quotes erd
+\end_inset
+
+ of an action.
+ There exists simply no cluster-global switch which selects either 
+\begin_inset Quotes eld
+\end_inset
+
+manual mode
+\begin_inset Quotes erd
+\end_inset
+
+ or 
+\begin_inset Quotes eld
+\end_inset
+
+automatic mode
+\begin_inset Quotes erd
+\end_inset
+
+ (except when you start to hack the code and/or write new plugins; then
+ you might notice that there is almost no architectural layering / sufficient
+ separation between mechanism and strategy).
+ Being forced to permanently use an automatic mode for several hundreds
+ or even thousands of clusters is not only boring, but bears a considerable
+ risk when automatics do a wrong decision at hundreds of instances in parallel.
+\end_layout
+
+\begin_layout Subsection
+Granularity and Layering Hierarchy for Long Distances
+\end_layout
+
+\begin_layout Standard
+Many existing clustermanager solutions are dealing with a single cluster
+ instance, as the term 
+\begin_inset Quotes eld
+\end_inset
+
+
+\emph on
+cluster
+\emph default
+manager
+\begin_inset Quotes erd
+\end_inset
+
+ suggests.
+ However, when running several hundreds or thousands of cluster instances,
+ you likely will not want to manage each of them individually.
+ In addition, failover should 
+\emph on
+not only
+\emph default
+ be 
+\emph on
+triggered
+\emph default
+ (not to be confused with 
+\emph on
+executed
+\emph default
+) individually at cluster level, but likely 
+\emph on
+also
+\emph default
+ at a higher granularity such as a room, or a whole datacenter.
+ Otherwise, some chaos is likely to happen.
+\end_layout
+
+\begin_layout Standard
+Here is what you probably will 
+\series bold
+need
+\series default
+, possibly in difference to what you may find on the market (whether OpenSource
+ or not).
+ For simplicity, the following diagram shows only two levels of granularity,
+ but can be easily extended to multiple layers of granularity, or to some
+ concept of various 
+\emph on
+subsets of clusters
+\emph default
+:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+	filename images/clustermanager-hierarchy.fig
+	width 70col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+Notice that many existing clustermanager solutions are not addressing the
+ datacenter granularity at all.
+ Typically, they use concepts like 
+\series bold
+quorums
+\series default
+ for determining failures 
+\emph on
+at cluster level
+\emph default
+ solely, and then immediately executing failover of the cluster, sometimes
+ without clean architectural distinction between trigger and execution (similar
+ to the 
+\begin_inset Quotes eld
+\end_inset
+
+separation of concerns
+\begin_inset Quotes erd
+\end_inset
+
+ between 
+\series bold
+mechanism
+\series default
+ and 
+\series bold
+strategy
+\series default
+ in Operating Systems).
+ Sometimes there is even no internal software layering / modularization
+ according to this separation of concerns at all.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+	filename images/MatieresCorrosives.png
+	lyxscale 50
+	scale 17
+
+\end_inset
+
+ When there is no distinction between different levels of granularity, you
+ are hopelessly bound to a non-extensible and thus non-adaptable system
+ when you need to operate masses of clusters.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+	filename images/MatieresCorrosives.png
+	lyxscale 50
+	scale 17
+
+\end_inset
+
+ A lacking distinction between automatic mode and manual mode, and/or lack
+ of corresponding 
+\series bold
+architectural software layers
+\series default
+ is not only a blatant ignoration of well-established best practices of
+ 
+\series bold
+software engineering
+\series default
+, but will bind you even more firmly to an inflexible system.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+	filename images/lightbulb_brightlit_benj_.png
+	lyxscale 12
+	scale 7
+
+\end_inset
+
+ Terminology: for practical reasons, we use the general term 
+\begin_inset Quotes eld
+\end_inset
+
+clustermanager
+\begin_inset Quotes erd
+\end_inset
+
+ also for speaking about layers dealing with higher granularity, such as
+ datacenter layers, and also for long-distance replication scenarios, although
+ some terminology from grid computing would be more appropriate in a scientific
+ background.
+\end_layout
+
+\begin_layout Standard
+Please consider the following: when it comes to long-distance HA, the above
+ layering architecture is also motivated by vastly different numbers of
+ instances for each layer.
+ Ideally, the topmost automatics layer should be able to overview several
+ datacenters in parallel, in order to cope with (almost) global network
+ problems such as network partitions.
+ Additionally, it should also detect single cluster failures, or intermediate
+ problems like 
+\begin_inset Quotes eld
+\end_inset
+
+rack failure
+\begin_inset Quotes erd
+\end_inset
+
+ or 
+\begin_inset Quotes eld
+\end_inset
+
+room failure
+\begin_inset Quotes erd
+\end_inset
+
+, as well as various types of (partial / intermediate) (replication) network
+ failures.
+ Incompatible decisions at each of the different granularities would be
+ a no-go in practice.
+ Somewhere and somehow, you need one single
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+If you have 
+\emph on
+logical pairs of datacenters
+\emph default
+ which are firmly bound together, you could also have several topmost automatics
+ instances, e.g.
+ for each 
+\emph on
+pair
+\emph default
+ of datacenters.
+ However, that would be very 
+\series bold
+inflexible
+\series default
+, because then you cannot easily mix locations or migrate your servers between
+ datacenters.
+ Using 
+\begin_inset Formula $k>2$
+\end_inset
+
+ replicas with MARS would also become a nightmare.
+ In your own interest, please don't create any concepts where masses of
+ hardware are firmly bound to fixed constants at some software layers.
+\end_layout
+
+\end_inset
+
+ top-most 
+\emph on
+logical
+\emph default
+ problem detection / ranking instance, which should be 
+\emph on
+internally distributed
+\emph default
+ of course, typically using some 
+\series bold
+distributed consensus protocol
+\series default
+; but in difference to many published distributed consensus algorithms it
+ should be able to work with multiple granularities at the same time.
+\end_layout
+
+\begin_layout Subsection
+Methods and their Appropriateness
+\end_layout
+
+\begin_layout Subsubsection
+Failover Methods
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Failover-Methods"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Failover methods are only needed in case of an incident.
+ They should not be used for regular handover.
+\end_layout
+
+\begin_layout Paragraph
+STONITH-like Methods
+\end_layout
+
+\begin_layout Standard
+STONITH = Shoot The Other Node In The Head
+\end_layout
+
+\begin_layout Standard
+These methods are widely known, although they have several serious drawbacks.
+ Some people even believe that 
+\emph on
+any
+\emph default
+ clustermanager must 
+\emph on
+always
+\emph default
+ have some STONITH-like functionality.
+ This is wrong.
+ There 
+\emph on
+exist
+\emph default
+ alternatives, as shown in the next paragraph.
+\end_layout
+
+\begin_layout Standard
+The most obvious drawback is that STONITH will always create a 
+\series bold
+damage
+\series default
+, by definition.
+\end_layout
+
+\begin_layout Standard
+Example: a typical contemporary STONITH implementation uses IPMI for automatical
+ly powering off your servers, or at least pushes the (virtual) reset button.
+ This will 
+\emph on
+always
+\emph default
+ create a certain type of damage: the affected systems will definitely not
+ be available, at least for some time until they have (manually) rebooted.
+\end_layout
+
+\begin_layout Standard
+This is a conceptual contradiction: the reason for starting failover is
+ that you want to restore availability as soon as possible, but in order
+ to do so you will first 
+\emph on
+destroy
+\emph default
+ the availability of a particular 
+\emph on
+component
+\emph default
+.
+ This may be counter-productive.
+\end_layout
+
+\begin_layout Standard
+Example: when your hot standby node B does not work as expected, or if it
+ works even 
+\emph on
+worse
+\emph default
+ than A before, you will loose some time until you 
+\emph on
+can
+\emph default
+ become operational again at the old side A.
+\end_layout
+
+\begin_layout Standard
+Here is an example method for handling a failure scenario.
+ The old active side A is assumed to be no longer healthy anymore.
+ The method uses a sequential state transition chain with a STONITH-like
+ step:
+\end_layout
+
+\begin_layout Description
+Phase1 Check whether the hot standby B is currently usable.
+ If this is violated (which may happen during certain types of disasters),
+ abort the failover for any affected resources.
+\end_layout
+
+\begin_layout Description
+Phase2 
+\emph on
+Try
+\emph default
+ to shutdown the damaged side A (in the 
+\emph on
+hope
+\emph default
+ that there is no 
+\emph on
+serious
+\emph default
+ damage).
+\end_layout
+
+\begin_layout Description
+Phase3 In case phase2 did not work during a grace period / after a timeout,
+ assume that A is badly damaged and therefore STONITH it.
+\end_layout
+
+\begin_layout Description
+Phase4 Start the application at the hot standby B.
+\end_layout
+
+\begin_layout Standard
+Notice: any cleanup actions, such as 
+\series bold
+repair
+\series default
+ of defective hard- or software etc, are outside the scope of failover processes.
+ Typically, they are executed much later when restoring redundancy.
+\end_layout
+
+\begin_layout Standard
+Also notice: this method is a 
+\emph on
+heavily
+\emph default
+ distributed one, in the sense that sequential actions are alternated multiple
+ times on different hosts.
+ This is known to be cumbersome in distributed systems, in particular in
+ presence of network problems.
+\end_layout
+
+\begin_layout Standard
+\begin_inset CommandInset label
+LatexCommand label
+name "Phase4-in-more"
+
+\end_inset
+
+Phase4 in more detail for DRBD, augmented with some pseudo code for application
+ control:
+\end_layout
+
+\begin_layout Enumerate
+at side B: 
+\family typewriter
+drbdadm disconnect all
+\end_layout
+
+\begin_layout Enumerate
+at side B: 
+\family typewriter
+drbdadm primary --force all
+\end_layout
+
+\begin_layout Enumerate
+at side B: 
+\family typewriter
+applicationmanager start all
+\end_layout
+
+\begin_layout Standard
+The same phase4 using MARS:
+\end_layout
+
+\begin_layout Enumerate
+at side B: 
+\family typewriter
+marsadm pause-fetch all
+\end_layout
+
+\begin_layout Enumerate
+at side B: 
+\family typewriter
+marsadm primary --force all
+\end_layout
+
+\begin_layout Enumerate
+at side B: 
+\family typewriter
+applicationmanager start all
+\end_layout
+
+\begin_layout Standard
+This sequential 4-phase method is far from optimal, for the following reasons:
+\end_layout
+
+\begin_layout Itemize
+The method tries to handle both failover and handover scenarios with one
+ single sequential receipe.
+ In case of a true failover scenario where it is 
+\emph on
+already known for sure
+\emph default
+ that side A is badly damaged, this method will unnecessarily waste time
+ for phase 2.
+ This could be fixed by introduction of a conceptual distinction between
+ handover and failover, but it would not fix the following problems.
+\end_layout
+
+\begin_layout Itemize
+Before phase4 is started (which will re-establish the service from a user's
+ perspective), a lot of time is wasted by 
+\emph on
+both
+\emph default
+ phases 2 
+\emph on
+and
+\emph default
+ 3.
+ Even if phase 2 would be skipped, phase 3 would unnecessarily cost some
+ time.
+ In the next paragraph, an alternative method is explained which eliminates
+ any unnecessary waiting time at all.
+\end_layout
+
+\begin_layout Itemize
+The above method is adapted to the shared-disk model.
+ It does not take advantage of the shared-nothing model, where further possibili
+ties for better solutions exist.
+\end_layout
+
+\begin_layout Itemize
+In case of long-distance network partitions and/or sysadmin / system management
+ subnetwork outages, you may not even be able to (remotely) start STONITH
+ at at.
+ Thus the above method misses an important failure scenario.
+\end_layout
+
+\begin_layout Standard
+Some people seem to have a 
+\emph on
+binary
+\emph default
+ view at the healthiness of a system: in their view, a system is either
+ operational, or it is damaged.
+ This kind of view is ignoring the fact that some systems may be half-alive,
+ showing only 
+\emph on
+minor
+\emph default
+ problems, or occurring only from time to time.
+\end_layout
+
+\begin_layout Standard
+It is obvious that damaging a healthy system is a bad idea by itself.
+ Even 
+\emph on
+generally
+\emph default
+ damaging a half-alive system in order to 
+\begin_inset Quotes eld
+\end_inset
+
+fix
+\begin_inset Quotes erd
+\end_inset
+
+ problems is not generally a good idea, because it may increase the damage
+ when you don't know the 
+\emph on
+real
+\emph default
+ reason
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Example, occurring in masses: an incorrectly installed bootloader, or a
+ wrong BIOS boot priority order which unexpectedly lead to hangs or infinite
+ reboot cycles once the DHCP or BOOTP servers are not longer available /
+ reachable.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+Even worse: in a distributed system
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice: the STONITH concept is more or less associated with short-distance
+ scenarios where 
+\series bold
+crossover cables
+\series default
+ or similare equipment are used.
+ The assumption is that crossover cables can't go defective, or at least
+ it would be an extremely unlikely scenario.
+ For long-distance replication, this assumption is simply not true.
+\end_layout
+
+\end_inset
+
+ you sometimes 
+\emph on
+cannot(!)
+\emph default
+ know whether a system is healthy, or to what degree it is healthy.
+ Typical STONITH methods as used in some contemporary clustermanagers are
+ 
+\series bold
+assuming a worst case
+\series default
+, even if that worst case is currently not for real.
+\end_layout
+
+\begin_layout Standard
+Therefore, avoid the following 
+\series bold
+fundamental flaws
+\series default
+ in failover concepts and healthiness models, which apply to implementors
+ / configurators of clustermanagers:
+\end_layout
+
+\begin_layout Itemize
+Don't mix up knowledge with conclusions about a (sub)system, and also don't
+ mix this up with the real state of that (sub)system.
+ In reality, you don't have any knowledge about a complex distributed system.
+ You only may have 
+\emph on
+some
+\emph default
+ knowledge about 
+\emph on
+some
+\emph default
+ parts of the system, but you cannot 
+\begin_inset Quotes eld
+\end_inset
+
+see
+\begin_inset Quotes erd
+\end_inset
+
+ a complex distributed system as a whole.
+ What you think is your knowledge, isn't knowledge in reality: in many cases,
+ it is 
+\emph on
+conclusion
+\emph default
+, not knowledge.
+ Don't mix this up! 
+\end_layout
+
+\begin_layout Itemize
+Some systems are more complex than your model of it.
+ Don't neglect important parts (such as networks, routers, switches, cables,
+ plugs) which may lead you to wrong conclusions!
+\end_layout
+
+\begin_layout Itemize
+Don't restrict your mind to boolean models of healthyness.
+ Doing so can easily create unnecessary damage by construction, and even
+ at concept level.
+ You should know from software engineering that defects in concepts or models
+ are much more serious than simple bugs in implementations.
+ Choosing the wrong model cannot be fixed as easily as a typical bug or
+ a typo.
+\end_layout
+
+\begin_layout Itemize
+Try to deduce the state of a system as 
+\series bold
+reliably
+\series default
+ as possible.
+ If you don't know something for sure, don't generally assume that it has
+ gone wrong.
+ Don't confuse missing knowledge with the conclusion that something is bad.
+ Boolean algebra restricts your mind to either 
+\begin_inset Quotes eld
+\end_inset
+
+good
+\begin_inset Quotes erd
+\end_inset
+
+ or 
+\begin_inset Quotes eld
+\end_inset
+
+bad
+\begin_inset Quotes erd
+\end_inset
+
+.
+ Use at least 
+\series bold
+tri-state algebra
+\series default
+ which has a means for expressing 
+\series bold
+
+\begin_inset Quotes eld
+\end_inset
+
+unknown
+\begin_inset Quotes erd
+\end_inset
+
+
+\series default
+.
+ Even better: attach a probability to anything you (believe to) know.
+ Errare humanum est: nothing is absolutely sure.
+\end_layout
+
+\begin_layout Itemize
+Oversimplification: don't report an 
+\begin_inset Quotes eld
+\end_inset
+
+unknown
+\begin_inset Quotes erd
+\end_inset
+
+ or even a 
+\begin_inset Quotes eld
+\end_inset
+
+broken
+\begin_inset Quotes erd
+\end_inset
+
+ state for a complex system whenever a smaller subsystem exists for which
+ you have some knowledge (or you can conclude something about it with reasonable
+ evidence).
+ Otherwise, your users / sysadmins may draw wrong conclusions, and assume
+ that the whole system is broken, while in reality only some minor part
+ has some minor problem.
+ Users could then likely make wrong decisions, which may then easily lead
+ to bigger damages.
+\end_layout
+
+\begin_layout Itemize
+Murphy's law: 
+\series bold
+never assume that something can't go wrong!
+\series default
+ Doing so is a blatant misconception at topmost level: the 
+\emph on
+purpose
+\emph default
+ of a clustermanager is creating High Availablity (HA) out of more or less
+ 
+\begin_inset Quotes eld
+\end_inset
+
+unreliable
+\begin_inset Quotes erd
+\end_inset
+
+ components.
+ It is the damn duty of both a clustermanager and its configurator to try
+ to compensate 
+\emph on
+any
+\emph default
+ failures, 
+\emph on
+regardless of their probability
+\emph default
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Never claim that something has only low probability (and therefore it were
+ not relevant).
+ In the HA area, you simply 
+\series bold
+cannot know
+\series default
+ that, because you typically have 
+\emph on
+sporadic
+\emph default
+ incidents.
+ In extreme cases, the 
+\emph on
+purpose
+\emph default
+ of your HA solution is protection against 1 failure per 10 years.
+ You simply don't have the time to wait for creating an incident statistics
+ about that!
+\end_layout
+
+\end_inset
+
+, as best as possible.
+\end_layout
+
+\begin_layout Itemize
+Never confuse 
+\series bold
+probability
+\series default
+ with
+\series bold
+ expectancy value! 
+\series default
+If you don't know the mathematical term 
+\begin_inset Quotes eld
+\end_inset
+
+expectancy value
+\begin_inset Quotes erd
+\end_inset
+
+, or if you don't know what this means 
+\emph on
+in practice
+\emph default
+, don't take responsibility for millions of € or $.
+\end_layout
+
+\begin_layout Itemize
+When operating masses of hard- and software: never assume that a particular
+ failure can occur only at a low number of instances.
+ There are 
+\series bold
+\emph on
+unknown(!)
+\emph default
+ systematic errors
+\series default
+ which may pop up at the wrong time and in huge masses when you don't expect
+ them.
+\end_layout
+
+\begin_layout Itemize
+Multiple layers of fallback: 
+\emph on
+any
+\emph default
+ action can fail.
+ Be prepared to have a plan B, and even a plan C, and even better a plan
+ D, wherever possible.
+\end_layout
+
+\begin_layout Itemize
+Never increase any damage anywhere, unnecessarily! Always try to 
+\emph on
+miminize
+\emph default
+ any damage! It can be mathematically proven that in deterministic probabilistic
+ systems having finite state, increases of a damage level 
+\emph on
+at the wrong place
+\emph default
+ will 
+\emph on
+introduce
+\emph default
+ an 
+\emph on
+additional
+\emph default
+ 
+\emph on
+risk
+\emph default
+ of getting into an 
+\series bold
+endless loop
+\series default
+.
+ This is also true for nondeterministic systems, as known from formal language
+ theory
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Finite automatons are known to be transformable to deterministic ones, usually
+ by an exponential increase in the number of states.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Itemize
+Use the 
+\series bold
+best effort principle
+\series default
+.
+ You should be aware of the following fact: in general, it is impossible
+ to create an 
+\emph on
+absolutely reliable system
+\emph default
+ out of unreliable components.
+ You can 
+\emph on
+lower
+\emph default
+ the risk of failures to any 
+\begin_inset Formula $\epsilon>0$
+\end_inset
+
+ by investing a lot of resources and of money, but whatever you do: 
+\begin_inset Formula $\epsilon=0$
+\end_inset
+
+ is impossible.
+ Therefore, be careful with boolean algebra.
+ Prefer approximation methods / optimizing methods instead.
+ Always do 
+\emph on
+your
+\emph default
+ best, instead of trying to reach a 
+\emph on
+global
+\emph default
+ optimum which likely does not exist at all (because the 
+\begin_inset Formula $\epsilon$
+\end_inset
+
+ can only 
+\emph on
+converge
+\emph default
+ to an optimum, but will never actually reach it).
+ The best effort principle means the following: if you discover a method
+ for improving your operating state by reduction of a (potential) damage
+ in a reasonable time and with reasonable effort, then 
+\series bold
+simply do it
+\series default
+.
+ Don't argue that a particular step is no 100% solution for all of your
+ problems.
+ 
+\emph on
+Any
+\emph default
+ 
+\emph on
+improvement
+\emph default
+ is valuable.
+ 
+\series bold
+Don't miss any valuable step
+\series default
+ having reasonable costs with respect to your budget.
+ Missing valuable measures which have low costs are certainly a violation
+ of the best effort principle, because you are not doing 
+\emph on
+your
+\emph default
+ best.
+ Keep that in mind.
+\begin_inset Newline newline
+\end_inset
+
+If you have 
+\emph on
+understood
+\emph default
+ this (e.g.
+ deeply think at least one day about it), you will no longer advocate STONITH
+ methods 
+\emph on
+in general
+\emph default
+, when there are alternatives.
+ STONITH methods are only valuable when you 
+\emph on
+know in advance
+\emph default
+ that the final outcome (after reboot) will most likely be better, and that
+ waiting for reboot will most likely 
+\emph on
+pay off
+\emph default
+.
+ In general, this condition is 
+\emph on
+not true
+\emph default
+ if you have a healthy hot standby system.
+ This should be easy to see.
+ But there exist well-known clustermanager solutions / configurations blatantly
+ ignoring
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+For some 
+\emph on
+special(!)
+\emph default
+ cases of the shared-disk model, there exist some justifications for doing
+ STONITH 
+\emph on
+before
+\emph default
+ starting the application at the hot standby.
+ Under certain circumstances, it can happen that system A running amok could
+ destroy the data on your single shared disk (example: a filesystem doubly
+ mounted 
+\emph on
+in parallel
+\emph default
+, which will certainly destroy your data, except you are using 
+\family typewriter
+ocfs2
+\family default
+ or suchalike).
+ This argument is only valid for 
+\emph on
+passive
+\emph default
+ disks which are 
+\emph on
+directly
+\emph default
+ attached to 
+\emph on
+both
+\emph default
+ systems A and B, such that there is no 
+\emph on
+external
+\emph default
+ means for fencing the disk.
+ In case of iSCSI running over ordinary network equipment such as routers
+ or switches, the argument 
+\begin_inset Quotes eld
+\end_inset
+
+fencing the disk is otherwise not possible
+\begin_inset Quotes erd
+\end_inset
+
+ does not apply.
+ You can interrupt iSCSI connection at the network gear, or you can often
+ do it at cluster A or at the iSCSI target.
+ Even commercial storage appliances speaking iSCSI can be remotely controlled
+ for forcefully aborting iSCSI sessions.
+ In modern times, the STONITH method has no longer such a justification.
+ The justification stems from ancient times when a disk was a purely passive
+ mechanical device, and its disk controller was part of the server system.
+\end_layout
+
+\end_inset
+
+ this.
+ Only when the former standby system does not work as expected (this means
+ that 
+\emph on
+all
+\emph default
+ of your redundant systems are not healthy enough for your application),
+ 
+\emph on
+only then
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice that STONITH may be needed for (manual or partially automatic) 
+\emph on
+repair
+\emph default
+ in some cases, e.g.
+ when you know that a system has a kernel crash.
+ Don't mix up the repair phase with failover or handover phases.
+ Typically, they are executed at different times.
+ The repair phase is outside the scope of this section.
+\end_layout
+
+\end_inset
+
+
+\emph default
+ STONITH is unevitable as a 
+\emph on
+last resort
+\emph default
+ option.
+\begin_inset Newline newline
+\end_inset
+
+In short: blindly using STONITH without true need during failover is a violation
+ of the best effort principle.
+ You are simply not doing your best.
+\end_layout
+
+\begin_layout Itemize
+When your budget is limited, carefully select those improvements which make
+ your system 
+\series bold
+as reliable as possible
+\series default
+, given your fixed budget.
+\end_layout
+
+\begin_layout Itemize
+Create statistics on the duration of your actions.
+ Based on this, try to get a 
+\emph on
+balanced
+\emph default
+ optimum between time and costs.
+\end_layout
+
+\begin_layout Itemize
+Whatever actions you can 
+\series bold
+start in parallel
+\series default
+ for saving time, do it.
+ Otherwise you are disregarding the best effort principle, and your solution
+ will be sub-optimal.
+ You will require deep knowledge of parallel systems, as well as experience
+ with dealing with problems like (distributed) races.
+ Notice that 
+\emph on
+any
+\emph default
+ distributed system is 
+\emph on
+inherently parallel
+\emph default
+.
+ Don't believe that sequential methods can deliver an optimum solution in
+ such a difficult area.
+\end_layout
+
+\begin_layout Itemize
+If you don't have the 
+\series bold
+necessary skills
+\series default
+ for (a) recognizing already existing parallelism, (b) dealing with parallelism
+ at concept level, (c) programming and/or configuring parallelism race-free
+ and deadlock-free (or if you even don't know what a race condition is and
+ where it may occur in practice), then don't take responsibility for millions
+ of € or $.
+\end_layout
+
+\begin_layout Itemize
+Avoid hard timeouts wherever possible.
+ Use 
+\series bold
+adaptive timeouts
+\series default
+ instead.
+ Reason: depending on hardware or workload, the same action A may take a
+ very short time on cluster 1, but take a very long time on cluster 2.
+ If you need to guard action A from hanging (which is almost always the
+ case because of Murphy's law), don't configure any fixed timeout for it.
+ When having several hundreds of clusters, you would need to use the 
+\emph on
+worst case value
+\emph default
+, which is the longest time occurring somewhere at the very slow clusters
+ / slow parts of the network.
+ This wastes a lot of time in case one of the fast clusters is hanging.
+ Adaptive timeouts work differently: they use a kind of 
+\begin_inset Quotes eld
+\end_inset
+
+progress bar
+\begin_inset Quotes erd
+\end_inset
+
+ to monitor the 
+\emph on
+progress
+\emph default
+ of an action.
+ They will abort only if there is 
+\emph on
+no progress
+\emph default
+ for a certain amount of time.
+ Hint: among others, 
+\family typewriter
+marsadm view-*-rest
+\family default
+ commands or macros are your friend.
+\end_layout
+
+\begin_layout Paragraph
+ITON = Ignore The Other Node
+\end_layout
+
+\begin_layout Standard
+This means 
+\series bold
+fencing from application traffic
+\series default
+, and can be used as an alternative to STONITH when done properly.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+	filename images/fencing-hierarchy.fig
+	width 60col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+Fencing from application traffic is best suited for the shared-nothing model,
+ but can also be adapted to the shared-disk model with some quirks.
+\end_layout
+
+\begin_layout Standard
+The idea is simple: always route your application network traffic to the
+ current (logically) active side, whether it is currently A or B.
+ Just don't route any application requests to the current (logically) passive
+ side at all.
+\end_layout
+
+\begin_layout Standard
+For failover (and 
+\emph on
+only
+\emph default
+ for that), you 
+\emph on
+should not care about
+\emph default
+ any split brain occurring at the low-level generic block device:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+	filename images/split-brain-history.fig
+	width 50col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+Although having a split brain at the generic low-level block device, you
+ now define the 
+\begin_inset Quotes eld
+\end_inset
+
+logically active
+\begin_inset Quotes erd
+\end_inset
+
+ and 
+\begin_inset Quotes eld
+\end_inset
+
+logically passive
+\begin_inset Quotes erd
+\end_inset
+
+ side by yourself by 
+\emph on
+logically ignoring
+\emph default
+ the 
+\begin_inset Quotes eld
+\end_inset
+
+wrong
+\begin_inset Quotes erd
+\end_inset
+
+ side as defined by yourself:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+	filename images/split-brain-resolved.fig
+	width 50col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+This is possible because the generic block devices provided by DRBD or MARS
+ are completely 
+\series bold
+agnostic
+\series default
+ of the 
+\begin_inset Quotes eld
+\end_inset
+
+meaning
+\begin_inset Quotes erd
+\end_inset
+
+ of either version A or B.
+ Higher levels such as clustermanagers (or humans like sysadmins) can assign
+ them a meaning like 
+\begin_inset Quotes eld
+\end_inset
+
+relevant
+\begin_inset Quotes erd
+\end_inset
+
+ or 
+\begin_inset Quotes eld
+\end_inset
+
+not relevant
+\begin_inset Quotes erd
+\end_inset
+
+, or 
+\begin_inset Quotes eld
+\end_inset
+
+logically active
+\begin_inset Quotes erd
+\end_inset
+
+ or 
+\begin_inset Quotes eld
+\end_inset
+
+logically passive
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+As a result of fencing from application traffic, the 
+\begin_inset Quotes eld
+\end_inset
+
+logically passive
+\begin_inset Quotes erd
+\end_inset
+
+ side will 
+\emph on
+logically
+\emph default
+ cease any actions such as updating user data, even if it is 
+\begin_inset Quotes eld
+\end_inset
+
+physically active
+\begin_inset Quotes erd
+\end_inset
+
+ during split-brain (when two primaries exist in DRBD or MARS sense
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Hint: some clustermanagers and/or some people seem to define the term 
+\begin_inset Quotes eld
+\end_inset
+
+split-brain
+\begin_inset Quotes erd
+\end_inset
+
+ differently from DRBD or MARS.
+ In the context of generic block devices, split brain means that the 
+\emph on
+history
+\emph default
+ of both versions has been split to a Y-like 
+\series bold
+fork
+\series default
+ (for whatever reason), such that re-joining them 
+\emph on
+incrementally
+\emph default
+ by ordinary write operations is no longer guaranteed to be possible.
+ As a slightly simplified definition, you might alternatively use the definition
+ 
+\begin_inset Quotes eld
+\end_inset
+
+two incompatible primaries are existing in parallel
+\begin_inset Quotes erd
+\end_inset
+
+, which means almost the same in practice.
+ Details of formal semantics are not the scope of this treatment.
+\end_layout
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+If you already have some load balancing, or BGP, or another 
+\emph on
+mechanism
+\emph default
+ for dynamic routing, you already have an important part for the ITON method.
+ Additionally, ensure by an appropriate 
+\emph on
+strategy
+\emph default
+ that your balancer status / BGP announcement etc does always coincide with
+ the 
+\begin_inset Quotes eld
+\end_inset
+
+logically active
+\begin_inset Quotes erd
+\end_inset
+
+ side (recall that even during split-brain 
+\emph on
+you
+\emph default
+ must define 
+\begin_inset Quotes eld
+\end_inset
+
+logically active
+\begin_inset Quotes erd
+\end_inset
+
+ 
+\series bold
+uniquely
+\series default
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+A possible strategy is to use a Lamport clock for route changes: the change
+ with the most recent Lamport timestamp will always win over previous changes.
+\end_layout
+
+\end_inset
+
+ by yourself).
+\end_layout
+
+\begin_layout Standard
+Example:
+\end_layout
+
+\begin_layout Description
+Phase1 Check whether the hot standby B is currently usable.
+ If this is violated (which may happen during certain types of disasters),
+ abort the failover for any affected resources.
+\end_layout
+
+\begin_layout Description
+Phase2 Do the following 
+\emph on
+in parallel
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+For database applications where no transactions should get lost, you should
+ slightly modify the order of operations: first fence the old side A, then
+ start the application at standby side B.
+ However, be warned that even this cannot guarantee that no transaction
+ is lost.
+ When the network between A and B is interrupted 
+\emph on
+before
+\emph default
+ the incident happens, DRBD will automatically disconnect, and MARS will
+ show a lagbehind.
+ In order to fully eliminate this possibility, you can either use DRBD and
+ configure it to hang forever during network outages (such that users will
+ be unable to commit any transactions at all), or you can use the shared-disk
+ model instead.
+ But in the latter case, you are introducing a SPOF at the single shared
+ disk.
+ The former case is logically almost equivalent to shared-disk, but avoiding
+ some parts of the physical SPOF.
+ In a truly distributed system, the famous CAP theorem is limiting your
+ possibilities.
+ Therefore, no general solution exists fulfilling all requirements at the
+ same time.
+\end_layout
+
+\end_inset
+
+:
+\end_layout
+
+\begin_deeper
+\begin_layout Itemize
+Start all affected applications at the hot standby B.
+ This can be done with the same DRBD or MARS procedure as described 
+\begin_inset CommandInset ref
+LatexCommand vpageref
+reference "Phase4-in-more"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Itemize
+Fence A by fixedly routing all affected application traffic to B.
+\end_layout
+
+\end_deeper
+\begin_layout Standard
+That's all which has to be done for a shared-nothing model.
+ Of course, this will likely produce a split-brain (even when using DRBD
+ in place of MARS), but that will not matter from a user's perspective,
+ because the users will no longer 
+\begin_inset Quotes eld
+\end_inset
+
+see
+\begin_inset Quotes erd
+\end_inset
+
+ the 
+\begin_inset Quotes eld
+\end_inset
+
+logically passive
+\begin_inset Quotes erd
+\end_inset
+
+ side A through their network.
+ Only during the relatively small time period where application traffic
+ was going to the old side A while not replicated to B due to the incident,
+ a very small number of updates 
+\emph on
+could
+\emph default
+ have gone lost.
+ In fields like webhosting, this is taken into account.
+ Users will usually not complain when some (smaller amount of) data is lost
+ due to split-brain.
+ They will complain when the service is unavailable.
+\end_layout
+
+\begin_layout Standard
+This method is the fastest for restoring availability, because it doesn't
+ try to execute any (remote) action at side A.
+ Only from a sysadmin's perspective, there remain some cleanup tasks to
+ be done during the following repair phase, such as split-brain resolution,
+ which are outside the scope of this treatment.
+\end_layout
+
+\begin_layout Standard
+By running the application fencing step 
+\emph on
+sequentially
+\emph default
+ (including wait for its partial successfulness such that the old side A
+ can no longer be reached by any users) in front of the failover step, you
+ may minimize the amount of lost data, but at the cost of total duration.
+ Your service will take longer to be available again, while the amount of
+ lost data is typically somewhat smaller.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+	filename images/lightbulb_brightlit_benj_.png
+	lyxscale 12
+	scale 7
+
+\end_inset
+
+ A few people might clamour when some data is lost.
+ In long-distance replication scenarios with high update traffic, there
+ is 
+\emph on
+simply no way at all
+\emph default
+ for guaranteeing that no data can be lost ever.
+ According to the laws of Einstein and the laws of Distributed Systems like
+ the famous CAP theorem, this isn't the fault of DRBD+proxy or MARS, but
+ simply the 
+\emph on
+consequence
+\emph default
+ of having long distances.
+ If you want to protect against data loss as best as possible, then don't
+ use 
+\begin_inset Formula $k=2$
+\end_inset
+
+ replicas.
+ Use 
+\begin_inset Formula $k\geq4$
+\end_inset
+
+, and spread them over different distances, such as mixed small + medium
+ + long distances.
+ Future versions of MARS will support adaptive pseudo-synchronous modes,
+ which will allow individual adaptation to network latencies / distances.
+\end_layout
+
+\begin_layout Standard
+The ITON method can be adapted to shared-disk by additionally fencing the
+ common disk from the (presumably) failed cluster node A.
+\end_layout
+
+\begin_layout Subsubsection
+Handover Methods
+\end_layout
+
+\begin_layout Standard
+Planned handover is conceptually simpler, because both sides must be (almost)
+ healthy as a 
+\emph on
+precondition
+\emph default
+.
+ There are simply no pre-existing failures to deal with.
+\end_layout
+
+\begin_layout Standard
+Here is an example using DRBD, some application commands denoted as pseudo
+ code:
+\end_layout
+
+\begin_layout Enumerate
+at side A: 
+\family typewriter
+applicationmanager stop all
+\end_layout
+
+\begin_layout Enumerate
+at side A: 
+\family typewriter
+drbdadm secondary all
+\end_layout
+
+\begin_layout Enumerate
+at side B: 
+\family typewriter
+drbdadm primary all
+\end_layout
+
+\begin_layout Enumerate
+at side B: 
+\family typewriter
+applicationmanager start all
+\end_layout
+
+\begin_layout Standard
+MARS already has a conceptual distinction between handover and failover.
+ With MARS, it becomes even simpler, because a generic handover procedure
+ is already built in:
+\end_layout
+
+\begin_layout Enumerate
+at side A: 
+\family typewriter
+applicationmanager stop all
+\end_layout
+
+\begin_layout Enumerate
+at side B: 
+\family typewriter
+marsadm primary all
+\end_layout
+
+\begin_layout Enumerate
+at side B: 
+\family typewriter
+applicationmanager start all
+\end_layout
+
+\begin_layout Subsubsection
+Hybrid Methods
+\end_layout
+
+\begin_layout Standard
+In general, a planned handover may fail at any stage.
+ Notice that such a failure is also a failure, but (partially) caused by
+ the planned handover.
+ You have the following alternatives for automatically dealing with such
+ cases:
+\end_layout
+
+\begin_layout Enumerate
+In case of a failure, switch back to the old side A.
+\end_layout
+
+\begin_layout Enumerate
+Instead, forcefully switch to the new side A, similar to the methods described
+ in section 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Failover-Methods"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+Similar options exist for a failed failover (at least in theory), but chances
+ are lower for actually recovering if you have only 
+\begin_inset Formula $k=2$
+\end_inset
+
+ replicas in total.
+\end_layout
+
+\begin_layout Standard
+Whatever you decide to do in what case in whatever priority order, whether
+ you decide it in advance or during the course of a failing action: it simply
+ means that according to the best effort principle, you should 
+\series bold
+never leave your system in a broken state
+\series default
+ when there exists a chance to recover availability with any method.
+\end_layout
+
+\begin_layout Standard
+Therefore, you should 
+\emph on
+implement
+\emph default
+ neither handover nor failover in their pure forms.
+ Always implement hybrid forms following the best effort principle.
+\end_layout
+
+\begin_layout Subsection
+Special Requirements for Long Distances
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Special-Requirements-for"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Most contemporary clustermanagers have been constructed for short distance
+ shared-nothing clusters, or even for 
+\emph on
+local
+\emph default
+ shared-nothing clusters (c.f.
+ DRBD over crossover cables), or even for shared-disk clusters (
+\emph on
+originally
+\emph default
+, when their 
+\emph on
+concepts
+\emph default
+ were developed).
+ Blindly using them for long-distance replication without modification /
+ adaptation bears some additional risks.
+\end_layout
+
+\begin_layout Itemize
+Notice that long-distance replication always 
+\emph on
+requires
+\emph default
+ a 
+\series bold
+shared-nothing
+\series default
+ model.
+\end_layout
+
+\begin_layout Itemize
+As a consequence, 
+\series bold
+split brain
+\series default
+ can appear 
+\emph on
+regularly
+\emph default
+ during failover.
+ There is no way for preventing it! This is an 
+\emph on
+inherent property
+\emph default
+ of distributed systems, not limited to MARS (e.g.
+ also ocurring with DRBD if you try to use it over long distances).
+ Therefore, you 
+\emph on
+must
+\emph default
+ deal with occurences of split-brain as a 
+\emph on
+requirement
+\emph default
+.
+\end_layout
+
+\begin_layout Itemize
+The probability of 
+\series bold
+network partitions
+\series default
+ is much higher: although you should have been required by Murphy's law
+ to deal with network partitions already in short-distance scenarios, it
+ now becomes 
+\emph on
+mandatory
+\emph default
+.
+\end_layout
+
+\begin_layout Itemize
+Be prepared that in case of certain types of (more or less global) internet
+ partitions, you may not be able to trigger STONITH actions 
+\emph on
+at all
+\emph default
+.
+ Therefore, 
+\series bold
+fencing of application traffic
+\series default
+ is 
+\emph on
+mandatory
+\emph default
+.
+\end_layout
+
 \begin_layout Section
 Creating Backups via Pseudo Snapshots
 \end_layout
@@ -30946,7 +33166,7 @@ cm3
 \end_layout
 
 \begin_layout Standard
-If suchalike doesn't work, or if you need to handover some ressource 
+If suchalike doesn't work, or if you need to handover some resource 
 \family typewriter
 $res1
 \family default
@@ -31135,7 +33355,7 @@ If suchalike doesn't work, or when a handover attempt has failed several
 \emph on
 really need
 \emph default
- forceful switching of some ressource 
+ forceful switching of some resource 
 \family typewriter
 $res1
 \family default