diff --git a/docu/mars-architecture-guide.lyx b/docu/mars-architecture-guide.lyx
new file mode 100644
index 00000000..173a0c00
--- /dev/null
+++ b/docu/mars-architecture-guide.lyx
@@ -0,0 +1,60260 @@
+#LyX 2.3 created this file. For more info see http://www.lyx.org/
+\lyxformat 544
+\begin_document
+\begin_header
+\save_transient_properties true
+\origin unavailable
+\textclass scrreprt
+\begin_preamble
+\usepackage{listings}
+\end_preamble
+\options abstracton,dvipsnames
+\use_default_options true
+\begin_modules
+customHeadersFooters
+enumitem
+fixltx2e
+\end_modules
+\maintain_unincluded_children false
+\language english
+\language_package default
+\inputencoding auto
+\fontencoding global
+\font_roman "default" "default"
+\font_sans "default" "default"
+\font_typewriter "default" "default"
+\font_math "auto" "auto"
+\font_default_family rmdefault
+\use_non_tex_fonts false
+\font_sc false
+\font_osf false
+\font_sf_scale 100 100
+\font_tt_scale 100 100
+\use_microtype false
+\use_dash_ligatures false
+\graphics default
+\default_output_format default
+\output_sync 0
+\bibtex_command default
+\index_command default
+\paperfontsize 10
+\spacing single
+\use_hyperref true
+\pdf_title "MARS Manual"
+\pdf_author "Thomas Schöbel-Theuer"
+\pdf_bookmarks true
+\pdf_bookmarksnumbered false
+\pdf_bookmarksopen false
+\pdf_bookmarksopenlevel 1
+\pdf_breaklinks true
+\pdf_pdfborder true
+\pdf_colorlinks true
+\pdf_backref false
+\pdf_pdfusetitle true
+\papersize a4paper
+\use_geometry true
+\use_package amsmath 1
+\use_package amssymb 1
+\use_package cancel 1
+\use_package esint 1
+\use_package mathdots 1
+\use_package mathtools 1
+\use_package mhchem 1
+\use_package stackrel 1
+\use_package stmaryrd 1
+\use_package undertilde 1
+\cite_engine basic
+\cite_engine_type default
+\biblio_style plain
+\use_bibtopic false
+\use_indices false
+\paperorientation portrait
+\suppress_date false
+\justification true
+\use_refstyle 1
+\use_minted 0
+\index Index
+\shortcut idx
+\color #008000
+\end_index
+\leftmargin 3.7cm
+\topmargin 2.7cm
+\rightmargin 2.8cm
+\bottommargin 2.3cm
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation indent
+\paragraph_indentation default
+\is_math_indent 0
+\math_numbering_side default
+\quotes_style english
+\dynamic_quotes 0
+\papercolumns 1
+\papersides 2
+\paperpagestyle headings
+\tracking_changes false
+\output_changes false
+\html_math_output 0
+\html_css_as_file 0
+\html_be_strict false
+\end_header
+
+\begin_body
+
+\begin_layout Title
+
+\family typewriter
+MARS Manual
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset space ~
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subtitle
+Multiversion Asynchronous Replicated Storage
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset space ~
+\end_inset
+
+
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/earth-mars-transfer.fig
+ width 70col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Author
+Thomas Schöbel-Theuer (
+\family typewriter
+tst@1und1.de
+\family default
+)
+\end_layout
+
+\begin_layout Date
+Version 0.1a-72
+\end_layout
+
+\begin_layout Lowertitleback
+\noindent
+Copyright (C) 2013-16 Thomas Schöbel-Theuer
+\begin_inset Newline newline
+\end_inset
+
+Copyright (C) 2013-16 1&1 Internet AG (see
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+http://www.1und1.de
+\end_layout
+
+\end_inset
+
+ shortly called 1&1 in the following).
+\begin_inset Newline newline
+\end_inset
+
+
+\size footnotesize
+Permission is granted to copy, distribute and/or modify this document under
+ the terms of the GNU Free Documentation License, Version 1.3 or any later
+ version published by the Free Software Foundation; with no Invariant Sections,
+ no Front-Cover Texts, and no Back-Cover Texts.
+ A copy of the license is included in the section entitled
+\begin_inset Quotes eld
+\end_inset
+
+
+\begin_inset CommandInset ref
+LatexCommand nameref
+reference "chap:GNU-FDL"
+
+\end_inset
+
+
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Abstract
+
+\family typewriter
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+sloppy
+\end_layout
+
+\end_inset
+
+ MARS
+\family default
+ is a block-level storage replication system for long distances / flaky
+ networks under GPL.
+ It runs as a Linux kernel module.
+ The sysadmin interface is similar to DRBD
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Registered trademarks are the property of their respective owner.
+\end_layout
+
+\end_inset
+
+, but its internal engine is completely different from DRBD: it works with
+
+\series bold
+transaction logging
+\series default
+, similar to some database systems.
+\end_layout
+
+\begin_layout Abstract
+Therefore, MARS can provide stronger
+\series bold
+consistency guarantees
+\series default
+.
+ Even in case of network bottlenecks / problems / failures, the secondaries
+ may become outdated (reflect an elder state), but never become inconsistent.
+ In contrast to DRBD, MARS preserves the
+\series bold
+order of write operations
+\series default
+ even when the network is flaky (
+\series bold
+Anytime Consistency
+\series default
+).
+\end_layout
+
+\begin_layout Abstract
+The current version of MARS supports
+\begin_inset Formula $k>2$
+\end_inset
+
+ replicas and works
+\series bold
+asynchronously
+\series default
+.
+ Therefore, application performance is completely decoupled from any network
+ problems.
+ Future versions are planned to also support synchronous or near-synchronous
+ modes.
+\end_layout
+
+\begin_layout Abstract
+MARS supports a new method for building Cloud Storage / Software Defined
+ Storage, called
+\series bold
+LV Football
+\series default
+.
+\end_layout
+
+\begin_layout Abstract
+It comes with some automation scripts, leading to a similar functionality
+ than Kubernetes, but devoted to stateful LVs over
+\series bold
+virtual LVM pools
+\series default
+ in the petabytes range.
+\end_layout
+
+\begin_layout Abstract
+\paragraph_spacing double
+\noindent
+\begin_inset space ~
+\end_inset
+
+
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset space ~
+\end_inset
+
+
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Box Frameless
+position "c"
+hor_pos "c"
+has_inner_box 1
+inner_pos "c"
+use_parbox 0
+use_makebox 1
+width "100col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/earth-mars-transfer.fig
+ width 70col%
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\begin_inset CommandInset toc
+LatexCommand tableofcontents
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Chapter
+Architectures of Cloud Storage / Software Defined Storage / Big Data
+\begin_inset CommandInset label
+LatexCommand label
+name "chap:Cloud-Storage"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Datacenter architects have no easy job.
+ Building up some petabytes of data in the wrong way can easily endanger
+ a company, as will be shown later.
+ There are some architectural laws to know and some rules to follow.
+\end_layout
+
+\begin_layout Standard
+First, we need to take a look at the most general possibilities how storage
+ can be architecturally designed:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/storage-classification.fig
+ width 80col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+The topmost question is: do we always need to access bigger masses of (typically
+ unstructured) data over a network?
+\end_layout
+
+\begin_layout Standard
+There is a common belief that both reliability and scalability could be
+ only achieved this way.
+ In the past, local storage has often been viewed as
+\begin_inset Quotes eld
+\end_inset
+
+too simple
+\begin_inset Quotes erd
+\end_inset
+
+ to provide both enterprise grade reliability, and scalability.
+ In the past, this was sometimes true.
+\end_layout
+
+\begin_layout Standard
+However, this picture has changed with the advent of a new
+\series bold
+load balancing
+\series default
+ method called
+\series bold
+LV Football
+\series default
+, see chapter
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "chap:LV-Football"
+
+\end_inset
+
+.
+ We will later review what level of reliability and scalability can be achieved
+ with each of the fundamental models mentioned here.
+\end_layout
+
+\begin_layout Section
+What is Architecture
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:What-is-Architecture"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+From
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+https://en.wikipedia.org/wiki/Software_architecture
+\end_layout
+
+\end_inset
+
+:
+\end_layout
+
+\begin_layout Quote
+Software architecture refers to the
+\series bold
+high level structures
+\series default
+ of a software system and the
+\series bold
+discipline
+\series default
+ of creating such structures and systems.
+\end_layout
+
+\begin_layout Standard
+Throughout this paper, the term
+\begin_inset Quotes eld
+\end_inset
+
+architecture
+\begin_inset Quotes erd
+\end_inset
+
+ is strictly separated from
+\begin_inset Quotes eld
+\end_inset
+
+implementations
+\begin_inset Quotes erd
+\end_inset
+
+.
+ Any of
+\begin_inset Quotes eld
+\end_inset
+
+architecture
+\begin_inset Quotes erd
+\end_inset
+
+ or
+\begin_inset Quotes eld
+\end_inset
+
+implementation
+\begin_inset Quotes erd
+\end_inset
+
+ can relate to both hard- and software in general.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+Confusion of
+\begin_inset Quotes eld
+\end_inset
+
+architecture
+\begin_inset Quotes erd
+\end_inset
+
+ with
+\begin_inset Quotes eld
+\end_inset
+
+implementation
+\begin_inset Quotes erd
+\end_inset
+
+ is a major source of ill-designs, which then often cause major product
+ flaws and/or operational problems.
+ Be sure to understand the difference.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+Another source of costly ill-designs is starting with a particular implementatio
+n in mind, and not sufficiently reasoning abouts its fundamental architecture.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Recommended best practice is to (1) look at the
+\series bold
+problem space
+\series default
+, then (2) consider a
+\emph on
+set
+\emph default
+ of
+\series bold
+architectural solution classes
+\series default
+, and (3) look at the
+\series bold
+mappings
+\series default
+ between them.
+ This means: start with
+\series bold
+architectural requirements
+\series default
+ for a particular
+\series bold
+application area
+\series default
+ (typically covering
+\emph on
+multiple
+\emph default
+ use cases), then look at
+\series bold
+multiple solution architectures
+\series default
+, and finally go down to a
+\series bold
+\emph on
+set
+\series default
+\emph default
+ of potential implementations, but only
+\emph on
+after
+\emph default
+ the former has been understood.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+Starting with a particular single solution in mind is almost a
+\emph on
+guarantee
+\emph default
+ for a non-optimum solution, or even a failed project, or even a disaster
+ at company level when
+\series bold
+enterprise-critical mass data
+\series default
+ is involved.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Nevertheless, don't think in waterfall models.
+ Always work
+\series bold
+iteratively
+\series default
+ and
+\series bold
+evolutionary
+\series default
+, but nevertheless obey the principle that any bug in an architectural ill-desig
+n cannot be fixed by the best implementation of the world.
+ Be sure to understand the fundamental difference between architecture and
+ its (multiple / alternative) implemenations by their respective
+\series bold
+reach
+\series default
+.
+\end_layout
+
+\begin_layout Section
+What is
+\emph on
+Cloud Storage
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Requirements-for-Cloud"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+According to a popular definition from
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+https://en.wikipedia.org/wiki/Cloud_storage
+\end_layout
+
+\end_inset
+
+ (retrieved June 2018), cloud storage is
+\end_layout
+
+\begin_layout Description
+(1) Made up of many
+\series bold
+distributed resources
+\series default
+, but still
+\series bold
+act as one
+\series default
+.
+\end_layout
+
+\begin_layout Description
+(2) Highly
+\series bold
+fault tolerant
+\series default
+ through redundancy and distribution of data.
+\end_layout
+
+\begin_layout Description
+(3) Highly
+\series bold
+durable
+\series default
+ through the creation of versioned copies.
+\end_layout
+
+\begin_layout Description
+(4) Typically
+\series bold
+eventually consistent
+\series default
+ with regard to data replicas.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Notice that the term
+\begin_inset Quotes eld
+\end_inset
+
+network
+\begin_inset Quotes erd
+\end_inset
+
+ does not occur in this definition.
+ However, the term
+\begin_inset Quotes eld
+\end_inset
+
+distributed resources
+\begin_inset Quotes erd
+\end_inset
+
+ is implying
+\emph on
+some(!)
+\emph default
+ kind of network.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Important! The definition does
+\emph on
+not
+\emph default
+ imply some
+\emph on
+specific
+\emph default
+ type of network, such as a
+\series bold
+storage network
+\series default
+ which must be capable of transporting masses of IO operations in
+\series bold
+realtime
+\series default
+.
+ We are free to use other types of networks, such as
+\series bold
+replication networks
+\series default
+, which need not be dimensioned for realtime IO traffic, but are usable
+ for
+\series bold
+background data migration
+\series default
+, and even over long distances, where the network typically has some bottlenecks.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Notice that the definition says nothing about the
+\series bold
+time scale
+\series default
+ of operations
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice: go down to a time scale of microseconds.
+ You will then notice that typical IO operations will require several hundreds
+ of machine instructions between IO request
+\emph on
+submission
+\emph default
+ and the corresponding IO request
+\emph on
+completion
+\emph default
+.
+ This is not only true for local IO.
+ In network clusters like Ceph, it will even involve creation of network
+ packets, and lead to additional IO latencies implied by the network packet
+ transfer latencies.
+\end_layout
+
+\end_inset
+
+.
+ We are free to implement certain operations, such as background data migrations
+, in a rather long timescale (from a human point of view).
+ Example: increasing the number of replicas in an operational Ceph cluster,
+ already containing a few hundreds of terabytes of data, will not only require
+ additional storage hardware, but also take a rather long time, implied
+ by the very nature of such reorganisational tasks.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+The famous CAP theorem is one of the motivations behind requirement (4)
+
+\begin_inset Quotes eld
+\end_inset
+
+eventually consistent
+\begin_inset Quotes erd
+\end_inset
+
+.
+ This is not an accident.
+ There is a
+\emph on
+reason
+\emph default
+ for it, although it is not a
+\emph on
+hard
+\emph default
+ requirement.
+ Strict consistency is not needed for many applications running on top of
+ cloud storage.
+ In addition, the CAP theorem and some other theorems cited at
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+https://en.wikipedia.org/wiki/CAP_theorem
+\end_layout
+
+\end_inset
+
+ are telling us that Strict Consistency would be
+\series bold
+ difficult and expensive
+\series default
+ to achieve at global level in a bigger Distributed System, and at the cost
+ of other properties.
+ More detailed explanations are in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:Explanation-via-CAP"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+There are some consequences from this definition of Cloud Storage, for each
+ of our high-level storage architectures:
+\end_layout
+
+\begin_layout Description
+Distributed
+\begin_inset space ~
+\end_inset
+
+Storage, in particular
+\family typewriter
+BigCluster
+\family default
+ architectures (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Distributed-vs-Local:"
+
+\end_inset
+
+): many of them (with few exceptions) are conforming to all of these requirement
+s.
+ Typical granularity are objects, or chunks, or other relatively small units
+ of data.
+\end_layout
+
+\begin_layout Description
+Centralized
+\begin_inset space ~
+\end_inset
+
+Storage: does not conform to (1) and to (4) by definition
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice that sharding on top of CentralStorage is no longer a CentralStorage
+ model by definition, but a RemoteSharding model according to section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Variants-of-Sharding"
+
+\end_inset
+
+.
+\end_layout
+
+\end_inset
+
+.
+ By introduction of synchronous or asynchronous replication, it can be made
+ to
+\emph on
+almost
+\emph default
+ conform, except for (1) where some concept mismatches remain (probably
+ resolvable by going to a RemoteSharding model on top of CentralStorage,
+ where CentralStorage is only a
+\emph on
+sub-component
+\emph default
+).
+ Typical granularity is replication of whole internal storage pools, or
+ of LVs, or of filesystem instances.
+\end_layout
+
+\begin_layout Description
+LocalStorage, and some further models like
+\family typewriter
+RemoteSharding
+\family default
+ (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Variants-of-Sharding"
+
+\end_inset
+
+):
+\end_layout
+
+\begin_deeper
+\begin_layout Description
+(1) can be achieved at LV granularity with Football (see chapter
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "chap:LV-Football"
+
+\end_inset
+
+), which creates a
+\series bold
+Big Virtual LVM Pool
+\series default
+.
+\end_layout
+
+\begin_layout Description
+(2) can be achieved at disk granularity with local RAID, and at LV granularity
+ with DRBD or MARS.
+\end_layout
+
+\begin_layout Description
+(3) can be achieved at LV granularity with LVM snapshots, and/or ZFS (or
+ other filesystem) snapshots, and/or above filesystem layer by addition
+ of classical backup.
+\end_layout
+
+\begin_layout Description
+(4) at least
+\family typewriter
+Eventually Consistent
+\family default
+ or better can be alternatively achieved by
+\end_layout
+
+\begin_deeper
+\begin_layout Description
+(4a)
+\series bold
+DRBD
+\series default
+, which provides
+\family typewriter
+Strict Consistency
+\family default
+ during
+\family typewriter
+connected
+\family default
+ state, but works only reliably with passive crossover cables over
+\series bold
+short distances
+\series default
+ (see CAP theorem in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:Explanation-via-CAP"
+
+\end_inset
+
+).
+\begin_inset Newline newline
+\end_inset
+
+Notice: DRBD violates any type of consistency within your
+\emph on
+replicas
+\emph default
+ during (automatic) re-sync, and thus does not
+\emph on
+fully
+\emph default
+ comply with the above definition of cloud storage in a
+\emph on
+strong
+\emph default
+ sense.
+ But you can argue at a course time granularity level in order to fix this.
+\end_layout
+
+\begin_layout Description
+(4b)
+\series bold
+MARS
+\series default
+, which works over
+\series bold
+long distances
+\series default
+ and provides two different consistency guarantees at different levels,
+
+\emph on
+both at the same time
+\emph default
+:
+\end_layout
+
+\begin_deeper
+\begin_layout Description
+locally:
+\family typewriter
+ Strict Consistency
+\family default
+ at local LV granularity, also
+\emph on
+within
+\emph default
+ each of the LV replicas.
+\end_layout
+
+\begin_layout Description
+globally:
+\family typewriter
+Eventually Consistent
+\family default
+
+\emph on
+between
+\emph default
+ different LV replicas (global level).
+\begin_inset Newline newline
+\end_inset
+
+The CAP theorem (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Explanation-via-CAP"
+
+\end_inset
+
+) says that
+\family typewriter
+Strict Consistency
+\family default
+ is
+\series bold
+not possible
+\series default
+ in general at
+\emph on
+unplanned failover
+\emph default
+ during long-distance network outages (P = Partitioning Tolerance), when
+ A = Availability is also a requirement.
+\begin_inset Newline newline
+\end_inset
+
+However, in case of a
+\emph on
+planned handover
+\emph default
+, MARS is also
+\family typewriter
+Strictly Consistent
+\family default
+ at a global level, but may need some extra time for catching up.
+\begin_inset Newline newline
+\end_inset
+
+Notice: global
+\family typewriter
+Strict Consistency
+\family default
+ is also possible at a
+\emph on
+coarse timescale
+\emph default
+, in accordance with the CAP theorem, if you decide to sacrifice A = Availabilit
+y during such a network incident by simply
+\emph on
+not
+\emph default
+ doing a failover action.
+ Just wait until the network outage is gone, and MARS will automatically
+ resume
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+This automatic MARS behaviour is similar to the behaviour of DRBD in such
+ situations, when DBRD can automatically go to
+\family typewriter
+disconnected
+\family default
+-like state, and you are later manually or automatically resuming the DRBD
+ connection for an incremental re-sync.
+ MARS does everything automatically because it has no firmly built-in assumption
+s about the actual duration of any network communication.
+\end_layout
+
+\end_inset
+
+ everything ASAP, and thus you are using MARS
+\emph on
+only
+\emph default
+ as a protection against
+\series bold
+fatal
+\series default
+ storage failures / unplanned
+\series bold
+disasters
+\series default
+.
+\begin_inset Newline newline
+\end_inset
+
+Notice: A = Availability is
+\emph on
+not generally
+\emph default
+ required by the above definition of cloud storage, because from a user's
+ perspective it would not generally make sense in the global internet where
+ connection loss may anyway occur at any time.
+ Thus it is a valid operational strategy to
+\emph on
+not
+\emph default
+ fail-over your LVs during certain major network outages.
+\begin_inset Newline newline
+\end_inset
+
+Notice: long-term
+\series bold
+disaster tolerance
+\series default
+ (e.g.
+ perpetual loss of some storage nodes during an earthquake) is
+\emph on
+not
+\emph default
+ modeled by the CAP theorem, but is more or less required by (2) and (3)
+ from the above definition of cloud storage.
+\end_layout
+
+\end_deeper
+\end_deeper
+\end_deeper
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Notice:
+\family typewriter
+BigCluster
+\family default
+ architectures are creating
+\emph on
+virtual
+\emph default
+ storage pools out of physically distributed storage servers.
+ For fairness reasons, creation of a big virtual LVM pool, must be considered
+ as
+\emph on
+another
+\emph default
+ valid Cloud Storage
+\emph on
+model
+\emph default
+, matching the above definition of Cloud Storage.
+ The main architectural difference is granularity, as explained in section
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Granularity-at-Architecture"
+
+\end_inset
+
+, and the stacking order of sub-components.
+ Notice that Football is creating
+\series bold
+location transparency
+\series default
+ inside of the distributed virtual LVM pool.
+ This is an important (though not always required) basic property of any
+ type of clusters and/or grids.
+\end_layout
+
+\begin_layout Section
+Granularity at Architecture
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Granularity-at-Architecture"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Here are the most important architectural differences between object-based
+ storages and LV-based (Logical Volume) storages:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Objects
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+LVs
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Granularity
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+small (typically KiB)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+huge (several TiB)
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Number of instances
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+very high
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+low to medium
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Typical access
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+random keys
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+named
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Update in place
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Resize during operation
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Object support
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+native
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+on top of
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+LV support
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+on top of
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+native
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Filesystem support
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+on top of
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+on top of
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Scalable
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+at cluster
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+both cluster and grid
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Location distances
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+per datacenter / on campus
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+long distances possible
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Centralized pool management
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+per cluster
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Football uniting clusters
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Easy sharding support
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+cumbersome
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+Replication vs Backup
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Replication-vs-Backup"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Intuitively, data backup and data replication are two different solution
+ classes, addressing different problems.
+\end_layout
+
+\begin_layout Standard
+However, there exist descriptions where both solution classes are overlapping,
+ as well as their corresponding problem classes.
+ For example, backup as explained in
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+https://en.wikipedia.org/wiki/Backup
+\end_layout
+
+\end_inset
+
+ could be seen as also encompassing some types of storage replications explained
+ in
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+https://en.wikipedia.org/wiki/Replication_(computing)
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+For a rough comparison of
+\emph on
+typical
+\emph default
+ implementations, see the following
+\emph on
+typical
+\emph default
+ differences:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Backup
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Replication
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Fast handover (planned)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+no, or cumbersome
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Fast failover (unplanned)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+no, or cumbersome
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Protect for physical failures
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Protect for logical data corruption
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes (partly)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+typically no
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Disaster Recovery Time (MTTR)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+typically (very) slow
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+fast
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+Because of these typical differences, enterprise-critical data typically
+ deserves
+\emph on
+both
+\emph default
+ solution classes.
+\end_layout
+
+\begin_layout Standard
+Confusion of solution classes and/or their corresponding problem classes
+ / properties can be harmful to enterprises and to carreers of responsible
+ persons.
+\end_layout
+
+\begin_layout Subsection
+Example: Point-in-time Replication via ZFS Snapshots
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Example:-ZFS-Replication"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Some ZFS advocates believe that ZFS snapshots, which were originally designed
+ for backup-like use cases, are also appropriate solutions for achieving
+ geo-redundancy.
+ The basic idea is to run incremental ZFS snapshots in an endless loop,
+ e.g.
+ via some simple scripts, and expediting to another host where the snapshots
+ are then applied to another ZFS instance.
+ When there is less data to be expedited, loop cycle times can go down to
+ a few seconds.
+ When much data is written at the primary site, loop cycle times will rise
+ up.
+\end_layout
+
+\begin_layout Standard
+The following table tries to explain why geo-redundancy is not as simple
+ to achieve as believed, at least without addition of sophisticated additional
+ means
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+ZFS advocates often argue with many features which aren't present at other
+ filesystem types.
+ The above table shows some dimensions not dealing with properties of local
+ filesystems, but with
+\emph on
+problems / tasks
+\emph default
+ arising in long-distance distributed systems involving masses of enterprise-cri
+tical storage.
+\end_layout
+
+\end_inset
+
+:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Tabular
+
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+OpenSource Component
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+DRBD
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+MARS
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+ZFS
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Synchronity (in average)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+delay
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+delay * 1.5
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Generic solution
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+FS-specific
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Granularity
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+LVs
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+LVs
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+subvolumes
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Built-in snapshots
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Long distances
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Replication parallelism (per gran.)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $1$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\geq2$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $1$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Built-in primary/secondary roles
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+no
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Built-in handover (planned)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+mostly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+no
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Built-in failover (unplanned)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+no
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Built-in data overflow handling
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+unnecessary
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+no, missing
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Unnoticed data loss due to overflow
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+possible
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Split-brain awareness
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+no
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Execute split-brain resolution
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+no
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Protect against illegal data modification
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+no
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+The last item means that ZFS by itself does not protect against amok-running
+ applications modifiying the secondary (backup) side in parallel to the
+ replication process (at least not by default).
+ Workarounds may be possible, but are not easy to create and to test for
+ enterprise-critical applications.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Notice that zfs snapshots can be combined with DRBD or MARS, because zfs
+ snapshots are residing at
+\emph on
+filesystem
+\emph default
+ layer, while DRBD / MARS replicas are located at
+\emph on
+block
+\emph default
+ layer.
+ Just create your zpools at the
+\emph on
+top
+\emph default
+ of DRBD or MARS virtual devices, and import / export them
+\emph on
+individually
+\emph default
+ upon handover / failover of each LV.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ There is a
+\series bold
+\emph on
+fundamental
+\series default
+\emph default
+ difference between zpools and classical RAID / LVM stacked architectures.
+ Some zfs advocates are propagating zpools as a replacement for both RAID
+ and LVM.
+ However, there is a
+\series bold
+massive difference
+\series default
+ in architecture, as explained in the following example (10 logical resources
+ over 48 physical spindles), achieving practically the
+\series bold
+\emph on
+same
+\series default
+ zfs snapshot functionality
+\emph default
+ from a user's perspective, but in a different way:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/raid-lvm-architecture.fig
+ height 6cm
+
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/zpool-architecture.fig
+ height 6cm
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+When RAID functionality is executed by zfs, it will be located at the
+\emph on
+top
+\emph default
+ of the hierarchy.
+ On one hand, this easily allows for different RAID levels for each of the
+ 10 different logical resources.
+ On the other hand, this
+\emph on
+exposes
+\emph default
+ the
+\series bold
+physical spindle configuration
+\series default
+ to the topmost filesystem layer (48 spindles in this example).
+ There is no easy way for replication of these
+\emph on
+physical properties
+\emph default
+ in a larger / heterogenous distributed system, e.g.
+ when some hardware components are replaced over a longer period of time
+ (hardware lifecycle, or LV Football as explained in chapter
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "chap:LV-Football"
+
+\end_inset
+
+).
+ Essentially, only replication of
+\emph on
+logical
+\emph default
+ structures like snapshots remains as the only reasonable option, with its
+ drawbacks as explained above.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ There is another argument: zfs tries to
+\emph on
+hide
+\emph default
+ its internal structures and interfaces from the sysadmins, forming a more
+ or less
+\series bold
+monolithic
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Some sysadmins acting as zfs advocates are reclaiming this as an advantage,
+ because they need to understand only a single tool for managing
+\begin_inset Quotes eld
+\end_inset
+
+everything
+\begin_inset Quotes erd
+\end_inset
+
+.
+ However, this is a short-sighted argument when it comes to
+\emph on
+true
+\emph default
+ flexibility as offered by a component-based system, where multiple types
+ of hardware / software RAID, multiple types of LVM functionality, and much
+ more can be almost orthogonally combined in a very flexible way.
+\end_layout
+
+\end_inset
+
+ architecture
+\series default
+ as seen from outside.
+ This violates the classical
+\emph on
+layering rules
+\emph default
+ from Dijkstra.
+ In contrast, classical LVM-based configurations are
+\series bold
+component oriented
+\series default
+, according to the
+\series bold
+Unix philosophy
+\series default
+.
+\end_layout
+
+\begin_layout Section
+Local vs Centralized Storage
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Local-vs-Centralized"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+There is some old-fashioned belief that only centralized storage systems,
+ as typically sold by commercial storage vendors, could achieve a high degree
+ of reliability, while local storage were inferior by far.
+ In the following, we will see that this is only true for an
+\series bold
+\emph on
+unfair
+\series default
+\emph default
+ comparison involving different classes of storage systems.
+\end_layout
+
+\begin_layout Subsection
+Internal Redundancy Degree
+\end_layout
+
+\begin_layout Standard
+Centralized commerical storage systems are typically built up from highly
+ redundant
+\emph on
+internal
+\emph default
+ components:
+\end_layout
+
+\begin_layout Enumerate
+Redundant power supplies with UPS.
+\end_layout
+
+\begin_layout Enumerate
+Redundancy at the storage HDDs / SSDs.
+\end_layout
+
+\begin_layout Enumerate
+Redandancy at internal transport busses.
+\end_layout
+
+\begin_layout Enumerate
+Redundant RAM / SSD caches.
+\end_layout
+
+\begin_layout Enumerate
+Redundant network interfaces.
+\end_layout
+
+\begin_layout Enumerate
+Redundant compute heads.
+\end_layout
+
+\begin_layout Enumerate
+Redundancy at control heads / management interfaces.
+\end_layout
+
+\begin_layout Standard
+What about local hardware RAID controllers? Many people think that these
+ relatively cheap units were massively inferior at practically each of these
+ points.
+ However, please take a
+\emph on
+really deep
+\emph default
+ look at what classical RAID chip manufacturers like LSI / Avago / Broadcom
+ and their competitors are offering as configuration variants of their top
+ notch models.
+ The following enumeration is in the same order as above (item by item):
+\end_layout
+
+\begin_layout Enumerate
+Redundant hardware RAID cards with BBU caches, each with local goldcaps
+ surviving power outages, their BBU caches cross-coupled via high-speed
+ interconnects.
+\end_layout
+
+\begin_layout Enumerate
+HDD / SSD redundancy: almost any RAID level you can think of.
+\end_layout
+
+\begin_layout Enumerate
+Redundant SAS cross-cabling: any head can access any device.
+\end_layout
+
+\begin_layout Enumerate
+BBU caches are redundant and cross-coupled, similarly to RDMA.
+ When SSD caches are added to both cards, you also get redundancy there.
+\end_layout
+
+\begin_layout Enumerate
+When using cross-coupled redundant cards, you automatically get redundant
+ host bus interfaces (HBAs).
+\end_layout
+
+\begin_layout Enumerate
+The same story: you also get two independent RAID controller instances which
+ can do RAID computations independently from each other.
+ Some implementations do this even in hardware (ASICs).
+\end_layout
+
+\begin_layout Enumerate
+Dito: both cards may be plugged into two different servers, thereby creating
+ redundancy at control level.
+ As a side effect, you may also get a similar functionality than DRBD.
+\end_layout
+
+\begin_layout Standard
+If you compare typical prices for both competing systems, you will notice
+ a huge difference.
+ See also section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Cost-Arguments-from"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsection
+Capacity Differences
+\end_layout
+
+\begin_layout Standard
+There is another hard-to-die myth: commercial storage would provide higher
+ capacity.
+ Please read the data sheets.
+ It is
+\emph on
+possible
+\emph default
+ (but not generally recommended) to put several hundreds of spindles into
+ several external HDD enclosures, and then connect them to a redundant cross-cou
+pled pair of RAID controllers via several types of SAS busses.
+ By filling a rack this way, you can easily reach similar, if not higher
+ capacities than commercial storage boxes, for a
+\emph on
+fraction
+\emph default
+ of the price.
+
+\end_layout
+
+\begin_layout Standard
+However, this is not the recommended way for general use cases (but could
+ be an option for low demands like archiving).
+ The big advantage of RAID-based local storage is
+\series bold
+massive scale-out by sharding,
+\series default
+ as explained in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Distributed-vs-Local:"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsection
+Caching Differences
+\end_layout
+
+\begin_layout Standard
+A frequent argument is that centralized storage systems had bigger caches
+ than local RAID systems.
+ While this argument is often true, it neglects an important point:
+\end_layout
+
+\begin_layout Standard
+Local RAID systems often
+\emph on
+don't need
+\emph default
+ bigger caches, because they are typically located at the
+\emph on
+bottom
+\emph default
+ of a cache hierarchy, playing only a
+\emph on
+particular
+\emph default
+ role in that hierarchy.
+ There exist
+\emph on
+further
+\emph default
+ caches which are
+\series bold
+erronously not considered
+\series default
+ by such an argument!
+\end_layout
+
+\begin_layout Standard
+Example, see also section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Performance-Arguments-from"
+
+\end_inset
+
+ for more details: At 1&1 Shared Hosting Linux (ShaHoLin), a typical LXC
+ container containing several thousands to tenthousands of customer home
+ directories, creates a long-term
+\emph on
+average(!)
+\emph default
+ IOPS load at block layer of about 70 IOPS.
+ No, this isn't a typo.
+ It is not 70,000 IOPS.
+ It is only 70 IOPS.
+
+\end_layout
+
+\begin_layout Standard
+Linux kernel experts know why I am not kidding.
+ The standard Linux kernel has two main caches, the Page Cache for file
+ content, and the Dentry Cache (plus Inode slave cache) for metadata.
+ Both caches are residing in
+\series bold
+RAM
+\series default
+, which is the
+\emph on
+fastest
+\emph default
+ type of cache you can get.
+\end_layout
+
+\begin_layout Standard
+Nowadays, typical servers have several hundreds of gigabytes of RAM, sometimes
+ even up to terabytes, resulting in an incredible caching behaviour which
+ can be measured by those people who know how to do it (caution: it can
+ be easily done wrongly).
+\end_layout
+
+\begin_layout Standard
+Many people are neglecting these caches, sometimes not knowing of their
+ existence, and are falsely assuming that 1 application r
+\family typewriter
+ead()
+\family default
+ or
+\family typewriter
+write()
+\family default
+ operation will also lead to 1 IOPS at block layer.
+ As a consequence, they are demanding 50,000 IOPS or 100,000 or even 1,000,000
+ IOPS.
+\end_layout
+
+\begin_layout Standard
+Some (but not all) commercial storage systems can deliver similar IOPS rates,
+ because they have internal RAM caches in the same order of magnitude.
+ People who are buying such systems are typically falling into some of the
+ following classes (list is probably incomplete):
+\end_layout
+
+\begin_layout Itemize
+some people know this, but price does not matter - the more caches, the
+ better.
+ Wasted money for doubled caches does not count for them, or is even viewed
+ as an advantage to them (personally).
+ Original citation of an anonymous person:
+\begin_inset Quotes eld
+\end_inset
+
+only the best and the most expensive storage is good enough for us
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Itemize
+using NFS, which has extremely poor filesystem caching behaviour because
+ the Linux nfs client implementation does not take full advantage of the
+ dentry cache.
+ Sometimes people know this, sometimes not.
+ It seems that few people have read an important paper on the Linux implementati
+on of nfs.
+ Please search the internet for
+\begin_inset Quotes eld
+\end_inset
+
+Why nfs sucks
+\begin_inset Quotes erd
+\end_inset
+
+ from Olaf Kirch (who is one of the original Linux nfs implementors), and
+
+\emph on
+read
+\emph default
+ it.
+ Your opinion about nfs might change.
+\end_layout
+
+\begin_layout Itemize
+have transactional databases, where high IOPS may be
+\emph on
+really
+\emph default
+ needed, but
+\series bold
+\emph on
+exceptionally
+\series default
+\emph default
+(!) for this class of application.
+ For very big enterprise databases like big SAP installations, there may
+ be a very valid justification for big RAM caches at storage layers.
+ However: smaller transactional loads, as in webhosting, are
+\emph on
+often
+\emph default
+ (not always) hammering a
+\emph on
+low
+\emph default
+ number of
+\series bold
+hot spots
+\series default
+, where
+\emph on
+big
+\emph default
+ caches are not really needed.
+ Relatively small BBU caches of RAID cards will do it also.
+ Often people don't notice this because they don't measure the
+\series bold
+workingset behaviour
+\series default
+ of their application, as could be done for example with
+\family typewriter
+blkreplay
+\family default
+ (see
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+https://blkreplay.org
+\end_layout
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Itemize
+do not notice that
+\emph on
+well-tuned
+\emph default
+ filesystem caches over iSCSI are typically demanding much less IOPS, sometimes
+ by several orders of magnitude, and are wasting money with caches at commercial
+ boxes they don't need (classical
+\series bold
+over-engineering
+\series default
+).
+\end_layout
+
+\begin_layout Standard
+Anyway, local storage can be augmented with various types of local caches
+ with various dimensioning.
+\end_layout
+
+\begin_layout Standard
+However, there is no point in accessing the fastest possible type of RAM
+ cache remotely over a network.
+ Even expensive hardware-based RDMA (e.g.
+ over Infiniband) cannot deliver the same performance as
+\series bold
+directly caching
+\series default
+ your data in the
+\series bold
+\emph on
+same
+\emph default
+ RAM
+\series default
+ where your application is running.
+ The Dentry Cache in the Linux kernel provides highly optimized
+\series bold
+shared metadata
+\series default
+ in SMP and NUMA systems (nowadays scaling to more than 100 processor cores),
+ while the Page Cache provides
+\series bold
+shared memory
+\series default
+ via hardware MMU.
+ This is crucial for the performance of classical local filesystems.
+\end_layout
+
+\begin_layout Standard
+The physical laws of Einstein and others are telling us that neither this
+ type of caching, nor its shared memory behaviour, can be transported over
+ whatever type of network without causing performance degradation.
+\end_layout
+
+\begin_layout Subsection
+Latencies and Throughput
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Latencies-and-Throughput"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+First of all: today there exist only a small number of HDD manufacturers
+ on the world.
+ The number of SSD manufacturers will likely decline in the long run.
+ Essentially, commercial storage vendors are more or less selling you the
+ same HDDs or SSDs as you could buy and deploy yourself.
+ If at all, there are only some minor technical differences.
+\end_layout
+
+\begin_layout Standard
+In the meantime, many people agree to a Google paper that the
+\emph on
+ratio
+\emph default
+ of market prices (price per terabyte) between HDDs and SSDs are unlikely
+ to change in a fundamental
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+In folklore, there exists a
+\series bold
+fundamental empirical law
+\series default
+, fuzzily called
+\begin_inset Quotes eld
+\end_inset
+
+Storage Pyramid
+\begin_inset Quotes erd
+\end_inset
+
+ or
+\begin_inset Quotes eld
+\end_inset
+
+Memory Hierarchy Law
+\begin_inset Quotes erd
+\end_inset
+
+ or similar, which is well-known at least in German OS academic circles.
+ The empirical law (extrapolated from
+\series bold
+observations
+\series default
+, similarly to Moore's law) tells us that faster storage technology is always
+
+\series bold
+more expensive
+\series default
+ than slower storage technology, and that capacities of faster storage are
+ typically always lesser than capacity of slower storage.
+ This observation has been roughly valid for more than 50 years now.
+ You can find it in several German lecture scripts.
+ Unfortunately, the Wikipedia article
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+https://en.wikipedia.org/wiki/Memory_hierarchy
+\end_layout
+
+\end_inset
+
+ (retrieved in June 2018) does not cite this very important fundamental
+ law about
+\series bold
+costs
+\series default
+.
+ In contrast, the German article
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+https://de.wikipedia.org/wiki/Speicherhierarchie
+\end_layout
+
+\end_inset
+
+ about roughly the same subject is mentioning
+\begin_inset Quotes eld
+\end_inset
+
+Kosten
+\begin_inset Quotes erd
+\end_inset
+
+ which means
+\begin_inset Quotes eld
+\end_inset
+
+cost
+\begin_inset Quotes erd
+\end_inset
+
+, and
+\begin_inset Quotes eld
+\end_inset
+
+teuer
+\begin_inset Quotes erd
+\end_inset
+
+ which means
+\begin_inset Quotes eld
+\end_inset
+
+expensive
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\end_inset
+
+ way during the next 10 years.
+ Thus, most large-capacity enterprise storage systems are built on top of
+ HDDs.
+\end_layout
+
+\begin_layout Standard
+Typically, HDDs and their mechanics are forming the overall bottleneck.
+\end_layout
+
+\begin_layout Itemize
+by construction, a
+\emph on
+local
+\emph default
+ HDD attached via HBAs or a hardware RAID controller will show the least
+
+\emph on
+additional
+\emph default
+ overhead in terms of
+\emph on
+additional
+\emph default
+ latencies and throughput degradation caused by the attachment.
+\end_layout
+
+\begin_layout Itemize
+When the
+\emph on
+same
+\emph default
+ HDD is
+\emph on
+indirectly
+\emph default
+ attached via Ethernet or Infiniband or another rack-to-rack transport,
+ both latencies and throughput will become worse.
+ Depending on further factors and influences, the overall bottleneck may
+ shift to the network.
+\end_layout
+
+\begin_layout Standard
+The laws of information transfer are telling us: with increasing distance,
+ both latencies (laws of Einstein) and throughput (laws of energy needed
+ for compensation of SNR = signal to noise ratio) are becoming worse.
+ Distance matters.
+ And the number of intermediate components, like routers / switches and
+ their
+\series bold
+queuing
+\series default
+, matters too.
+\end_layout
+
+\begin_layout Standard
+This means that local storage has
+\emph on
+always
+\emph default
+ an advantage in front of any attachment via network.
+ Centralized storages are bound to some network, and thus suffer from disadvanta
+ges in terms of latencies and throughput.
+\end_layout
+
+\begin_layout Standard
+What is the expected long-term future? Will additional latencies and throughput
+ of centralized storages become better over time?
+\end_layout
+
+\begin_layout Standard
+It is difficult to predict the future.
+ Let us first look at the past evolution.
+ The following graphics has taken its numbers from Wikipedia articles
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+https://en.wikipedia.org/wiki/List_of_device_bit_rates
+\end_layout
+
+\end_inset
+
+ and
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+https://en.wikipedia.org/wiki/History_of_hard_disk_drives
+\end_layout
+
+\end_inset
+
+, showing that HDD capacities have grown
+\series bold
+over-proportionally
+\series default
+ by about 2 orders of magnitude over about 30 years, when compared to the
+ relative growth of network bandwidth.
+\end_layout
+
+\begin_layout Standard
+In the following graphics, effects caused by decreasing form factors have
+ been neglected, which would even
+\emph on
+amplify
+\emph default
+ the trend.
+ For fairness, bundling of parallel disks or parallel communication channels
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+It is easy to see that the slopes of
+\family typewriter
+HDD.capacity
+\family default
+ vs
+\family typewriter
+Infiniband.rates
+\family default
+ are different.
+ Parallelizing by bundling of Infiniband wires will only lift the line a
+ little upwards, but will not alter its slope in logarithmic scale.
+ For extrapolated time
+\begin_inset Formula $t\rightarrow\infty$
+\end_inset
+
+, the extrapolated empirical long-term behaviour is rather striking.
+\end_layout
+
+\end_inset
+
+ have been ignored.
+ All comparisons are in logarithmic y axis scale:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename BitRates/Capacity-BitRate-Comparison.pdf
+ width 100col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+What does this mean when extrapolated into the future?
+\end_layout
+
+\begin_layout Standard
+It means that concentrating more and more capacity into a single rack due
+ to increasing data density will likely lead to more problems in future.
+ Accessing more and more data over the network will become increasingly
+ more difficult when concentrating high-capacity HDDs or SSDs
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+It is difficult to compare the space density of contemporary SSDs in a fair
+ way.
+ There are too many different form factors.
+ For example, M2 cards are typically consuming even less
+\begin_inset Formula $cm^{3}/TB$
+\end_inset
+
+ than classical 2.5 inch form factors.
+ This trend is likely to continue in future.
+\end_layout
+
+\end_inset
+
+ into the same space volume as before.
+\end_layout
+
+\begin_layout Standard
+In other words: centralized storages are no good idea yet, and will likely
+ become an even worse idea in the future.
+
+\end_layout
+
+\begin_layout Standard
+Example: there was a major incident at a German web hosting company at the
+ beginning of the 2000's.
+ Their entire webhosting main business was running on a single proprietary
+ highly redundant CentralStorage solution, which failed.
+ Restore from backup took way too long from the viewpoint of a huge number
+ of customers, leading to major press attention.
+ Before this incident, they were the #1 webhoster in Germany.
+ A few years later, 1&1 was the #1 instead.
+ You can speculate whether this has to do with the incident.
+ But anyway, the later geo-redundancy strategy of 1&1 basing on a sharding
+ model (originally using DRBD, later MARS) was motivated by conclusions
+ drawn from this incident.
+\end_layout
+
+\begin_layout Standard
+Another example: in the 1980s, a CentralStorage
+\begin_inset Quotes eld
+\end_inset
+
+dinosaur
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+With the advent of NVME, SSDs are almost directly driven by DMA.
+ Accessing any high-speed DMA devices by default via network is a foolish
+ idea, similarly foolish than playing games via an expensive high-end gamer
+ graphics cards which is then
+\emph on
+indirectly
+\emph default
+ attached via RDMA, or even via Ethernet.
+ Probably no serious gamer would ever
+\emph on
+try
+\emph default
+ to do that.
+ But some storage vendors do, for strategic reasons.
+ Probably for their own survival, their customers are to be misguided to
+ overlook the blinking red indicators that centralized SSD storage is likely
+ nothing but an expensive dead end in the history of dinosaur architectures.
+\end_layout
+
+\end_inset
+
+
+\begin_inset Quotes erd
+\end_inset
+
+ architecture called SLED = Single Large Expensive Disk was propagated with
+ huge marketing noise and effort, but its historic fate was predictable
+ for real experts not bound to particular interests: SLED finally lost against
+ their contemporary RAID competition.
+ Nowadays, many people don't even remember the term SLED.
+\end_layout
+
+\begin_layout Standard
+Today's future is likely dominated by
+\series bold
+scaling-out architectures
+\series default
+ like sharding, as explained in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Distributed-vs-Local:"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsection
+Reliability Differences CentralStorage vs Sharding
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Reliability-Differences-CentralStorage"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+In this section, we look at
+\emph on
+fatal
+\emph default
+ failures only, ignoring temporary failures.
+ A fatal failure of a storage is an incident which needs to be corrected
+ by
+\series bold
+restore from backup
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+By definition, even a
+\emph on
+highly redundant
+\emph default
+ CentralStorage is
+\emph on
+nevertheless
+\emph default
+ a SPOF = Single Point of Failure.
+ This also applies to fatal failures.
+\end_layout
+
+\begin_layout Standard
+Some people are incorrectly arguing with redundancy.
+ However, the problem is that
+\emph on
+any
+\emph default
+ system, even a highly redundant one, can fail fatally.
+ There exists no perfect system on earth.
+ One of the biggest known sources of fatal failure is
+\series bold
+human error
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+In contrast, sharded storage (for example the LocalSharding model, see also
+ section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Variants-of-Sharding"
+
+\end_inset
+
+) has MPOF = Multiple Points Of Failure.
+ It is unlikely that many shards are failing fatally at the same time, because
+ shards are
+\emph on
+independent
+\emph default
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+When all shards are residing in the same datacenter, there exists a SPOF
+ by power loss or other impacts onto the whole datacenter.
+ However, this applies to both the CentralStorage and to the LocalSharding
+ model.
+ In contrast to CentralStorage, LocalSharding can be more easily distributed
+ over multiple datacenters.
+\end_layout
+
+\end_inset
+
+ from each other by definition (cf paragraph
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "par:Definition-of-Sharding"
+
+\end_inset
+
+ for disambiguation of terms
+\begin_inset Quotes eld
+\end_inset
+
+sharding
+\begin_inset Quotes erd
+\end_inset
+
+ and
+\begin_inset Quotes eld
+\end_inset
+
+shared-nothing
+\begin_inset Quotes erd
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+What is the difference from the viewpoint of customers of the services?
+\end_layout
+
+\begin_layout Standard
+When a CentralStorage fails fatally, a
+\emph on
+huge
+\emph default
+ number of customers will be affected for a
+\emph on
+long
+\emph default
+ time (see the example German webhoster mentioned in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Latencies-and-Throughput"
+
+\end_inset
+
+).
+ Reason: restore from backup will take extremely long because huge masses
+ of data have to be restored.
+ MTBF = Mean Time Between Failures is (hopefully) longer thanks to redundancy,
+ but MTTR = Mean Time To Repair is also very long.
+\end_layout
+
+\begin_layout Standard
+With (Local)Sharding, the risk of
+\emph on
+some
+\emph default
+ fatal incident
+\emph on
+somewhere
+\emph default
+ in the sharding pool is higher, but the
+\series bold
+\emph on
+size
+\series default
+\emph default
+ of such an incident is smaller in three dimensions at the same time:
+\end_layout
+
+\begin_layout Enumerate
+There are much
+\series bold
+less customers affected
+\series default
+ (typically only
+\begin_inset Formula $1$
+\end_inset
+
+ shard out of
+\begin_inset Formula $n$
+\end_inset
+
+ shards).
+\end_layout
+
+\begin_layout Enumerate
+
+\series bold
+MTTR
+\series default
+ = Mean Time To Repair is typically much better because there is much less
+ data to be restored.
+\end_layout
+
+\begin_layout Enumerate
+
+\series bold
+Residual risk
+\series default
+ plus resulting fatal damage by
+\series bold
+un-repairable problems
+\series default
+ is thus lower.
+\end_layout
+
+\begin_layout Standard
+What does this mean from the viewpoint of an investor of a big
+\begin_inset Quotes eld
+\end_inset
+
+global player
+\begin_inset Quotes erd
+\end_inset
+
+ company?
+\end_layout
+
+\begin_layout Standard
+As is promised by the vendors, let us assume that failure of CentralStorage
+ might be occurring less frequently.
+ But
+\emph on
+when
+\emph default
+ it happens on
+\series bold
+enterprise-critical mass data
+\series default
+, the stock exchange value of the affected company will be exposed to a
+
+\series bold
+hazard
+\series default
+.
+ This is not bearable from the viewpoint of an investor.
+\end_layout
+
+\begin_layout Standard
+In contrast, the (Local)Sharding model is
+\emph on
+distributing
+\emph default
+ the
+\series bold
+indispensible incidents
+\series default
+ (because
+\series bold
+perfect systems do not exist
+\series default
+, and
+\series bold
+perfect humans do not exist
+\series default
+) to a lower number of customers with higher frequency, such that the
+\series bold
+total impact onto the business
+\series default
+ becomes bearable.
+\end_layout
+
+\begin_layout Standard
+Risk analysis of enterprise-critical use cases is summarized in the following
+ table:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+CentralStorage
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(Local)Sharding
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Probability of
+\emph on
+some
+\emph default
+ fatal incident
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+lower
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+higher
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+# Customers affected
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+very high
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+very low
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+MTBF per storage
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+higher
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+lower
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+MTTR per storage
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+higher
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+lower
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Unrepairable residual risk
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+higher
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+lower
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Total impact
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+higher
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+lower
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Investor's risk
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\series bold
+unbearable
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+stock exchange compatible
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+Summary: CentralStorage is something for
+\end_layout
+
+\begin_layout Itemize
+\noindent
+Small to medium-sized companies which don't have the
+\series bold
+manpower
+\series default
+ and the
+\series bold
+skills
+\series default
+ for professionally building and operating a (Local)Sharding (or similar)
+ system for their enterprise-critical mass data their business is relying
+ upon.
+\end_layout
+
+\begin_layout Itemize
+
+\series bold
+\emph on
+Monolithic
+\emph default
+ enterprise applications
+\series default
+ like classical SAP which are anyway bound to a specific vendor, where you
+ cannot select a different solution (so-called
+\series bold
+Vendor Lock-In
+\series default
+).
+\end_layout
+
+\begin_layout Itemize
+When your application
+\series bold
+is neither shardable
+\series default
+ by construction (c.f.
+ section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Distributed-vs-Local:"
+
+\end_inset
+
+), or when doing so would be a too high effort,
+\series bold
+nor going to BigCluster
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Theoretically, BigCluster can be used to create 1 single huge remote LV
+ (or 1 single huge remote FS instance) out of a pool of storage machines.
+ Double-check, better triple-check that such a
+\series bold
+big
+\emph on
+logical
+\emph default
+ SPOF
+\series default
+ is
+\emph on
+really
+\emph default
+ needed, and cannot be circumvented by any means.
+ Only in such a case, the current version of MARS cannot help (yet), because
+ its
+\emph on
+current
+\emph default
+
+\emph on
+focus
+\emph default
+ is on a big number of machines each having relatively small LVs.
+ At 1&1 ShaHoLin, the biggest LVs are 40TiB at the moment, running for years
+ now, and bigger ones are certainly possible.
+ Only when current local RAID technology with external enclosures cannot
+ easily create a single LV in the petabyte scale, BigCluster is probably
+ the better solution (c.f.
+ section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:Reliability-Arguments-from"
+
+\end_inset
+
+).
+\end_layout
+
+\end_inset
+
+
+\series default
+ (e.g.
+ Ceph / Swift / etc, see secion
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:Reliability-Arguments-from"
+
+\end_inset
+
+) is an option.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+If you have an
+\emph on
+ already sharded
+\emph default
+ system, e.g.
+ in webhosting, don't convert it to a non-shardable one, and don't introduce
+ SPOFs needlessly.
+ You will introduce
+\series bold
+technical debts
+\series default
+ which are likely to hurt back somewhen in future!
+\end_layout
+
+\begin_layout Standard
+As a real big
+\begin_inset Quotes eld
+\end_inset
+
+global player
+\begin_inset Quotes erd
+\end_inset
+
+, or as a company being part of such a structure, you should be careful
+ when listening to
+\begin_inset Quotes eld
+\end_inset
+
+marketing drones
+\begin_inset Quotes erd
+\end_inset
+
+ of proprietary CentralStorage vendors.
+ Always check your
+\emph on
+concrete
+\emph default
+ use case.
+ Never believe in wrongly generalized claims, which are only valid in some
+ specific context, but do not really apply to your use case.
+ It could be about your
+\emph on
+life
+\emph default
+.
+\end_layout
+
+\begin_layout Subsection
+Proprietary vs OpenSource
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Proprietary-vs-OpenSource"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+In theory, the following dimensions are orthogonal to each other:
+\end_layout
+
+\begin_layout Description
+Architecture: LocalStorage vs CentralStorage vs DistributedStorage
+\end_layout
+
+\begin_layout Description
+Licensing: Proprietary vs OpenSource
+\end_layout
+
+\begin_layout Standard
+In practice, however, many vendors of proprietary storage systems are selecting
+ the CentralStorage model.
+ This way, they can avoid inter-operability with their competitors.
+ This opens the door for the so-called
+\series bold
+Vendor Lock-In
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+In contrast, the OpenSource community is based on
+\emph on
+cooperation
+\emph default
+.
+ Opting for OpenSource means that you can
+\series bold
+combine and exchange
+\series default
+ numerous
+\series bold
+components
+\series default
+ with each other.
+
+\end_layout
+
+\begin_layout Standard
+Key OpenSource players are
+\emph on
+basing
+\emph default
+ their business on the
+\series bold
+usefulness
+\series default
+ of their software components for you, their customer.
+ Please search the internet for further explanations from Eric S.
+ Raymond.
+\end_layout
+
+\begin_layout Standard
+Therefore
+\series bold
+interoperability
+\series default
+ is a
+\emph on
+must
+\emph default
+ in the opensource business.
+ For example, you can relatively easily migrate between DRBD and MARS, forth
+ and backwards, see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Setup-Primary-and"
+
+\end_inset
+
+.
+ The
+\emph on
+generic
+\emph default
+ block devices provided by both DRBD and MARS (and by the kernel LVM2 implementa
+tion, and many others
+\begin_inset Formula $\ldots$
+\end_inset
+
+) can interact with zillions of filesystems, VMs, applications, and so forth.
+\end_layout
+
+\begin_layout Standard
+Summary:
+\series bold
+genericity
+\series default
+ is a highly desired property in OpenSource communities, while proprietary
+ products often try to control their usage by limiting either technical
+ interoperability at certain layers, and/or legally by contracts.
+ Trying to do so with OpenSource would make no sense, because
+\emph on
+you
+\emph default
+, the customer, are the
+\emph on
+real
+\emph default
+ king who can
+\emph on
+really
+\emph default
+ select and combine components.
+ You can form a
+\series bold
+really customized system
+\series default
+ to your
+\series bold
+\emph on
+real needs
+\series default
+\emph default
+, not as just promised but not always actually delivered by so-called
+\begin_inset Quotes eld
+\end_inset
+
+marketing drones
+\begin_inset Quotes erd
+\end_inset
+
+ from commercial vendors who are actually prefering the needs of their employer
+ in front of yours.
+\end_layout
+
+\begin_layout Standard
+There is another fundamental difference between proprietary software and
+ OpenSource: the former is bound to some company, which may
+\emph on
+vanish
+\emph default
+ from the market.
+ Commercial storage systems may be
+\series bold
+discontinued
+\series default
+.
+
+\end_layout
+
+\begin_layout Standard
+This can be a serious threat to your business relying on the value of your
+ data.
+ In particular, buying storage systems from
+\emph on
+small
+\emph default
+ vendors may increase this risk
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+There is a risk of a
+\emph on
+domino effect
+\emph default
+: once there is a critical incident on highly redundant CentralStorage boxes
+ from a particular (smaller) vendor, this may lead to major public media
+ attention.
+ This may form the
+\emph on
+root cause
+\emph default
+ for such a vendor to vanish from the market.
+ Thus you may be left alone with a buggy system, even if you aren't the
+ victim of the concrete incident.
+\end_layout
+
+\begin_layout Plain Layout
+In contrast, bugs in an OpenSource component can be fixed by a larger community
+ of interested people, or by yourself if you hire somebody for this.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+OpenSource is different: it cannot die, even if the individual, or the (small)
+ company which produced it, does no longer exist.
+ The sourcecode is in the
+\series bold
+public
+\series default
+.
+ It just could get
+\emph on
+outdated
+\emph default
+ over time.
+ However, as long as there is enough public interest, you will always find
+ somebody who is willing to adapt and to
+\emph on
+maintain
+\emph default
+ it.
+ Even if you would be the only one having such an interest, you can
+\emph on
+hire
+\emph default
+ a maintainer for it, specifically for your needs.
+ You aren't
+\series bold
+helpless
+\series default
+.
+\end_layout
+
+\begin_layout Section
+Distributed vs Local: Scalability Arguments from Architecture
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Distributed-vs-Local:"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Datacenters aren't usually operated for fun or for hobby.
+ Scalability of an
+\emph on
+architecture
+\emph default
+ (cf section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:What-is-Architecture"
+
+\end_inset
+
+) is very important, because it can seriously limit your business.
+ Overcoming architectural ill-designs can grow extremely cumbersome and
+ costly.
+\end_layout
+
+\begin_layout Standard
+Many enterprise system architects are starting with a particular architecture
+ in mind, called
+\begin_inset Quotes eld
+\end_inset
+
+Big Cluster
+\begin_inset Quotes erd
+\end_inset
+
+.
+ There is a common belief that otherwise
+\series bold
+scalability
+\series default
+ could not be achieved:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/Architecure_Big_Cluster.pdf
+ width 100col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+The crucial point is the
+\series bold
+storage network
+\series default
+ here:
+\begin_inset Formula $n$
+\end_inset
+
+ storageservers are interconnected with
+\begin_inset Formula $m=O(n)$
+\end_inset
+
+ frontend servers, in order to achieve properties like scalability, failure
+ tolerance, etc.
+\end_layout
+
+\begin_layout Standard
+Since
+\emph on
+any
+\emph default
+ of the
+\begin_inset Formula $m$
+\end_inset
+
+ frontends must be able to access
+\emph on
+any
+\emph default
+ of the
+\begin_inset Formula $n$
+\end_inset
+
+ storages in realtime, the storage network must be dimensioned for
+\begin_inset Formula $O(n\cdot m)=O(n^{2})$
+\end_inset
+
+ network connections running in parallel.
+ Even if the total network throughput is scaling only with
+\begin_inset Formula $O(n)$
+\end_inset
+
+, nevertheless
+\begin_inset Formula $O(n^{2})$
+\end_inset
+
+ network connections have to be maintained at connection oriented protocols
+ and at various layers of the operating software.
+ The network has to
+\emph on
+switch
+\emph default
+ the packets from
+\begin_inset Formula $n$
+\end_inset
+
+ sources to
+\begin_inset Formula $m$
+\end_inset
+
+ destinations (and their opposite way back) in
+\series bold
+realtime
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+This
+\series bold
+cross-bar functionality
+\series default
+ in realtime makes the storage network complicated and expensive.
+ Some further factors are increasing the costs of storage networks:
+\end_layout
+
+\begin_layout Itemize
+In order to limit error propagation from other networks, the storage network
+ is often built as a
+\emph on
+physically separate
+\emph default
+ =
+\emph on
+dedicated
+\emph default
+ network.
+
+\end_layout
+
+\begin_layout Itemize
+Because storage networks are heavily reacting to high latencies and packet
+ loss, they often need to be dimensioned for the
+\series bold
+worst case
+\series default
+ (load peaks, packet storms, etc), needing one of the best = typically most
+ expensive components for reducing latency and increasing throughput.
+ Dimensioning to the worst case instead of an average case plus some safety
+ margins is nothing but an expensive
+\series bold
+overdimensioning
+\series default
+ /
+\series bold
+over-engineering
+\series default
+.
+\end_layout
+
+\begin_layout Itemize
+When
+\series bold
+multipathing
+\series default
+ is required for improving fault tolerance of the storage network itself,
+ these efforts will even
+\emph on
+double
+\emph default
+.
+\end_layout
+
+\begin_layout Itemize
+When geo-redundancy is required, the total effort may easily more than double
+ another time because in cases of disasters like terrorist attacks the backup
+ datacenter must be prepared for taking over for multiple days or weeks.
+\end_layout
+
+\begin_layout Standard
+Fortunately, there is an alternative called
+\begin_inset Quotes eld
+\end_inset
+
+
+\series bold
+Sharding Architecture
+\series default
+
+\begin_inset Quotes erd
+\end_inset
+
+ or
+\begin_inset Quotes eld
+\end_inset
+
+
+\series bold
+Shared-nothing Architecture
+\series default
+
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Paragraph
+Definition of Sharding
+\begin_inset CommandInset label
+LatexCommand label
+name "par:Definition-of-Sharding"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Notice that the term
+\begin_inset Quotes eld
+\end_inset
+
+Sharding
+\begin_inset Quotes erd
+\end_inset
+
+ originates from database architecture
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+https://en.wikipedia.org/wiki/Shard_(database_architecture)
+\end_layout
+
+\end_inset
+
+ where it has a slightly different meaning than used here.
+ Our usage of the term
+\begin_inset Quotes eld
+\end_inset
+
+sharding
+\begin_inset Quotes erd
+\end_inset
+
+ reflects slightly different situations in some webhosting companies
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+According to
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+https://en.wikipedia.org/wiki/Shared-nothing_architecture
+\end_layout
+
+\end_inset
+
+, Google also uses the term
+\begin_inset Quotes eld
+\end_inset
+
+sharding
+\begin_inset Quotes erd
+\end_inset
+
+ for a particular
+\begin_inset Quotes eld
+\end_inset
+
+shared-nothing architecture
+\begin_inset Quotes erd
+\end_inset
+
+.
+ Although our above definition of
+\begin_inset Quotes eld
+\end_inset
+
+sharding
+\begin_inset Quotes erd
+\end_inset
+
+ does not fully comply with its original meaning, a similar usage by Google
+ probably means that our usage of the term is not completely uncommon.
+\end_layout
+
+\end_inset
+
+, and can be certainly transferred to some more application areas.
+ Our more specific use of the term
+\begin_inset Quotes eld
+\end_inset
+
+sharding
+\begin_inset Quotes erd
+\end_inset
+
+ has the following properties,
+\emph on
+all at the same time:
+\end_layout
+
+\begin_layout Enumerate
+User / customer data is
+\series bold
+partitioned
+\series default
+.
+ This is very similar to database sharding.
+ However, the original database term also allows
+\emph on
+some
+\emph default
+ data to remain unpartitioned.
+ In webhosting, suchalike may exists also, but typically only for
+\emph on
+system data,
+\emph default
+ like OS images, including large parts of their configuration data.
+ Suchalike system data is typically
+\emph on
+replicated
+\emph default
+ from a central
+\begin_inset Quotes eld
+\end_inset
+
+golden image
+\begin_inset Quotes erd
+\end_inset
+
+ in an
+\emph on
+offline
+\emph default
+ fashion, e.g.
+ via regular
+\family typewriter
+rsync
+\family default
+ cron jobs, etc.
+ Typically, it comprises only of few gigabytes per instance and is mostly
+ read-only with a slow change rate, while total customer data is typically
+ in the range of some petabytes with a higher total change rate.
+\end_layout
+
+\begin_layout Enumerate
+Servers have
+\series bold
+no single point of contention
+\series default
+, and thus are
+\series bold
+completely independent
+\series default
+ from each other, like in
+\series bold
+shared-nothing
+\series default
+ architectures
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+https://en.wikipedia.org/wiki/Shared-nothing_architecture
+\end_layout
+
+\end_inset
+
+.
+ However, the original term
+\begin_inset Quotes eld
+\end_inset
+
+shared-nothing
+\begin_inset Quotes erd
+\end_inset
+
+ has also been used for describing
+\emph on
+replicas
+\emph default
+, e.g.
+ DRBD mirrors.
+ In our context of
+\begin_inset Quotes eld
+\end_inset
+
+sharding
+\begin_inset Quotes erd
+\end_inset
+
+, the shared-nothing principle
+\emph on
+only
+\emph default
+ refers to the
+\begin_inset Quotes eld
+\end_inset
+
+
+\series bold
+no single point of contention
+\series default
+
+\begin_inset Quotes erd
+\end_inset
+
+ principle at
+\emph on
+partitioning
+\emph default
+ level, which means it
+\emph on
+only
+\emph default
+ refers to to the
+\emph on
+partitioning
+\emph default
+ of the user data, but
+\emph on
+not
+\emph default
+ to their replicas.
+ Shared-nothing replicas in the sense of DRBD may be also present (and in
+ fact they are at 1&1 Shared Hosting Linux), but these replicas are
+\emph on
+not
+\emph default
+ meant by our usage of the term
+\begin_inset Quotes eld
+\end_inset
+
+sharding
+\begin_inset Quotes erd
+\end_inset
+
+.
+ Customer data replicas form an
+\emph on
+independent
+\emph default
+ dimension called
+\begin_inset Quotes eld
+\end_inset
+
+replication layer
+\begin_inset Quotes erd
+\end_inset
+
+.
+ The replication layer also obeys the shared-nothing principle in original
+ sense, but it is
+\emph on
+not
+\emph default
+ meant by our term
+\begin_inset Quotes eld
+\end_inset
+
+sharding
+\begin_inset Quotes erd
+\end_inset
+
+ in order to avoid confusion
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice that typically
+\family typewriter
+BigCluster
+\family default
+ architectures are also abstracting away their replicas when talking about
+ their architecture.
+\end_layout
+
+\end_inset
+
+ between these two independent dimensions.
+\end_layout
+
+\begin_layout Standard
+Our sharding model does not need a dedicated storage network at all, at
+ least when built and dimensioned properly.
+ Instead, it
+\emph on
+should have
+\emph default
+ (but not always needs) a so-called
+\series bold
+replication network
+\series default
+ which can, when present, be dimensioned much smaller because it does neither
+ need realtime operations nor scalabiliy to
+\begin_inset Formula $O(n^{2})$
+\end_inset
+
+:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/Architecure_Sharding.pdf
+ width 100col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+Sharding architectures are extremely well suited when both the input traffic
+ and the data is
+\series bold
+already partitioned
+\series default
+.
+ For example, when several thousands or even millions of customers are operating
+ on disjoint data sets, like in web hosting where each webspace is residing
+ in its own home directory, or when each of millions of mySQL database instances
+ has to be isolated from its neighbour.
+ Masses of customers are also appearing at cloud storage applications like
+ Cloud Filesystems (e.g.
+ Dropbox or similar).
+\end_layout
+
+\begin_layout Standard
+Even in cases when any customer may potentially access any of the data items
+ residing in the whole storage pool (e.g.
+ like in a search engine), sharding can be often applied.
+ The trick is to create some relatively simple content-based dynamic switching
+ or redirect mechanism in the input network traffic, similar to HTTP load
+ balancers or redirectors.
+\end_layout
+
+\begin_layout Standard
+Only when partitioning of input traffic plus data is not possible in a reasonabl
+e way, big cluster architectures as implemented for example in Ceph or Swift
+ (and partly even possible with MARS when restricted to the block layer)
+ have a very clear use case.
+\end_layout
+
+\begin_layout Standard
+In the following sections, we will see: when sharding is possible, it is
+ the preferred model due to reliability and cost and performance reasons.
+ Another good explanation can be found at
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+http://www.benstopford.com/2009/11/24/understanding-the-shared-nothing-architectur
+e/
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsection
+Variants of Sharding
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Variants-of-Sharding"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Description
+LocalSharding The simplest possible sharding architecture is simply putting
+ both the storage and the compute CPU power onto the same iron.
+\begin_inset Newline newline
+\end_inset
+
+Example: at 1&1 Shared Hosting Linux (ShaHoLin), we have dimensioned several
+ variants of this.
+ (a) we are using 1U pizza boxes with local hardware RAID controllers with
+ fast hardware BBU cache and up 10 local disks for the majority of LXC container
+ instances where the
+\begin_inset Quotes eld
+\end_inset
+
+small-sized
+\begin_inset Quotes erd
+\end_inset
+
+ customers (up to ~100 GB webspace per customer) are residing.
+ Since most customers have very small home directories with extremely many
+ but small files, this is a very cost-efficient model.
+ (b) less that 1 permille of all customers have > 250 GB (up to 2TB) per
+ home directory.
+ For these few customers we are using another dimensioning variant of the
+ same architecture: 4U servers with 48 high-capacity spindles on 3 RAID
+ sets, delivering a total PV capacity of ~300 TB, which are then cut down
+ to ~10 LXC containers of ~30 TB each.
+\begin_inset Newline newline
+\end_inset
+
+In order to operate this model at a bigger scale, you should consider the
+
+\begin_inset Quotes eld
+\end_inset
+
+container football
+\begin_inset Quotes erd
+\end_inset
+
+ method as described in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Principle-of-Background"
+
+\end_inset
+
+ and in chapter
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "chap:LV-Football"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Description
+RemoteSharding This variant needs a (possibly dedicated) storage network,
+ which is however only
+\begin_inset Formula $O(n)$
+\end_inset
+
+ in total.
+ Each storage server exports a block device over iSCSI (or over another
+ transport) to at most
+\begin_inset Formula $O(k)$
+\end_inset
+
+ dedicated compute nodes where
+\begin_inset Formula $k$
+\end_inset
+
+ is some
+\series bold
+constant
+\series default
+.
+\begin_inset Newline newline
+\end_inset
+
+Hint 1: it is advisable to build this type of storage network with
+\series bold
+local switches
+\series default
+ and no routers inbetween, in order to avoid
+\begin_inset Formula $O(n^{2})$
+\end_inset
+
+-style network architectures and traffic.
+ This reduces error propagation upon network failures.
+ Keep the storage and the compute nodes locally close to each other, e.g.
+ in the same datacenter room, or even in the same rack.
+\begin_inset Newline newline
+\end_inset
+
+Hint 2: additionally, you can provide some (low-dimensioned) backbone for
+
+\series bold
+exceptional(!)
+\series default
+ cross-traffic between the local storage switches.
+ Don't plan to use any realtime cross-traffic
+\emph on
+regularly
+\emph default
+, but only in clear cases of emergency!
+\begin_inset Newline newline
+\end_inset
+
+Notice: in this model, a shard typically consists of one storage node plus
+
+\begin_inset Formula $k+1$
+\end_inset
+
+ or
+\begin_inset Formula $k+2$
+\end_inset
+
+ compute servers, introducing some additional failure redundancy
+\emph on
+within
+\emph default
+ such a shard, while retaining the
+\begin_inset Quotes eld
+\end_inset
+
+no single point of contention
+\begin_inset Quotes erd
+\end_inset
+
+ property
+\emph on
+between
+\emph default
+ the shards (according to the definition
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "par:Definition-of-Sharding"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Description
+FlexibleSharding This is a dynamic combination of LocalSharding and RemoteShardi
+ng, dynamically re-configurable, as explained below.
+\end_layout
+
+\begin_layout Description
+BigClusterSharding The sharding model can also be placed
+\series bold
+on top of
+\series default
+ a BigCluster model, or possibly
+\begin_inset Quotes eld
+\end_inset
+
+internally
+\begin_inset Quotes erd
+\end_inset
+
+ in such a model, leading to a similar effect.
+ Whether this makes sense needs some discussion.
+ It can be used to reduce the
+\emph on
+logical
+\emph default
+ BigCluster size from
+\begin_inset Formula $O(n)$
+\end_inset
+
+ to some
+\begin_inset Formula $O(k)$
+\end_inset
+
+, such that it is no longer a
+\begin_inset Quotes eld
+\end_inset
+
+big cluster
+\begin_inset Quotes erd
+\end_inset
+
+ but a
+\begin_inset Quotes eld
+\end_inset
+
+small cluster
+\begin_inset Quotes erd
+\end_inset
+
+, and thus reducing the serious problems described in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Reliability-Arguments-from"
+
+\end_inset
+
+ to some degree.
+ This could make sense in the following use cases:
+\end_layout
+
+\begin_deeper
+\begin_layout Itemize
+When you
+\series bold
+already have
+\series default
+ invested into a big cluster, e.g.
+ Ceph or Swift, which does not really scale and/or does not really deliver
+ the expected reliability.
+ Some possible reasons for this are explained in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Reliability-Arguments-from"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Itemize
+When you really need a
+\emph on
+single
+\emph default
+ LV which is necessarily
+\series bold
+bigger
+\series default
+ than can be reasonably built on top of local LVM.
+ This means, you are likely claiming that you really need
+\series bold
+strict consistency
+\series default
+ as provided by a block device on more than 1 PB with current technology
+ (2018).
+ Examples are very
+\series bold
+big enterprise databases
+\series default
+ like classical SAP (c.f.
+ section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Local-vs-Centralized"
+
+\end_inset
+
+), or if you really need
+\series bold
+POSIX-compliance
+\series default
+ on a single big filesystem instance.
+ Be conscious when you think this is the only solution to your problem.
+ Double-check or triple-check whether there is
+\emph on
+really
+\emph default
+ no other solution than creating such a huge block device and/or such a
+ huge filesystem instance.
+ Such huge SPOFs are tending to create similar problems as described in
+ section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Reliability-Arguments-from"
+
+\end_inset
+
+ for similar reasons.
+\end_layout
+
+\end_deeper
+\begin_layout Standard
+When building a
+\series bold
+new
+\series default
+ storage system, be sure to check the following use cases.
+ You should seriously consider a LocalSharding / RemoteSharding / FlexibleShardi
+ng model in favor of BigClusterSharding when ...
+\end_layout
+
+\begin_layout Itemize
+...
+ when more than 1 LV instance would be placed onto your
+\begin_inset Quotes eld
+\end_inset
+
+small cluster
+\begin_inset Quotes erd
+\end_inset
+
+ shards.
+ Then a
+\series bold
+{Local,Remote,Flexible}Sharding
+\series default
+ model could be likely used instead.
+ Then the total overhead (
+\series bold
+total cost of ownership
+\series default
+) introduced by a BigCluster
+\emph on
+model
+\emph default
+ but actually stripped down to a
+\begin_inset Quotes eld
+\end_inset
+
+SmallCluster
+\begin_inset Quotes erd
+\end_inset
+
+
+\emph on
+implementation / configuration
+\emph default
+ should be examined separately.
+ Does it really pay off?
+\end_layout
+
+\begin_layout Itemize
+...
+ when there are
+\series bold
+legal requirements
+\series default
+ that you can tell at any time where your data is.
+ Typically, this is all else but easy on a BigCluster model, even when stripped
+ down to SmallCluster size.
+\end_layout
+
+\begin_layout Subsection
+FlexibleSharding
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:FlexibleSharding"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Notice that MARS' new remote device feature from the 0.2 branch series (which
+ is kind of replacement for iSCSI)
+\emph on
+could
+\emph default
+ be used for implementing some sort of
+\begin_inset Quotes eld
+\end_inset
+
+big cluster
+\begin_inset Quotes erd
+\end_inset
+
+ model at block layer.
+\end_layout
+
+\begin_layout Standard
+Nevertheless, such models re-introducing some kind of
+\begin_inset Quotes eld
+\end_inset
+
+big dedicated storage network
+\begin_inset Quotes erd
+\end_inset
+
+ into MARS operations are not the preferred model.
+ Following is the a super-model which combines both the
+\begin_inset Quotes eld
+\end_inset
+
+big cluster
+\begin_inset Quotes erd
+\end_inset
+
+ and sharding model at block layer in a very flexible way.
+ The following example shows only two servers from a pool consisting of
+ hundreds or thousands of servers:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/MARS_Cluster_on_Demand.pdf
+ width 100col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+The idea is to use iSCSI or the MARS remote device
+\emph on
+only where necessary
+\emph default
+.
+ Preferably, local storage is divided into multiple Logical Volumes (LVs)
+ via LVM, which are
+\emph on
+directly
+\emph default
+ used
+\emph on
+locally
+\emph default
+ by Virtual Machines (VMs), such as KVM or filesystem-based variants like
+ LXC containers.
+\end_layout
+
+\begin_layout Standard
+In the above example, the left machine has relatively less CPU power or
+ RAM than storage capacity.
+ Therefore, not
+\emph on
+all
+\emph default
+ LVs could be instantiated locally at the same time without causing operational
+ problems, but
+\emph on
+some
+\emph default
+ of them can be run locally.
+ The example solution is to
+\emph on
+exceptionally(!)
+\emph default
+ export LV3 to the right server, which has some otherwise unused CPU and
+ RAM capacity.
+\end_layout
+
+\begin_layout Standard
+Notice that local operations of VMs doesn't produce any storage network
+ traffic at all.
+ Therefore, this is the preferred runtime configuration.
+\end_layout
+
+\begin_layout Standard
+Only in cases of resource imbalance, such as (transient) CPU or RAM peaks
+ (e.g.
+ caused by DDOS attacks),
+\emph on
+some
+\emph default
+ VMs or containers may be run somewhere else over the network.
+ In a well-balanced and well-dimensioned system, this will be the
+\series bold
+vast minority
+\series default
+, and should be only used for dealing with timely load peaks etc.
+\end_layout
+
+\begin_layout Standard
+Running VMs directly on the same servers as their storage is a
+\series bold
+major cost reducer.
+\end_layout
+
+\begin_layout Standard
+You simply don't need to buy and operate
+\begin_inset Formula $n+m$
+\end_inset
+
+ servers, but only about
+\begin_inset Formula $\max(n,m)+m\cdot\epsilon$
+\end_inset
+
+ servers, where
+\begin_inset Formula $\epsilon$
+\end_inset
+
+ corresponds to some relative small extra resources needed by MARS.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+In addition to this and to reduced networking costs, there are further cost
+ savings at power consumption, air conditioning, Height Units (HUs), number
+ of HDDs, operating costs, etc as explained below in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Cost-Arguments-from"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsection
+Principle of Background Migration
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Principle-of-Background"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The sharding model needs a different approach to load balancing of storage
+ space than the big cluster model.
+ There are serveral possibilities at different layers, each addressing different
+
+\series bold
+granularities
+\series default
+:
+\end_layout
+
+\begin_layout Itemize
+Moving customer data at filesystem or database level via
+\family typewriter
+rsync
+\family default
+ or
+\family typewriter
+mysqldump
+\family default
+ or similar.
+
+\begin_inset Newline newline
+\end_inset
+
+Example: at 1&1 Shared Hosting Linux, we have about 9 millions of customer
+ home directories.
+ We also have a script
+\family typewriter
+movespace.pl
+\family default
+ using incremental
+\family typewriter
+tar
+\family default
+ for their moves.
+ Now, if we would try to move around
+\emph on
+all
+\emph default
+ of them this way, it could easily take years or even decades for millions
+ of extremely small home directories, due to overhead like DNS updates etc.
+ However, there exist a small handful of large customer home directories
+ in the terabyte range.
+ For these, and only for these, it is a clever idea to use
+\family typewriter
+movespace.pl
+\family default
+ because thereby the size of a LV can be regulated more fine grained than
+ at LV level.
+\end_layout
+
+\begin_layout Itemize
+Dynamically growing the sizes of LVs during operations:
+\family typewriter
+lvresize
+\family default
+ followed by
+\family typewriter
+marsadm resize
+\family default
+ followed by
+\family typewriter
+xfs_growfs
+\family default
+ or similar operations.
+\end_layout
+
+\begin_layout Itemize
+Moving whole LVs via MARS, as shown in the following example:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/MARS_Background_Migration.pdf
+ width 100col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+The idea is to dynamically create
+\emph on
+additional
+\emph default
+ LV replicas for the sake of
+\series bold
+background migration
+\series default
+.
+ Examples:
+\end_layout
+
+\begin_layout Itemize
+In case you had no redundancy at LV level before, you have
+\begin_inset Formula $k=1$
+\end_inset
+
+ replicas during ordinary operation.
+ If not yet done, you should transparently introduce MARS into your LVM-based
+ stack by using the so-called
+\begin_inset Quotes eld
+\end_inset
+
+standalone mode
+\begin_inset Quotes erd
+\end_inset
+
+ of MARS.
+ When necessary, create the first MARS replica with
+\family typewriter
+marsadm create-resource
+\family default
+ on your already-existing LV data, which is retained unmodified, and restart
+ your application again.
+ Now, for the sake of migration, you just create an additional replica at
+ another server via
+\family typewriter
+marsadm join-resource
+\family default
+ there and wait until the second mirror has been fully
+\series bold
+synced
+\series default
+ in background, while your application is running and while the contents
+ of the LV is modified
+\emph on
+in parallel
+\emph default
+ by your ordinary applications.
+ Then you do a primary
+\series bold
+handover
+\series default
+ to your mirror.
+ This is usually a matter of minutes, or even seconds.
+ Once the application runs again at the new location, you can delete the
+ old replica via
+\family typewriter
+marsadm leave-resource
+\family default
+ and
+\family typewriter
+lvremove
+\family default
+.
+ Finally, you may re-use the freed-up space for something else (e.g.
+
+\family typewriter
+lvresize
+\family default
+ of
+\emph on
+another
+\emph default
+ LV followed by
+\family typewriter
+marsadm resize
+\family default
+ followed by
+\family typewriter
+xfs_growfs
+\family default
+ or similar).
+ For the sake of some hardware lifecycle, you may run a different strategy:
+ evacuate the original source server completely via the above MARS migration
+ method, and eventually decommission it.
+\end_layout
+
+\begin_layout Itemize
+In case you already have a redundant LV copy somewhere, you should run a
+ similar procedure, but starting with
+\begin_inset Formula $k=2$
+\end_inset
+
+ replicas, and temporarily increasing the number of replicas to either
+\begin_inset Formula $k'=3$
+\end_inset
+
+ when moving each replica step-by-step, or you may even directly go up to
+
+\begin_inset Formula $k'=4$
+\end_inset
+
+ when moving pairs at once.
+\begin_inset Newline newline
+\end_inset
+
+Example: see
+\family typewriter
+football.sh
+\family default
+ in the
+\family typewriter
+football/
+\family default
+ directory of MARS, which is a checkout of the Football sub-project (see
+ chapter
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "chap:LV-Football"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Itemize
+When already starting with
+\begin_inset Formula $k>2$
+\end_inset
+
+ LV replicas in the starting position, you can do the same analogously,
+ or you may then use a lesser variant.
+ For example, we have some mission-critical servers at 1&1 which are running
+
+\begin_inset Formula $k=4$
+\end_inset
+
+ replicas all the time on relatively small but important LVs for extremely
+ increased safety.
+ Only in such a case, you may have the freedom to temporarily decrease from
+
+\begin_inset Formula $k=4$
+\end_inset
+
+ to
+\begin_inset Formula $k'=3$
+\end_inset
+
+ and then going up to
+\begin_inset Formula $k''=4$
+\end_inset
+
+ again.
+ This has the advantage of requiring less temporary storage space for
+\emph on
+swapping
+\emph default
+ some LVs.
+\end_layout
+
+\begin_layout Section
+Cost Arguments
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Cost-Arguments-from"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+A common pre-jugdement is that
+\begin_inset Quotes eld
+\end_inset
+
+big cluster
+\begin_inset Quotes erd
+\end_inset
+
+ is the cheapest scaling storage technology when built on so-called
+\begin_inset Quotes eld
+\end_inset
+
+commodity hardware
+\begin_inset Quotes erd
+\end_inset
+
+.
+ While this is very often true for the
+\begin_inset Quotes eld
+\end_inset
+
+commodity hardware
+\begin_inset Quotes erd
+\end_inset
+
+ part, it is often not true for the
+\begin_inset Quotes eld
+\end_inset
+
+big cluster
+\begin_inset Quotes erd
+\end_inset
+
+ part.
+ But let us first look at the
+\begin_inset Quotes eld
+\end_inset
+
+commodity
+\begin_inset Quotes erd
+\end_inset
+
+ part.
+\end_layout
+
+\begin_layout Subsection
+Cost Arguments from Technology
+\end_layout
+
+\begin_layout Standard
+Here are some rough market prices for basic storage as determined around
+ end of 2016 / start of 2017:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+Technology
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+Enterprise-Grade
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+Price in € / TB
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+Consumer SATA disks via on-board SATA controllers
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+no (small-scale)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+< 30 possible
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+SAS disks via SAS HBAs (e.g.
+ in external 14
+\begin_inset Quotes erd
+\end_inset
+
+ shelfs)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+halfways
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+< 80
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+SAS disks via hardware RAID + LVM (+DRBD/MARS)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+80 to 150
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+Commercial storage appliances via iSCSI
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+around 1000
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+Cloud storage, S3 over 5 years lifetime
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size small
+3000 to 8000
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+You can see that any self-built and self-administered storage (whose price
+ varies with slower high-capacity disks versus faster low-capacity disks)
+ is much cheaper than any commercial offering by about a factor of 10 or
+ even more.
+ If you need to operate several petabytes of data, self-built storage is
+ always cheaper than commercial one, even if additional manpower is needed
+ for commissioning and operating.
+ You don't have to pay the shareholders of the storage provider.
+ Here we just assume that the storage is needed permanently for at least
+ 5 years, as is the case in web hosting, databases, backup / archival systems,
+ and many other application areas.
+\end_layout
+
+\begin_layout Standard
+Commercial offerings of cloud storage are way too much hyped.
+ Some people apparently don't know that the generic term
+\begin_inset Quotes eld
+\end_inset
+
+Cloud Storage
+\begin_inset Quotes erd
+\end_inset
+
+ refers to a
+\emph on
+storage class
+\emph default
+, not to a particular
+\emph on
+instance
+\emph default
+ like original Amazon S3, and that it is possible to build and operate almost
+ any instance of any storage class yourself.
+ From a commercial perspective,
+\series bold
+outsourcing
+\series default
+ of
+\emph on
+huge masses
+\emph default
+of enterprise-critical storage (to whatever class of storage) usually pays
+ off
+\series bold
+only when
+\series default
+ your storage demands are either
+\emph on
+relatively low
+\emph default
+, or are
+\emph on
+extremely
+\emph default
+ varying over time, and/or when you need some
+\emph on
+extra
+\emph default
+ capacity only
+\emph on
+temporarily
+\emph default
+ for a
+\emph on
+very
+\emph default
+ short time.
+\end_layout
+
+\begin_layout Subsection
+Cost Arguments from Architecture
+\end_layout
+
+\begin_layout Standard
+In addition to basic storage prices, many further factors come into play
+ when roughly comparing big cluster architectures versus sharding.
+ The following table bears the
+\emph on
+unrealistic assumption
+\emph default
+ that BigCluster can be reliably operated with 2 replicas (
+\family roman
+\series medium
+\shape up
+\size normal
+\emph off
+\bar no
+\strikeout off
+\uuline off
+\uwave off
+\noun off
+\color none
+the suffix
+\begin_inset Formula $\times2$
+\end_inset
+
+
+\family default
+\series default
+\shape default
+\size default
+\emph default
+\bar default
+\strikeout default
+\uuline default
+\uwave default
+\noun default
+\color inherit
+ means with additional geo-redundancy):
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Tabular
+
+
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+BC
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+SHA
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+BC
+\begin_inset Formula $\times2$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+SHA
+\begin_inset Formula $\times2$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+# of Disks
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+>200%
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+<120%
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+>400%
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+<240%
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+# of Servers
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times2$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times1.1$
+\end_inset
+
+ possible
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times4$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times2.2$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Power Consumption
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times2$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times1.1$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times4$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times2.2$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+HU Consumption
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times2$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times1.1$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times4$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times2.2$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+As shown in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Reliability-Arguments-from"
+
+\end_inset
+
+, two replicas are typically not sufficient for BigCluster.
+ Even addicts of BigCluster are typically recommending 3 replicas in some
+ so-called
+\begin_inset Quotes eld
+\end_inset
+
+best practices
+\begin_inset Quotes erd
+\end_inset
+
+, leading to the following more realistic table:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Tabular
+
+
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+BC
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+SHA
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+BC
+\begin_inset Formula $\times2$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+SHA
+\begin_inset Formula $\times2$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+# of Disks
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+>300%
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+<120%
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+>600%
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+<240%
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+# of Servers
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times3$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times1.1$
+\end_inset
+
+ possible
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times6$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times2.2$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Power Consumption
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times3$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times1.1$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times6$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times2.2$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+HU Consumption
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times3$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times1.1$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times6$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Formula $\approx\times2.2$
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+The crucial point is not only the number of extra servers needed for dedicated
+ storage boxes, but also the total number of HDDs.
+ While big cluster implementations like Ceph or Swift can
+\emph on
+theoretically
+\emph default
+ use some erasure encoding for avoiding full object replicas, their
+\emph on
+practice
+\emph default
+ as seen in internal 1&1 Ceph clusters is similar to RAID-10, but just on
+ objects instead of block-based sectors.
+\end_layout
+
+\begin_layout Standard
+Therefore a big cluster typically needs >300% disks to reach the same net
+ capacity as a simple sharded cluster.
+ The latter can typically take advantage of hardware RAID-60 with a significantl
+y smaller disk overhead, while providing sufficient failure tolerance at
+ disk level.
+\end_layout
+
+\begin_layout Standard
+There is a surprising consequence from this: geo-redundancy is not as expensive
+ as many people are believing.
+ It just needs to be built with the proper architecture.
+ A sharded geo-redundant pool based on hardware RAID-60 (last column
+\begin_inset Quotes eld
+\end_inset
+
+SHA
+\begin_inset Formula $\times2$
+\end_inset
+
+
+\begin_inset Quotes erd
+\end_inset
+
+) costs typically
+\emph on
+less
+\emph default
+ than a non-georedundant big cluster with typically needed / recommended
+ number of replicas (column
+\begin_inset Quotes eld
+\end_inset
+
+BC
+\begin_inset Quotes erd
+\end_inset
+
+).
+ A geo-redundant sharded pool provides even better failure compensation
+ (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Reliability-Arguments-from"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+Notice that geo-redundancy implies by definition that an unforeseeable
+\series bold
+full datacenter loss
+\series default
+ (e.g.
+ caused by
+\series bold
+disasters
+\series default
+ like a terrorist attack or an earthquake) must be compensated for
+\series bold
+several days or weeks
+\series default
+.
+ Therefore it is
+\emph on
+not
+\emph default
+ sufficient to take a big cluster and just spread it to two different locations.
+\end_layout
+
+\begin_layout Standard
+In any case, a MARS-based geo-redundant sharding pool is cheaper than using
+ commercial storage appliances which are much more expensive by their nature.
+\end_layout
+
+\begin_layout Section
+Reliability Arguments from Architecture
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Reliability-Arguments-from"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+A contemporary common belief is that big clusters and their random replication
+ methods would provide better reliability than anything else.
+ There are some practical observations at 1&1 and its daughter companies
+ which cannot confirm this.
+\end_layout
+
+\begin_layout Standard
+Similar experiences are part of a USENIX paper about copysets, see
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+https://www.usenix.org/system/files/conference/atc13/atc13-cidon.pdf
+\end_layout
+
+\end_inset
+
+.
+ Their proposed solution is different from the solution proposed here, but
+ interestingly their
+\emph on
+problem analysis
+\emph default
+ part contains not only similar observations, but also comes to similar
+ conclusions about random replication.
+ Citation from the abstract:
+\end_layout
+
+\begin_layout Quote
+However, random replication is
+\series bold
+almost guaranteed
+\series default
+ to lose data in the common scenario of simultaneous node failures due to
+ cluster-wide power outages.
+
+\size footnotesize
+ [emphasis added by me]
+\end_layout
+
+\begin_layout Standard
+Stimulated by our practical experiences even in truly less disastrous scenarios
+ than mass power outage, theoretical explanations were sought.
+ Surprisingly, they show that LocalSharding is superior to true big clusters
+ under practically important preconditions.
+ Here is an intutitive explanation.
+ A detailed mathematical description of the model can be found in appendix
+
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "chap:Mathematical-Model-of"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsection
+Storage Server Node Failures
+\end_layout
+
+\begin_layout Subsubsection
+Simple intuitive explanation
+\end_layout
+
+\begin_layout Standard
+Block-level replication systems like DRBD are constructed for failover in
+ local redundancy scenarios.
+ Or, when using MARS, even for geo-redundant failover scenarios.
+ They are traditionally dealing with
+\series bold
+pairs
+\series default
+ of servers, or with triples, etc.
+ In order to get a storage incident with them,
+\emph on
+both
+\emph default
+ sides of a DRBD or MARS small-cluster (also called
+\series bold
+shard
+\series default
+ in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "par:Definition-of-Sharding"
+
+\end_inset
+
+) must have an incident
+\emph on
+at the same time
+\emph default
+.
+\end_layout
+
+\begin_layout Standard
+In contrast, big clusters are conceptually spreading their objects over
+ a huge number of nodes
+\begin_inset Formula $O(n)$
+\end_inset
+
+, with some redundancy degree
+\begin_inset Formula $k$
+\end_inset
+
+ denoting the number of replicas.
+ As a consequence,
+\emph on
+any
+\emph default
+
+\begin_inset Formula $k$
+\end_inset
+
+ node failures out of
+\begin_inset Formula $O(n)$
+\end_inset
+
+ will produce an incident.
+ For example, when
+\begin_inset Formula $k=2$
+\end_inset
+
+ and
+\begin_inset Formula $n$
+\end_inset
+
+ is equal for both models, then
+\emph on
+any
+\emph default
+ combination to two node failures occurring at the same time will lead to
+ an incident:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/Incident_Probabilities.pdf
+ width 100col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+Intuitively, it is easy to see that hitting both members of the same pair
+ at the same time is less likely than hitting
+\emph on
+any
+\emph default
+ two nodes of a big cluster.
+\end_layout
+
+\begin_layout Standard
+If you are curious about some concrete numbers, read on.
+\end_layout
+
+\begin_layout Subsubsection
+Detailed explanation
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Detailed-explanation"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+For the sake of simplicity, the following more detailed explanation is based
+ on the following assumptions:
+\end_layout
+
+\begin_layout Itemize
+We are looking at
+\series bold
+storage node
+\series default
+ failures only.
+\end_layout
+
+\begin_layout Itemize
+Disk failures are regarded as already solved (e.g.
+ by local RAID-6 or by the well-known compensation mechanisms of big clusters).
+ Only in case they don't work, they are mapped to node failures, and are
+ already included in the probability of storage node failures.
+\end_layout
+
+\begin_layout Itemize
+We only look at
+\series bold
+data replication
+\series default
+ with a redundancy degree of a relatively small
+\begin_inset Formula $k$
+\end_inset
+
+.
+ CRC methods are not used across storage nodes, but may be present
+\emph on
+internally
+\emph default
+ at some storage nodes, e.g.
+ RAID-5 or RAID-6 or similar methods.
+ Notice that CRC methods generally involve very high overhead, and even
+ won't work in realtime across long distances (geo-redundancy).
+\end_layout
+
+\begin_layout Itemize
+We restrict ourselves to temporary /
+\series bold
+transient
+\series default
+ failures, without regarding permanent data loss.
+ Otherwise, the differences between local-storage sharding architectures
+ and big clusters would become even worse.
+ When loosing some physical storage nodes forever in a big cluster, it is
+ typically all else but easy to determine which data of which application
+ instances / customers have been affected, and which will need a restore
+ from backup.
+\end_layout
+
+\begin_layout Itemize
+Storage network failures (as a whole) are ignored.
+ Otherwise a fair comparison between the architectures would become difficult.
+ If they were taken into account, the advantages of LocalSharding would
+ become even bigger.
+\end_layout
+
+\begin_layout Itemize
+We assume that the storage network (when present) forms no bottleneck.
+ Network implementations like TCP/IP versus Infiniband or similar are thus
+ ignored.
+\end_layout
+
+\begin_layout Itemize
+Software failures / bugs are also ignored.
+ We only compare
+\emph on
+architectures
+\emph default
+ here, not their various implementations.
+\end_layout
+
+\begin_layout Itemize
+The x axis shows the number of basic storage units
+\begin_inset Formula $n$
+\end_inset
+
+ from an
+\emph on
+application
+\emph default
+ perspective, meaning
+\begin_inset Quotes eld
+\end_inset
+
+usable storage
+\begin_inset Quotes erd
+\end_inset
+
+ or
+\begin_inset Quotes eld
+\end_inset
+
+net amount of storage
+\begin_inset Quotes erd
+\end_inset
+
+.
+ For simplicitiy of the model, one basic application storage unit equals
+ to the total disk space provided by one physical storage node in the special
+ case of
+\begin_inset Formula $k=1$
+\end_inset
+
+ replicas.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Attention! when increasing the number of replicas
+\begin_inset Formula $k$
+\end_inset
+
+, the total number of storage nodes needs to be
+\series bold
+increased accordingly
+\series default
+.
+ Typically, you will need to deploy
+\begin_inset Formula $k\cdot n$
+\end_inset
+
+ physical storage nodes in order to get
+\begin_inset Formula $n$
+\end_inset
+
+ net storage units from a user's perspective.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Attention!
+\begin_inset space ~
+\end_inset
+
+
+\begin_inset Formula $k$
+\end_inset
+
+ has a strong influence at the
+\series bold
+price tag
+\series default
+ of any of the competing architectures.
+ You cannot assume an
+\begin_inset Quotes eld
+\end_inset
+
+infinite amount of money
+\begin_inset Quotes erd
+\end_inset
+
+.
+ Therefore, only relatively small
+\begin_inset Formula $k$
+\end_inset
+
+ are bearable for business cases.
+\end_layout
+
+\begin_layout Itemize
+We assume that the number of application instances is linearly scaling with
+
+\begin_inset Formula $n$
+\end_inset
+
+.
+ For simplicity, we assume that the number of applications running on the
+ whole pool is exactly
+\begin_inset Formula $n$
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Itemize
+We assume that the storage nodes are (almost completely) filled with data
+ (sectors with RAID, and/or objects with BigCluster).
+\end_layout
+
+\begin_layout Itemize
+We assume that the number of sectors / objects per storage node is
+\begin_inset Quotes eld
+\end_inset
+
+very large
+\begin_inset Quotes erd
+\end_inset
+
+.
+ Some examples: a logical volume of 4 TB has 1,000,000,000 sectors or object,
+ each 4 KB in size.
+ A physical storage node providing 40 TB of storage will then provide 10
+ billions of sectors / objects.
+\end_layout
+
+\begin_layout Itemize
+For the BigCluster architecture, we assume that all objects are always distribut
+ed to
+\begin_inset Formula $O(n)$
+\end_inset
+
+ nodes.
+ For simiplicy of the model, we assume a distribution via a
+\emph on
+uniform
+\emph default
+ hash function.
+ When other hash functions were used (e.g.
+ distributing only to a constant number of nodes), it would no longer be
+ a big cluster architecture in our sense.
+\begin_inset Newline newline
+\end_inset
+
+In the following example, we assume a uniform object distribution to exactly
+
+\begin_inset Formula $n$
+\end_inset
+
+ nodes.
+ Notice that any other
+\begin_inset Formula $n'=O(n)$
+\end_inset
+
+ with
+\begin_inset Formula $n'
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+LocalSharding
+\size tiny
+(DRBDorMARS)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+A up
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+A down
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+B up
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+0
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+B down
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+BigCluster
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+A up
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+A down
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+B up
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+0
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+B down
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+
+\begin_inset space ~
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+What is the heart of the difference? While a node failure at LocalSharding
+ (DRBDorMARS) will tear down only the local application, the teardown produced
+ by BigCluster will spread to
+\emph on
+all
+\emph default
+ of the
+\begin_inset Formula $n=2$
+\end_inset
+
+ application units, because of the uniform hashing and because we have only
+
+\begin_inset Formula $k=1$
+\end_inset
+
+ replica.
+\end_layout
+
+\begin_layout Standard
+Would it help to increase both
+\begin_inset Formula $n$
+\end_inset
+
+ and
+\begin_inset Formula $k$
+\end_inset
+
+ to larger values?
+\end_layout
+
+\begin_layout Standard
+In the following graphics, the thick red line shows the behaviour for
+\begin_inset Formula $k=1$
+\end_inset
+
+ PlainServers (which is the same as
+\begin_inset Formula $k=1$
+\end_inset
+
+ DRBDorMARS) with increasing number of storage units
+\begin_inset Formula $n,$
+\end_inset
+
+ ranging from 1 to 10,000 storage units = number of servers for
+\begin_inset Formula $k=1$
+\end_inset
+
+.
+ Higher values of
+\begin_inset Formula $k\in[1,4]$
+\end_inset
+
+ are also displayed.
+ All lines corresponding to the same
+\begin_inset Formula $k$
+\end_inset
+
+ are drawn in the same color.
+ Notice that both the x and y axis are logscale:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/SERVICE_Comparison_of_Reversible_StorageNode_Failures.pdf
+ lyxscale 200
+ width 100col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+When you look at the thin solid BigCluster lines for
+\begin_inset Formula $k=2,\ldots$
+\end_inset
+
+ drawn in different colors, you may wonder why they are alltogether converging
+ to the thin red BigCluster line, which corresponds to
+\begin_inset Formula $k=1$
+\end_inset
+
+ BigCluster.
+ And they also converge against the grey dotted topmost line indicating
+ the total possible uptime of all applications (depending on x).
+ It can be explained as follows:
+\end_layout
+
+\begin_layout Standard
+The x axis shows the number of basic storage units.
+ When you have to create 10,000 storage units with a replication degree
+ of
+\begin_inset Formula $k=2$
+\end_inset
+
+ replicas, then you will have to deploy
+\begin_inset Formula $k*10,000=20,000$
+\end_inset
+
+ servers in total.
+ When operating a pool of 20,000 servers, in statistical average 2 servers
+ of them will be down at any given point in time.
+ However, 2 is the same number as the replication degree
+\begin_inset Formula $k.$
+\end_inset
+
+ Because our BigCluster model as defined above will distribute
+\emph on
+all
+\emph default
+ objects to
+\emph on
+all
+\emph default
+ servers uniformly, there will almost always
+\emph on
+exist
+\emph default
+ some objects for which no replica is available at any given point in time.
+ This means, you will almost always have a
+\series bold
+permanent incident
+\series default
+ involving the same number of nodes as your replication degree
+\begin_inset Formula $k$
+\end_inset
+
+, and in turn
+\emph on
+some
+\emph default
+ of your objects will not be accessible at all.
+ This means, at
+\begin_inset Formula $x=10,000$
+\end_inset
+
+ storage units you will loose almost any advantage from increasing the number
+ of replicas.
+ Adding more replicas will no longer help at
+\begin_inset Formula $x\geq10,000$
+\end_inset
+
+ storage units.
+\end_layout
+
+\begin_layout Standard
+Notice that the
+\emph on
+solid
+\emph default
+ lines are showing the probability of
+\emph on
+some
+\emph default
+ incident, disregarding the
+\series bold
+size of the incident
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+What's about the
+\emph on
+dashed
+\emph default
+ lines showing much better behaviour for BigCluster?
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Under some further preconditions, it would be possible to argue with the
+
+\emph on
+size
+\emph default
+ of incidents.
+ However, now a big fat warning.
+ When you are
+\series bold
+responsible
+\series default
+ for operations of thousands of servers, you should be very conscious about
+ these preconditions.
+ Otherwise you could risk your career.
+ In short:
+\end_layout
+
+\begin_layout Itemize
+When your application, e.g.
+ a smartphone app, consists of accessing only 1 object at all during a reasonabl
+y long timeframe, you can safely
+\series bold
+assume that there is no interdependency
+\series default
+ between all of your objects.
+ In addition, you have to assume (and you should check) that your cluster
+ operating software as a whole does not introduce any further
+\series bold
+hidden / internal interdependencies
+\series default
+.
+ Only in this case, and only then, you can take the dashed lines arguing
+ with the number of inaccessible objects instead of with the number of basic
+ storage units.
+\end_layout
+
+\begin_layout Itemize
+Whenever your application uses
+\series bold
+bigger structured logical objects
+\series default
+, such as filesystems or block devices or whole VMs / containers, then you
+ likely will get
+\series bold
+interdependent objects
+\series default
+ at your big cluster storage layer.
+\begin_inset Newline newline
+\end_inset
+
+Practical example: experienced sysadmins will confirm that even a data loss
+ rate of only 1/1,000,000 of blocks in a classical Linux filesystem like
+
+\family typewriter
+xfs
+\family default
+ or
+\family typewriter
+ext4
+\family default
+ will likely imply the need of an offline filesystem check (
+\family typewriter
+fsck
+\family default
+), which is a major incident for the affected filesystem instances.
+\begin_inset Newline newline
+\end_inset
+
+Theoretical explanation: servers are running for a very long time, and filesyste
+ms are typically also mounted for a long time.
+ Notice that the probability of hitting any vital filesystem data roughly
+ equals the probability of hitting any other data.
+ Sooner or later, any defective sector in the metadata structures or in
+ freespace management etc will stop your whole filesystem, and in turn will
+ stop your application instance(s) running on top of it.
+\begin_inset Newline newline
+\end_inset
+
+Similar arguments hold for transient failures: most classical filesystems
+ are not constructed for compensation of hanging IO, typically leading to
+
+\series bold
+system hangs
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Blindly taking the dashed lines will expose you to a high risk of error.
+ Practical experience shows that there are often
+\series bold
+hidden dependencies
+\series default
+ in many applications, often also at application level.
+ You cannot necessarily see them when inspecting their data structures!
+ You will only notice some of them by analyzing their
+\series bold
+runtime behaviour
+\series default
+, e.g.
+ with tools like
+\family typewriter
+strace
+\family default
+.
+ Notice that in general the runtime behaviour of an arbitrary program is
+
+\series bold
+undecidable
+\series default
+.
+ Be cautious when drawing assumptions out of thin air!
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Conversely, the assumption that
+\emph on
+any
+\emph default
+ unaccessible object may halt your application, might be too strong for
+
+\emph on
+some
+\emph default
+ use cases.
+ Therefore, some practical behaviour may be inbetween the solid thin lines
+ and the dashed lines of some given color.
+ Be extremely careful when constructing such an intermediate case.
+ The above example of a loss rate of 1/1,000,000 of sectors in a classical
+ filesystem should not be extended to lower values like 1/1,000,000,000
+ without knowing exactly how the filesystem works, and how it will react
+
+\emph on
+in detail
+\emph default
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+In general, it is insufficient to analyze the logical dependencies inside
+ of a filesystem instance, such as which inode contains some pointers to
+ which other filesystem objects, etc.
+ There exist further
+\series bold
+runtime dependencies
+\series default
+, such as
+\family typewriter
+nr_requests
+\family default
+ block-layer restrictions on IO queue depths, and/or capabilities / limitiations
+ of the hardware, and so on.
+ Trying to model all of these influences in a reasonable way could be a
+
+\emph on
+major
+\emph default
+ research undertakement outside the scope of this MARS manual.
+\end_layout
+
+\end_inset
+
+.
+ The grey zone between the extreme cases thin solid vs dashed is a
+\series bold
+dangerous zone
+\series default
+!
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+If you want to stay at the
+\series bold
+safe side
+\series default
+, simply obey the fundamental law as explained in the next section:
+\end_layout
+
+\begin_layout Subsection
+Optimum Reliability from Architecture
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Optimum-Reliability-from"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Another argument could be: don't distribute the BigCluster objects to exactly
+
+\begin_inset Formula $n$
+\end_inset
+
+ nodes, but to less nodes.
+ Would the result be better than DRBDorMARS LocalSharding?
+\end_layout
+
+\begin_layout Standard
+When distributing to
+\begin_inset Formula $O(k')$
+\end_inset
+
+ nodes with some constant
+\begin_inset Formula $k'$
+\end_inset
+
+, we have no longer a BigCluster architecture, but a mixed BigClusterSharding
+ form.
+\end_layout
+
+\begin_layout Standard
+As can be generalized from the above tables, the reliability of
+\series bold
+any
+\series default
+ BigCluster on
+\begin_inset Formula $k'>k$
+\end_inset
+
+ nodes is
+\series bold
+always
+\series default
+ worse than of LocalSharding on exactly
+\begin_inset Formula $k$
+\end_inset
+
+ nodes, where
+\begin_inset Formula $k$
+\end_inset
+
+ is also the redundancy degree.
+ In general:
+\end_layout
+
+\begin_layout Quote
+
+\series bold
+\size large
+The LocalSharding model is the optimum model for reliability of operation,
+ compared to any other model truly distributing its data and operations
+ over truly more nodes, like RemoteSharding or BigClusterSharding or BigCluster
+ does.
+\end_layout
+
+\begin_layout Standard
+There exists no better model because shards consisting of exactly
+\begin_inset Formula $k$
+\end_inset
+
+ nodes where
+\begin_inset Formula $k$
+\end_inset
+
+ is the redundancy degree are already the
+\emph on
+smallest possible shards
+\emph default
+ under the assumptions of section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Detailed-explanation"
+
+\end_inset
+
+.
+ Any other model truly involving
+\begin_inset Formula $k'>k$
+\end_inset
+
+ nodes for distribution of objects at any shard is
+\series bold
+always
+\series default
+ worse in the dimension of reliability.
+ Thus the above sentence follows by induction.
+\end_layout
+
+\begin_layout Standard
+The above sentence is formulating a
+\series bold
+fundamental law of storage systems
+\series default
+.
+\end_layout
+
+\begin_layout Subsection
+Error Propagation to Client Mountpoints
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Error-Propagation-to"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The following is only applicable when filesystems (or their objectstore
+ counterparts) are exported over a storage network, in order to be mounted
+ in parallel at
+\begin_inset Formula $O(n)$
+\end_inset
+
+ mountpoints each.
+\end_layout
+
+\begin_layout Standard
+In such a scenario, any problem / incident inside of your storage pool for
+ the filesystem instances will be spread to
+\begin_inset Formula $O(n)$
+\end_inset
+
+ clients, leading to an increase of the incident size by a factor of
+\begin_inset Formula $O(n)$
+\end_inset
+
+ when measured in number of affected mountpoints:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/MOUNTPOINTS_Comparison_of_Reversible_StorageNode_Failures.pdf
+ lyxscale 200
+ width 100col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+As a results, we now have a total of
+\begin_inset Formula $O(n^{2})$
+\end_inset
+
+ mountpoints = our new basic application units.
+ Such
+\begin_inset Formula $O(n^{2})$
+\end_inset
+
+ architectures are quickly becoming even worse than before.
+ Thus a clear warning: don't try to build systems in such a way.
+\end_layout
+
+\begin_layout Standard
+Notice: DRBD or MARS are traditionally used for running the application
+ on the same box as the storage.
+ Thus they are not vulnerable to these kinds of failure propagation over
+ network.
+ Even with traditional iSCSI exports over DRBD or MARS, you won't have suchalike
+ problems.
+ Your only chance to increase the error propagation are
+\begin_inset Formula $O(n)$
+\end_inset
+
+ NFS or
+\family typewriter
+glusterfs
+\family default
+ exports to
+\begin_inset Formula $O(n)$
+\end_inset
+
+ clients leading to a total number of
+\begin_inset Formula $O(n^{2})$
+\end_inset
+
+ mountpoints, or similar setups.
+\end_layout
+
+\begin_layout Standard
+Clear advice: don't do that.
+ It's a bad idea.
+\end_layout
+
+\begin_layout Subsection
+Similarities and Differences to Copysets
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Similarities-and-differences"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+This section is mostly of academic interest.
+ You can skip it when looking for practical advice.
+\end_layout
+
+\begin_layout Standard
+The USENIX paper about copysets (see
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+https://www.usenix.org/system/files/conference/atc13/atc13-cidon.pdf
+\end_layout
+
+\end_inset
+
+) relates to the Sharding model in the following way:
+\end_layout
+
+\begin_layout Paragraph
+Similarities
+\end_layout
+
+\begin_layout Standard
+The concept of Random Replication of the storage data to large number of
+ machines will reduce reliability.
+ When chosing too big sets of storage machines, then the storage system
+ as a whole will become practically unusable.
+ This is common sense between the USENIX paper and the Sharding Approach
+ as propagated here.
+\end_layout
+
+\begin_layout Paragraph
+Differences
+\end_layout
+
+\begin_layout Standard
+The USENIX paper and many other Cloud Storage approaches are
+\emph on
+presuming
+\emph default
+ that there exists a storage network, allowing real-time distribution of
+ replicas over this kind of network.
+\end_layout
+
+\begin_layout Standard
+In contrast, the Sharding Approach to Cloud Storage tries to
+\emph on
+avoid
+\emph default
+ real-time storage networks
+\emph on
+as much as possible
+\emph default
+.
+ Notice that RemoteSharding and further variants (including future improvements)
+ do
+\emph on
+not
+\emph default
+ preclude it, but are trying to
+\emph on
+avoid
+\emph default
+ real-time storage network traffic.
+ Instead, the load-balancing problem is addressed via
+\series bold
+background data migration
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+This changes the
+\emph on
+timely granularity
+\emph default
+ of data access: many real-time accesses are
+\emph on
+shifted over
+\emph default
+ to migration processes, which in turn are weakening the requirements to
+ the network.
+\end_layout
+
+\begin_layout Standard
+In detail, there are some more differences to the USENIX paper.
+ Some examples:
+\end_layout
+
+\begin_layout Itemize
+Terminology: the scatter width
+\begin_inset Formula $S$
+\end_inset
+
+ is defined (see page 39 of the paper) as: each node's data is split
+\emph on
+uniformly
+\emph default
+ across a group of
+\begin_inset Formula $S$
+\end_inset
+
+
+\emph on
+other
+\emph default
+ nodes.
+ In difference, we neither assume uniformity, nor do we require the data
+ to be distributed to
+\emph on
+other
+\emph default
+ nodes.
+ By using the term
+\begin_inset Quotes eld
+\end_inset
+
+other
+\begin_inset Quotes erd
+\end_inset
+
+, the USENIX paper (as well as many other BigCluster approaches) are probably
+ presuming something like a distinction between
+\begin_inset Quotes eld
+\end_inset
+
+client
+\begin_inset Quotes erd
+\end_inset
+
+ and
+\begin_inset Quotes eld
+\end_inset
+
+server
+\begin_inset Quotes erd
+\end_inset
+
+ machines: while data processing is done on a
+\begin_inset Quotes eld
+\end_inset
+
+client
+\begin_inset Quotes erd
+\end_inset
+
+, data storage is on a
+\begin_inset Quotes eld
+\end_inset
+
+server
+\begin_inset Quotes erd
+\end_inset
+
+.
+
+\end_layout
+
+\begin_layout Itemize
+We don't disallow this in variants like RemoteSharding or FlexibleSharding
+ and so on, but we gave some arguments why we are trying to
+\emph on
+avoid
+\emph default
+ this.
+\end_layout
+
+\begin_layout Itemize
+It seems that some definitions in the USENIX paper may implicitly relate
+ to
+\begin_inset Quotes eld
+\end_inset
+
+each chunk
+\begin_inset Quotes erd
+\end_inset
+
+.
+ In contrast, the Sharding Approach typically relates to LVs (logical volumes),
+ which could however be viewed as a special case of
+\begin_inset Quotes eld
+\end_inset
+
+chunk
+\begin_inset Quotes erd
+\end_inset
+
+, e.g.
+ by minimizing the number of chunks in a system.
+ However notice: there exists definitions of
+\begin_inset Quotes eld
+\end_inset
+
+chunk
+\begin_inset Quotes erd
+\end_inset
+
+ where it is the basic transfer unit.
+ An LV has the fundamental property that small-granularity
+\series bold
+update in place
+\series default
+ (at any offset inside the LV) can be executed.
+\end_layout
+
+\begin_layout Itemize
+Notice: we do not preclude further fine-grained distribution of LV data,
+ but this is something which should be
+\emph on
+avoided
+\emph default
+ if not absolutely necessary.
+ Preferred method in typical practical use cases: some storage servers may
+ have some spare RAID slots to be populated later, by resizing the PVs =
+ Physical Volumes before resizing LVs.
+\end_layout
+
+\begin_layout Itemize
+Notice that a typical local RAID system
+\emph on
+is also
+\emph default
+ a Distributed System, according to some reasonable definition.
+ Typical RAID implementations just involve SAS cables instead of Ethernet
+ cables or Infiniband cables.
+ Notice that this also applies to many
+\begin_inset Quotes eld
+\end_inset
+
+Commodity Hardware
+\begin_inset Quotes erd
+\end_inset
+
+ approaches, like Ceph storage nodes driving dozens of local HDDs connected
+ over SAS or SATA.
+ The main difference is just that instead of a hardware RAID controller,
+ a hardware HBA = Host Bus Adapter is used instead.
+ Instead of Ethernet switches, SAS multiplexers in backplanes are used.
+ Anyway, this forms a locally distributed sub-system.
+\end_layout
+
+\begin_layout Itemize
+Future variants of the Sharding Approach might extend this already present
+ locally Distributed System to a somewhat wider one.
+ For example, creation of a local LV (called
+\begin_inset Quotes eld
+\end_inset
+
+disk
+\begin_inset Quotes erd
+\end_inset
+
+ in MARS terminology) could be implemented by a subordinate DRBD instance
+ implementing a future RAID-10 mode over local Infiniband or crossover Ethernet
+ cables, avoiding local switches.
+ While DRBD would essentially create the
+\begin_inset Quotes eld
+\end_inset
+
+local
+\begin_inset Quotes erd
+\end_inset
+
+ LV, the higher-level MARS instance would then be responsible for its wide-dista
+nce replication.
+ See chapter
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "chap:Use-Cases-for"
+
+\end_inset
+
+ about use cases of MARS vs DRBD.
+ Potential future use cases could be
+\emph on
+extremely huge
+\emph default
+ LVs where external SAS disk shelves are no longer sufficient to get the
+ desired capacity.
+\end_layout
+
+\begin_layout Itemize
+The USENIX paper needs to treat the following parameters as more or less
+ fixed (or only slowly changable)
+\series bold
+constants
+\series default
+, given by the system designer: the replication degree
+\begin_inset Formula $R$
+\end_inset
+
+, and the scatter width
+\begin_inset Formula $S$
+\end_inset
+
+.
+ In contrast, the replication degree
+\begin_inset Formula $k$
+\end_inset
+
+ of our Sharding Approach is not necessarily firmly given by the system,
+ but can be
+\series bold
+dynamically changed
+\series default
+ at runtime on a per-LV basis.
+ For example, during background migration via MARS the command
+\family typewriter
+marsadm join-resource
+\family default
+ is used for creating additional per-LV replicas.
+ However notice: this freedom is limited by the total number of deployed
+ hardware nodes.
+ If you want
+\begin_inset Formula $k=3$
+\end_inset
+
+ replicas at the
+\emph on
+whole
+\emph default
+ pool, then you will need to (dynamically) deploy at least about
+\begin_inset Formula $k*x$
+\end_inset
+
+ nodes in general.
+\end_layout
+
+\begin_layout Itemize
+The USENIX paper defines its copysets on a per-chunk basis.
+ Similarly to before, we can transfer this definition to a Sharding Approach
+ by relating it to a per-LV basis.
+ As a side effect, a copyset can then trivially become identical to
+\begin_inset Formula $S$
+\end_inset
+
+ when the definition is
+\begin_inset Formula $S$
+\end_inset
+
+ is also changed to a per-LV basis, analogously.
+ In the Sharding Approach, a distiction is not absolutely necessary, while
+ the USENIX paper has to invest some effort into clarifying the relationship
+ between
+\begin_inset Formula $S$
+\end_inset
+
+ and copysets as defined on a BigCluster model.
+\end_layout
+
+\begin_layout Itemize
+Neglecting the mentioned differences, we see our typical use case (LocalSharding
+) roughly equivalent to
+\begin_inset Formula $S=R$
+\end_inset
+
+ in the terminology of the USENIX paper, or to
+\begin_inset Formula $S=k$
+\end_inset
+
+ (our number of replicas) in our terminology.
+\end_layout
+
+\begin_layout Itemize
+This means: we try to minimize the
+\emph on
+size
+\emph default
+ of
+\begin_inset Formula $S$
+\end_inset
+
+ for any given per-LV
+\begin_inset Formula $k$
+\end_inset
+
+, which will lead to the best possible reliability (under the conditions
+ described in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Detailed-explanation"
+
+\end_inset
+
+) as has been shown in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Optimum-Reliability-from"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Section
+Performance Arguments from Architecture
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Performance-Arguments-from"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Some people think that replication is easily done at filesystem layer.
+ There exist lots of cluster filesystems and other filesystem-layer solutions
+ which claim to be able to replicate your data, sometimes even over long
+ distances.
+\end_layout
+
+\begin_layout Standard
+Trying to replicate several petabytes of data, or some billions of inodes,
+ is however a much bigger challenge than many people can imagine.
+\end_layout
+
+\begin_layout Standard
+Choosing the wrong layer for
+\series bold
+mass data replication
+\series default
+ may get you into trouble.
+ Here is an architectural-level (cf section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:What-is-Architecture"
+
+\end_inset
+
+) explanation why replication at the block layer is more easy and less error
+ prone:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/Layers.pdf
+ width 100col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+The picture shows the main components of a standalone Unix / Linux system.
+ In the late 1970s / early 1980s, a so-called
+\emph on
+Buffer Cache
+\emph default
+ had been introduced into the architecture of Unix.
+ Today's Linux has refined the concept to various internal caches such as
+ the
+\series bold
+Page Cache
+\series default
+ (for data) and the
+\series bold
+Dentry Cache
+\series default
+ (for metadata).
+\end_layout
+
+\begin_layout Standard
+All these caches serve one main purpose
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Another important purpose is
+\series bold
+providing shared memory
+\series default
+.
+\end_layout
+
+\end_inset
+
+: they are reducing the load onto the storage by exploitation of fast RAM.
+ A well-tuned cache can yield high cache hit ratios, typically 99%.
+ In some cases (as observed in practice) even more than 99.9%.
+\end_layout
+
+\begin_layout Standard
+Now start distributing the system over long distances.
+ There are potential cut points A and B and C
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+In theory, there is another cut point D by implementing a generically distribute
+d cache.
+ There exists some academic research on this, but practically usable enterprise-
+grade systems are rare and not wide-spread.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+Cut point A is application specific, and can have advantages because it
+ has knowledge of the application.
+ For example, replication of mail queues can be controlled much more fine-graine
+d than at filesystem or block layer.
+\end_layout
+
+\begin_layout Standard
+Cut points B and C are
+\emph on
+generic
+\emph default
+, supporting a wide variety of applicactions, without altering them.
+ Cutting at B means replication at filesystem level.
+ C means replication at block level.
+\end_layout
+
+\begin_layout Standard
+When replicating at B, you will notice that the caches are
+\emph on
+below
+\emph default
+ your cut point.
+ Thus you will have to re-implement
+\series bold
+distributed caches
+\series default
+, and you will have to
+\series bold
+maintain cache coherence
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+When replicating at C, the Linux caches are
+\emph on
+above
+\emph default
+ your cut point.
+ Thus you will receive much less traffic, typically already reduced by a
+ factor of 100, or even more.
+ This is much more easy to cope with.
+ You will also profit from
+\series bold
+journalling filesystems
+\series default
+ like
+\family typewriter
+ext4
+\family default
+ or
+\family typewriter
+xfs
+\family default
+.
+ In contrast,
+\emph on
+truly distributed
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+In this context,
+\begin_inset Quotes eld
+\end_inset
+
+truly
+\begin_inset Quotes erd
+\end_inset
+
+ means that the POSIX semantics would be always guaranteed cluster-wide,
+ and even in case of partial failures.
+ In practice, some distributed filesystems like NFS don't even obey the
+ POSIX standard
+\emph on
+locally
+\emph default
+ on 1 standalone client.
+ We know of projects which have
+\emph on
+failed
+\emph default
+ right because of this.
+\end_layout
+
+\end_inset
+
+
+\emph default
+ journalling is typically not available with distributed cluster filesystems.
+\end_layout
+
+\begin_layout Standard
+A
+\emph on
+potential
+\emph default
+ drawback of block layer replication is that you are typically limited to
+ active-passive replication.
+ An active-active operation is not impossible at block layer (see combinations
+ of DRBD with
+\family typewriter
+ocfs2
+\family default
+), but less common, and less safe to operate.
+\end_layout
+
+\begin_layout Standard
+This limitation isn't necessarily caused by the choice of layer.
+ It is simply caused by the
+\series bold
+laws of physics
+\series default
+: communication is always limited by the speed of light.
+ A distributed filesystem is nothing else but a logically
+\series bold
+distributed shared memory
+\series default
+ (DSM).
+\end_layout
+
+\begin_layout Standard
+Some decades of research on DSM have shown that there exist applications
+ / workloads where the DSM model is
+\emph on
+inferior
+\emph default
+ to the direct communication paradigm.
+ Even in short-distance / cluster scenarios.
+ Long-distance DSM is extremely cumbersome.
+\end_layout
+
+\begin_layout Standard
+Therefore: you simply shouldn't try to solve long-distance communication
+ needs via communication over filesystems.
+ Even simple producer-consumer scenarios (one-way communication) are less
+ performant (e.g.
+ when compared to plain TCP/IP) when it comes to distributed POSIX semantics.
+ There is simply too much
+\series bold
+synchronisation overhead at metadata level
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+If you have a need for mixed operations at different locations in parallel:
+ just split your data set into disjoint filesystem instances (or database
+ / VM instances, etc).
+ All you need is careful thought about the
+\emph on
+appropriate
+\emph default
+
+\emph on
+granularity
+\emph default
+ of your data sets (such as well-chosen
+\emph on
+sets
+\emph default
+ of user homedirectory subtrees, or database sets logically belonging together,
+ etc).
+\end_layout
+
+\begin_layout Standard
+Replication at filesystem level is often at single-file granularity.
+ If you have several millions or even billions of inodes, you may easily
+ find yourself in a snakepit.
+\end_layout
+
+\begin_layout Standard
+Conclusion: active-passive operation over long distances (such as between
+ continents) is even an advantage.
+ It keeps you from trying bad / almost impossible things.
+\end_layout
+
+\begin_layout Section
+Scalability Arguments from Architecture
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Scalability-Arguments-from"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Some people are talking about scalability by (1) looking at a relatively
+ small example cluster
+\emph on
+implementation
+\emph default
+ of their respective (pre-)chosen
+\emph on
+architecture
+\emph default
+ having
+\begin_inset Formula $n$
+\end_inset
+
+ machines or
+\begin_inset Formula $n$
+\end_inset
+
+ network components or running
+\begin_inset Formula $n$
+\end_inset
+
+ application instances, and then (2) extrapolating its behaviour to bigger
+
+\begin_inset Formula $n$
+\end_inset
+
+.
+ They think if it runs with small
+\begin_inset Formula $n$
+\end_inset
+
+, it will also run for bigger
+\begin_inset Formula $n$
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+This way of thinking and acting is completely broken, and can endanger both
+ companies and careers.
+\end_layout
+
+\begin_layout Standard
+This is not only because of confusion of
+\begin_inset Quotes eld
+\end_inset
+
+architecture
+\begin_inset Quotes erd
+\end_inset
+
+ with
+\begin_inset Quotes eld
+\end_inset
+
+implementation
+\begin_inset Quotes erd
+\end_inset
+
+, cf section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:What-is-Architecture"
+
+\end_inset
+
+.
+ It is also fundamentally broken because it assumes some
+\begin_inset Quotes eld
+\end_inset
+
+linearity
+\begin_inset Quotes erd
+\end_inset
+
+ in a field which is non-linear
+\emph on
+by definition
+\emph default
+.
+ If scalability would be linear, the term would not be useful at all, because
+ there would be
+\emph on
+no limit
+\emph default
+.
+ However, limits exist in practice, and the term
+\begin_inset Quotes eld
+\end_inset
+
+scalability
+\begin_inset Quotes erd
+\end_inset
+
+ is the
+\emph on
+means
+\emph default
+ for describing the behaviour at or around the limit.
+\end_layout
+
+\begin_layout Standard
+Another
+\emph on
+incorrect
+\emph default
+ way of ill-defining the term
+\begin_inset Quotes eld
+\end_inset
+
+scalability
+\begin_inset Quotes erd
+\end_inset
+
+ is looking at some relatively big
+\emph on
+example
+\emph default
+ cluster, which is working in practice.
+ Arguing with an example of a working system is wrong by construction.
+\end_layout
+
+\begin_layout Standard
+
+\emph on
+Every
+\emph default
+ storage system on this globe has
+\emph on
+always
+\emph default
+ some scalability limit, somewhere.
+ Scalability is
+\emph on
+always
+\emph default
+ a
+\series bold
+non-linear
+\series default
+ behaviour.
+ In order to find the practical limit, you must
+\emph on
+reach
+\emph default
+ it.
+\end_layout
+
+\begin_layout Standard
+Therefore, examples are principally insufficient for proving scalability,
+ as well as for comparing the scalability of architectures and/or of certain
+ implementations.
+ Examples can be only used for
+\emph on
+disproving
+\emph default
+ scalability.
+\end_layout
+
+\begin_layout Subsection
+Example Failures of Scalability
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Example-Failures-of"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The following description is a
+\series bold
+must read
+\series default
+ for sysadmins and system architects, and also for managers who are
+\series bold
+responsible
+\series default
+.
+ The numbers and some details are from my memory, thus it need not be 100%
+ accurate in all places.
+\end_layout
+
+\begin_layout Standard
+It is about an operation environment for a
+\emph on
+new
+\emph default
+ product, which was a proprietary web page editor running under a complicated
+ variant of a LAMP stack.
+\end_layout
+
+\begin_layout Standard
+The setup started with a
+\family typewriter
+BigCluster
+\family default
+
+\emph on
+architecture
+\emph default
+, but actually sized as a
+\family typewriter
+
+\begin_inset Quotes eld
+\end_inset
+
+SmallCluster
+\begin_inset Quotes erd
+\end_inset
+
+
+\family default
+ implementation.
+\end_layout
+
+\begin_layout Paragraph
+Setup 1 (NFS)
+\end_layout
+
+\begin_layout Standard
+The first setup consisted of
+\begin_inset Formula $n=6$
+\end_inset
+
+ storage servers, each replicated to another datacenter via DRBD.
+ Each were exporting their filesystems via NFS to about the same number
+ of client servers, where Apache/PHP was supposed to serve the HTTP requests
+ from the customers, which were entering the client cluster via a HTTP load
+ balancer.
+ The load balancer was supposed to spread the HTTP load to the client servers
+ in a
+\series bold
+round-robin
+\series default
+ fashion.
+\end_layout
+
+\begin_layout Standard
+
+\color lightgray
+At this point, eager readers may notice some similarity with the error propagati
+on problem treated in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "subsec:Error-Propagation-to"
+
+\end_inset
+
+.
+ Notice that this is about
+\emph on
+scalability
+\emph default
+ instead, but you should compare with that, to find some similarities.
+\end_layout
+
+\begin_layout Standard
+After the complicated system was built up and was working well enough, the
+ new product was launched via a marketing campaign with free trial accounts,
+ limited to some time.
+\end_layout
+
+\begin_layout Standard
+So the number of customers was ramping up from 0 to about 20,000 within
+ a few weeks.
+ When about 20,000 customers were running on the client machines, system
+ hangs were noticed, and also from a customer's perspective.
+ When too many customers were pressing the
+\begin_inset Quotes eld
+\end_inset
+
+save
+\begin_inset Quotes erd
+\end_inset
+
+ button in parallel on reasonably large web page projects, a big number
+ of small files, including a huge bunch of small image files, was generated
+ over a short period of time.
+ A few customers were pressing the
+\begin_inset Quotes eld
+\end_inset
+
+save
+\begin_inset Quotes erd
+\end_inset
+
+ button several times a minute, each time re-creating all of these files
+ again and again from the proprietary web page generator.
+ Result: the system appeared to hang.
+\end_layout
+
+\begin_layout Standard
+However, all of the servers, including the storage servers, were almost
+
+\emph on
+idle
+\emph default
+ with respect to CPU consumption.
+ RAM sizes were also no problem.
+\end_layout
+
+\begin_layout Standard
+After investigating the problem for a while, it was noticed that the
+\series bold
+\emph on
+network
+\series default
+\emph default
+ was the bottleneck, but not in terms of throughput.
+ The internal sockets were forming some
+\series bold
+queues
+\series default
+ which were
+\emph on
+delaying
+\emph default
+ the NFS requests in some
+\series bold
+ping-pong
+\series default
+ like fashion, almost resulting in a
+\begin_inset Quotes eld
+\end_inset
+
+deadlock
+\begin_inset Quotes erd
+\end_inset
+
+ from a customer's perspective (a better term would be
+\series bold
+distributed livelock
+\series default
+ or
+\series bold
+distributed thrashing
+\series default
+).
+\end_layout
+
+\begin_layout Paragraph
+Setup 2 (
+\family typewriter
+ocfs2
+\family default
+)
+\end_layout
+
+\begin_layout Standard
+Due to some external investigations and recommendations, the system was
+ converted from NFS to
+\family typewriter
+ocfs2
+\family default
+.
+ Now DRBD was operated in active-active mode.
+ Only one system software component was replaced with another one, without
+ altering the
+\family typewriter
+BigCluster
+\family default
+ architecture, and without changing the number of servers, which remained
+ a stripped-down
+\family typewriter
+SmallCluster
+\family default
+ implementation.
+\end_layout
+
+\begin_layout Standard
+Result: the problem with the
+\begin_inset Quotes eld
+\end_inset
+
+hangs
+\begin_inset Quotes erd
+\end_inset
+
+ disappeared.
+\end_layout
+
+\begin_layout Standard
+However, after the number of customers had exceeded the
+\series bold
+next scalability limit
+\series default
+ of about 30,000 customers, the
+\begin_inset Quotes eld
+\end_inset
+
+hang
+\begin_inset Quotes erd
+\end_inset
+
+ problem appeared once again, in a similar way.
+ The system showed systematical incidents again.
+\end_layout
+
+\begin_layout Paragraph
+Setup 3 (
+\family typewriter
+glusterfs
+\family default
+ as a substitute for NFS)
+\end_layout
+
+\begin_layout Standard
+After investigating the network queueing behaviour and the lock contention
+ problems of
+\family typewriter
+ocfs2
+\family default
+, the next solution was
+\family typewriter
+glusterfs
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+However, when the number of customers exceeded the
+\series bold
+\emph on
+next
+\emph default
+ scalability limit
+\series default
+, which was about 50,000 customers, some of them hammering the cluster with
+ their
+\begin_inset Quotes eld
+\end_inset
+
+save
+\begin_inset Quotes erd
+\end_inset
+
+ button, the
+\begin_inset Quotes eld
+\end_inset
+
+hangs
+\begin_inset Quotes erd
+\end_inset
+
+ appeared again.
+\end_layout
+
+\begin_layout Paragraph
+Setup 4 (
+\family typewriter
+glusterfs
+\family default
+ replication as a substitute for DRBD)
+\end_layout
+
+\begin_layout Standard
+After analyzing the problem once again, it was discovered by accident that
+
+\family typewriter
+drbdadm disconnect
+\family default
+
+\emph on
+appeared
+\emph default
+ to
+\begin_inset Quotes eld
+\end_inset
+
+solve
+\begin_inset Quotes erd
+\end_inset
+
+ the problem.
+\end_layout
+
+\begin_layout Standard
+Therefore DRBD was replaced with
+\family typewriter
+glusterfs
+\family default
+ replication.
+ There exists a
+\family typewriter
+glusterfs
+\family default
+ feature allowing replication of files at filesystem level.
+\end_layout
+
+\begin_layout Standard
+This attempt was
+\emph on
+immediately
+\emph default
+ resulting in an almost fatal disaster, and thus was stopped immediately:
+ the cluster completely broke down.
+ Almost nothing was working anymore.
+\end_layout
+
+\begin_layout Standard
+The problem was even worse: switching off the
+\family typewriter
+glusterfs
+\family default
+ replication and rollback to DRBD did not work.
+ The system remained unusable.
+\end_layout
+
+\begin_layout Standard
+As a temporary workaround,
+\family typewriter
+drbdadm disconnect
+\family default
+ was improving the situation enough for some humbling operation.
+\end_layout
+
+\begin_layout Standard
+Retrospective explanation: some of the reasons can be found in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "subsec:Behaviour-of-DRBD"
+
+\end_inset
+
+.
+
+\family typewriter
+glusterfs
+\family default
+ replication does not scale at all because it stores its replication information
+ at
+\series bold
+per-inode granularity
+\series default
+ in EAs (extended attributes), which must
+\emph on
+necessarily
+\emph default
+ be worse than DRBD, because there were some hundreds of millions of them
+ in total as reported by
+\family typewriter
+df -i
+\family default
+ (see the cut point discussion in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:Performance-Arguments-from"
+
+\end_inset
+
+).
+ Overnight in some cron jobs, these EAs had to be deleted in reasonably
+ sized batches in order to become more or less
+\begin_inset Quotes eld
+\end_inset
+
+operable
+\begin_inset Quotes erd
+\end_inset
+
+ again.
+\end_layout
+
+\begin_layout Paragraph
+Setup5 (Sharding on top of DRBD)
+\end_layout
+
+\begin_layout Standard
+After the almost fatal incident had been resolved to a less critical one,
+ the responsibility for setup was taken over by another person.
+ After the
+\begin_inset Formula $O(n^{2})$
+\end_inset
+
+ behaviour from section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:Distributed-vs-Local:"
+
+\end_inset
+
+ had been understood, and after it was clear that sharding is only
+\begin_inset Formula $O(k)$
+\end_inset
+
+ from a customer's perspective, it was the final solution.
+ Now the problem was resolved at
+\series bold
+\emph on
+architectural level
+\series default
+\emph default
+, no longer by just replacing some components with some others.
+\end_layout
+
+\begin_layout Standard
+The system was converted to a variant of a
+\family typewriter
+RemoteSharding
+\family default
+ model (see section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "subsec:Variants-of-Sharding"
+
+\end_inset
+
+), and some
+\family typewriter
+migrate
+\family default
+ scripts were introduced for load balancing of customer homedirectories
+ and databases between shards.
+\end_layout
+
+\begin_layout Standard
+As a side effect, the load balancer became a new role: instead of spreading
+
+\emph on
+all
+\emph default
+ of the HTTP requests to
+\emph on
+all
+\emph default
+ of the client servers in a round-robin fashion, it now acted as a redirection
+ mechanism at
+\emph on
+shard granularity
+\emph default
+, e.g.
+ when one of the client servers was handed over to another one for maintenance.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ Retrospective explanation: DRBD was definitely
+\emph on
+not
+\emph default
+ the real reason for the critical incident.
+ The replication traffic per shard is so low in average that until today,
+ no replacement by MARS was absolutely necessary
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Many sysadmins are running a conservative strategy: never touch a running
+ system...
+\end_layout
+
+\end_inset
+
+, although the distance is over 50 km.
+ If you wonder why such low write traffic demands can cause such a big incident:
+ look at the
+\series bold
+cache reduction
+\series default
+ graphics in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:Performance-Arguments-from"
+
+\end_inset
+
+.
+ Today, the
+\begin_inset Quotes eld
+\end_inset
+
+save
+\begin_inset Quotes erd
+\end_inset
+
+ buttons of the customers are just triggering some
+\emph on
+extra
+\emph default
+
+\series bold
+writebacks
+\series default
+ from the Page Cache of the kernel into the block layer, after some
+\emph on
+delay
+\emph default
+.
+ These writebacks are not performance critical in reality, because the Page
+ Cache is running them
+\series bold
+\emph on
+asynchronously in background
+\series default
+\emph default
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ In contrast, distributed filesystems like
+\family typewriter
+NFS
+\family default
+ or
+\family typewriter
+ocfs2
+\family default
+ or
+\family typewriter
+glusterfs
+\family default
+ are not working asynchronously in many places, but will often schedule
+ their requests
+\emph on
+synchronously
+\emph default
+ into ordinary network queues, which form a
+\series bold
+sequential bottleneck
+\series default
+, competing with other high-frequent filesystem operations.
+ In addition, the
+\begin_inset Quotes eld
+\end_inset
+
+save
+\begin_inset Quotes erd
+\end_inset
+
+ button triggers masses of metadata / inode updates in a short time, often
+ residing in the same directory.
+ Such a directory may thus form a
+\begin_inset Quotes eld
+\end_inset
+
+global
+\begin_inset Quotes erd
+\end_inset
+
+ bottleneck.
+ When suchalike competing
+\series bold
+metadata updates
+\series default
+ are distributed via a round-robin load balancer, the problem can easily
+ become critical by the
+\series bold
+cache coherence problem
+\series default
+.
+ While local filesystems can smoothen such application behaviour via the
+ Dentry Cache plus Inode Cache, which also show some asynchronous writeback
+ behaviour, network filesystems are often unable to deal with this performantly.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ Although DRBD has a similar sequential bottleneck at the low-frequency
+ block layer by its write-through strategy into its replica, this does not
+ really matter: all other writebacks from the Page Cache are
+\emph on
+also
+\emph default
+ started asynchronously, and triggered low-frequently, and are occurring
+ after some
+\emph on
+delay
+\emph default
+ (which in turn will smoothen the
+\series bold
+spikes
+\series default
+ caused by
+\series bold
+mass dirtification
+\series default
+ of many small files and inodes in a short time as caused by the
+\begin_inset Quotes eld
+\end_inset
+
+save
+\begin_inset Quotes erd
+\end_inset
+
+ button), and thus are not really performance critical for this particular
+ use case.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ This is a striking example why careful
+\series bold
+selection of granularity level
+\series default
+ (filesystem vs block layer) is essential.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ This is also a striking example why asynchronous operations can form a
+ huge advantage in certain use cases.
+\end_layout
+
+\begin_layout Standard
+The sharding setup is working until today, scaling up to the current number
+ of customers, which is more than an order of magnitude, in the range of
+ about a million of customers.
+ Of course, the number of shards had to be increased, but this is just what
+ sharding is about.
+\end_layout
+
+\begin_layout Subsection
+Properties of Storage Scalability
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Properties-Scalability"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Influence Factors at Scalability
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Influence-Factors-Scalability"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+In general, scalability of storage systems depends on the following factors
+ (list may be incomplete):
+\end_layout
+
+\begin_layout Enumerate
+The
+\series bold
+application class
+\series default
+, in particular its principal
+\series bold
+workingset behaviour
+\series default
+ (in both dimensions: timely and locality).
+ More explanations about workingsets can be found at
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+http://blkreplay.org
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Enumerate
+The
+\series bold
+size
+\series default
+
+\begin_inset Formula $x$
+\end_inset
+
+ of the application data and/or the
+\series bold
+number of application instances
+\series default
+ (possibly also denoted by
+\begin_inset Formula $x$
+\end_inset
+
+), and the amount of storage needed for it (could be also termed
+\begin_inset Formula $x$
+\end_inset
+
+).
+ Besides the data itself, the corresponding
+\series bold
+metadata
+\series default
+ (inodes, indexes, etc) can form an important factor, or can even
+\emph on
+dominate
+\emph default
+ the whole story.
+ Typically, critical datacenter application data is tremendously differently
+ sized from workstation data.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Caution! Many people think erronously that scalability would be
+\emph on
+linearly
+\emph default
+ depending on
+\begin_inset Formula $x$
+\end_inset
+
+.
+ However, as is known at least since the 1960s (read some ancient papers
+ from Saltzer and/or from Denning), scalability is
+\series bold
+never linear
+\series default
+, but sometimes even
+\series bold
+\emph on
+disruptive
+\series default
+\emph default
+, in particular when RAM size is the bottleneck.
+ IO queues and/or networking queues are often also reacting to overload
+ in a disruptive fashion.
+ This means: after exceeding the
+\series bold
+scalability limit
+\series default
+ of a particular system for its particular class of applications, the system
+ will always
+\series bold
+break down
+\series default
+ from a customer's perspective, sometimes almost completely, and sometimes
+ even
+\series bold
+\emph on
+fatally
+\series default
+\emph default
+.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ On the other hand, some other systems are reacting with
+\series bold
+graceful degradation
+\series default
+.
+ Whether a particular systems reacts to a particular type of (over)load,
+ either with graceful degradation, or with fatal disruption, or with some
+ intermediate behaviour, is some sort of
+\begin_inset Quotes eld
+\end_inset
+
+quality property
+\begin_inset Quotes erd
+\end_inset
+
+ of the system and/or of the application.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ EVERY SYSTEM, even sharded systems, and even the internet as a whole, has
+
+\emph on
+always
+\emph default
+ some scalability limit
+\emph on
+somewhere
+\emph default
+.
+ There exists
+\series bold
+no
+\begin_inset Quotes eld
+\end_inset
+
+inifinitely scaling
+\begin_inset Quotes erd
+\end_inset
+
+ system
+\series default
+ on earth!
+\end_layout
+
+\begin_layout Enumerate
+The
+\series bold
+\emph on
+distribution
+\series default
+\emph default
+ of the application behaviour in both
+\series bold
+timely
+\series default
+ and
+\series bold
+locality
+\series default
+ dimensions.
+ Depending on the application class, this is often an
+\emph on
+exponential
+\emph default
+ distribution according to Zipf's law.
+ By falsely
+\emph on
+assuming
+\emph default
+ an equal distribution (or a Gaussian distribution) instead of actually
+ measuring the distribution in both dimensions, you can easily induce zillions
+ of costly problems for big
+\begin_inset Formula $x$
+\end_inset
+
+, or even fatal failure of the whole system / project.
+\end_layout
+
+\begin_layout Enumerate
+The
+\series bold
+transformation
+\series default
+ of the application workingset behaviour at architectural level, sometimes
+ caused by certain components resp their specific implementation or parameteriza
+tion.
+ Examples are intermediate virtualization layers, e.g.
+ vmware
+\family typewriter
+*.vmdk
+\family default
+ or KVM
+\family typewriter
+*.qcow2
+\family default
+ container formats which can completely change the game, not only in extreme
+ cases.
+ Another example is
+\series bold
+random distribution
+\series default
+ to object stores, which can turn some uncomplicated sequential workloads
+ into highly problematic
+\emph on
+random IO
+\emph default
+ workloads.
+ Don't overlook such potential pitfalls!
+\end_layout
+
+\begin_layout Enumerate
+The storage
+\series bold
+architecture
+\series default
+ to be chosen, such as
+\family typewriter
+CentralStorage
+\family default
+ vs
+\family typewriter
+BigCluster
+\family default
+ vs
+\family typewriter
+*Sharding
+\family default
+.
+ Choice of the wrong architecture can be fatal for big
+\begin_inset Formula $n$
+\end_inset
+
+ and/or for certain timely / spatial application behaviour.
+ Changing an architecture during operations on some petabytes of data and/or
+ some billions of inodes can be almost impossible, and/or can consume a
+ lot of time and money.
+\end_layout
+
+\begin_layout Enumerate
+The
+\series bold
+number
+\series default
+ of storage
+\series bold
+nodes
+\series default
+
+\begin_inset Formula $n$
+\end_inset
+
+.
+ In some architectures, addition of more nodes can make the system
+\emph on
+worse
+\emph default
+ instead of better, c.f.
+ section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:Reliability-Arguments-from"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Enumerate
+In case of architectures relying on a storage network: choice of
+\series bold
+layer
+\series default
+ for cut point, e.g.
+ filesystem layer vs block layer, see section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:Performance-Arguments-from"
+
+\end_inset
+
+, and/or introduction of an additional intermediate object storage layer
+ (which can result in major degradation from an architectural view).
+ Due to fundamental differences in distributed vs local
+\series bold
+cache coherence
+\series default
+, suchalike can have a
+\emph on
+tremendous
+\emph default
+ effect on scalability.
+\end_layout
+
+\begin_layout Enumerate
+The
+\series bold
+implementation
+\series default
+ of the architecture.
+ Be sure to understand the difference between an
+\emph on
+architecture
+\emph default
+ and an
+\emph on
+implementation
+\emph default
+ of that architecture.
+\end_layout
+
+\begin_layout Enumerate
+The size and types / properties of various
+\series bold
+caches
+\series default
+ at various layers.
+ You need to know the general properties of
+\series bold
+inclusive
+\series default
+ vs
+\series bold
+exclusive
+\series default
+ cache architecture.
+ You absolutely need to know what
+\series bold
+thrashing
+\series default
+ is, and under which conditions it can occur.
+\begin_inset Newline newline
+\end_inset
+
+It is advantagous for system architects to know
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Reading a few Wikipedia articles does not count as
+\begin_inset Quotes eld
+\end_inset
+
+knowledge
+\begin_inset Quotes erd
+\end_inset
+
+.
+ You need to be able to
+\emph on
+apply
+\emph default
+ your knowdedge to enterprise level systems (as opposed to workstation-sized
+ systems),
+\emph on
+sustainable
+\emph default
+ and
+\emph on
+reproducible
+\emph default
+.
+ Therefore you need to have
+\emph on
+actually worked
+\emph default
+ in the matter and gained some extraordinary experiences, on top of deep
+ understanding of the matter.
+\end_layout
+
+\end_inset
+
+ pre-loading strategies, as well as replacement strategies.
+ It is advantageous to know what
+\family typewriter
+LRU
+\family default
+ or
+\family typewriter
+MFU
+\family default
+ means, what their induced
+\emph on
+overhead
+\emph default
+ is, and how they
+\emph on
+really
+\emph default
+ work on
+\emph on
+actual
+\emph default
+ data, not just on some artificial lab data.
+ You also should know what an
+\series bold
+anomaly
+\series default
+ is, and how it can be produced not only by
+\family typewriter
+FIFO
+\family default
+ strategies, but also by certain types of ill-designed multi-layer caching.
+ Beware: there are places where
+\family typewriter
+FIFO
+\family default
+-like behaviour is almost impossible to avoid, such as networks.
+ All of these is outside the scope of this MARS manual.
+ You should
+\emph on
+measure
+\emph default
+, when possible, the
+\series bold
+overhead
+\series default
+ of cache implementations.
+ I know of
+\emph on
+examples
+\emph default
+ where caching is c
+\emph on
+ounter-productive
+\emph default
+.
+ For example, certain types and implementations of SSD caches are over-hyped.
+ Removing a certain cache will then
+\emph on
+improve
+\emph default
+ the situation.
+ Notice: caches are conceptually based on some type of
+\series bold
+associative memory
+\series default
+, which is either very costly when directly implemented in hardware, or
+ can suffer from tremendous performance penalties when implemented inappropriate
+ly in software.
+\end_layout
+
+\begin_layout Enumerate
+
+\series bold
+Hardware dimensioning
+\series default
+ of the implementation: choice of storage hardware, for each storage node.
+ This includes SSDs vs HDDs, their attachment (e.g.
+ SAS multiplexing bottlenecks), RAID level, and controller limitations,
+ etc.
+\end_layout
+
+\begin_layout Enumerate
+Only for architectures relying on a storage network: network
+\series bold
+throughput
+\series default
+ and network
+\series bold
+latencies
+\series default
+, and network
+\series bold
+bottlenecks
+\series default
+, including the
+\series bold
+queueing
+\series default
+ behaviour / congestion control /
+\series bold
+packet loss
+\series default
+ behaviour upon overload.
+ The latter is often neglected, leading to unexpected behaviour at load
+ peaks, and/or leading to costly over-engineering (examples see section
+
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "subsec:Example-Failures-of"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Enumerate
+
+\series bold
+\emph on
+Hidden
+\emph default
+ bottlenecks
+\series default
+ of various types.
+ A complete enumeration is almost impossible, because there are too many
+
+\begin_inset Quotes eld
+\end_inset
+
+opportunities
+\begin_inset Quotes erd
+\end_inset
+
+.
+ To reduce the latter, my general advice is to try to build bigger systems
+ as
+\emph on
+simple
+\emph default
+ as possible.
+ This is why you should involve some
+\emph on
+real
+\emph default
+ experts in storage systems, at least on critical enterprise data.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\emph on
+Any
+\emph default
+ of these factors can be dangerous when not carefully thought about and
+ treated, depending on your use case.
+\end_layout
+
+\begin_layout Subsubsection
+Example Scalability Scenario
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Example-Scalability-Scenario"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+To get an impression what
+\begin_inset Quotes eld
+\end_inset
+
+enterprise critical data
+\begin_inset Quotes erd
+\end_inset
+
+ can mean in a concrete example, here are some characteristic numbers on
+ 1&1 ShaHoLin (Shared Hosting Linux) around spring 2018, which would be
+ the
+\emph on
+input parameters
+\emph default
+ for
+\emph on
+any
+\emph default
+ potential solution architecture
+\family typewriter
+CentralStorage
+\family default
+ vs
+\family typewriter
+BigCluster
+\family default
+ vs
+\family typewriter
+Sharding
+\family default
+:
+\end_layout
+
+\begin_layout Itemize
+About 9 millions of customer homedirectories.
+\end_layout
+
+\begin_layout Itemize
+About 10 billions of inodes, with daily incremental backup.
+\end_layout
+
+\begin_layout Itemize
+More than 4 petabytes of
+\emph on
+net
+\emph default
+ data (total
+\family typewriter
+df
+\family default
+ filling level) in spring 2018, with a growth rate of 21% per year.
+\end_layout
+
+\begin_layout Itemize
+All of this permanently replicated into a second datacenter.
+\end_layout
+
+\begin_layout Itemize
+Webhosting very close to 24/7/365.
+ For maintenance, any resource must be switchable to the other datacenter
+ at any time, indepently from other resources; while in catastrophic failure
+ scenarios
+\emph on
+all
+\emph default
+ resources must be switchable within a short time.
+\end_layout
+
+\begin_layout Standard
+For simplicity of our sandbox game, we assume that all of this is in one
+ campus.
+ In reality, about 30% is residing in another continent.
+ Introducing this as an additional input parameter would not fundamentally
+ change the game.
+ Many other factors, like dependencies from existing infrastructure, are
+ also neglected.
+\end_layout
+
+\begin_layout Paragraph
+Theoretical Solution:
+\family typewriter
+CentralStorage
+\end_layout
+
+\begin_layout Standard
+Let us assume somebody would try to operate this on classical
+\family typewriter
+CentralStorage
+\family default
+, and let us assume that migration of this amount of data including billions
+ of inodes would be no technical problem.
+ What would be the outcome?
+\end_layout
+
+\begin_layout Standard
+With current technology, finding a single
+\family typewriter
+CentralStorage
+\family default
+ appliance would be all else but easy.
+ Dimensioning would be needed for the
+\emph on
+lifetime
+\emph default
+ of such a solution, which is at least 5 years.
+ In five years, the data would grow by a factor of about
+\begin_inset Formula $1.21^{5}=2.6$
+\end_inset
+
+, which is then about
+\begin_inset Formula $10.5$
+\end_inset
+
+ petabytes.
+ This is only the
+\emph on
+net
+\emph default
+ capacity; at hardware layer much more is needed for spare space and for
+ local redundancy.
+ The single
+\family typewriter
+CentralStorage
+\family default
+ instance will need to scale up to at least this number, in one datacenter
+ (under the simplified game assumptions).
+\end_layout
+
+\begin_layout Standard
+The current number of client LXC containers is about
+\begin_inset Formula $2600$
+\end_inset
+
+, independent from location.
+ You will have to support growth in number of them.
+ For maintenance, any of these need to be switchable to a different location
+ at any time.
+ The number of bare metal servers running them can vary with hardware architectu
+re / hardware lifecycle, and with growth.
+ You will need to dimension a dedicated storage network for all of this.
+\end_layout
+
+\begin_layout Standard
+If you find a solution which can do this with current
+\family typewriter
+CentralStorage
+\family default
+ technology for the next 5 years, then you will have to ensure that restore
+ from backup
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Local snapshots, whether LVM or via some COW filesystem, do not count as
+ backups! You need a
+\emph on
+logical
+\emph default
+ copy, not a
+\emph on
+physical
+\emph default
+ one, in case your production filesystem instance gets damaged.
+\end_layout
+
+\end_inset
+
+ can be done in less than 1 day in case of a fatal disaster, see also treatment
+ of
+\family typewriter
+CentralStorage
+\family default
+ reliability in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "subsec:Reliability-Differences-CentralStorage"
+
+\end_inset
+
+.
+ Notice that the current self-built backup solution for a total of 15 billions
+ of inodes is based on a sharding model; converting this to some more or
+ less centralized solution would turn out as another challenge.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Attention! Buying 10 or 50 or 100 CentralStorage instances does not count
+ as a
+\family typewriter
+CentralStorage
+\family default
+ architecture.
+ By definition, suchalike would be
+\family typewriter
+RemoteSharding
+\family default
+ instead.
+ Notice that the current 1&1 solution is already a mixture of
+\family typewriter
+LocalSharding
+\family default
+ and
+\family typewriter
+RemoteSharding
+\family default
+, so you would win
+\emph on
+nothing
+\emph default
+ at architectural level.
+
+\end_layout
+
+\begin_layout Standard
+In your business case, you would need to justify the price difference between
+ the current component-based hardware solution (horizontally extensible
+ by
+\emph on
+scale-out
+\emph default
+) and
+\family typewriter
+CentralStorage
+\family default
+, which is about a factor of 10 per terabyte according to the table in section
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Cost-Arguments-from"
+
+\end_inset
+
+.
+ Even if you manage to find a vendor who is willing to subsidize to a factor
+ of only 3, this is not all you need.
+ You need to add the costs for the dedicated storage network.
+ On top of this, you need to account for the
+\emph on
+migration costs
+\emph default
+ after the lifetime of 5 years has passed, where the full data set needs
+ to be migrated to a successor storage system.
+\end_layout
+
+\begin_layout Standard
+Notice that classical argumentations with
+\series bold
+\emph on
+manpower
+\series default
+\emph default
+ will not work.
+ The current operating team is about 10 persons, with no dedicated storage
+ admin.
+ This relatively small team is not only operating a total of more than 6,000
+ shared boxes in all datacenters, but also some tenthousands of managed
+ dedicated servers, running essentially the same software stack, with practicall
+y fully automated mass deployment.
+ Most of their tasks are related to central software installation, which
+ is then automatically distributed, and to operation / monitoring / troubleshoot
+ing of masses of client servers.
+ Storage administration tasks in isolation are costing only a
+\emph on
+fraction
+\emph default
+ of this.
+ Typical claims that
+\family typewriter
+CentralStorage
+\family default
+ would require less manpower will not work here.
+ Almost everything which is needed for
+\emph on
+mass automation
+\emph default
+ is already automated.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Neglecting the tenthousands of managed dedicated servers would be a catastrophi
+c ill-design.
+ Their hardware is already given, by existing customer contracts, some of
+ them decades old.
+ You simply cannot fundamentally change the hardware of these customers
+ including their
+\emph on
+dedicated
+\emph default
+ local disks, which is their
+\emph on
+main selling point
+\emph default
+.
+ You cannot simply convert them to a shared
+\family typewriter
+CentralStorage
+\family default
+, even if it would be technically possible, and if it would deliver similar
+ IOPS rates than tenthousands of local spindles (and if you could reach
+ the bundled performance of local SSDs from newer contracts), and even if
+ you would introduce some interesting
+\series bold
+storage classes
+\series default
+ for all of this.
+ A dedicated server on top of a shared storage is no longer a dedicated
+ one.
+ You would have to migrate these customers to another product, with all
+ of its consequences.
+ Alone for these machines,
+\emph on
+most
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Only a few out of >1000 self-built or customized Debian packages are dealing
+ with MARS and/or with the clustermanager
+\family typewriter
+cm3
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\emph default
+ of the current automation of
+\family typewriter
+LocalStorage
+\family default
+ is needed
+\emph on
+anyway
+\emph default
+, although they are not geo-redundant at current stage.
+\end_layout
+
+\begin_layout Standard
+Conclusion:
+\family typewriter
+CentralStorage
+\family default
+ is simply
+\emph on
+unrealistic
+\emph default
+.
+\end_layout
+
+\begin_layout Paragraph
+Theoretical Solution:
+\family typewriter
+BigCluster
+\end_layout
+
+\begin_layout Standard
+The main problem of
+\family typewriter
+BigCluster
+\family default
+ is
+\series bold
+reliability
+\series default
+, as explained intuitively in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:Reliability-Arguments-from"
+
+\end_inset
+
+ and mathematically in appendix
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "chap:Mathematical-Model-of"
+
+\end_inset
+
+, and as observed in numerous installations not working as expected.
+\end_layout
+
+\begin_layout Standard
+Let us assume that all of these massive technical problems were solved,
+ somehow.
+ Then the business case would have to deal with the following:
+\end_layout
+
+\begin_layout Standard
+The total number of servers would need to be roughly
+\emph on
+doubled
+\emph default
+.
+ Not only their CAPEX, but also the corresponding OPEX (electrical power,
+ rackspace, manpower) would increase.
+ Alone their current electrical power cost, including cooling, is more than
+ the current sysadmin manpower cost.
+ Datacenter operations would also increase.
+ On top, a dedicated storage network and its administration would also be
+ needed.
+\end_layout
+
+\begin_layout Standard
+With respect to the tenthousands of managed dedicated servers and their
+ customer contracts, a similar argument as above holds.
+ You simply cannot convert them to
+\family typewriter
+BigCluster
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+Conclusion:
+\family typewriter
+BigCluster
+\family default
+ is also
+\emph on
+unrealistic
+\emph default
+.
+ There is nothing to win, but a lot to loose.
+\end_layout
+
+\begin_layout Paragraph
+Current Solution:
+\family typewriter
+LocalSharding
+\family default
+, sometimes
+\family typewriter
+RemoteSharding
+\end_layout
+
+\begin_layout Standard
+Short story: it works since decades, and is both cheap and robust since
+ geo-redundancy had been added around 2010.
+\end_layout
+
+\begin_layout Standard
+With the advent of Football (see chapter
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "chap:LV-Football"
+
+\end_inset
+
+), the
+\family typewriter
+LocalSharding
+\family default
+ architecture is raising up on par with the most important management abilities
+ of
+\family typewriter
+CentralStorage
+\family default
+ and
+\family typewriter
+BigCluster
+\family default
+ / Software Defined Storage.
+\end_layout
+
+\begin_layout Standard
+The story with the tenthousands of managed dedicated servers is arguing
+ vice versa: without the traditional ShaHoLin sharding architecture and
+ all of its automation, including the newest addition called Football, the
+ product
+\begin_inset Quotes eld
+\end_inset
+
+managed dedicated servers
+\begin_inset Quotes erd
+\end_inset
+
+ would not be possible in this scale.
+
+\end_layout
+
+\begin_layout Standard
+Summay: the sharded
+\begin_inset Quotes eld
+\end_inset
+
+shared
+\begin_inset Quotes erd
+\end_inset
+
+ product enables another
+\begin_inset Quotes eld
+\end_inset
+
+dedicated
+\begin_inset Quotes erd
+\end_inset
+
+ product, which is sharded by definition, and it actually is known to scale
+ up by at least another order of magnitude (in terms of number of servers).
+\end_layout
+
+\begin_layout Subsection
+Scalability of Filesystem Layer vs Block Layer
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Filesystem-Layer-vs"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Following factors are responsible for better architectural (cf section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:What-is-Architecture"
+
+\end_inset
+
+) scalability of the block layer vs the filesystem layer, at least in many
+ cases, with a few exceptions (list may be incomplete):
+\end_layout
+
+\begin_layout Enumerate
+
+\series bold
+Granularity
+\series default
+ of access:
+\series bold
+metadata
+\series default
+ is often smaller than the content data it refers to, but access to data
+ is typically not possible without accessing corresponding metadata
+\emph on
+first
+\emph default
+.
+ When masses of metadata are present (e.g.
+ some billions of inodes as in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Example-Scalability-Scenario"
+
+\end_inset
+
+), and when it is accessed
+\series bold
+more frequently
+\series default
+ than the corresponding data (e.g.
+ in stateless designs like Apache), it is likely to become the bottleneck.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Neglecting metadata and its access patterns is a major source of ill-designs.
+ I know of projects which have failed (in their original setup) because
+ of this.
+ Repair will typically involve some non-trivial architectural changes.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+By default, the block layer itself has almost no metadata at all (or only
+ tiny ones, such as describing a whole block device).
+ Therefore it has an
+\emph on
+inherent advantage
+\emph default
+ over the filesystem layer in such use cases.
+\end_layout
+
+\begin_layout Enumerate
+
+\series bold
+Caching
+\series default
+: shared memory caches in kernelspace (page cache + dentry cache) vs distributed
+ caches over network.
+ See the picture in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:Performance-Arguments-from"
+
+\end_inset
+
+.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ There exist
+\emph on
+examples
+\emph default
+ where shared distributed caches do not work at all.
+ I know of
+\emph on
+several
+\emph default
+ projects which have failed.
+ Another project than mentioned in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "subsec:Example-Failures-of"
+
+\end_inset
+
+ has failed because of violations of POSIX filesystem semantics.
+\end_layout
+
+\begin_layout Enumerate
+Only in distributed systems: the
+\series bold
+cache coherence problem
+\series default
+, both on metadata and on data.
+ Depending on load patterns, this can lead to tremendous performance degradation
+, see example in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Example-Failures-of"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Enumerate
+Dimensioning of the
+\series bold
+network
+\series default
+: throughput, latencies, queueing behaviour.
+\end_layout
+
+\begin_layout Standard
+There exist a few known exceptions (list may be incomplete, please report
+ further examples if you know some):
+\end_layout
+
+\begin_layout Itemize
+Databases: these are typically operating on specific container formats,
+ where no frequent
+\emph on
+external
+\emph default
+ metadata access is necessary, and where no sharing of the
+\emph on
+container as such
+\emph default
+ is necessary.
+ Typically, there is no big difference between storing them in block devices
+ vs local filesystems.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Exception from the exception: MyISAM is an old design from the 1980s, originall
+y based on DBASE data structures.
+ Don't try to access them over NFS or similar.
+ Or, better, try to avoid them at all if possible.
+\end_layout
+
+\begin_layout Itemize
+VM images: these are logical BLOBS, so there is typically no big difference
+ whether you have an intermediate
+\emph on
+true
+\emph default
+ filesystem layer, or not.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Filesystems on top of object stores are no true intermediate filesystems.
+ They are violating Dijkstra's important layering rules, as stated in his
+ famous articles on THE.
+ A similar argument holds for block devices on top of object stores.
+ Intermediate container formats like
+\family typewriter
+*.vmdk
+\family default
+ or
+\family typewriter
+*.qcow2
+\family default
+ can also act as game changers.
+ This does not mean that you have to avoid them at all.
+ However, be sure to
+\series bold
+check their influence
+\series default
+, and don't forget their
+\emph on
+workingset
+\emph default
+ and their
+\emph on
+caching behaviour
+\emph default
+ (which can go both into positive and into negative direction), in order
+ to really
+\emph on
+know what you are doing!
+\end_layout
+
+\begin_layout Standard
+There exist a few cases where a distributed filesystem, sometimes even actually
+ with
+\begin_inset Formula $O(n^{2})$
+\end_inset
+
+ behaviour,
+\emph on
+must
+\emph default
+ be used, because there exists a
+\emph on
+requirement
+\emph default
+ for it.
+ Some examples (list is certainly incomplete):
+\end_layout
+
+\begin_layout Itemize
+HPC =
+\series bold
+High Performance Computing
+\series default
+ on modern supercomputers, consisting of a high number of
+\begin_inset Formula $n$
+\end_inset
+
+ compute nodes, are often requiring access to a shared persistent data pool,
+ where each of the
+\begin_inset Formula $n$
+\end_inset
+
+ nodes must be sometimes able to access the same persistent data, sometimes
+ both for reading and writing.
+ Therefore, several supercomputers are using cluster filesystems like Lustre.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Care must be taken that high-frequency / fine granularity communication
+ over the distributed filesystem and its dedicated storage network does
+ not take place, but instead occurs over the ordinary low-latency communication
+ fabrics each modern supercomputer is relying on.
+ True
+\begin_inset Formula $O(n^{2})$
+\end_inset
+
+ storage access behaviour should be avoided as far as possible (given by
+ the problem to be solved).
+ When absolutely necessary, location transparency (as possible with cluster
+ filesystems like Lustre) as well as its DSM = Distributed Shared Memory
+ model must be given up, and an
+\series bold
+explicit communication model
+\series default
+ must be used instead, which allows explicit control over replicas and their
+ communication paths (e.g.
+ propagation in a binary tree fashion), although it results in much more
+ work for the programmers.
+ Only low frequency / coarse granularity transfers of
+\emph on
+bulk data
+\emph default
+ with
+\emph on
+high locality
+\emph default
+ should run over distributed filesystems, preferably in streaming mode.
+ The total frequency of metadata access should be low, because metadata
+ consistency may form a bottleneck when updated too frequently.
+ The programmers of the distributed application software need to take care
+ for this.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Notice that certain supercomputer workloads may be crying for a RemoteSharding
+ or FlexibleSharding storage architecture in place of a BigCluster architecture.
+ However, this is very application specific.
+\end_layout
+
+\begin_layout Itemize
+Student pools at universities, or location-independent workplaces at companies.
+ This is just the usecase where NFS was originally constructed for.
+ Typically,
+\series bold
+workstation workloads
+\series default
+ are neither performance critical, nor prone to actual
+\begin_inset Formula $O(n^{2})$
+\end_inset
+
+ behaviour (although the network infrastructure would
+\emph on
+allow
+\emph default
+ for it), because each user has her own home directory which is typically
+
+\emph on
+not shared
+\emph default
+ with others, and she cannot split herself and sit in front of multiple
+ workstations at the same time.
+ Thus the
+\emph on
+local per-workstation
+\emph default
+ NFS caching strategies have a good chance to hide much of the network latencies
+, and thus the actual total network workload is typically only
+\begin_inset Formula $O(n).$
+\end_inset
+
+
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ This can lead to a dangerous misinterpretation: because it apparently works
+ even for a few thousands of workstations, people conclude
+\emph on
+wrongly
+\emph default
+ that the network filesystem
+\begin_inset Quotes eld
+\end_inset
+
+must be scalable
+\begin_inset Quotes erd
+\end_inset
+
+.
+ Some people are then applying their experience to completely different
+ usecases, where much higher metadata traffic by several orders of magnitudes
+ is occurring (such as in webhosting), or even where true
+\begin_inset Formula $O(n^{2})$
+\end_inset
+
+ runtime behaviour is occuring (see example of a failed scalability scenario
+ in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "subsec:Example-Failures-of"
+plural "false"
+caps "false"
+noprefix "false"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ In general: when something works for usecase A, this
+\series bold
+does
+\emph on
+not
+\emph default
+ prove
+\series default
+ that it will also work for another usecase B.
+\end_layout
+
+\begin_layout Section
+Recommendations for Design and Operation of Storage Systems
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Recommendations-for-Designing"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection
+Recommendations for Managers
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Recommendations-for-Managers"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+When you are responsible for
+\series bold
+masses of enterprise-critical data
+\series default
+, the most important point is to get people with
+\series bold
+the right skills
+\series default
+, in
+\emph on
+addition(!) to
+\emph default
+ the
+\emph on
+right mindset
+\emph default
+, and to assign the right roles to them.
+\end_layout
+
+\begin_layout Standard
+Practical observation from many groups in many companies: which storage
+ systems / architectures are in use, and how much they are
+\emph on
+really
+\emph default
+ failure resistent and reliable, and how much they are
+\emph on
+really
+\emph default
+ scalable for their workload, and what is their TCO (Total Cost of Ownership),
+ does often
+\emph on
+not
+\emph default
+ depend on real knowledge and facts.
+ It often depends on
+\series bold
+personal habits
+\series default
+ and
+\series bold
+pre-judgement
+\series default
+ of staff
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+\noindent
+This can be seen in a bigger company (e.g.
+ after mergers etc) when very different architectures have been built by
+ different teams for very similar usecases, although they are sometimes
+ even roughly comparable in size and workload.
+\end_layout
+
+\end_inset
+
+.
+ In essence, this results in a gambling game how safe / cost-effective etc
+ your critical data
+\emph on
+really
+\emph default
+ is.
+\end_layout
+
+\begin_layout Standard
+As just explained in the previous section, there are so many pitfalls, and
+ there are only a few people who know them, because more people are working
+ in small-scale systems than in large-scale enterprise ones.
+ There are so many lots of people at the market who
+\emph on
+claim
+\emph default
+ to have some experience, but in reality they don't know what they don't
+ know (
+\series bold
+second-order ignorance
+\series default
+).
+\end_layout
+
+\begin_layout Standard
+Second-order ignorance is very dangerous, even for affected people themselves,
+ because they are in good faith about their own skills, and that they would
+ be able to control everything (sometimes they really want to control literally
+
+\emph on
+everything
+\emph default
+, even other people who have more real experience and knowledge).
+ See for example wrong assumptions and
+\begin_inset Quotes eld
+\end_inset
+
+false proofs
+\begin_inset Quotes erd
+\end_inset
+
+ about scalability, derived from different usecases (or in extreme cases
+ even from workstations workloads), or the failed scalability scenario in
+ section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "subsec:Example-Failures-of"
+plural "false"
+caps "false"
+noprefix "false"
+
+\end_inset
+
+ where some freelancers were consulted as
+\begin_inset Quotes eld
+\end_inset
+
+external experts
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Quotation
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Check your information sources! There is a
+\emph on
+systematic reason
+\emph default
+ for ill-informed
+\begin_inset Quotes eld
+\end_inset
+
+experts
+\begin_inset Quotes erd
+\end_inset
+
+.
+ On the internet, you can find a lot of so-called
+\begin_inset Quotes eld
+\end_inset
+
+best practices
+\begin_inset Quotes erd
+\end_inset
+
+.
+ Many of them propagating badly scaling storage architectures for enterprise
+ workloads, sometimes even
+\emph on
+generally
+\emph default
+ claiming they would
+\begin_inset Quotes eld
+\end_inset
+
+scale very well
+\begin_inset Quotes erd
+\end_inset
+
+, which is however often based on
+\emph on
+assumptions
+\emph default
+ instead of knowledge (and almost never based on
+\emph on
+measurements
+\emph default
+ at the right measurement points for deriving substantial knowledge about
+ your real application behaviour).
+ Literally
+\emph on
+anyone
+\emph default
+ can post falsely generalized
+\begin_inset Quotes eld
+\end_inset
+
+best practices
+\begin_inset Quotes erd
+\end_inset
+
+ to the internet.
+ Together with second-order ignorance about the non-transferability of
+\begin_inset Quotes eld
+\end_inset
+
+success stories
+\begin_inset Quotes erd
+\end_inset
+
+ from usecase A to usecase B (resulting in
+\emph on
+false
+\begin_inset Quotes eld
+\end_inset
+
+proofs
+\emph default
+
+\begin_inset Quotes erd
+\end_inset
+
+), the internet is creating
+\series bold
+information bubbles
+\series default
+.
+
+\end_layout
+
+\begin_layout Quotation
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Real knowledge originates from evaluated sources, such as
+\series bold
+scientific publications
+\series default
+ which have undergone at least some minimum
+\emph on
+quality check
+\emph default
+, and which are trying to describe their preconditions and operating environment
+s as precisely
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+\noindent
+Therefore, chances are better to get a real expert when he has some (higher)
+ academic degrees, and was working in the area for a longer time.
+\end_layout
+
+\end_inset
+
+ as possible.
+\end_layout
+
+\begin_layout Quotation
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Real experts will tell you when they don't know something.
+ In addition, they will tell you
+\emph on
+multiple
+\emph default
+ ways for abtaining such information, such as measurements, simulation,
+ etc.
+\end_layout
+
+\begin_layout Standard
+If you don't have anyone in your teams who knows how
+\series bold
+caching
+\series default
+
+\emph on
+really
+\emph default
+ works, or if it is a single guy who cannot withstand the pressure from
+ a whole group of
+\begin_inset Quotes eld
+\end_inset
+
+alpha animals
+\begin_inset Quotes erd
+\end_inset
+
+, you are running an
+\series bold
+increased risk
+\series default
+ of unnecessary expenses
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+I know of cases which have produced unnecessary
+\emph on
+direct
+\emph default
+ cost of at least € 20 millions.
+\end_layout
+
+\end_inset
+
+, worse services (indirect costs), failed projects, and sometimes even resulting
+ in loss of market share and/or of stock exchange value.
+\end_layout
+
+\begin_layout Standard
+The problem is that it
+\emph on
+looks so easy
+\emph default
+, as if everyone could build a larger storage system, with ease.
+ For example, just
+\begin_inset Quotes eld
+\end_inset
+
+spend some more money
+\begin_inset Quotes erd
+\end_inset
+
+, that's all you would need.
+ Unfortunately, both
+\begin_inset Quotes eld
+\end_inset
+
+marketing drones
+\begin_inset Quotes erd
+\end_inset
+
+ from commercial storage vendors, and even a few OpenSource advocates, are
+ propagating this
+\series bold
+dangerous mindset
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+As a responsible manager, how can you detect dangerous partly knowledge?
+ Good indicators are wrong usage of the term
+\begin_inset Quotes eld
+\end_inset
+
+architecture
+\begin_inset Quotes erd
+\end_inset
+
+ (see definition in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:What-is-Architecture"
+plural "false"
+caps "false"
+noprefix "false"
+
+\end_inset
+
+), and/or
+\series bold
+ confusion of architecture with implementation
+\series default
+.
+ When somebody confuses
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice that there exist people who use the term
+\begin_inset Quotes eld
+\end_inset
+
+architecture
+\begin_inset Quotes erd
+\end_inset
+
+ inadvertly.
+ They even don't even know that they are confusing architecture with implementat
+ion.
+ Pure usage of a certain term is no clear indicator that somebody is really
+ an expert.
+\end_layout
+
+\end_inset
+
+ this, he does not really have an overview of different architectural solution
+ classes.
+ Instead, such people are tending to propagate their random
+\begin_inset Quotes eld
+\end_inset
+
+favourite product
+\begin_inset Quotes erd
+\end_inset
+
+.
+ For a responsible, this increases the risk of getting a non-optimum or
+ even bad / dangerous solutions.
+\end_layout
+
+\begin_layout Standard
+Not everything which works in a garage, or in a student pool, or in the
+ testlab (whether it's yours or from a commercial storage vendor), or in
+ a PoC with some
+\begin_inset Quotes eld
+\end_inset
+
+friendly customers
+\begin_inset Quotes erd
+\end_inset
+
+, is well-suited for large enterprises and their critical data (measured
+ in petabytes / billions of files / etc), or is the optimum solution for
+ TCO.
+ Some rules of thumb, out of experience and observation:
+\end_layout
+
+\begin_layout Itemize
+For each 1 or 2 orders of magnitude of the
+\series bold
+size
+\series default
+ of your data, you need better methods for safe construction and operation.
+ At least for each 3 to 4 orders of magnitude (sometimes even for less),
+ you need
+\series bold
+better architectures
+\series default
+, and people who can deal with them.
+\end_layout
+
+\begin_layout Itemize
+For each 1 or 2 orders of magntitude of
+\series bold
+criticality
+\series default
+ of your data (measured by
+\emph on
+losses
+\emph default
+ in case of certain incidents), you will also need better architecture,
+ not just better components.
+\end_layout
+
+\begin_layout Subsection
+Recommendations for Architects and Sysadmins
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Recommendations-for-Architects"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+In order of precedence, do the following:
+\end_layout
+
+\begin_layout Enumerate
+
+\series bold
+Fix and/or limit and/or tune the
+\emph on
+application
+\series default
+\emph default
+.
+\begin_inset Newline newline
+\end_inset
+
+Some extreme examples:
+\end_layout
+
+\begin_deeper
+\begin_layout Itemize
+When you encounter a classical Unix
+\series bold
+fork bomb
+\series default
+, you have no chance against it.
+ Even the
+\begin_inset Quotes eld
+\end_inset
+
+best and the most expensive hardware
+\begin_inset Quotes erd
+\end_inset
+
+ is unable to successfully run a fork bomb.
+ The only countermeasure is
+\emph on
+limitation of resources
+\emph default
+.
+ Reason: unlimited resources do not exist on earth.
+\end_layout
+
+\begin_layout Itemize
+If you think that this were only of academic interest: several types of
+ internet
+\series bold
+DDOS attacks
+\series default
+ are acting like a fork bomb, and
+\series bold
+Apache
+\series default
+ is also acting similar to a fork bomb when not configured properly.
+ This is not about academics, it is about
+\emph on
+your survival
+\emph default
+ (in the sense of Darwin).
+\end_layout
+
+\begin_layout Itemize
+If you think it cannot hurt you because you are running
+\family typewriter
+fast-cgi
+\family default
+ or another application scheme where forks are not part of the game (e.g.
+ databases and many others): please notice that
+\series bold
+network queues
+\series default
+ are often acting as a replacement for processes.
+ Overflow of queues can have a similar effect than fork bombs from the viewpoint
+ of customers: they simply don't get the service they are expecting.
+\end_layout
+
+\begin_layout Itemize
+Real-life example: some percentage of
+\family typewriter
+WordPress
+\family default
+ customers are typically and
+\emph on
+systematically
+\emph default
+
+\series bold
+misconfiguring
+\series default
+ their
+\family typewriter
+wp-cron
+\family default
+ cron jobs.
+ They create backups of their website, which
+\emph on
+include
+\emph default
+ their old backups.
+ Result: in each generation of the backups, the needed disk space will roughly
+
+\emph on
+double
+\emph default
+.
+ Even if you had
+\begin_inset Quotes eld
+\end_inset
+
+unlimited storage
+\begin_inset Quotes erd
+\end_inset
+
+ on top of the
+\begin_inset Quotes eld
+\end_inset
+
+best and the most expensive storage system
+\begin_inset Quotes erd
+\end_inset
+
+, and even if you would like to give
+\begin_inset Quotes eld
+\end_inset
+
+unlimited storage
+\begin_inset Quotes erd
+\end_inset
+
+ to your customers, it simply cannot work at all.
+ Exponential growth is exponential growth.
+ After a few months of this kind of daily backup, you would need more storage
+ than atoms exist in the whole universe.
+ You
+\emph on
+must
+\emph default
+ introduce some quota limits somewhere.
+ And you
+\emph on
+must
+\emph default
+ ensure that the
+\family typewriter
+wp-cron
+\family default
+ misconfiguration is fixed, whoever is responsible for fixing it.
+\end_layout
+
+\begin_layout Itemize
+Another
+\family typewriter
+WordPress
+\family default
+ example: the
+\family typewriter
+wp-cron
+\family default
+ configuration syntax is not easily understandable by laymen.
+ It is easy to
+\series bold
+misconfigure
+\series default
+ such that a backup is created
+\emph on
+once per minute
+\emph default
+.
+ As long as the website is very small, this will not even be noticed by
+ sysadmins.
+ However, for bigger websites (and they are typically growing over time),
+ the IO load may increase to a point until even asynchronous replication
+ over 10Gig interfaces cannot catch up.
+ Even worse: the next run of
+\family typewriter
+wp-cron
+\family default
+ may start before the old one has finished within a minute.
+ Again, there is no chance except fixing the
+\emph on
+root cause
+\emph default
+ at application level.
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+
+\series bold
+Choose the right
+\emph on
+overall
+\emph default
+ architecture
+\series default
+ (not limited to storage).
+\begin_inset Newline newline
+\end_inset
+
+An impressive example for architectural (cf section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:What-is-Architecture"
+
+\end_inset
+
+) ill-design can be found in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Example-Failures-of"
+
+\end_inset
+
+.
+ Important explanations are in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Properties-Scalability"
+
+\end_inset
+
+, in particular subsection
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "subsec:Influence-Factors-Scalability"
+
+\end_inset
+
+, and section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "subsec:Filesystem-Layer-vs"
+
+\end_inset
+
+.
+ A strategic example is in subsection
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Example-Scalability-Scenario"
+
+\end_inset
+
+.
+ It is absolutely necessary to know the standard cache hierarchy of Unix
+ (similarly also found in Windows) from section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:Performance-Arguments-from"
+
+\end_inset
+
+.
+ More explanations are in this manual at many places.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ In general, major ill-designs of overall architectures (end-to-end) cannot
+ be fixed at component level.
+ Even the
+\begin_inset Quotes eld
+\end_inset
+
+best tuning of the world
+\begin_inset Quotes erd
+\end_inset
+
+ executed by the
+\begin_inset Quotes eld
+\end_inset
+
+best tuning expert
+\begin_inset Quotes erd
+\end_inset
+
+ on top of the
+\begin_inset Quotes eld
+\end_inset
+
+best and most expensive storage
+\emph on
+components
+\emph default
+ and the best storage
+\emph on
+network
+\emph default
+ of the world
+\begin_inset Quotes erd
+\end_inset
+
+ cannot compensate major ill-designs, such as
+\begin_inset Formula $O(n^{2})$
+\end_inset
+
+ behaviour.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Similarly for reliability: if you have problems with too many and/or too
+ large incidents affecting too many customers, read sections
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:Reliability-Arguments-from"
+
+\end_inset
+
+ and
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "subsec:Reliability-Differences-CentralStorage"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Enumerate
+
+\series bold
+Choice and tuning of components
+\series default
+.
+\begin_inset Newline newline
+\end_inset
+
+No further explanations necessary, because most people already know this.
+ In case you think this is the only way: no, it is typically the
+\emph on
+worst
+\emph default
+ and typically only the
+\emph on
+last resort
+\emph default
+ when compared to the previous enumeration items.
+\begin_inset Newline newline
+\end_inset
+
+Exception: choice of wrong components with insufficient properties for your
+ particular application / use case.
+ But this is an
+\emph on
+architectural
+\emph default
+ problem in reality.
+\end_layout
+
+\begin_layout Chapter
+Use Cases for MARS vs DRBD
+\begin_inset CommandInset label
+LatexCommand label
+name "chap:Use-Cases-for"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+DRBD has a long history of successfully providing HA features to many users
+ of Linux.
+ With the advent of MARS, many people are wondering what the difference
+ is.
+ They ask for recommendations.
+ In which use cases should DRBD be recommended, and in which other cases
+ is MARS the better choice?
+\end_layout
+
+\begin_layout Standard
+The following table is a short guide to the most important cases where the
+ decision is rather clear:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Tabular
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Use Case
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Recommendation
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+server pairs, each directly connected via
+\series bold
+crossover cables
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+DRBD
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\series bold
+active-active
+\series default
+ / dual-primary, e.g.
+
+\family typewriter
+\series bold
+gfs2
+\family default
+\series default
+,
+\family typewriter
+\series bold
+ocfs2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+DRBD
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+distance
+\series bold
+> 50km
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+MARS
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\series bold
+> 100 server pairs
+\series default
+ over a short-distance
+\series bold
+shared
+\series default
+ line
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+MARS
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+all else / intermediate cases
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+read the following details
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+There exist some use cases where DRBD is clearly better than MARS.
+ 1&1 has a long history of experiences with DRBD where it works very fine,
+ in particular coupling Linux devices rack-to-rack via crossover cables.
+ DRBD is just
+\emph on
+constructed
+\emph default
+ for that use case (RAID-1 over network).
+ In such a scenario, DRBD is better than MARS because it uses up less disk
+ space resources.
+ In addition, newer DRBD versions can run over high-speed but short-distance
+ interconnects like Infiniband (via the SDP protocol).
+ Another use case for DRBD is active-active / dual-primary mode, e.g.
+
+\family typewriter
+ocfs2
+\family default
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice that
+\family typewriter
+ocfs2
+\family default
+ is appearantly not constructed for long distances.
+ 1&1 has some experiences on a specific short distance cluster where the
+
+\family typewriter
+ocfs2
+\family default
+ /
+\family typewriter
+DRBD
+\family default
+ combination scaled a little bit better than
+\family typewriter
+NFS
+\family default
+, but worse than
+\family typewriter
+glusterfs
+\family default
+ (using 2 clients in both cases – notice that
+\family typewriter
+glusterfs
+\family default
+ showed extremely bad performance when trying to enable active-active
+\family typewriter
+glusterfs
+\family default
+ replication between 2 server instances, therefore we ended up using active-pass
+ive DRBD replication below a single
+\family typewriter
+glusterfs
+\family default
+ server).
+ Conclusion:
+\family typewriter
+NFS
+\family default
+ <
+\family typewriter
+ocfs2
+\family default
+ <
+\family typewriter
+glusterfs
+\family default
+ < sharding.
+ We found that
+\family typewriter
+glusterfs
+\family default
+ on top of active-passive DRBD scalability was about 2 times better than
+
+\family typewriter
+NFS
+\family default
+ on top of active-passive DRBD, while
+\family typewriter
+ocfs2
+\family default
+ on top of
+\family typewriter
+DRBD
+\family default
+ in active-active mode was somewhere inbetween.
+ All cluster comparisons with an increasing workload over time (measured
+ as number of customers which could be safely operated).
+ Each system was replaced by the next one when the respective scalability
+ was at its respective end, each time leading to operational problems.
+ The ultimate solution was to replace all of these clustering concepts by
+ the general concept of
+\series bold
+sharding
+\series default
+.
+\end_layout
+
+\end_inset
+
+ over short
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Active-active won't work over long distances at all because of high network
+ latencies (cf chapter
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "chap:Cloud-Storage"
+
+\end_inset
+
+).
+ Probably, for replication of whole clusters over long distances DRBD and
+ MARS could be stacked: using DRBD on top for MARS for active-active clustering
+ of
+\family typewriter
+gfs2
+\family default
+ or
+\family typewriter
+ocfs2
+\family default
+, and a MARS instance
+\emph on
+below
+\emph default
+ for failover of
+\emph on
+one
+\emph default
+ of the DRBD replicas over long distances.
+\end_layout
+
+\end_inset
+
+ distances.
+\end_layout
+
+\begin_layout Standard
+On the other hand, there exist other use cases where DRBD did not work as
+ expected, leading to incidents and other operational problems.
+ We analyzed them for our specific use cases.
+ The later author of MARS came to the conclusion that they could only be
+ resolved by fundamental changes in the overall architecture of DRBD.
+ The development of MARS started at the personal initiative of the author,
+ first in form of a personal project during holidays, but later picked up
+ by 1&1 as an official project.
+\end_layout
+
+\begin_layout Standard
+MARS and DRBD simply have
+\series bold
+different application areas
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+In the following, we will discuss the pros and cons of each system in particular
+ situations and contexts, and we shed some light at their conceptual and
+ operational differences.
+\end_layout
+
+\begin_layout Section
+Network Bottlenecks
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Network-Bottlenecks"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection
+Behaviour of DRBD
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Behaviour-of-DRBD"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+In order to describe the most important problem we found when DRBD was used
+ to couple whole datacenters (each encompassing thousands of servers) over
+ metro distances, we strip down that complicated real-life scenario to a
+ simplified laboratory scenario in order to demonstrate the effect with
+ minimal means.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Notice that the following DRBD effect does not appear at crossover cables.
+ The following scenario covers a non-standard case of DRBD.
+ DRBD works fine when no network bottleneck appears!
+\end_layout
+
+\begin_layout Standard
+The following picture illustrates an effect which has been observed in 1&1
+ datacenters when running masses of DBRD instances through a single network
+ bottleneck.
+ In addition, the effect is also reproducible by an elder version of the
+ MARS test suite
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+The effect has been demonstrated some years ago with DRBD version 8.3.13.
+ By construction, is is independent from any of the DRBD series 8.3.x, 8.4.x,
+ or 9.0.x.
+\end_layout
+
+\end_inset
+
+:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/network-bottleneck-drbd.fig
+ width 80col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+The simplified scenario is the following:
+\end_layout
+
+\begin_layout Enumerate
+DRBD is loaded with a low to medium, but constant rate of write operations
+ for the sake of simplicity of the scenario.
+\end_layout
+
+\begin_layout Enumerate
+The network has some throughput bottleneck, depicted as a red line.
+ For the sake of simplicity, we just linearly decrease it over time, starting
+ from full throughput, down to zero.
+ The decrease is very slowly over time (some minutes, or even hours).
+\end_layout
+
+\begin_layout Standard
+What will happen in this scenario?
+\end_layout
+
+\begin_layout Standard
+As long as the actual DRBD write throughput is lower than the network bandwidth
+ (left part of the horizontal blue line), DRBD works as expected.
+\end_layout
+
+\begin_layout Standard
+Once the maximum network throughput (red line) starts to fall short of the
+ required application throughput (first blue dotted line), we get into trouble.
+ By its very nature, DRBD works
+\series bold
+synchronously
+\series default
+.
+ Therefore, it
+\emph on
+must
+\emph default
+ transfer all your application writes through the bottleneck, but now it
+ is impossible
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+This is independent from the DRBD protocols A through C, because it just
+ depends on an information-theoretic argument independently from any protocol.
+ We have a fundamental conflict between network capabilities and application
+ demands here, which cannot be circumvented due to the
+\series bold
+synchronous
+\series default
+ nature of DRBD.
+\end_layout
+
+\end_inset
+
+ due to the bottleneck.
+ As a consequence, the application running on top of DRBD will see increasingly
+ higher IO latencies and/or stalls / hangs.
+ We found practical cases (at least with former versions of DRBD) where
+ IO latencies exceeded practical monitoring limits such as
+\begin_inset Formula $5$
+\end_inset
+
+ s by far, up to the range of
+\emph on
+minutes
+\emph default
+.
+ As an experienced sysadmin, you know what happens next: your application
+ will run into an incident, and your customers will be dissatisfied.
+\end_layout
+
+\begin_layout Standard
+In order to deal with such situations, DRBD has lots of tuning parameters.
+ In particular, the
+\family typewriter
+timeout
+\family default
+ parameter and/or the
+\family typewriter
+ping-timeout
+\family default
+ parameter will determine when DRBD will give up in such a situation and
+ simply drop the network connection as an emergency measure.
+ Dropping the network connection is roughly equivalent to an automatic
+\family typewriter
+disconnect
+\family default
+, followed by an automatic re-connect attempt after
+\family typewriter
+connect-int
+\family default
+ seconds.
+ During the dropped connection, the incident will appear as being resolved,
+ but at some hidden cost
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+By appropriately tuning various DRBD parameters, such as
+\family typewriter
+timeout
+\family default
+ and/or
+\family typewriter
+ping-timeout
+\family default
+, you can keep the impact of the incident below some viable limit.
+ However, the automatic disconnect will then happen earlier and more often
+ in practice.
+ Flaky or overloaded networks may easily lead to an enormous number of automatic
+ disconnects.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+What happens next in our scenario? During the
+\family typewriter
+disconnect
+\family default
+, DRBD will record all positions of writes in its bitmap and/or in its activity
+ log.
+ As soon as the automatic re-connect succeeds after
+\family typewriter
+connect-int
+\family default
+ seconds, DRBD has to do a partial re-sync of those blocks which were marked
+ dirty in the meantime.
+ This leads to an
+\emph on
+additional
+\emph default
+ bandwidth demand
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+DRBD parameters
+\family typewriter
+sync-rate
+\family default
+ resp
+\family typewriter
+resync-rate
+\family default
+ may be used to tune the height of the additional demand.
+ In addition, the newer parameters
+\family typewriter
+c-plan-ahead
+\family default
+,
+\family typewriter
+c-fill-target
+\family default
+,
+\family typewriter
+c-delay-target
+\family default
+,
+\family typewriter
+c-min-rate
+\family default
+,
+\family typewriter
+c-max-rate
+\family default
+ and friends may be used to dynamically adapt to
+\emph on
+some
+\emph default
+ situations where the application throughput
+\emph on
+could
+\emph default
+ fit through the bottleneck.
+ These newer parameters were developed in a cooperation between 1&1 and
+ Linbit, the maker of DRBD.
+\end_layout
+
+\begin_layout Plain Layout
+Please note that lowering / dynamically adapting the resync rates may help
+ in lowering the
+\emph on
+probability
+\emph default
+ of occurrences of the above problems in practical scenarios where the bottlenec
+k would recover to viable limits after some time.
+ However, lowering the rates will also increase the
+\emph on
+duration
+\emph default
+ of re-sync operations accordingly.
+ The
+\emph on
+total amount of re-sync data
+\emph default
+ simply does not decrease when lowering
+\family typewriter
+resync-rate
+\family default
+; it even tends to increase over time when new requests arrive.
+ Therefore, the
+\emph on
+expectancy value
+\emph default
+ of problems caused by
+\emph on
+strong
+\emph default
+ network bottlenecks (i.e.
+ when not even the ordinary application rate is fitting through) is
+\emph on
+not
+\emph default
+ improved by lowering or adapting
+\family typewriter
+resync-rate
+\family default
+, but rather the expectancy value mostly depends on the
+\emph on
+relation
+\emph default
+ between the amount of holdback data versus the amount of application write
+ data, both measured for the duration of some given strong bottleneck.
+\end_layout
+
+\end_inset
+
+ as indicated by the upper dotted blue box.
+\end_layout
+
+\begin_layout Standard
+Of course, there is
+\emph on
+absolutely no chance
+\emph default
+ to get the increased amount of data through our bottleneck, since not even
+ the ordinary application load (lower dotted lines) could be transferred.
+\end_layout
+
+\begin_layout Standard
+Therefore, you run at a
+\series bold
+very high risk
+\series default
+ that the re-sync cannot finish before the next
+\family typewriter
+timeout
+\family default
+ /
+\family typewriter
+ping-timeout
+\family default
+ cycle will drop the network connection again.
+\end_layout
+
+\begin_layout Standard
+What will be the final result when that risk becomes true? Simply, your
+ secondary site will be
+\emph on
+permanently
+\emph default
+ in state
+\family typewriter
+inconsistent
+\family default
+.
+ This means, you have lost your redundancy.
+ In our scenario, there is no chance at all to become consistent again,
+ because the network bottleneck declines more and more, slowly.
+ It is simply
+\emph on
+hopeless
+\emph default
+, by construction.
+\end_layout
+
+\begin_layout Standard
+In case you lose your primary site now, you are lost at all.
+\end_layout
+
+\begin_layout Standard
+Some people may argue that the probability for a similar scenario were low.
+ We don't agree on such an argumentation.
+ Not only because it really happens in pratice, and it may even last some
+ days until problems are fixed.
+ In case of
+\series bold
+rolling disasters
+\series default
+, the network is very likely to become flaky and/or overloaded shortly before
+ the final damage.
+ Even in other cases, you can easily end up with inconsistent secondaries.
+ It occurs not only in the lab, but also in practice if you operate some
+ hundreds or even thousands of DRBD instances.
+\end_layout
+
+\begin_layout Standard
+The point is that you can produce an ill behaviour
+\emph on
+systematically
+\emph default
+ just by overloading the network a bit for some sufficient duration.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ When coupling whole datacenters via some thousands of DRBD connections,
+ any (short) network loss will almost certainly increase the re-sync network
+ load each time the outage appears to be over.
+ As a consequence, overload may be
+\emph on
+provoked
+\emph default
+ by the re-sync repair attempts.
+ This may easily lead to self-amplifying
+\series bold
+throughput storms
+\series default
+ in some resonance frequency (similar to self-destruction of a bridge when
+ an army is marching over it in lockstep).
+\end_layout
+
+\begin_layout Standard
+The only way for reliable prevention of loss of secondaries is to start
+ any re-connect
+\emph on
+only
+\emph default
+ in such situations where you can
+\emph on
+predict in advance
+\emph default
+ that the re-sync is
+\emph on
+guaranteed
+\emph default
+ to finish before any network bottleneck / loss will cause an automatic
+ disconnect again.
+ We don't know of any method which can reliably predict the future behaviour
+ of a complex network.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Conclusion: in the presence of network bottlenecks, you run a considerable
+ risk that your DRBD mirrors get destroyed just in that moment when you
+ desperately need them.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Notice that crossover cables usually never show a behaviour like depicted
+ by the red line.
+ Crossover cables are
+\emph on
+passive components
+\emph default
+ which normally
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Exceptions might be mechanical jiggling of plugs, or electro-magnetical
+ interferences.
+ We never noticed any of them.
+\end_layout
+
+\end_inset
+
+ either work, or not.
+ The binary connect / disconnect behaviour of DRBD has no problems to cope
+ with that.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+or
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Linbit recommends a
+\series bold
+workaround
+\series default
+ for the inconsistencies during re-sync: LVM snapshots.
+ We tried it, but found a
+\emph on
+performance penalty
+\emph default
+ which made it prohibitive for our concrete application.
+ A problem seems to be the cost of destroying snapshots.
+ LVM uses by default a BOW strategy (Backup On Write, which is the counterpart
+ of COW = Copy On Write).
+ BOW increases IO latencies during ordinary operation.
+ Retaining snapshots is cheap, but reverting them may be very costly, depending
+ on workload.
+ We didn't fully investigate that effect, and our experience is a few years
+ old.
+ You might come to a different conclusion for a different workload, for
+ newer versions of system software, or for a different strategy if you carefully
+ investigate the field.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ DRBD problems usually arise
+\emph on
+only
+\emph default
+ when the network throughput shows some
+\begin_inset Quotes eld
+\end_inset
+
+awkward
+\begin_inset Quotes erd
+\end_inset
+
+ analog behaviour, such as overload, or as occasionally produced by various
+ switches / routers / transmitters, or other potential sources of packet
+ loss.
+\end_layout
+
+\begin_layout Subsection
+Behaviour of MARS
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Behaviour-of-MARS"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The behaviour of MARS in the above scenario:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/network-bottleneck-mars.fig
+ width 80col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+When the network is restrained, an asynchronous system like MARS will continue
+ to serve the user IO requests (dotted green line) without any impact /
+ incident while the actual network throughput (solid green line) follows
+ the red line.
+ In the meantime, all changes to the block device are recorded at the transactio
+n logfiles.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Here is one point in favour of DRBD: MARS stores its transaction logs on
+ the filesystem
+\family typewriter
+/mars/
+\family default
+.
+ When the network bottleneck is lasting very long (some days or even some
+ weeks), the filesystem will eventually run out of space some day.
+ Section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Defending-Overflow"
+
+\end_inset
+
+ discusses countermeasures against that in detail.
+ In contrast to MARS, DRBD allocates its bitmap
+\emph on
+statically
+\emph default
+ at resource creation time.
+ It uses up less space, and you don't have to monitor it for (potential)
+ overflows.
+ The space for transaction logs is the price you have to pay if you want
+ or need anytime consistency, or asynchronous replication in general.
+\end_layout
+
+\begin_layout Standard
+In order to really grasp the
+\emph on
+heart
+\emph default
+ of the difference between synchronous and asynchronous replication, we
+ look at the following modified scenario:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/network-flaky-mars.fig
+ width 80col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+This time, the network throughput (red line) is varying
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+In real life, many long-distance lines or even some heavily used metro lines
+ usually show fluctuations of their network bandwidth by an order of magnitude,
+ or even higher.
+ We have measured them.
+ The overall behaviour can be characterized as
+\begin_inset Quotes eld
+\end_inset
+
+
+\series bold
+chaotic
+\series default
+
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\end_inset
+
+ in some unpredictable way.
+ As before, the application throughput served by MARS is assumed to be constant
+ (dotted green line, often superseded by the solid green line).
+ The actual replication network throughput is depicted by the solid green
+ line.
+\end_layout
+
+\begin_layout Standard
+As you can see, a network dropdown undershooting the application demand
+ has no impact on the application throughput, but only on the replication
+ network throughput.
+ Whenever the network throughput is held back due to the flaky network,
+ it simply catches up as soon as possible by overshooting the application
+ throughput.
+ The amount of lag-behind is visualized as shaded area: downward shading
+ (below the application throughput) means an increase of the lag-behind,
+ while the upwards shaded areas (beyond the application throughput) indicate
+ a decrease of the lag-behind (catch-up).
+ Once the lag-behind has been fully caught up, the network throughput suddenly
+ jumps back to the application throughput (here visible in two cases).
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Note that the existence of lag-behind areas is roughly corresponding to
+ DRBD disconnect states, and in turn to DRBD inconsistent states of the
+ secondary as long as the lag-behind has not been fully cought up.
+ The very rough
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Of course, this visualization is not exact.
+ On one hand, the DRBD inconsistency phase may start later as depicted here,
+ because it only starts
+\emph on
+after
+\emph default
+ the first automatic disconnect, upon the first automatic re-connect.
+ In addition, the amount of resync data may be smaller than the amount of
+ corresponding MARS transaction logfile data, because the DRBD bitmap will
+ coalesce multiple writes to the same block into one single transfer.
+ On the other hand, DRBD will transfer no data at all during its disconnected
+ state, while MARS continues its best.
+ This leads to a prolongation of the DRBD inconsistent phase.
+ Depending on properties of the workload and of the network, the real duration
+ of the inconsistency phase may be both shorter or longer.
+\end_layout
+
+\end_inset
+
+ duration of the corresponding DRBD inconsistency phase is visualized as
+ magenta line at the time scale.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+MARS utilizes the existing network bandwidth as best as possible in order
+ to pipe through as much data as possible, provided that there exists some
+ data requiring expedition.
+ Conceptually, there exists no better way due to information theoretic limits
+ (besides data compression).
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Note that
+\emph on
+in average
+\emph default
+ during a longer period of time, the network must have emough capacity for
+ transporting all of your data.
+ MARS cannot magically break through information-theoretic limits.
+ It cannot magically transport gigabytes of data over modem lines.
+ Only
+\emph on
+relatively short
+\emph default
+ network problems / packet loss can be compensated.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+In case of lag-behind, the version of the data replicated to the secondary
+ site corresponds to some time in the past.
+ Since the data is always transferred in the same order as originally submitted
+ at the primary site, the secondary never gets inconsistent.
+ Your mirror always remains usable.
+ Your only potential problem could be the outdated state, corresponding
+ to some state in the past.
+ However, the
+\begin_inset Quotes eld
+\end_inset
+
+as-best-as-possible
+\begin_inset Quotes erd
+\end_inset
+
+ approach to the network transfer ensures that your version is always
+\emph on
+as up-to-date as possible
+\emph default
+ even under ill-behaving network bottlenecks.
+
+\series bold
+There is simply no better way to do it.
+
+\series default
+ In presence of temporary network bottlenecks such as network congestion,
+ there exists no better method than prescribed by the information theoretic
+ limit (red line, neglecting data compression).
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ In order to get all of your data through the line, somewhen the network
+ must be healthy again.
+ Otherwise, data will be recorded until the capacity of the
+\family typewriter
+/mars/
+\family default
+ filesystem is exhausted, leading to an emergency mode (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Resolution-of-Emergency"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+MARS' property of never sacrificing local data consistency (at the possible
+ cost of actuality, as long as you have enough capacity in
+\family typewriter
+/mars/
+\family default
+) is called
+\series bold
+Anytime Consistency
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Even when the capacity of
+\family typewriter
+/mars/
+\family default
+ is exhausted and when emergency mode is entered, the replicas will not
+ become inconsistent by themselves.
+ However, when the emergency mode is later
+\emph on
+cleaned up
+\emph default
+ for a replica, it will become temporarily inconsistent during the fast
+ full sync.
+ Details are in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Resolution-of-Emergency"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Conclusion: you can even use
+\series bold
+traffic shaping
+\series default
+ on MARS' TCP connections in order to globally balance your network throughput
+ (of course at the cost of actuality, but without sacrificing local data
+ consistency).
+ If you would try to do the same with DRBD, you could easily provoke a disaster.
+ MARS simply tolerates any network problems, provided that there is enough
+ disk space for transaction logfiles.
+ Even in case of completely filling up your disk with transaction logfiles
+ after some days or weeks, you will not lose local consistency anywhere
+ (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Defending-Overflow"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+Finally, here is yet another scenario where MARS can cope with the situation:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/network-constant-mars.fig
+ width 80col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+This time, the network throughput limit (solid red line) is assumed to be
+ constant.
+ However, the application workload (dotted green line) shows some heavy
+ peaks.
+ We know from our 1&1 datacenters that such an application behaviour is
+ very common (e.g.
+ in case of certain kinds of DDOS attacks etc).
+\end_layout
+
+\begin_layout Standard
+When the peaks are exceeding the network capacities for some short time,
+ the replication network throughput (solid green line) will be limited for
+ a short time, stay a little bit longer at the limit, and finally drop down
+ again to the normal workload.
+ In other words, you get a flexible buffering behaviour, coping with the
+ peaks.
+\end_layout
+
+\begin_layout Standard
+Similar scenarios (where both the application workload has peaks and the
+ network is flaky to some degree) are rather common.
+ If you would use DRBD there, you were likely to run into regular application
+ performance problems and/or frequent automatic disconnect cycles, depending
+ on the height and on the duration of the peaks, and on network resources.
+\end_layout
+
+\begin_layout Section
+Long Distances / High Latencies
+\end_layout
+
+\begin_layout Standard
+In general and in some theories, latencies are conceptually independent
+ from throughput, at least to some degree.
+ There exist all 4 possible combinations:
+\end_layout
+
+\begin_layout Enumerate
+There exist communication lines with high latencies but also high throughput.
+ Examples are raw fibre cables at the ground of the Atlantic.
+\end_layout
+
+\begin_layout Enumerate
+High latencies on low-throughput lines is very easy to achieve.
+ If you never saw it, you never ran interactive
+\family typewriter
+vi
+\family default
+ over
+\family typewriter
+ssh
+\family default
+ in parallel to downloads on your old-fashioned modem line.
+\end_layout
+
+\begin_layout Enumerate
+Low latencies need not be incompatible with high throughput.
+ See Myrinet, InfiniBand or high-speed point-to-point interconnects, such
+ as modern RAM busses.
+\end_layout
+
+\begin_layout Enumerate
+Low latency combined with low throughput is also possible: in an ATM system
+ (or another pre-reservation system for bandwidth), just increase the multiplex
+ factor on low-capacity but short lines, which is only possible at the cost
+ of assigned bandwidth.
+\end_layout
+
+\begin_layout Standard
+In the
+\emph on
+internet
+\emph default
+ practice, however, it is very likely that high latencies will also lead
+ to worse throughput, because of the
+\emph on
+congestion control algorithms
+\emph default
+ running all over the world.
+\end_layout
+
+\begin_layout Standard
+We have experimented with extremely large TCP send/receive buffers plus
+ various window sizes and congestion control algorithms over long-distance
+ lines between the USA and Europe.
+ Yes, it is possible to improve the behaviour to some degree.
+ But magic does not happen.
+ Natural laws will always hold.
+ You simply cannot travel faster than the speed of light.
+\end_layout
+
+\begin_layout Standard
+Our experience leads to the following rule of thumb, not formally proven
+ by anything, but just observed in practice:
+\end_layout
+
+\begin_layout Quotation
+In general
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+We have heard of cases where even less than 50 km were not working with
+ DRBD.
+ It depends on application workload, on properties of the line, and on congestio
+n caused by other traffic.
+ Some other people told us that according to
+\emph on
+their
+\emph default
+ experience, much lesser distances should be considered operable, only in
+ the range of a few single kilometers.
+ However, they agree that DRBD is rock stable when used on crossover cables.
+\end_layout
+
+\end_inset
+
+, synchronous data replication (not limited to applications of DRBD) works
+ reliably only over distances
+\begin_inset Formula $<50$
+\end_inset
+
+ km, or sometimes even less.
+\end_layout
+
+\begin_layout Standard
+There may be some exceptions, e.g.
+ when dealing with low-end workstation loads.
+ But when you are responsible for a whole datacenter and/or some centralized
+ storage units, don't waste your time by trying (almost) impossible things.
+ We recommend to use MARS in such use cases.
+\end_layout
+
+\begin_layout Section
+Explanation via CAP Theorem
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Explanation-via-CAP"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/cap-theorem.fig
+ width 60col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+The famous CAP theorem, also called Brewer's theorem, is important for a
+ deeper understanding of the differences between DRBD and MARS.
+ A good explanation can be found at
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+https://en.wikipedia.org/wiki/CAP_theorem
+\end_layout
+
+\end_inset
+
+ (retrieved July 2018).
+\end_layout
+
+\begin_layout Standard
+The CAP theorem states that only 2 out of 3 properties can be achieved at
+ the same time, when a Distributed System is under pressure: C = Consistency
+ means
+\series bold
+\emph on
+Strict
+\series default
+\emph default
+ Consistency at the level of the
+\emph on
+distributed
+\emph default
+ system (which is
+\emph on
+not
+\emph default
+ the same as strict consistency
+\emph on
+inside
+\emph default
+ of one of the
+\emph on
+local
+\emph default
+ systems), A = Availability = intuitively clear from a user's perspective,
+ and P = Partitioning Tolerance = the network may have its own outages at
+ any time (which is a negative criterion).
+\end_layout
+
+\begin_layout Standard
+As explained in the Wikipedia article, the P = Partitioning Tolerance is
+ a property which is imporant at least in
+\emph on
+wide-distance
+\emph default
+ data replication scenarios, and possibly in some other scenarios.
+\end_layout
+
+\begin_layout Subsection
+CAP Differences between DRBD and MARS
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:CAP-Differences"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+If you are considering only short distances like passive crossover cables
+ between racks,
+\emph on
+then
+\emph default
+ (and
+\emph on
+only then
+\emph default
+) you may
+\emph on
+assume(!)
+\emph default
+ that P is not required.
+ Then, and only then, you can get both A and C at the same time, without
+ sacrificing P, because P is already for free by assumption.
+ In such a crossover cable scenario, getting all three C and A and P is
+ possible, similarly to an explanation in the Wikipedia article.
+\end_layout
+
+\begin_layout Standard
+This is the classical use case for DRBD: when both DRBD replicas are always
+ staying physically connected via a passive crossover cable (which is
+\emph on
+assumed
+\emph default
+ to never break down), you can get both strict global consistency and availabili
+ty, even in cases where one of the DRBD nodes is failing
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+In addition, you will need some further components like Pacemaker, iSCSI
+ failover, etc.
+\end_layout
+
+\end_inset
+
+.
+ Both C and A are provided by DRBD during
+\family typewriter
+connected
+\family default
+ state, while P is assumed to be provided by a passive component.
+ By addition of iSCSI failover, A can be achieved even in case of single
+ storage node failures, while retaining C from the viewpoint
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice: the CAP theorem does not deal with node failures, only with
+\emph on
+network
+\emph default
+ failures.
+ Node failures would always violate C by some
+\begin_inset Quotes eld
+\end_inset
+
+strong
+\begin_inset Quotes erd
+\end_inset
+
+ definition.
+ By some
+\begin_inset Quotes eld
+\end_inset
+
+weaker
+\begin_inset Quotes erd
+\end_inset
+
+ definition, the downtime plus recovery time (e.g.
+ DRBD re-sync) can be taken out of the game.
+ Notice: while a node can always
+\begin_inset Quotes eld
+\end_inset
+
+know
+\begin_inset Quotes erd
+\end_inset
+
+ whether it has failed (at least after reboot), network failures cannot
+ be distinguished from failures of remote nodes in general.
+ Therefore node failures and network failures are fundamentally different
+ by their nature.
+\end_layout
+
+\end_inset
+
+ of the application.
+\end_layout
+
+\begin_layout Standard
+This is explained by the thick line in the following variant of the graphics,
+ which is only valid for crossover cables where P need not be guaranteed
+ by the replication because it is already assumed for free:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/cap-drbd-operational.fig
+ width 60col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+Now look at the case of a truly Distributed System, where P cannot be assumed
+ as for free.
+ For example, try to use DRBD in a long-distance replication scenario.
+ There we cannot assume P as already given.
+ We
+\series bold
+must
+\emph on
+tolerate
+\series default
+\emph default
+ replication network outages.
+ DRBD is reacting to this differently in two different modes.
+\end_layout
+
+\begin_layout Standard
+First we look at the (short) time interval
+\emph on
+before
+\emph default
+ DRBD recognizes the replication network incident, and before it leaves
+ the
+\family typewriter
+connected
+\family default
+ state.
+ During this phase, the application IO will
+\series bold
+hang
+\series default
+ for some time, indicating the (temporary) sacrifice (from a user's perspective)
+ by a red X:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/cap-drbd-connected.fig
+ width 60col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+Because Availability is one of the highest goods of enterprise-critical
+ IT operations, you will typically configure DRBD such that it automatically
+ switches to some variant of a
+\family typewriter
+disconnected
+\family default
+ state after some timeout, thereby giving up consistency between both replicas.
+ The red X indicates not only loss of global strict consistency in the sense
+ of the CAP theorem, but also that your replica will become
+\family typewriter
+Inconsistent
+\family default
+ during the following re-sync:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/cap-drbd-disconnected.fig
+ width 60col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+You may wonder what the difference to MARS is.
+ As explained in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Requirements-for-Cloud"
+
+\end_inset
+
+, MARS is not only intended for wide distances, but also for
+\series bold
+Cloud Storage
+\series default
+ where no strict consistency is required at global level by definition,
+ but instead
+\series bold
+Eventually Consistent
+\series default
+ is the preferred model for the Distributed System.
+ Therefore,
+\emph on
+strict
+\emph default
+ consistency (in the sense of the CAP theorem) is
+\emph on
+not required by definition
+\emph default
+.
+ Therefore, the red X is not present in the following graphics, showing
+ the state where MARS is remaining
+\emph on
+locally consistent
+\emph default
+ all the time
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice that the
+\emph on
+initial
+\emph default
+ full sync is not considered here, neither for DRBD, nor for MARS.
+
+\emph on
+Setup
+\emph default
+ of the Distributed System is its own scenario, not considered here.
+
+\emph on
+Repair
+\emph default
+ of a
+\emph on
+damaged
+\emph default
+ system is also a different scenario, also not considered here.
+ Notice the MARS' emergency mode also belongs to the class of
+\begin_inset Quotes eld
+\end_inset
+
+damages
+\begin_inset Quotes erd
+\end_inset
+
+, as well as DRBD' disk failure modes, where is has some additional functionalit
+y compared to the current version of MARS.
+\end_layout
+
+\end_inset
+
+, even when a network outage occurs:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/cap-mars.fig
+ width 60col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Notice: MARS does not guarantee strict consistency
+\emph on
+between
+\emph default
+ LV replicas at the level of the Distributed System, but only Eventually
+ Consistent.
+ However,
+\emph on
+at the same time
+\emph default
+ it
+\emph on
+also
+\emph default
+ guarantees strict consistency
+\emph on
+locally
+\emph default
+, and even at
+\emph on
+each
+\emph default
+ of the passive replicas, each by each.
+ Don't confuse these different levels.
+ There are different consistency guarantees at different levels, at the
+ same time.
+ This might be confusing if you are not looking at the system at different
+ levels: (1) overall Distributed System versus (2) each of the local system
+ instances.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Why does MARS this? Because a better way is not possible at all.
+ The CAP theorem tells us that there exists no better way when both A have
+ to be guaranteed (as almost everywhere in enterprise-critical IT operations),
+ and P has to be ensured in datacenter disaster scenarios or some other
+ scenarios.
+ Similarly to natural laws like Einstein's laws of the speed of light, there
+
+\emph on
+does not exist
+\emph default
+ a better way!
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Conclusion from the CAP theorem: when P is a
+\emph on
+hard
+\emph default
+
+\emph on
+requirement
+\emph default
+, don't use DRBD (or other
+\emph on
+synchronous
+\emph default
+ replication implementations) for long-distance and/or Cloud Storage scenarios.
+ The red X is in particular problematic during re-sync, after the network
+ has become healthy again (cf section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Behaviour-of-DRBD"
+
+\end_inset
+
+).
+ MARS has no red X at C because of its
+\series bold
+Anytime Consistency
+\series default
+, which refers to
+\emph on
+local
+\emph default
+ consistency, and which is violated by DRBD during certain important phases
+ of its regular operation.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Another conclusion from the CAP theorem: when A+C is a
+\emph on
+hard requirement
+\emph default
+, and when P can be faithfully assumed as already given by passive crossover
+ cables, then don't use the current version of MARS.
+ Use DRBD instead.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ If you think that you require alle three properties C+A+P, but you don't
+ have passive crossover cables over short distances, you are requiring something
+ which is
+\series bold
+impossible
+\series default
+.
+ There exists no solution, with whatever component, or from whatever commercial
+ storage vendor.
+ The CAP theorem is as hard as Einstein's natural laws are.
+ Rethink your complete concept, from end to end.
+ Something is wrong, somewhere.
+ Ignoring this on enterprise-critical use cases can endanger a company and/or
+ your career.
+\end_layout
+
+\begin_layout Subsection
+CAP Commonalities between DRBD and MARS
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:CAP-Commonalities"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+In this subsection, we look at the case that P is not for free, but has
+ to be ensured by the Distributed Storage system.
+\end_layout
+
+\begin_layout Standard
+You may have noticed that MARS' ordinary CAP behaviour is similar to DRBD's
+ CAP picture in
+\family typewriter
+disconnected
+\family default
+ state, or during similar states when the replication network is interrupted.
+\end_layout
+
+\begin_layout Standard
+Replication network interruption is also known as
+\begin_inset Quotes eld
+\end_inset
+
+Network Partitioning
+\begin_inset Quotes erd
+\end_inset
+
+.
+ This is where property P = Partitioning Tolerance comes into play.
+\end_layout
+
+\begin_layout Standard
+When a network partition has
+\emph on
+actually occurred
+\emph default
+, both DRDB and MARS allow you to do the same: you may
+\series bold
+forcefully switch
+\series default
+ the
+\family typewriter
+primary
+\family default
+ role, which means activation of a former
+\family typewriter
+secondary
+\family default
+ node.
+ In such a situation, you can issue commands like
+\family typewriter
+ drbdadm primary --force
+\family default
+ or
+\family typewriter
+marsadm primary --force
+\family default
+.
+ It is no accident that both commands are looking similar to each other.
+\end_layout
+
+\begin_layout Standard
+The outcome will be the same: you will most likely get a
+\family typewriter
+\series bold
+SplitBrain
+\family default
+\series default
+ situation.
+\end_layout
+
+\begin_layout Standard
+The possibility of getting a split brain is no specific property of neither
+ DRBD nor MARS.
+ It will also happen with any other replication system, whether synchronous
+ or asynchronous.
+\end_layout
+
+\begin_layout Standard
+It is one of the consequences from the CAP theorem when (1a) P has to be
+ assured, and (1b) a network partition has
+\emph on
+actually occurred
+\emph default
+, and (2) when A = Availability is enforced at both sides of the network
+ partition.
+ The result is that C = global Consistency is violated, by creation of two
+ or more versions of the data.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Careful: at least for some application classes, it is a bad idea to systematica
+lly create split brain via automatic cluster managers, e.g.
+ Pacemaker or similar.
+ As explained in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:Inappropriate-Clustermanger"
+
+\end_inset
+
+, some cluster managers were originally constructed for truly shared disk
+ scenarios, where no split brain can occur by construction.
+ Using them in masses on versioned data in truly distributed systems can
+ result in existential surprises, once a bigger network partition and/or
+ a flaky replication networks triggers them in masses, and at some moments
+ where you didn't really want to do what they now are doing automatically,
+ and in masses.
+ Split brain should not be provoked when not
+\emph on
+absolutely
+\emph default
+ necessary.
+\end_layout
+
+\begin_layout Standard
+Split brain resolution is all else but easy in general.
+ When the data is in a generic block device, you typically will have no
+ general means for merging both versions.
+ This means, split brain resolution is typically only possible by
+\series bold
+throwing away
+\series default
+ some of the versions.
+\end_layout
+
+\begin_layout Standard
+This kind of split brain resolution problem is no specific property of DRBD
+ or of MARS.
+ It is a fundamental property of generic block devices.
+\end_layout
+
+\begin_layout Standard
+DRBD and MARS have some commands like
+\family typewriter
+drbdadm invalidate
+\family default
+ or
+\family typewriter
+marsadm invalidate
+\family default
+ for this.
+ Again, the similarity is no accident.
+\end_layout
+
+\begin_layout Standard
+Notice that classical filesystems aren't typically better than raw block
+ devices.
+ There are even more possibilities for tricky types of
+\series bold
+conflicts
+\series default
+ (e.g.
+ on path names in addition to file content).
+\end_layout
+
+\begin_layout Standard
+Similary, BigCluster object stores are often suffering from similar (or
+ even worse) problems, because higher application layers may have some hidden
+ internal dependencies between object versions, while the object store itself
+ is agnostic of version dependencies in general
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+There exists lots of types of potential dependencies between objects.
+ Timely ones are easy to capture, but this is not sufficient in general
+ for everything.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ When stacking block devices or filesystems (or something else) on top of
+ some BigCluster object store, the latter will not magically resolve any
+ split brain for you.
+ Check whether your favorite object store implementation has some kind of
+ equivalent of a
+\family typewriter
+primary --force
+\family default
+ command, and some equivalent
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice: BigCluster architectures are typically discriminating between between
+ client servers and storage servers.
+ This will typically introduce some more possibilities into the game, such
+ as forced client failover, independently from forced storage failover.
+\end_layout
+
+\end_inset
+
+ of an
+\family typewriter
+invalidate
+\family default
+ command.
+ If it doesn't have one, or only a restricted one, you should be
+\emph on
+alerted
+\emph default
+.
+ In case of a long-lasting storage network partition, you might need suchalike
+
+\emph on
+desperately
+\emph default
+ for ensuring A, even at the cost of C.
+ Check: whether you need this is heavily depending on the
+\series bold
+\emph on
+application class
+\series default
+\emph default
+ (see also the Cloud Storage definition in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:Requirements-for-Cloud"
+
+\end_inset
+
+, or look at webhosting, etc).
+ When you
+\emph on
+would
+\emph default
+ need it, but you are
+\series bold
+not prepared for suchalike scenarios at your enterprise-critical data
+\series default
+, it could cost you a lot of money and/or reputation and/or even your existence.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ Notice: the
+\emph on
+concept
+\emph default
+ of
+\family typewriter
+SplitBrain
+\family default
+ is occurring almost everywhere in truly Distributed Systems when C can
+ be violated in favour of A+P.
+ It is a very general consequence
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+There exist only few opportunities for generic conflict resolution, even
+ in classical databases where
+\emph on
+some
+\emph default
+ knowledge about the structure of the data is available.
+ Typically, there are some more hidden dependencies.
+ Lossless
+\family typewriter
+SplitBrain
+\family default
+ resolution will thus need to be implemented at application layer, if it
+ is possible at all.
+\end_layout
+
+\end_inset
+
+ of the CAP theorem.
+\end_layout
+
+\begin_layout Standard
+The only reliable way for avoiding split brain in truly distributed systems
+ would be: don't insist on A = Availability.
+ Notice that there exist some application classes, like certain types of
+ banking, where C is typically a higher good than A.
+\end_layout
+
+\begin_layout Standard
+Notice that both DRBD and MARS are supporting this also: just don't add
+ the option
+\family typewriter
+--force
+\family default
+ to the
+\family typewriter
+primary
+\family default
+ switch command.
+\end_layout
+
+\begin_layout Standard
+However: even in banking, some
+\emph on
+extremely extraordinary
+\emph default
+ scenarios might occur, where sacrifice of C in favour of A could be necessary
+ (e.g.
+ when
+\emph on
+manual cleanup
+\emph default
+ of C is cheaper than long-lasting violations of A).
+ Good to know that both DRBD and MARS have some emergency measure for killing
+ C in favour of A!
+\end_layout
+
+\begin_layout Section
+Higher Consistency Guarantees vs Actuality
+\end_layout
+
+\begin_layout Standard
+We already saw in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Network-Bottlenecks"
+
+\end_inset
+
+ that certain types of network bottlenecks can easily (and reproducibly)
+ destroy the consistency of your DRBD secondary, while MARS will preserve
+ local consistency at the cost of actuality (
+\series bold
+anytime consistency
+\series default
+).
+\end_layout
+
+\begin_layout Standard
+Some people, often located at database operations, are obtrusively arguing
+ that actuality is such a high good that it must not be sacrificed under
+ any circumstances.
+\end_layout
+
+\begin_layout Standard
+Anyone arguing this way has at least the following choices (list may be
+ incomplete):
+\end_layout
+
+\begin_layout Enumerate
+None of the above use cases for MARS apply.
+ For instance, short distance replication over crossover cables is sufficient
+ (which occurs very often), or the network is reliable enough such that
+ bottlenecks can never occur (e.g.
+ because the total load is extremely low, or conversely the network is extremely
+ overengineered / expensive), or the occurrence of bottlenecks can
+\emph on
+provably
+\emph default
+ be taken into account.
+ In such cases, DRBD is clearly the better solution than MARS, because it
+ provides better actuality than the current version of MARS, and it uses
+ up less disk resources.
+\end_layout
+
+\begin_layout Enumerate
+In the presence of network bottlenecks, people didn't notice and/or didn't
+ understand and/or did under-estimate the risk of accidental invalidation
+ of their DRBD secondaries.
+ They should carefully check that risk.
+ They should convince themselves that the risk is
+\emph on
+really
+\emph default
+ bearable.
+ Once they are hit by a systematic chain of events which
+\emph on
+reproducibly
+\emph default
+ provoke the bad effect, it is too late
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Some people seem to need a bad experience before they get the difference
+ between risk caused by reproducible effects and inverted luck.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Enumerate
+In the presence of network bottlenecks, people found a solution such that
+ DRBD does not automatically re-connect after the connection has been dropped
+ due to network problems (c.f.
+
+\family typewriter
+ko-count
+\family default
+ parameter).
+ So the risk of inconsistency
+\emph on
+appears
+\emph default
+ to have vanished.
+ In some cases, people did not notice that the risk has
+\emph on
+not completely
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Hint: what's the
+\emph on
+conceptual
+\emph default
+ difference beween an automatic and a manual re-connect? Yes, you can try
+ to
+\emph on
+lower
+\emph default
+ the risk in some cases by transferring risks to human analysis and human
+ decisions, but did you take into account the possibility of human errors?
+\end_layout
+
+\end_inset
+
+
+\emph default
+ vanished, and/or they did not notice that now the actuality produced by
+ DRBD is even drastically worse than that of MARS (in the same situation).
+ It is true that DRBD provides better actuality in
+\family typewriter
+connected
+\family default
+ state, but for a full picture the actuality in
+\family typewriter
+disconnected
+\family default
+ state should not be neglected
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Hint: a potential hurdle may be the fact that the current format of
+\family typewriter
+/proc/drbd
+\family default
+ does neither display the timestamp of the first
+\emph on
+relevant
+\emph default
+ network drop nor the total amount of lag-behind user data (which is
+\emph on
+not
+\emph default
+ the same as the number of dirty bits in the bitmap), while
+\family typewriter
+marsadm view
+\family default
+ can display it.
+ So it is difficult to judge the risks.
+ Possibly a chance is inspection of DRBD messages in the syslog, but quantificat
+ion could remain hard.
+\end_layout
+
+\end_inset
+
+.
+ So they didn't notice that their argumentation on the importance of actuality
+ may be fundamentally wrong.
+ A possible way to overcome that may be re-reading section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Behaviour-of-MARS"
+
+\end_inset
+
+ and comparing its outcome with the corresponding outcome of DRBD in the
+ same situation.
+\end_layout
+
+\begin_layout Enumerate
+People are stuck in contradictive requirements because the current version
+ of MARS does not yet support synchronous or pseudo-synchronous operation
+ modes.
+ This should be resolved some day.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+A common misunderstanding is about the actuality guarantees provided by
+ filesystems.
+ The buffer cache / page cache uses by default a
+\series bold
+writeback strategy
+\series default
+ for performance reasons.
+ Even modern journalling filesystems will (by default) provide only consistency
+ guarantees, but no strong actuality guarantee.
+ In case of power loss, some transactions may be even
+\emph on
+rolled back
+\emph default
+ in order to restore consistency.
+ According to POSIX
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+The above argumentation also applies to Windows filesystems in analogous
+ way.
+\end_layout
+
+\end_inset
+
+ and other standards, the only
+\emph on
+reliable
+\emph default
+ way to achieve actuality is usage of system calls like
+\family typewriter
+sync()
+\family default
+,
+\family typewriter
+fsync()
+\family default
+,
+\family typewriter
+fdatasync()
+\family default
+, flags like
+\family typewriter
+O_DIRECT
+\family default
+, or similar.
+ For performance reasons, the
+\emph on
+vast majority of applications
+\emph default
+ don't use them at all, or use them only sparingly!
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ It makes no sense to require strong actuality guarantees from any block
+ layer replication (whether DRBD or future versions of MARS) while higher
+ layers such as filesystems or even applications are already sacrificing
+ them!
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+In summary, the
+\series bold
+anytime consistency
+\series default
+ provided by MARS is an argument you should consider, even if you need an
+ extra hard disk for transaction logfiles.
+\end_layout
+
+\begin_layout Chapter
+Quick Start Guide
+\begin_inset CommandInset label
+LatexCommand label
+name "chap:Quick-Start-Guide"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+This chapter is for impatient but experienced sysadmins who already know
+ DRBD.
+ For more complete information, refer to chapter
+\begin_inset CommandInset ref
+LatexCommand nameref
+reference "chap:The-Sysadmin-Interface"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Section
+Preparation: What you Need
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Preparation:-What-you"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Typically, you will use MARS at servers in a datacenter for replication
+ of big masses of data.
+\end_layout
+
+\begin_layout Standard
+Typically, you will use MARS for replication
+\emph on
+between
+\emph default
+ multiple datacenters, when the distances are greater than
+\begin_inset Formula $\approx50$
+\end_inset
+
+ km.
+ Many other solutions, even from commercial storage vendors, will not work
+ reliably over large distances when your network is not
+\emph on
+extremely
+\emph default
+ reliable, or when you try to push huge masses of data from high-performance
+ applications through a network bottleneck.
+ If you ever encountered suchalike problems (or try to avoid them in advance),
+ MARS is for you.
+\end_layout
+
+\begin_layout Standard
+You can use MARS both at dedicated storage servers (e.g.
+ for serving Windows clients), or at standalone Linux servers where CPU
+ and storage are not separated.
+\end_layout
+
+\begin_layout Standard
+In order to protect your data from low-level disk failures, you should use
+ a hardware RAID controller with BBU.
+ Software RAID is explicitly
+\emph on
+not
+\emph default
+ recommended, because it generally provides worse performance due to the
+ lack of a hardware BBU (for some benchmark comparisons with/out BBU, see
+
+\begin_inset Flex URL
+status collapsed
+
+\begin_layout Plain Layout
+
+https://github.com/schoebel/blkreplay/raw/master/doc/blkreplay.pdf
+\end_layout
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Don't set your hardware BBU cache to
+\begin_inset Quotes eld
+\end_inset
+
+writethrough
+\begin_inset Quotes erd
+\end_inset
+
+ mode.
+ This may lead to tremendous performance degradation.
+ Use the
+\begin_inset Quotes eld
+\end_inset
+
+writeback
+\begin_inset Quotes erd
+\end_inset
+
+ strategy instead.
+ It should be operationally safe, because in case of power loss the BBU
+ cache content will be preserved thanks to the battery, and/or thanks to
+ goldcaps for saving the cache content into some flash chips.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+For better performance, use newer MARS versions from branch
+\family typewriter
+mars0.1a.y
+\family default
+ or later.
+ Check the trips and tricks from sections
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:IO-Performance-Tuning"
+plural "false"
+caps "false"
+noprefix "false"
+
+\end_inset
+
+ and
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "subsec:Tuning-Network-Performance"
+plural "false"
+caps "false"
+noprefix "false"
+
+\end_inset
+
+.
+ You may also play around with
+\family typewriter
+/proc/sys/mars/aio_sync_mode
+\family default
+ when actuality is less important.
+ Further tuning of
+\family typewriter
+/proc/sys/mars/io_tuning/
+\family default
+ and many more tunables is currently only recommended for experts.
+ Future versions of MARS are planned to provide better performance with
+ software RAID.
+\end_layout
+
+\begin_layout Standard
+Typically, you will need more than one RAID set
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+For low-cost storage, RAID-5 is no longer regarded safe for today's typical
+ storage sizes, because the error rate is regarded too high.
+ Therefore, use RAID-6.
+ If you need more than 15 disks in total, create multiple RAID sets (each
+ having at most 15 disks, better about 12 disks) and stripe them via LVM
+ (or via your hardware RAID controller if it supports RAID-60).
+\end_layout
+
+\end_inset
+
+ for big masses of data.
+ Therefore, use of LVM is also recommended
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+You may also combine MARS with commercial storage boxes connected via Fibrechann
+el or iSCSI, but we have not yet operational experiences at 1&1 with such
+ setups.
+\end_layout
+
+\end_inset
+
+ for your data.
+\end_layout
+
+\begin_layout Standard
+MARS' tolerance of networking problems comes with some cost.
+ You will need some extra space for the transaction logfiles of MARS, residing
+ at the
+\family typewriter
+/mars/
+\family default
+ filesystem.
+\end_layout
+
+\begin_layout Standard
+The exact space requirements for
+\family typewriter
+/mars/
+\family default
+ depend on the
+\emph on
+average write rate
+\emph default
+ of your application, not on the size of your data.
+ We found that only few applications are writing more than 1 TB per day.
+ Most are writing even less than 100 GB per day.
+ Usually, you want to dimension
+\family typewriter
+/mars/
+\family default
+ such that you can survive a network loss lasting 3 days / about one weekend.
+ This can be achieved with current technology rather easily: as a simple
+ rule of thumb, just use one
+\series bold
+dedicated disk
+\series default
+ having a capacity of 4 TB or more.
+ Typically, that will provide you with plenty of headroom even for bigger
+ networking incidents.
+\end_layout
+
+\begin_layout Standard
+Dedicated disks for
+\family typewriter
+/mars/
+\family default
+ have another advantage: their mechanical head movement is completely independen
+t from your data head movements.
+ For best performance, attach that dedicated disk to your hardware RAID
+ controller with BBU, building a separate RAID set (even if it consists
+ only of a single disk – notice that the
+\series bold
+hardware BBU
+\series default
+ is the crucial point).
+\end_layout
+
+\begin_layout Standard
+If you are concerned about reliability, use two disks switched together
+ as a relatively small RAID-1 set.
+ For extremely high performance demands, you may consider (and check) RAID-10.
+\end_layout
+
+\begin_layout Standard
+Since the transaction logfiles are highly sequential in their access pattern,
+ a cheap but high-capacity SATA disk (or nearline-SAS disk) is usually sufficien
+t.
+ At the time of this writing, standard SATA SSDs have shown to be
+\emph on
+not
+\emph default
+ (yet) preferable.
+ Although they offer high random IOPS rate, their sequential throughput
+ is worse, and their long-term stability is questioned by many people at
+ the time of this writing.
+ However, as technology evolves and becomes more mature, this could change
+ in future.
+\end_layout
+
+\begin_layout Standard
+Use
+\family typewriter
+ext4
+\family default
+ for
+\family typewriter
+/mars/
+\family default
+.
+ Avoid
+\family typewriter
+ext3
+\family default
+, and don't use
+\family typewriter
+xfs
+\family default
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+It seems that the late internal resource allocation strategy of
+\family typewriter
+xfs
+\family default
+ (or another currently unknown reason) could be the reason for some resource
+ deadlocks which appear only with
+\family typewriter
+xfs
+\family default
+ and only under
+\emph on
+extremely
+\emph default
+ high IO load in combination with high memory pressure.
+\end_layout
+
+\end_inset
+
+ at all.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Notice that the filesystem
+\family typewriter
+/mars/
+\family default
+ has nothing to do with an ordinary filesystem.
+ It is completely reserved for MARS internal purposes, namely as a
+\series bold
+storage container
+\series default
+ for MARS' persistent data.
+ It does not obey any userspace rules like FHS (filesystem hierarchy standard),
+ and it should not be accessed by any userspace tool execpt the official
+
+\family typewriter
+marsadm
+\family default
+ tool.
+ Its internal data format should be a regarded as a
+\series bold
+blackbox
+\series default
+ by you.
+ The internal data format may change in future, or the complete
+\family typewriter
+/mars/
+\family default
+ filesystem may be even replaced by a totally different container format,
+ while the official
+\family typewriter
+marsadm
+\family default
+ interface is supposed to remain stable.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+That said, you might look into its contents
+\emph on
+by hand
+\emph default
+ for curiosity or for
+\emph on
+debugging purposes
+\emph default
+, and only as root.
+ But don't program any tools / monitoring scripts / etc bypassing the official
+
+\family typewriter
+marsadm
+\family default
+ tool.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Like DRBD, the current version of MARS has
+\series bold
+no security
+\series default
+ built in.
+ MARS assumes that it is running in a
+\series bold
+trusted network
+\series default
+.
+ Anyone who can connect to the MARS ports (default 7777 to 7779) can potentially
+ breach in and become root! Therefore, you
+\series bold
+must
+\series default
+ protect your network by appropriate means, such as firewalling and/or encrypted
+ VPN.
+\end_layout
+
+\begin_layout Standard
+Currently, MARS provides no shared secret like DRBD, because a simple shared
+ secret is way too weak to provide any real security (potentially misleading
+ people about the real level of security).
+ Future versions of MARS should provide at least 2-factor authorization,
+ and encryption via dynamic session keys.
+ Until that is implemented, use a secured VPN instead! And don't forget
+ to
+\emph on
+audit
+\emph default
+ it for security holes!
+\end_layout
+
+\begin_layout Section
+Setup Primary and Secondary Cluster Nodes
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Setup-Primary-and"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+If you already use DRBD, you may migrate to MARS (or even back from MARS
+ to DRBD) if you use
+\emph on
+external
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+
+\emph on
+Internal
+\emph default
+ DRBD metadata should also work as long as the filesystem inside your block
+ device / disk already exists and is not re-created.
+ The latter would destroy the DRBD metadata, but even that will not hurt
+ you really: you can always switch back to DRBD using
+\emph on
+external
+\emph default
+ metadata, as long as you have some small spare space somewhere.
+\end_layout
+
+\end_inset
+
+
+\emph default
+ DRBD metadata (which is not touched by MARS).
+
+\end_layout
+
+\begin_layout Subsection
+Kernel and MARS Module
+\end_layout
+
+\begin_layout Standard
+The MARS kernel module should be available or can be built via one of the
+ following methods:
+\end_layout
+
+\begin_layout Enumerate
+As an external Debian or rpm kernel module, as provided by a package contributor
+ (or hopefully by standard distros in the future).
+\end_layout
+
+\begin_layout Enumerate
+As a separate kernel module, only for experienced
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+You should be familiar with the problems arising from orthogonal combination
+ of different kernel versions with different MARS module versions and with
+ different
+\family typewriter
+marsadm
+\family default
+ userspace tool versions at the package management level.
+ Hint:
+\family typewriter
+modinfo
+\family default
+ is your friend.
+\end_layout
+
+\end_inset
+
+ sysadmins: see file
+\family typewriter
+Makefile.dist
+\family default
+ (tested with some older versions of Debian; may need some extra work with
+ other distros).
+\end_layout
+
+\begin_layout Enumerate
+Build for senior sysadmins or developers, inplace in the kernel source tree:
+ first apply
+\family typewriter
+0001-mars-minimum-pre-patch-for-mars.patch
+\family default
+ and
+\family typewriter
+0001-mars-SPECIAL-for-in-tree-build.patch
+\family default
+ or similar, then
+\family typewriter
+cd block/ && git clone --recurse-submodules https://github.com/schoebel/mars
+\family default
+.
+ Then
+\family typewriter
+cd ..
+
+\family default
+ and build your kernel as usual.
+ Config options for MARS should appear under
+\begin_inset Quotes eld
+\end_inset
+
+Enable the block layer
+\begin_inset Quotes erd
+\end_inset
+
+.
+ Just activate MARS as a
+\series bold
+kernel module
+\series default
+ via
+\begin_inset Quotes eld
+\end_inset
+
+m
+\begin_inset Quotes erd
+\end_inset
+
+ (don't try a fixed compile-in), and leave all else MARS config options
+ at the default (except you know what you are doing).
+\end_layout
+
+\begin_layout Standard
+Further / more accurate / latest instructions can be found in
+\family typewriter
+README
+\family default
+ and in
+\family typewriter
+INSTALL
+\family default
+.
+ You must not only install the kernel and the
+\family typewriter
+mars.ko
+\family default
+ kernel module to all of your cluster nodes, but also the
+\family typewriter
+marsadm
+\family default
+ userspace tool.
+\end_layout
+
+\begin_layout Standard
+Starting with
+\family typewriter
+mars0.1stable38
+\family default
+ and other branches having merged this feature, a prepatch for vanilla kernels
+ 3.2 through 4.4 is no longer needed.
+ However,
+\series bold
+IO performance
+\series default
+ is currently somewhat worse when the pre-patch is not applied.
+ This will be addressed in a later release.
+\end_layout
+
+\begin_layout Standard
+Therefore, application of the pre-patch to the kernel is
+\emph on
+recommended
+\emph default
+ for large-scale production systems for now.
+\end_layout
+
+\begin_layout Standard
+Kernel pre-patches can be found in the
+\family typewriter
+pre-patches/
+\family default
+ subdirectory of the MARS source tree.
+ Following are the types of pre-patches:
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+0001-mars-minimum-pre-patch-for-mars.patch
+\family default
+ or similar.
+ Please prefer this one (when present for your kernel version) in front
+ of
+\family typewriter
+0001-mars-generic-pre-patch-for-mars.patch
+\family default
+ or similar.
+ The latter should not be used anymore, except for testing or as an emergency
+ fallback.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+0001-mars-SPECIAL-for-in-tree-build.patch
+\family default
+ or similar.
+ This is
+\emph on
+only
+\emph default
+ needed when building the MARS kernel module together with all other kernel
+ modules in a single
+\family typewriter
+make
+\family default
+ pass.
+ For separate external module builds, this patch
+\emph on
+must not
+\emph default
+ be applied (but the pre-patch
+\emph on
+should
+\emph default
+ when possible).
+ When using this patch, please apply the aforementioned pre-patch also,
+ because your kernel is patched anyway.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Starting from version
+\family typewriter
+mars0.1stable56
+\family default
+ or
+\family typewriter
+mars0.1abeta8
+\family default
+,
+\series bold
+submodules
+\series default
+ have been added to the github repo of MARS.
+ If you have an old checkout, please say
+\family typewriter
+git pull --recurse-submodules=yes
+\family default
+ or similar.
+ Otherwise you may be missing an important future part of the MARS release,
+ without notice (depending on your local
+\family typewriter
+git
+\family default
+ version and its local configuration).
+\end_layout
+
+\begin_layout Subsection
+Setup your Cluster Nodes
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Setup-your-Cluster"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+For your cluster, you need at least two nodes.
+ In the following, they will be called A and B.
+ In the beginning, A will have the
+\family typewriter
+primary
+\family default
+ role, while B will be your initial
+\family typewriter
+secondary
+\family default
+.
+ The roles may change later.
+\end_layout
+
+\begin_layout Enumerate
+You must be
+\family typewriter
+root
+\family default
+.
+\end_layout
+
+\begin_layout Enumerate
+On each of A and B, create the
+\family typewriter
+/mars/
+\family default
+ mountpoint.
+\end_layout
+
+\begin_layout Enumerate
+On each node, create an
+\family typewriter
+ext4
+\family default
+ filesystem on your separate disk / RAID set via
+\family typewriter
+mkfs.ext4
+\family default
+ (for requirements on size etc see section
+\begin_inset CommandInset ref
+LatexCommand nameref
+reference "sec:Preparation:-What-you"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Enumerate
+On each node, mount that filesystem to
+\family typewriter
+/mars/
+\family default
+.
+ It is advisable to add an entry to
+\family typewriter
+/etc/fstab
+\family default
+.
+\end_layout
+
+\begin_layout Enumerate
+For security reasons, execute
+\family typewriter
+chmod 0700 /mars
+\family default
+ everyhwere after
+\family typewriter
+/mars/
+\family default
+ has been mounted.
+ If you forget this step, any following
+\family typewriter
+marsadm
+\family default
+ command will drop you a warning, but will fix the problem for you.
+\end_layout
+
+\begin_layout Enumerate
+On node A, say
+\family typewriter
+marsadm create-cluster
+\family default
+.
+\begin_inset Newline newline
+\end_inset
+
+This must be done
+\emph on
+exactly once
+\emph default
+, on exactly one node of your cluster.
+ Never do this twice or on different nodes, because that would create two
+ different clusters which would have nothing to do with each other.
+ The
+\family typewriter
+marsadm
+\family default
+ tool protects you against accidentally joining / merging two different
+ clusters.
+ If you accidentally created two different clusters, just umount that
+\family typewriter
+/mars/
+\family default
+ partition and start over with step 3 at that node.
+\end_layout
+
+\begin_layout Enumerate
+On node B, you must have a working
+\family typewriter
+ssh
+\family default
+ connection to node A (as
+\family typewriter
+root
+\family default
+).
+ Test it by saying
+\family typewriter
+ssh A w
+\family default
+ on node B.
+ It should work without entering a password (otherwise, use
+\family typewriter
+ssh-agent
+\family default
+ to achieve that).
+ In addition,
+\family typewriter
+rsync
+\family default
+ must be installed.
+\end_layout
+
+\begin_layout Enumerate
+On node B, say
+\family typewriter
+marsadm join-cluster A
+\end_layout
+
+\begin_layout Enumerate
+Only
+\emph on
+after
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+In fact, you may already
+\family typewriter
+modprobe mars
+\family default
+ at node A after the
+\family typewriter
+marsadm create-cluster
+\family default
+.
+ Just don't do any of the
+\family typewriter
+*-cluster
+\family default
+ operations when the kernel module is loaded.
+ All other operations should have no such restriction.
+\end_layout
+
+\end_inset
+
+
+\emph default
+ that, do
+\family typewriter
+modprobe mars
+\family default
+ on each node.
+\end_layout
+
+\begin_layout Section
+Creating and Maintaining Resources
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Creating-and-Maintaining"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+In the following example session, a block device
+\family typewriter
+/dev/lv-x/mydata
+\family default
+ (shortly called
+\emph on
+disk
+\emph default
+) must already exist on both nodes A and B, respectively, having the same
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Actually, the disk at the initially secondary side may be larger than that
+ at the initially primary side.
+ This will waste space and is therefore not recommended.
+\end_layout
+
+\end_inset
+
+ size.
+ For the sake of simplicity, the disk (underlying block device) as well
+ as its later logical resource name as well as its later virtual device
+ name will all be named uniformly by the same suffix
+\family typewriter
+mydata
+\family default
+.
+ In general, you might name each of them differently, but that is not recommende
+d since it may easily lead to confusion in larger installations.
+\end_layout
+
+\begin_layout Standard
+You may have already some data inside your disk
+\family typewriter
+/dev/lv-x/mydata
+\family default
+ at the initially primary side A.
+ Before using it for MARS, it must be unused for any other purpose (such
+ as being mounted, or used by DRBD, etc).
+ MARS will require
+\series bold
+exclusive access
+\series default
+ to it.
+\end_layout
+
+\begin_layout Enumerate
+On node A, say
+\family typewriter
+marsadm create-resource mydata /dev/lv-x/mydata
+\family default
+.
+\begin_inset Newline newline
+\end_inset
+
+As a result, a directory
+\family typewriter
+/mars/resource-mydata/
+\family default
+ will be created on node A, containing some symlinks.
+ Node A will automatically start in the primary role for this resource.
+ Therefore, a new pseudo-device
+\family typewriter
+/dev/mars/mydata
+\family default
+ will also appear after a few seconds.
+\begin_inset Newline newline
+\end_inset
+
+Note that the initial contents of
+\family typewriter
+/dev/mars/mydata
+\family default
+ will be exactly the same as in your pre-existing disk
+\family typewriter
+/dev/lv-x/mydata
+\family default
+.
+\begin_inset Newline newline
+\end_inset
+
+If you like, you may already use
+\family typewriter
+/dev/mars/mydata
+\family default
+ for mounting your already pre-existing data, or for creating a fresh filesystem
+, or for exporting via iSCSI, and so on.
+ You may even do so before any other cluster node has joined the resource
+ (so-called
+\begin_inset Quotes eld
+\end_inset
+
+standalone mode
+\begin_inset Quotes erd
+\end_inset
+
+).
+ But you can also do so later after setup of (one ore many) secondaries.
+\end_layout
+
+\begin_layout Enumerate
+Wait a few seconds until the directory
+\family typewriter
+/mars/resource-mydata/
+\family default
+ and its symlink contents also appears on cluster node B.
+ The command
+\family typewriter
+marsadm wait-cluster
+\family default
+ may be helpful.
+\end_layout
+
+\begin_layout Enumerate
+On node B, say
+\family typewriter
+marsadm join-resource mydata /dev/lv-x/mydata
+\family default
+.
+\begin_inset Newline newline
+\end_inset
+
+As a result, the initial full-sync from node A to node B should start automatica
+lly.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Of course, your old contents of your disk
+\family typewriter
+/dev/lv-x/mydata
+\family default
+ at side B (and
+\emph on
+only
+\emph default
+ there!) is overwritten by the version from side A.
+ Since you are an experienced sysadmin, you knew that, and it was just the
+ effect you deliberately wanted to achieve.
+ If you didn't check that your old contents didn't contain any valuable
+ data (or if you accidentally provided a wrong disk device argument), it
+ is too late now.
+ The
+\family typewriter
+marsadm
+\family default
+ command checks that the disk device argument is really a block device,
+ and that exclusive access to it is possible (as well as some further safety
+ checks, e.g.
+ matching sizes).
+ However, MARS cannot know the
+\emph on
+purpose
+\emph default
+ of your generic block device.
+ MARS (as well as DRBD) is completely ignorant of the
+\emph on
+contents
+\emph default
+ of a generic block device; it does not interpret it in any way.
+ Therefore, you may use MARS (as well as DRBD) for mirroring Windows filesystems
+, or raw devices from databases, or virtual machines, or whatever.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Check that state
+\family typewriter
+Orphan
+\family default
+ is left after a while on B.
+ Notice that
+\family typewriter
+join-resource
+\family default
+ is only
+\emph on
+starting
+\emph default
+ a new replica, but does not wait for its completion.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Hint: by default, MARS uses the so-called
+\begin_inset Quotes eld
+\end_inset
+
+fast fullsync
+\begin_inset Quotes erd
+\end_inset
+
+ algorithm.
+ It works similar to
+\family typewriter
+rsync
+\family default
+, first reading the data on both sides and computing an md5 checksum for
+ each block.
+ Heavy-weight data is only transferred over the long-distance network upon
+ checksum mismatch.
+ This is extremely fast if your data is already (almost) identical on both
+ sides.
+ Conversely, if you know in advance that your initial data is completely
+ different on both sides, you may choose to switch off the fast fullsync
+ algorithm via
+\family typewriter
+echo 0 > /proc/sys/mars/do_fast_fullsync
+\family default
+ in order to save the additional IO overhead and network latencies introduced
+ by the separate checksum comparison steps.
+\end_layout
+
+\begin_layout Enumerate
+Optionally, only for experienced sysadmins who
+\emph on
+really
+\emph default
+ know what they are doing: if you will create a
+\emph on
+new
+\emph default
+ filesystem on
+\family typewriter
+/dev/mars/mydata
+\family default
+
+\emph on
+after(!)
+\emph default
+ having created the MARS resource as well as
+\emph on
+after
+\emph default
+ having already joined it on every replica, you may abandon the fast fullsync
+ phase
+\emph on
+before
+\emph default
+ creating the fresh filesystem, because the old content of
+\family typewriter
+/dev/mars/mydata
+\family default
+ will then be just garbage not used by the freshly created filesystem
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+It is
+\emph on
+vital
+\emph default
+ that the transaction logfile contents created by
+\family typewriter
+mkfs
+\family default
+ is
+\emph on
+fully
+\emph default
+ propagated to the secondaries and then replayed there.
+\end_layout
+
+\begin_layout Plain Layout
+Analogously, another exception is also possible, but at your own risk (be
+ careful, really!): when migrating your data from DRBD to MARS, and you
+ have ensured that (1) at the end of using DRBD both your replicas were
+ really equal (you should have checked that), and (2) before and after setting
+ up any side of MARS (
+\family typewriter
+create-resource
+\family default
+ as well as
+\family typewriter
+join-resource
+\family default
+) nothing has been written at all to it (i.e.
+ no usage, neither of
+\family typewriter
+/dev/lv/mydata
+\family default
+ nor of
+\family typewriter
+/dev/mars/mydata
+\family default
+ has occurred in any way), the first transaction logfile
+\family typewriter
+/mars/resource-mydata/log-000000001-$primary
+\family default
+ created by MARS will be empty.
+ Check whether this is really true! Then, and only then, you may also issue
+ a
+\family typewriter
+fake-sync
+\family default
+.
+\end_layout
+
+\end_inset
+
+.
+ Then, and only then, you may say
+\family typewriter
+marsadm fake-sync mydata
+\family default
+ in order to abort the sync operation.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Never do a
+\family typewriter
+fake-sync
+\family default
+ unless you are
+\series bold
+absolutely sure
+\series default
+ that you really don't need to sync the data! Otherwise, you are
+\emph on
+guaranteed
+\emph default
+ to have produced harmful inconsistencies.
+ If you accidentally issued
+\family typewriter
+fake-sync
+\family default
+, you may startover the fast full sync at your secondary side by saying
+
+\family typewriter
+marsadm invalidate mydata
+\family default
+ (analogously to the corresponding DRBD command).
+\end_layout
+
+\begin_layout Section
+Keeping Resources Operational
+\end_layout
+
+\begin_layout Subsection
+Logfile Rotation / Deletion
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Logfile-Rotation"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+As explained in section
+\begin_inset CommandInset ref
+LatexCommand nameref
+reference "sec:The-Transaction-Logger"
+
+\end_inset
+
+, all changes to your resource data are recorded in transaction logfiles
+ residing on the
+\family typewriter
+/mars/
+\family default
+ filesystem.
+ These files are always growing over time.
+ In order to avoid filesystem overflow, the following must be done in regular
+ time intervals:
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+marsadm log-rotate all
+\family default
+
+\begin_inset Newline newline
+\end_inset
+
+This starts appending to a new logfile on all of your resources.
+ The logfiles are automatically numbered by an increasing 9-digit logfile
+ number.
+ This will suffice for many centuries even if you would logrotate once a
+ minute.
+ Practical frequencies for logfile rotation are more like once an hour,
+ or every 10 minutes when having highly-loaded storage servers.
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+marsadm log-delete-all all
+\family default
+
+\begin_inset Newline newline
+\end_inset
+
+This determines all logfiles from all resources which are no longer needed
+ (i.e.
+ which are
+\emph on
+fully
+\emph default
+ replayed, on
+\emph on
+all
+\emph default
+ relevant secondaries).
+ All superfluous logfiles are then deleted, including all copies on all
+ secondaries.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ The current version of MARS deletes either
+\emph on
+all
+\emph default
+ replicas of a logfile everywhere, or
+\emph on
+none
+\emph default
+ of the replicas.
+ This is a simple rule, but has the drawback that one node may hinder other
+ nodes from freeing space in
+\family typewriter
+/mars/
+\family default
+.
+ In particular, the command
+\family typewriter
+marsadm pause-replay $res
+\family default
+ (as well as
+\family typewriter
+marsadm disconnect $res
+\family default
+) will freeze the space reclamation in the whole cluster when the pause
+ is lasting very long.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ During such space accumulation, also the number of so-called deletions
+ will accumulate in /mars/todo-global/ and sibling directories.
+ In very big installations consisting of thousands of nodes, it is a good
+ idea to regularly monitor the number of deletions similarly to the following:
+
+\family typewriter
+$(find /mars/ -name
+\begin_inset Quotes eld
+\end_inset
+
+delete-*
+\begin_inset Quotes erd
+\end_inset
+
+ | wc -l)
+\family default
+ should not exceed a limit of ~150 entries.
+\end_layout
+
+\begin_layout Standard
+Please prefer the short form
+\family typewriter
+marsadm cron
+\family default
+ as an equivalent to scripting two separate commands
+\family typewriter
+marsadm log-rotate all
+\family default
+ and
+\family typewriter
+marsadm log-delete-all all
+\family default
+.
+ The short form is not only easier to remember, but also future-proof in
+ case some new MARS features should be implemented in future.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Best practice is to run
+\family typewriter
+marsadm cron
+\family default
+ in a
+\family typewriter
+cron
+\family default
+ job, such as
+\family typewriter
+/etc/cron.d/mars
+\family default
+.
+ An example cronjob can be found in the
+\family typewriter
+userspace/cron.d/
+\family default
+ subdirectory of the git repo.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+In addition, you should establish some regular monitoring of the free space
+ present in the
+\family typewriter
+/mars/
+\family default
+ filesystem.
+\end_layout
+
+\begin_layout Standard
+More detailed information about about avoidance of
+\family typewriter
+/mars/
+\family default
+ overflow is in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Defending-Overflow"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsection
+Switch Primary / Secondary Roles
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/switching.fig
+ width 90col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+In contrast to DRBD, MARS distinguishes between
+\emph on
+intended
+\emph default
+ and
+\emph on
+forced
+\emph default
+ switching.
+ This distinction is necessary due to differences in the communication architect
+ure (asynchronous communication vs synchronous communication, see sections
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-Lamport-Clock"
+
+\end_inset
+
+ and
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-Symlink-Tree"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+Asynchronous communication means that (in worst case) a message may take
+ (almost) arbitrary time in a distorted network to propagate to another
+ node.
+ As a consequence, the risk for accidentally creating an (unintended) split
+ brain is increased (compared to a synchronous system like DRBD).
+\end_layout
+
+\begin_layout Standard
+In order to minimize this risk, MARS has invested a lot of effort into an
+ internal handover protocol when you start an
+\emph on
+intended
+\emph default
+ primary switch.
+\end_layout
+
+\begin_layout Subsubsection
+Intended Switching / Planned Handover
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Intended-Switching"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Before starting a planned handover from your old primary
+\family typewriter
+A
+\family default
+ to a new primary
+\family typewriter
+B
+\family default
+, you should check the replication of the resource.
+ As a human, use
+\family typewriter
+marsadm view mydata
+\family default
+.
+ For scripting, use the macros from section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Predefined-Trivial-Macros"
+
+\end_inset
+
+ (see also section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Scripting-HOWTO"
+
+\end_inset
+
+; an example can be found in
+\begin_inset Flex URL
+status collapsed
+
+\begin_layout Plain Layout
+
+contrib/example-scripts/check-mars-switchable.sh
+\end_layout
+
+\end_inset
+
+).
+ The network should be OK, and the amount of replication delay should be
+ as low as possible.
+ Otherwise, handover may take a very long time.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Best practice is to
+\series bold
+prepare a planned handover
+\series default
+ by the following steps:
+\end_layout
+
+\begin_layout Enumerate
+Check the network and the replication lag.
+ It should be low (a few hundred megabytes, or a low number of gigabytes
+ - see also the rough time forecast shown by
+\family typewriter
+marsadm view mydata
+\family default
+ when there is a larger replication delay, or directly access the forecast
+ by
+\family typewriter
+marsadm view-replinfo
+\family default
+).
+\end_layout
+
+\begin_layout Enumerate
+Only when the
+\family typewriter
+systemd
+\family default
+ method from section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:systemd-Templates"
+
+\end_inset
+
+ is
+\emph on
+not
+\emph default
+ used: stop your application, then umount
+\family typewriter
+/dev/mars/mydata
+\family default
+ on host
+\family typewriter
+A
+\family default
+.
+\end_layout
+
+\begin_layout Enumerate
+Optionally: when the
+\family typewriter
+systemd
+\family default
+ method from section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:systemd-Templates"
+
+\end_inset
+
+ is
+\emph on
+not
+\emph default
+ used, and when scripting something else, or when typing extremely fast
+ by hand, or for better safety: say
+\family typewriter
+marsadm wait-umount mydata
+\family default
+ on host
+\family typewriter
+B
+\family default
+.
+ When your network is OK, the propagation of the device usage state
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice that the usage check for
+\family typewriter
+/dev/mars/mydata
+\family default
+ on host
+\family typewriter
+B
+\family default
+ is based on the
+\emph on
+open count
+\emph default
+ transferred from
+\emph on
+another
+\emph default
+ node
+\family typewriter
+A
+\family default
+.
+ Since MARS is operating asynchronously (in contrast to DRBD), it may take
+ some time until our node
+\family typewriter
+B
+\family default
+ knows that the device is no longer used at
+\family typewriter
+A
+\family default
+.
+ This can lead to a race condition if you automate an intended takeover
+ with a script like
+\family typewriter
+ssh root@A
+\begin_inset Quotes eld
+\end_inset
+
+umount /dev/mars/mydata
+\begin_inset Quotes erd
+\end_inset
+
+; ssh root@B
+\begin_inset Quotes eld
+\end_inset
+
+marsadm primary mydata
+\begin_inset Quotes erd
+\end_inset
+
+
+\family default
+ because your second ssh command may be faster than the internal MARS symlink
+ tree propagation (cf section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-Symlink-Tree"
+
+\end_inset
+
+).
+ In order to prevent such races, you are strongly advised to use the command
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+marsadm wait-umount mydata
+\end_layout
+
+\begin_layout Plain Layout
+on node
+\family typewriter
+B
+\family default
+ before trying to become primary.
+ See also section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Scripting-HOWTO"
+
+\end_inset
+
+.
+\end_layout
+
+\end_inset
+
+ should take only a few seconds.
+ Otherwise, check for any network problems or any other problems.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+This step is not really necessary, because
+\family typewriter
+marsadm primary
+\family default
+ will also wait for the
+\family typewriter
+umount
+\family default
+ before it will proceed.
+ However, scripting this intermediate step gives you some more options:
+ if the
+\family typewriter
+umount
+\family default
+ takes too long, you may program a different action, like re-starting at
+ the old primary, or its contrary, some forced umount, or even continuing
+ with a forceful failover instead (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Forced-Switching"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Enumerate
+Optionally, and when the
+\family typewriter
+systemd
+\family default
+ method from section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:systemd-Templates"
+
+\end_inset
+
+ is
+\emph on
+not
+\emph default
+ used: on host
+\family typewriter
+B
+\family default
+, wait until
+\family typewriter
+marsadm view mydata
+\family default
+ (or
+\family typewriter
+view-diskstate
+\family default
+) shows
+\family typewriter
+UpToDate
+\family default
+.
+ It is possible to omit this step, but then you have no control on the duration
+ of the handover, and in case of any transfer problems, disk space problems,
+ etc you are potentially risking to produce a split brain (although
+\family typewriter
+marsadm
+\family default
+ will do its best to avoid it).
+ Doing the wait by yourself,
+\emph on
+before
+\emph default
+ starting
+\family typewriter
+marsadm primary
+\family default
+, has a big advantage: you can abort the handover cycle at any time, just
+ by re-mounting the device
+\family typewriter
+/dev/mars/mydata
+\family default
+ at the old primary
+\family typewriter
+A
+\family default
+ again, and by re-starting your application.
+ Once you have started
+\family typewriter
+marsadm primary
+\family default
+ on host
+\family typewriter
+B
+\family default
+, you might have to switch back, or possibly even via
+\family typewriter
+primary --force
+\family default
+ (see sections
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Forced-Switching"
+
+\end_inset
+
+ and
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Split-Brain-Resolution"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+Switching the roles is very similar to DRBD: just issue the command
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+marsadm primary mydata
+\end_layout
+
+\begin_layout Standard
+on your formerly secondary node
+\family typewriter
+B
+\family default
+.
+ In combination with a properly set-up
+\family typewriter
+systemd
+\family default
+ method (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:systemd-Templates"
+
+\end_inset
+
+), this will even automatically start your application at the new site.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+The most important difference to DRBD: don't use an intermediate
+\family typewriter
+marsadm secondary mydata
+\family default
+ anywhere.
+ Although it would be possible, it has some
+\emph on
+disadvantages
+\emph default
+.
+ Always switch
+\emph on
+directly
+\emph default
+!
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+In contrast to DRBD, MARS remembers the designated primary, even when your
+ system crashes and reboots.
+ While in case of a crash you have to re-setup DRBD with commands like
+\family typewriter
+drbdadm up
+\begin_inset Formula $\ldots$
+\end_inset
+
+; drbdadm primary
+\begin_inset Formula $\ldots$
+\end_inset
+
+
+\family default
+, MARS will automatically resume its former roles just by saying
+\family typewriter
+modprobe mars
+\family default
+.
+ In combination with a properly set-up
+\family typewriter
+systemd
+\family default
+ method (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:systemd-Templates"
+
+\end_inset
+
+), this will even automatically re-start your application.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Another fundamental difference to DRBD: when the network is healthy, there
+ can only exist
+\emph on
+one
+\emph default
+ designated primary at a time (modulo some communication delays caused by
+ the
+\begin_inset Quotes eld
+\end_inset
+
+eventually consistent
+\begin_inset Quotes erd
+\end_inset
+
+ communication model, see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-Lamport-Clock"
+
+\end_inset
+
+).
+ By saying
+\family typewriter
+marsadm primary mydata
+\family default
+ on host
+\family typewriter
+B
+\family default
+,
+\series bold
+all other
+\series default
+ hosts (including
+\family typewriter
+A
+\family default
+) will
+\series bold
+automatically go into secondary role
+\series default
+ after a while!
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+You simply
+\emph on
+don't need
+\emph default
+ an intermediate
+\family typewriter
+marsadm secondary mydata
+\family default
+ for planned handover!
+\end_layout
+
+\begin_layout Standard
+Precondition for a plain
+\family typewriter
+marsadm primary
+\family default
+ (without
+\family typewriter
+systemd
+\family default
+) is that you are up, that means in attached and connected state (cf.
+
+\family typewriter
+marsadm up
+\family default
+), that you are no sync target anymore, and (only when
+\family typewriter
+systemd
+\family default
+ isn't configured to automatically stop the application at the old site)
+ that any old primary (in this case
+\family typewriter
+A
+\family default
+) does not use its
+\family typewriter
+/dev/mars/mydata
+\family default
+ device any longer, and that the network is healthy.
+ If some (parts of) logfiles are not yet (fully) transferred to the new
+ primary, you will need enough space on
+\family typewriter
+/mars/
+\family default
+ at the target side.
+ If one of the preconditions described in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Operation-of-the"
+
+\end_inset
+
+ is violated,
+\family typewriter
+marsadm primary
+\family default
+ may refuse to start.
+\end_layout
+
+\begin_layout Standard
+These preconditions try to protect you from doing silly things, such as
+ accidentally provoking a split brain error state.
+ We try to avoid split brain as best as we can.
+ Therefore, we distinguish between
+\emph on
+intended
+\emph default
+ and
+\emph on
+emergeny
+\emph default
+ switching.
+ Intended switching will try to avoid split brain
+\emph on
+as best as it can
+\emph default
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Don't
+\emph on
+rely
+\emph default
+ on split brain avoidance, in particular when scripting any higher-level
+ applications such as cluster managers (cf.
+ section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Scripting-HOWTO"
+
+\end_inset
+
+).
+
+\family typewriter
+marsadm
+\family default
+ does its best, but at least in case of (unnoticed) network outages / partitions
+ (or
+\emph on
+extremely, really extremely
+\emph default
+ slow / overloaded networks), an attempt to become
+\family typewriter
+UpToDate
+\family default
+ may fail.
+ If you want to
+\emph on
+ensure
+\emph default
+ that no split brain can result from intended primary switching, please
+ obey the the best practices from above, and please give the
+\family typewriter
+primary
+\family default
+ command only after your secondary is
+\emph on
+known
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+As noted in many places in this manual, checking this cannot be done by
+ looking at the local state of a single cluster node.
+ You have to check several nodes.
+
+\family typewriter
+marsadm
+\family default
+ can only check the
+\emph on
+local
+\emph default
+ node reliably!
+\end_layout
+
+\end_inset
+
+
+\emph default
+ to be
+\emph on
+really
+\emph default
+
+\family typewriter
+UpToDate
+\family default
+ (see
+\family typewriter
+marsadm wait-cluster
+\family default
+ and
+\family typewriter
+marsadm view
+\family default
+ and other macros described in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Inspecting-the-State"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ A
+\emph on
+very rough
+\emph default
+ estimation of the time to become
+\family typewriter
+UpToDate
+\family default
+ is displayed by
+\family typewriter
+marsadm view mydata
+\family default
+ or other macros (e.g.
+
+\family typewriter
+view-replinfo
+\family default
+).
+ However, on very flaky networks, the estimation may not only flicker much,
+ but also be inaccurate.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ Planned handover is refused
+\emph on
+by default
+\emph default
+ when some sync is running somewhere.
+ By adding the option
+\family typewriter
+--ignore-sync
+\family default
+, you are no longer protected by this
+\emph on
+safety measure
+\emph default
+, and you are willing to accept that any already running syncs will restart
+ from point 0, in order to ensure consistency.
+\end_layout
+
+\begin_layout Subsubsection
+Forced Switching
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Forced-Switching"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+In case the connection to the old primary is lost for whatever reason, we
+ just don't know anything about its
+\emph on
+current
+\emph default
+ state (which may deviate from its
+\emph on
+last known
+\emph default
+ state).
+ The following command sequence will skip many checks (essentially you just
+ need to be attached and you must not be a current sync target) and tell
+ your node to become primary forcefully:
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+marsadm pause-fetch mydata
+\family default
+
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_deeper
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ notice that this is similar to
+\family typewriter
+drbdadm disconnect mydata
+\family default
+ as you are probably used from DRBD.
+ For better compatibility with DRBD, you may use the alternate syntax
+\family typewriter
+marsadm disconnect mydata
+\family default
+ instead.
+ However, there is a subtle difference to DRBD: DRBD will drop
+\emph on
+both
+\emph default
+ sides of its single bi-directional connection and no longer try to re-connect
+ from any of both sides, while
+\family typewriter
+pause-fetch
+\family default
+ is equivalent to
+\family typewriter
+pause-fetch-local
+\family default
+, which instructs only the
+\emph on
+local
+\emph default
+ host to stop fetching logfiles.
+ Other members of the cluster, including the former primary, are
+\emph on
+not
+\emph default
+ instructed to do so.
+ They may continue fetching logfiles over their own private TCP connections,
+ potentially using many connections in parallel, and potentially even from
+ any
+\emph on
+other
+\emph default
+ member of the resource, if they think they can get the data from there.
+ In order to instruct
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice that not all such instructions may arrive at all sites when the network
+ is interrupted (or extremely slow).
+\end_layout
+
+\end_inset
+
+
+\emph on
+all
+\emph default
+ members of the resource to stop fetching logfiles, you may use
+\family typewriter
+marsadm pause-fetch-global mydata
+\family default
+ instead (cf section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Operation-of-the"
+
+\end_inset
+
+).
+\end_layout
+
+\end_deeper
+\begin_layout Itemize
+
+\family typewriter
+marsadm primary mydata --force
+\family default
+
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_deeper
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ this is the forceful failover.
+ Depending on the current replication lag, you may loose some data.
+ Use
+\family typewriter
+--force
+\family default
+ only if you know what you are doing!
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ When
+\family typewriter
+systemd
+\family default
+ is configured properly (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:systemd-Templates"
+
+\end_inset
+
+), your application will start automatically at the new primary site.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ when the network is interrupted, the old primary site cannot know this,
+ and will continue running.
+ Once the metadata exchange is working again (by default on port 7777),
+ the old site will be automatically shut down by its local
+\family typewriter
+systemd
+\family default
+ configuration, when configured properly (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:systemd-Templates"
+
+\end_inset
+
+).
+ In difference to the
+\emph on
+planned
+\emph default
+ handover from section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Intended-Switching"
+
+\end_inset
+
+, this may happen much later.
+ In case of long-last network outages, even days or weeks!
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+Running both sites in parallel for a long time may seriously damage your
+ business.
+ Ensure that any
+\series bold
+customer traffic
+\series default
+ cannot go to the old site! Be sure to configure your BGP in a proper way,
+ such that
+\emph on
+only
+\emph default
+, and
+\emph on
+only
+\emph default
+ the new site will receive any customer traffic from both inside and outside
+ networks, like the internet.
+\end_layout
+
+\end_deeper
+\begin_layout Itemize
+
+\family typewriter
+marsadm resume-fetch mydata
+\family default
+
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_deeper
+\begin_layout Standard
+As such, the new primary does not really need this, because primaries are
+ producing their own logfiles without need for fetching.
+ This is only to undo the previous
+\family typewriter
+pause-fetch
+\family default
+, in order to avoid future surprises when the new primary will somewhen
+ change to secondary mode again (in the far-distant future), and you have
+ forgotten to remember the fact that fetching had been switched off.
+
+\end_layout
+
+\end_deeper
+\begin_layout Standard
+When using
+\family typewriter
+--force
+\family default
+, many precondition checks and other internal checks are skipped, and in
+ particular the internal handover protocol for split brain avoidance.
+\end_layout
+
+\begin_layout Standard
+Therefore, use of
+\family typewriter
+--force
+\family default
+ is
+\emph on
+likely
+\emph default
+ to
+\series bold
+provoke a split brain
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\series bold
+Split brain
+\series default
+ is always an
+\series bold
+erroneous state
+\series default
+ which should be never entered deliberately! Once you have entered it accidental
+ly, you
+\series bold
+must
+\series default
+ resolve it ASAP (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Split-Brain-Resolution"
+
+\end_inset
+
+), otherwise you cannot operate your resource in the long term.
+\end_layout
+
+\begin_layout Standard
+In order to impede you from giving an accidental
+\family typewriter
+--force
+\family default
+, the precondition is different:
+\family typewriter
+--force
+\family default
+ works only in
+\emph on
+locally disconnected
+\emph default
+ state.
+ This is similar to DRBD.
+\end_layout
+
+\begin_layout Standard
+Remember:
+\family typewriter
+marsadm primary
+\family default
+ without
+\family typewriter
+--force
+\family default
+ tries to prevent split brain as best as it can.
+ Use of the
+\family typewriter
+--force
+\family default
+ option will almost
+\emph on
+certainly
+\emph default
+ provoke a split brain, at least if the old primary continues to operate
+ on its local
+\family typewriter
+/dev/mars/mydata
+\family default
+ device.
+ Therefore, you are
+\series bold
+strongly advised
+\series default
+ to do this
+\series bold
+only
+\series default
+ after
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+marsadm primary
+\family default
+ without
+\family typewriter
+--force
+\family default
+ has failed
+\emph on
+for no good reason
+\emph default
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Most reasons will be displayed by
+\family typewriter
+marsadm
+\family default
+ when it is rejecting the planned handhover.
+\end_layout
+
+\end_inset
+
+, and
+\end_layout
+
+\begin_layout Enumerate
+You are sure you
+\emph on
+really
+\emph default
+ want to switch, even when that eventually leads to a split brain.
+ You also declare that you are willing to do
+\emph on
+manual
+\emph default
+ split-brain resolution as described in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Split-Brain-Resolution"
+
+\end_inset
+
+, or even destruction / reconstruction of a damaged node as described in
+ section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Final-Destroy-of"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Notice: in case of
+\emph on
+connection loss
+\emph default
+ (e.g.
+ networking problems / network partitions), you may not be able to reliably
+ detect whether a split brain actually resulted, or not.
+\end_layout
+
+\begin_layout Paragraph
+Some Background
+\end_layout
+
+\begin_layout Standard
+In contrast to DRBD, split brain situations are handled differently by MARS
+ .
+ When two primaries are accidentally active at the same time, each of them
+ writes into different logfiles
+\family typewriter
+/mars/resource-mydata/log-000000001-A
+\family default
+ and
+\family typewriter
+/mars/resource-mydata/log-000000001-B
+\family default
+ where the
+\emph on
+origin
+\emph default
+ host is always recorded in the filename.
+ Therefore, both nodes
+\emph on
+can theoretically
+\emph default
+ run in primary mode independently from each other, at least for some time.
+ They
+\emph on
+might
+\emph default
+ even
+\family typewriter
+log-rotate
+\family default
+ independently from each other.
+ However, this is really no good idea.
+ The replication to third nodes will likely get stuck, and your
+\family typewriter
+/mars/
+\family default
+ filesystem(s) will eventually run out of space.
+ Any further secondary node (when having
+\begin_inset Formula $k>2$
+\end_inset
+
+ replicas) will certainly get into serious problems: it simply does not
+ know which split-brain version it should follow.
+ Therefore, you will certainly loose the actuality of your redundancy.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\family typewriter
+marsadm secondary
+\family default
+ is
+\emph on
+strongly discouraged
+\emph default
+.
+ It tells the whole cluster that
+\emph on
+nobody
+\emph default
+ is designated as primary any more.
+
+\emph on
+All
+\emph default
+ nodes should go into secondary mode, globally.
+ In the current version of MARS, the secondaries will no long fetch any
+ logfiles, since they don't know which version is the
+\begin_inset Quotes eld
+\end_inset
+
+right
+\begin_inset Quotes erd
+\end_inset
+
+ one.
+ Syncing is also not possible.
+ When the device
+\family typewriter
+/dev/mars/mydata
+\family default
+ is in use somewhere, it will remain in
+\emph on
+actual
+\emph default
+ primary mode during that time.
+ As soon as the local
+\family typewriter
+/dev/mars/mydata
+\family default
+ is released, the node will
+\emph on
+actually
+\emph default
+ go into secondary mode if it is no longer designated as primary.
+ You should avoid it in advance by always
+\emph on
+directly
+\emph default
+ switching over from one primary to another one, without intermediate
+\family typewriter
+secondary
+\family default
+ command.
+ This is different from DRBD.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ Split brain situations are detected
+\emph on
+passively
+\emph default
+ by secondaries.
+ Whenever a secondary detects that somewhere a split brain has happend,
+ it refuses to replay any logfiles behind the split point (and also to fetch
+ them when possible), or anywhere where something appears suspect or ambiguous.
+ This tries to keep its local disk state always being consistent, but outdated
+ with respect to any of the split brain versions.
+ As a consequence, becoming primary may be impossible, because it cannot
+ always know which logfiles are the correct ones to replay before
+\family typewriter
+/dev/mars/mydata
+\family default
+ can appear.
+ The ambiguity must be resolved first.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ If you
+\emph on
+really
+\emph default
+ need the local device
+\family typewriter
+/dev/mars/mydata
+\family default
+ to disappear
+\emph on
+everywhere
+\emph default
+ in a split brain situation, you don't need a
+\emph on
+strongly discouraged
+\emph default
+
+\family typewriter
+marsadm secondary
+\family default
+ command for this.
+
+\family typewriter
+marsadm detach
+\family default
+ or
+\family typewriter
+marsadm down
+\family default
+ can do it also, without destroying knowledge about the former designated
+ primary.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\family typewriter
+marsadm primary –force
+\family default
+ is rejected in newer
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Beware: older versions before
+\family typewriter
+mars0.1stable52
+\family default
+ did deliberately skip this check because a few years ago somebody at 1&1
+ did place a
+\emph on
+requirement
+\emph default
+ on this.
+ Fortunately, the requirement now has gone, so a more safe behaviour could
+ be implemented.
+ The new behaviour is for your safety, to prevent you from doing
+\begin_inset Quotes eld
+\end_inset
+
+silly
+\begin_inset Quotes erd
+\end_inset
+
+ things in case you are under pressure during an incident (try to safeguard
+ human error as best as possible).
+\end_layout
+
+\end_inset
+
+ marsadm versions if your replica is a current sync target.
+ This is not a bug: it should prevent you from forcing an inconsistent replica
+ into primary mode, which will
+\emph on
+certainly
+\emph default
+ lead to inconsistent data.
+ However, in extreme rare cases of severe damage of
+\emph on
+all
+\emph default
+ of your replicas, you may be desperate.
+ Only in such a rare case, and only then, you might decide to force any
+ of your replicas (e.g.
+ based on their last sync progress bar) into primary role although none
+ of the re-syncs had finished before.
+ In such a case, and only if you really know what you are doing, you may
+ use
+\family typewriter
+marsadm fake-sync
+\family default
+ to first mark your inconsisten replica as UpToDate (which is a
+\series bold
+lie
+\series default
+) and then force it to primary as explained above.
+ Afterwards, you will certainly need an
+\family typewriter
+fsck
+\family default
+ or similar repair before you can restart your application.
+ Good luck! And don't forget to check the size of
+\family typewriter
+lost+found
+\family default
+ afterwards.
+ This is really your
+\emph on
+very last
+\emph default
+ chance if nothing else had succeeded before.
+\end_layout
+
+\begin_layout Subsection
+Split Brain Resolution
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Split-Brain-Resolution"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Split brain can naturally occur during a long-lasting network outage (aka
+ network partition) when you (forcefully) switch primaries inbetween, or
+ due to final loss of your old primary node (fatal node crash) when not
+ all logfile data had been transferred immediately before the final crash.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Remember that split brain is an
+\series bold
+erroneous state
+\series default
+ which must be resolved as soon as possible!
+\end_layout
+
+\begin_layout Standard
+Whenever split brain occurs for whatever reason, you have two choices for
+ resolution: either destroy one of your versions, or retain it under a different
+ resource name.
+\end_layout
+
+\begin_layout Standard
+In any of both cases, do the following steps ASAP:
+\end_layout
+
+\begin_layout Enumerate
+
+\series bold
+Manually
+\series default
+ check which (surviving) version is the
+\begin_inset Quotes eld
+\end_inset
+
+right
+\begin_inset Quotes erd
+\end_inset
+
+ one.
+ Any error is up to you: destroying the wrong version is
+\emph on
+your
+\emph default
+ fault, not the fault of MARS.
+\end_layout
+
+\begin_layout Enumerate
+If you did not already switch your primary to the final destination determined
+ in the previous step, do it now (see description in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Forced-Switching"
+
+\end_inset
+
+).
+ Don't use an intermediate
+\family typewriter
+marsadm secondary
+\family default
+ command (as known from DRBD):
+\emph on
+directly
+\emph default
+ switch to the new designated primary!
+\end_layout
+
+\begin_layout Enumerate
+Unless
+\family typewriter
+systemd
+\family default
+ is configured properly (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:systemd-Templates"
+
+\end_inset
+
+), do the following manually: on each non-right version (which you don't
+ want to retain) which had been primary before, umount your
+\family typewriter
+/dev/mars/mydata
+\family default
+ or otherwise stop using it (e.g.
+ stop iSCSI or other users of the device).
+ Wait until each of them has actually left primary state and until their
+ local logfile(s) have been fully written back to the underlying disk.
+\end_layout
+
+\begin_layout Enumerate
+Wait until the network works again.
+ All your (surviving) cluster nodes
+\emph on
+must
+\emph default
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+If you are a MARS expert and you really know what you are doing (in particular,
+ you can anticipate the effects of the Lamport clock and of the symlink
+ update protocol including the
+\begin_inset Quotes eld
+\end_inset
+
+eventually consistent
+\begin_inset Quotes erd
+\end_inset
+
+ behaviour including the not-yet-consistent intermediate states, see sections
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-Lamport-Clock"
+
+\end_inset
+
+ and
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-Symlink-Tree"
+
+\end_inset
+
+), you may deviate from this requirement.
+\end_layout
+
+\end_inset
+
+ be able to communicate with each other.
+ If that is not possible, or if it takes too long, you may fall back to
+ the method described in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Final-Destroy-of"
+
+\end_inset
+
+, but do this only as far as necessary.
+\end_layout
+
+\begin_layout Standard
+The next steps are different for different use cases:
+\end_layout
+
+\begin_layout Paragraph
+Destroying a Wrong Split Brain Version
+\end_layout
+
+\begin_layout Standard
+Continue with the following steps, each on those cluster node(s) where you
+ do not want to retain its split-brain version.
+ In preference, start with the old
+\begin_inset Quotes eld
+\end_inset
+
+wrong
+\begin_inset Quotes erd
+\end_inset
+
+ primaries first (see advice at the end of this section):
+\end_layout
+
+\begin_layout Enumerate-Resume
+
+\family typewriter
+marsadm invalidate mydata
+\family default
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+setcounter{enumi}{4}
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+When no split brain is reported anymore after that (via
+\family typewriter
+marsadm view all
+\family default
+), you are done.
+ You need to repeat this on other secondaries only when necessary.
+\end_layout
+
+\begin_layout Standard
+In very rare cases when things are screwed up very heavily (e.g.
+ a partly destroyed
+\family typewriter
+/mars/
+\family default
+ partition), you may try an alternate method described in appendix
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "chap:Alternative-Methods-for"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Check that state
+\family typewriter
+Orphan
+\family default
+ is left after a while.
+ Notice that
+\family typewriter
+invalidate
+\family default
+ is only
+\emph on
+restarting
+\emph default
+ an existing replica, but does not wait for its completion.
+\end_layout
+
+\begin_layout Paragraph
+Keeping a Split Brain Version
+\end_layout
+
+\begin_layout Standard
+On those cluster node(s) where you want to retain the version (e.g.
+ for inspection purposes):
+\end_layout
+
+\begin_layout Enumerate-Resume
+
+\family typewriter
+marsadm leave-resource mydata
+\end_layout
+
+\begin_layout Enumerate-Resume
+After having done this on
+\emph on
+all
+\emph default
+ those cluster nodes, check that the split brain is gone (e.g.
+ by saying
+\family typewriter
+marsadm view mydata
+\family default
+), as documented above.
+ In very rare cases, you might also need a
+\family typewriter
+log-purge-all
+\family default
+ (see page
+\begin_inset CommandInset ref
+LatexCommand pageref
+reference "log-purge-all$res"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Enumerate-Resume
+Rename the underlying local disk
+\family typewriter
+/dev/lv-x/mydata
+\family default
+ is into something like
+\family typewriter
+/dev/lv-x/mynewdata
+\family default
+ (see
+\family typewriter
+man lvrename
+\family default
+) This is
+\emph on
+extremely
+\emph default
+ recommended to avoid confusion with the old resource name!
+\end_layout
+
+\begin_layout Enumerate-Resume
+Check that each underlying local disk
+\family typewriter
+/dev/lv-x/mynewdata
+\family default
+ is really usable afterwards, e.g.
+ by test-mounting it (or
+\family typewriter
+fsck
+\family default
+ if you can afford it).
+ If all is OK, don't forget to umount it before proceeding with the next
+ step.
+\end_layout
+
+\begin_layout Enumerate-Resume
+Create a completely new MARS resource out of the underlying disk
+\family typewriter
+/dev/lv-x/mynewdata
+\family default
+ having a different name, best is
+\family typewriter
+mynewdata
+\family default
+ (see description in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:Creating-and-Maintaining"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ Generally:
+\series bold
+ best practice
+\series default
+ is to always keep your LV names equal to your MARS resource names.
+ This can avoid a
+\emph on
+lot
+\emph default
+ of unnecessary confusion.
+\end_layout
+
+\begin_layout Paragraph
+Keeping a Good Version
+\end_layout
+
+\begin_layout Standard
+When you had a secondary which did not participate in the split brain, but
+ just got confused and therefore stopped replaying logfiles immediately
+ before the split-brain point, it may very well happen
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+In general, such a
+\begin_inset Quotes eld
+\end_inset
+
+good
+\begin_inset Quotes erd
+\end_inset
+
+ behaviour cannot be guaranteed for all secondaries.
+ Race conditions in complex networks may asynchronously transfer
+\begin_inset Quotes eld
+\end_inset
+
+wrong
+\begin_inset Quotes erd
+\end_inset
+
+ logfile data to a secondary much earlier than conflicting
+\begin_inset Quotes eld
+\end_inset
+
+good
+\begin_inset Quotes erd
+\end_inset
+
+ logfile data which will be marked
+\begin_inset Quotes eld
+\end_inset
+
+good
+\begin_inset Quotes erd
+\end_inset
+
+ only in the
+\emph on
+future.
+
+\emph default
+ It is impossible to predict this in advance.
+\end_layout
+
+\end_inset
+
+ that you don't need to do any action for it.
+ When all wrong versions have disappeared from the cluster (by
+\family typewriter
+invalidate
+\family default
+ or
+\family typewriter
+leave-resource
+\family default
+ as described before), the confusion should be over, and the secondary should
+ automatically resume tracking of the new unique version.
+\end_layout
+
+\begin_layout Standard
+Please check that
+\emph on
+all
+\emph default
+ of your secondaries are no longer stuck.
+ You need to execute split brain resolution only for
+\emph on
+stuck
+\emph default
+ nodes.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ Hint / advice for
+\begin_inset Formula $k>2$
+\end_inset
+
+ replicas: it is a good idea to start split brain resolution
+\emph on
+first
+\emph default
+ with those (few) nodes which had been (accidentally) primary before, but
+ are not the new designated primary.
+ Usually, you had 2 primaries during split brain, so this will apply only
+ to
+\emph on
+one
+\emph default
+ of them.
+ Leave the other one intact, by not umounting
+\family typewriter
+/dev/mars/mydata
+\family default
+ at all, and keeping your applications running.
+ Even during emergency mode, see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Emergency-Mode"
+
+\end_inset
+
+.
+
+\emph on
+First
+\emph default
+ resolve the problem of the
+\begin_inset Quotes eld
+\end_inset
+
+wrong
+\begin_inset Quotes erd
+\end_inset
+
+ primary(s) via
+\family typewriter
+invalidate
+\family default
+ or
+\family typewriter
+leave-resource
+\family default
+.
+ Wait for a short while.
+ Then check the rest of your secondaries, whether they now are already following
+ the new (unique) primary, and finally check whether the split brain warning
+ reported by
+\family typewriter
+marsadm view all
+\family default
+ is gone everywhere.
+ This way, you can often skip unnecessary invalidations of replicas.
+\end_layout
+
+\begin_layout Subsection
+Final Destruction of a Damaged Node
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Final-Destroy-of"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+When a node has eventually died, do the following steps ASAP:
+\end_layout
+
+\begin_layout Enumerate
+
+\emph on
+Physically
+\emph default
+ remove the dead node from your network.
+ Unplug all network cables! Failing to do so might provoke a disaster in
+ case it somehow resurrects in an uncontrolled manner, such as a partly-damaged
+
+\family typewriter
+/mars/
+\family default
+ filesystem, a half-defective kernel, RAM / kernel memory corruption, disk
+ corruption, or whatever.
+ Don't risk any such unpredictable behaviour!
+\end_layout
+
+\begin_layout Enumerate
+
+\series bold
+Manually
+\series default
+ check which of the surviving versions will be the
+\begin_inset Quotes eld
+\end_inset
+
+right
+\begin_inset Quotes erd
+\end_inset
+
+ one.
+ Any error is up to you: resurrecting an unnecessarily old / outdated version
+ and/or destroying the newest / best version is
+\emph on
+your
+\emph default
+ fault, not the fault of MARS.
+\end_layout
+
+\begin_layout Enumerate
+If you did not already switch your primary to the final destination determined
+ in the previous step, do it now (see description in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Forced-Switching"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Enumerate
+On a surviving node, but preferably
+\emph on
+not
+\emph default
+ the new designated primary, give the following commands:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+
+\family typewriter
+marsadm --host=your-damaged-host down mydata
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+marsadm --host=your-damaged-host leave-resource mydata
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Check for misspellings, in particular the hostname of the dead node, and
+ check the command syntax before typing return! Otherwise, you may forcefully
+ destroy the wrong
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+That said, MARS is rather tolerant of human error.
+ Once a sysadmin accidentally destroyed a cluster while it was continuously
+ running as primary.
+ Fortunately, the problem was detected early enough for a correction without
+ causing any extraordinary customer downtime outside of accepted tolerances,
+ and no data loss at all.
+\end_layout
+
+\end_inset
+
+ node!
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+In case any of the previous commands should fail (which is rather likely),
+ repeat it with an additional
+\family typewriter
+--force
+\family default
+ option.
+ Don't use
+\family typewriter
+--force
+\family default
+ in the first place, alway try first without it!
+\end_layout
+
+\begin_layout Enumerate
+Repeat the same with
+\emph on
+all
+\emph default
+ resources which were formerly present at
+\family typewriter
+your-damaged-host
+\family default
+.
+\end_layout
+
+\begin_layout Enumerate
+Finally, say
+\family typewriter
+marsadm --host=your-damaged-host leave-cluster
+\family default
+ (optionally augmented with
+\family typewriter
+--force
+\family default
+).
+\end_layout
+
+\begin_layout Standard
+Now your surviving nodes should
+\emph on
+believe
+\emph default
+ that the old node
+\family typewriter
+your-damaged-host
+\family default
+ does no longer exist, and that it does no longer participate in any resource.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Even if your dead node comes to life again in some way: always ensure that
+ the mars kernel module cannot run any more.
+
+\emph on
+Never
+\emph default
+ do a
+\family typewriter
+modprobe mars
+\family default
+ on a node marked as dead this way!
+\end_layout
+
+\begin_layout Standard
+Further instructions for complicated cases are in appendix
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "chap:Alternative-De--and"
+
+\end_inset
+
+ and
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Cleanup-in-case"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsection
+Online Resizing during Operation
+\end_layout
+
+\begin_layout Standard
+You should have LVM or some other means of increasing the physical size
+ of your disk (e.g.
+ via firmware of some RAID controllers).
+ The network must be healthy.
+ Do the following steps:
+\end_layout
+
+\begin_layout Enumerate
+Increase your local disks (usually
+\family typewriter
+/dev/vg/mydata
+\family default
+)
+\emph on
+everywhere
+\emph default
+ in the whole cluster.
+ In order to avoid wasting space, increase them
+\emph on
+uniformly
+\emph default
+ to the same size (when possible).
+ The
+\family typewriter
+lvresize
+\family default
+ tool is documented elsewhere.
+\end_layout
+
+\begin_layout Enumerate
+Check that all MARS switches are on.
+ If not, say
+\family typewriter
+marsadm up mydata
+\family default
+ everywhere.
+\end_layout
+
+\begin_layout Enumerate
+At the primary:
+\family typewriter
+marsadm resize mydata
+\end_layout
+
+\begin_layout Enumerate
+If you have intermediate layers such as iSCSI, you may need some
+\family typewriter
+iscsiadm
+\family default
+ update or other command.
+\end_layout
+
+\begin_layout Enumerate
+Now you may increase your filesystem.
+ This is specific for the filesystem type and documented elsewhere.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Hint: the secondaries will start syncing the increased new part of the underlyin
+g primary disk.
+ In many cases, this is not really needed, because the new junk data just
+ does not care.
+ If you are sure and if you know what you are doing, you may use
+\family typewriter
+marsadm fake-sync mydata
+\family default
+ to abort such unnecessary traffic.
+\end_layout
+
+\begin_layout Section
+The State of MARS
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:The-State-of"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+In general, MARS tries to
+\emph on
+hide
+\emph default
+ any network failures from you as best as it can.
+ After a network problem, any internal low-level socket connections are
+
+\emph on
+transparently
+\emph default
+ tried to re-open ASAP, without need for sysadmin intervention.
+ In difference to DRBD, network failures will
+\emph on
+not
+\emph default
+ automatically alter the state of MARS, such as switching to
+\family typewriter
+disconnected
+\family default
+ after a
+\family typewriter
+ko_timeout
+\family default
+ or similar.
+ From a high-level sysadmin viewpoint, communication may just take a very
+ long time to succeed.
+\end_layout
+
+\begin_layout Standard
+When the behaviour of MARS is different from DRBD, it is usually intended
+ as a feature.
+\end_layout
+
+\begin_layout Standard
+MARS is not only an
+\series bold
+asynchronous
+\series default
+ system at block IO level, but also
+\series bold
+at control level
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+This is
+\emph on
+necessary
+\emph default
+ because in a widely distributed long-distance system running on slow or
+ even temporarily failing networks, actions may take a long time, and there
+ may be many actions
+\series bold
+started in parallel
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Synchronous concepts are generally not sufficient for expressing that.
+ Because of inherent asynchronicity and of dynamic creation / joining of
+ resources, it is neither possible to comprehensively depict a complex distribut
+ed MARS system, nor a comprehensive standalone snippet of MARS, as a finite
+ state transition diagram
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Probably it could be possible to formally model MARS as a Petri net.
+ However, complete Petri nets are tending to become very conplex, and to
+ describe lots of low-level details.
+ Expressing hierarchy, in a top-down fashion, is cumbersome.
+ We find no clue in trying to do so.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+Although MARS tries to
+\emph on
+approximate
+\emph default
+ /
+\emph on
+emulate
+\emph default
+ the synchronous control behaviour of DRBD at the interface level (
+\family typewriter
+marsadm
+\family default
+) in many situations as best as it can, the
+\emph on
+internal
+\emph default
+ control model is necessarily asynchronous.
+ As an experiencend sysadmin, you will be curious how it works in principle.
+ When you know something about it, you will no longer be surprised when
+ some (detail) behaviour is different from DRBD.
+\end_layout
+
+\begin_layout Standard
+The general principle is an asynchronous 2-edge handshake protocol, which
+ is used almost everywhere in MARS:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/handshake.fig
+ width 80col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+We have a binary todo switch, which can be either in state
+\begin_inset Quotes eld
+\end_inset
+
+on
+\begin_inset Quotes erd
+\end_inset
+
+ or
+\begin_inset Quotes eld
+\end_inset
+
+off
+\begin_inset Quotes erd
+\end_inset
+
+.
+ In addition, we have an actual response indicator, which is similar to
+ an LED indicating the actual status.
+ In our example, we imagine that both are used for controlling a big ventilator,
+ having a huge inert mass.
+ Imagine a big machine from a power plant, which is as tall as a human.
+\end_layout
+
+\begin_layout Standard
+We start in a situation where the binary switch is off, and the ventilator
+ is stopped.
+ At point 1, we turn on the switch.
+ At that moment, a big contactor will sound like
+\begin_inset Quotes eld
+\end_inset
+
+zonggg
+\begin_inset Quotes erd
+\end_inset
+
+, and a big motor will start to hum.
+ At first you won't hear anything else.
+ It will take a while, say 1 minute, until the big wheel will have reached
+ its final operating RPM, due to the huge inert mass.
+ During that spin-up, the lights in your room will become slightly darker.
+ When having reached the full RPM at point 2, your workplace will then be
+ noisier, but in exchange your room lights will be back at ordinary strength,
+ and the actual response LED will start to lit in order to indicate that
+ the big fan is now operational.
+\end_layout
+
+\begin_layout Standard
+Assume we want to turn the system off.
+ When turning the todo switch to
+\begin_inset Quotes eld
+\end_inset
+
+off
+\begin_inset Quotes erd
+\end_inset
+
+ at point 3, first nothing will seem to happen at all.
+ The big wheel will keep spinning due to its heavy inert mass, and the RPM
+ as well as the sound will go down only slowly.
+ During spin-down, the actual response LED will stay illuminated, in order
+ to warn you that you should not touch the wheel, otherwise you may get
+ injuried
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice that it is only safe to access the wheel when
+\emph on
+both
+\emph default
+ the switch and the LED are off.
+ Conversely, if at least one of them is on, something is going on inside
+ the machine.
+ Transferred to MARS: always look at
+\emph on
+both
+\emph default
+ the todo switch and the correponding actual indicator in order to not miss
+ something.
+\end_layout
+
+\end_inset
+
+.
+ The LED will only go off after, say, 2 minutes, when the wheel has actually
+ stopped at point 4.
+ After that, the cycle may potentially start over again.
+\end_layout
+
+\begin_layout Standard
+As you can see, all four possible cartesian product combinations between
+ two boolean values are occurring in the diagram.
+\end_layout
+
+\begin_layout Standard
+The same handshake protocol is used in MARS for communication between userspace
+ and kernelspace, as well as for communication in the widely distributed
+ system.
+\end_layout
+
+\begin_layout Section
+Inspecting the State of MARS
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Inspecting-the-State"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The main command for viewing the current state of MARS is
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+marsadm view mydata
+\end_layout
+
+\begin_layout Standard
+or its more specialized variant
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+marsadm view-
+\emph on
+$macroname
+\emph default
+ mydata
+\end_layout
+
+\begin_layout Standard
+where
+\family typewriter
+\emph on
+$macroname
+\family default
+\emph default
+ is one of the macros described in chapter
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "chap:The-Macro-Processor"
+
+\end_inset
+
+, or a macro which has been written by yourself.
+\end_layout
+
+\begin_layout Standard
+As always, you may replace the resource name
+\family typewriter
+mydata
+\family default
+ with the special keyword
+\family typewriter
+all
+\family default
+ in order to get the state of all locally joined resources, as well as a
+ list of all those resources.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+When using the variant
+\family typewriter
+marsadm view all
+\family default
+, additionally the global communication status will be displayed.
+ This helps humans in diagnosing problems.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Hint: use the compound command
+\family typewriter
+watch marsadm view all
+\family default
+ for continuous display of the current state of MARS.
+ When starting this side-by-side in
+\family typewriter
+ssh
+\family default
+ terminal windows for all your cluster nodes, you can easily watch what's
+ going on in the whole cluster.
+\end_layout
+
+\begin_layout Chapter
+Basic Working Principle
+\end_layout
+
+\begin_layout Standard
+Even if you are impatient, please read this chapter.
+ At the
+\emph on
+surface
+\emph default
+, MARS appears to be very similar to DRBD.
+ It looks like almost being a drop-in replacement for DRBD.
+\end_layout
+
+\begin_layout Standard
+When taking this naïvely, you could easily step into some trivial pitfalls,
+ because the internal working principle of MARS is totally different from
+ DRBD.
+ Please forget (almost) anything you already know about the internal working
+ principles of DRBD, and look at the very different working principles of
+ MARS.
+\end_layout
+
+\begin_layout Section
+The Transaction Logger
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:The-Transaction-Logger"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/MARS_Data_Flow.pdf
+ lyxscale 60
+ width 100text%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+The basic idea of MARS is to record all changes made to your block device
+ in a so-called
+\series bold
+transaction logfile
+\series default
+.
+
+\emph on
+Any
+\emph default
+ write reqeuest is treated like a transaction which changes the contents
+ of your block device.
+\end_layout
+
+\begin_layout Standard
+This is similar in concept to some database systems, but there exists no
+ separate
+\begin_inset Quotes eld
+\end_inset
+
+commit
+\begin_inset Quotes erd
+\end_inset
+
+ operation:
+\emph on
+any
+\emph default
+ write request is acting like a commit.
+\end_layout
+
+\begin_layout Standard
+The picture shows the flow of write requests.
+ Let's start with the primary node.
+\end_layout
+
+\begin_layout Standard
+Upon submission of a write request on
+\family typewriter
+/dev/mars/mydata
+\family default
+, it is first buffered in a
+\emph on
+temporary
+\emph default
+ memory buffer.
+\end_layout
+
+\begin_layout Standard
+The temporary memory buffer serves multiple purposes:
+\end_layout
+
+\begin_layout Itemize
+It keeps track of the order of write operations.
+\end_layout
+
+\begin_layout Itemize
+Additionally, it keeps track of the positions in the underlying disk
+\family typewriter
+/dev/lv-x/mydata
+\family default
+.
+ In particular, it detects when the same block is overwritten multiple times.
+\end_layout
+
+\begin_layout Itemize
+During pending write operation, any concurrent reads are served from the
+ memory buffer.
+\end_layout
+
+\begin_layout Standard
+After the write has been buffered in the temporary memory buffer, the main
+ logger thread of the transaction logger creates a so-called
+\emph on
+log entry
+\emph default
+ and starts an
+\begin_inset Quotes eld
+\end_inset
+
+append
+\begin_inset Quotes erd
+\end_inset
+
+ operation on the transaction logfile.
+ The log entry contains vital information such as the logical block number
+ in the underlying disk, the length of the data, a timestamp, some header
+ magic in order to detect corruption, the log entry sequence number, of
+ course the data itself, and optional information like a checksum or compression
+ information.
+\end_layout
+
+\begin_layout Standard
+Once the log entry has been written through to the
+\family typewriter
+/mars/
+\family default
+ filesystem via fsync(), the application waiting for the write operation
+ at
+\family typewriter
+/dev/mars/mydata
+\family default
+ is signalled that the write was successful.
+\end_layout
+
+\begin_layout Standard
+This may happen even
+\emph on
+before
+\emph default
+ the writeback to the underlying disk
+\family typewriter
+/dev/lv-x/mydata
+\family default
+ has started.
+ Even when you power off the system right now, the information is not lost:
+ it is present in the logfile, and can be reconstructed from there.
+\end_layout
+
+\begin_layout Standard
+Notice that the order of log records present in the transaction log defines
+ a total order among the write requests which is
+\emph on
+compatible
+\emph default
+ to the partial order of write requests issued on
+\family typewriter
+/dev/mars/mydata
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+Also notice that despite its sequential nature, the transaction logfile
+ is typically
+\emph on
+not
+\emph default
+ the performance bottleneck of the system: since appending to a logfile
+ is almost purely sequential IO, it runs much faster than random IO on typical
+ datacenter workloads.
+\end_layout
+
+\begin_layout Standard
+In order to reclaim the temporary memory buffer, its content must be written
+ back to the underlying disk
+\family typewriter
+/dev/lv-x/mydat
+\family default
+a somewhen.
+ After writeback, the temporary space is freed.
+ The writeback can do the following optimizations:
+\end_layout
+
+\begin_layout Enumerate
+writeback may be in
+\emph on
+any
+\emph default
+ order; in particular, it may be
+\emph on
+sorted
+\emph default
+ according to ascending sector ´numbers.
+ This will reduce the average seek distances of magnetic disks in general.
+\end_layout
+
+\begin_layout Enumerate
+when the same sector is overwritten multiple times, only the
+\begin_inset Quotes eld
+\end_inset
+
+last
+\begin_inset Quotes erd
+\end_inset
+
+ version need to be written back, skipping some intermediate versions.
+\end_layout
+
+\begin_layout Standard
+In case the primary node crashes during writeback, it suffices to replay
+ the log entries from some point in the past until the end of the transaction
+ logfile.
+ It does no harm if you accidentally replay some log entries twice or even
+ more often: since the replay is in the original total order, any temporary
+ inconsistency is
+\emph on
+healed
+\emph default
+ by the logfile application.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+In mathematics, the property that you can apply your logfile twice to your
+ data (or even as often as you want), is called
+\series bold
+idempotence
+\series default
+.
+ This is a very desirable property: it ensures that nothing goes wrong when
+ replaying
+\begin_inset Quotes eld
+\end_inset
+
+too much
+\begin_inset Quotes erd
+\end_inset
+
+ / starting your replay
+\begin_inset Quotes eld
+\end_inset
+
+too early
+\begin_inset Quotes erd
+\end_inset
+
+.
+ Idempotence is even more beneficial: in case anything should go wrong with
+ your data on your disk (e.g.
+ IO errors), replaying your logfile once more often may
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Miracles cannot be guaranteed, but
+\emph on
+higher chances
+\emph default
+ and
+\emph on
+improvements
+\emph default
+ can be expected (e.g.
+ better chances for
+\family typewriter
+fsck
+\family default
+).
+\end_layout
+
+\end_inset
+
+ even
+\series bold
+heal
+\series default
+ some defects.
+ Good news for desperate sysadmins forced to work with flaky hardware!
+\end_layout
+
+\begin_layout Standard
+The basic idea of the asynchronous replication of MARS is rather simple:
+ just transfer the logfiles to your secondary nodes, and replay them onto
+ their copy of the disk data (also called
+\emph on
+mirror
+\emph default
+) in the same order as the total order defined by the primary.
+\end_layout
+
+\begin_layout Standard
+Therefore, a mirror of your data on any secondary may be outdated, but it
+ always corresponds to some version which was valid in the past.
+ This property is called
+\series bold
+anytime consistency
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Your secondary nodes are always consistent in themselves.
+ Notice that this kind of consistency is a
+\emph on
+local
+\emph default
+ consistency model.
+ There exists no global consistency in MARS.
+ Global consistency would be practically impossible in long-distance replication
+ where Einstein's law of the speed of light is limiting global consistency.
+ The front-cover pictures showing the planets Earth and Mars tries to lead
+ your imagination away from global consistency models as used in
+\begin_inset Quotes eld
+\end_inset
+
+DRBD Think(tm)
+\begin_inset Quotes erd
+\end_inset
+
+, and try to prepare you mentally for local consistency as in
+\begin_inset Quotes eld
+\end_inset
+
+MARS Think(tm)
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+As you can see in the picture, the process of transfering the logfiles is
+
+\emph on
+independent
+\emph default
+ from the process which replays the logfiles onto the data at some secondary
+ site.
+ Both processes can be switched on / off separately (see commands
+\family typewriter
+marsadm {dis,}connect
+\family default
+ and
+\family typewriter
+marsadm {pause,resume}-replay
+\family default
+ in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Operation-of-the"
+
+\end_inset
+
+).
+ This may be
+\emph on
+exploited
+\emph default
+: for example, you may replicate your logfiles as soon as possible (to protect
+ against catastrophic failures), but deliberately wait one hour until it
+ is replayed (under regular circumstances).
+ If your data inside your filesystem
+\family typewriter
+/mydata/
+\family default
+ at the primary site is accidentally destroyed by
+\family typewriter
+rm -rf /mydata/
+\family default
+, you have an old copy at the secondary site.
+ This way, you can substitute
+\emph on
+some parts
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Please note that MARS cannot
+\emph on
+fully
+\emph default
+ substitute a backup system, because it can keep only
+\emph on
+physical
+\emph default
+ copies, and does not create logical copies.
+\end_layout
+
+\end_inset
+
+
+\emph default
+ of conventional backup functionality by MARS.
+ In case you need the actual version, just replay in
+\begin_inset Quotes eld
+\end_inset
+
+fast-forward
+\begin_inset Quotes erd
+\end_inset
+
+ mode (similar to old-fashioned video tapes).
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Future versions of MARS Full are planned to also allow
+\begin_inset Quotes eld
+\end_inset
+
+fast-backward
+\begin_inset Quotes erd
+\end_inset
+
+ rewinding, of course at some cost.
+\end_layout
+
+\begin_layout Section
+The Lamport Clock
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:The-Lamport-Clock"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+MARS is always
+\emph on
+asynchonously
+\emph default
+ communicating in the distributed system on
+\emph on
+any
+\emph default
+ topics, even strategic decisions.
+\end_layout
+
+\begin_layout Standard
+If there were a
+\emph on
+strict
+\emph default
+ global consistency model, which would be roughly equivalent to a standalone
+ model, we would need
+\emph on
+locking
+\emph default
+ in order to serialize conflicting requests.
+ It is known for many decades that
+\emph on
+distributed locks
+\emph default
+ do not only suffer from performance problems, but they are also cumbersome
+ to get them working reliably in scenarios where nodes or network links
+ may fail at any time.
+\end_layout
+
+\begin_layout Standard
+Therefore, MARS uses a very different consistency model:
+\series bold
+Eventually Consistent
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Notice that the network bottleneck problems described in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Network-Bottlenecks"
+
+\end_inset
+
+ are
+\emph on
+demanding
+\emph default
+ an
+\begin_inset Quotes eld
+\end_inset
+
+eventually consistent
+\begin_inset Quotes erd
+\end_inset
+
+ model.
+ You have
+\series bold
+no chance
+\series default
+ against natural laws, like Einstein's laws.
+ In order to cope with the problem area, you have to
+\emph on
+invest some additional effort
+\emph default
+.
+ Unfortunately, asynchronous communication models are more tricky to program
+ and to debug than simple strictly consistent models.
+ In particular, you
+\emph on
+have to cope with
+\emph default
+ additional
+\series bold
+race conditions
+\series default
+
+\emph on
+inherent
+\emph default
+
+\emph on
+to
+\emph default
+ the
+\begin_inset Quotes eld
+\end_inset
+
+eventually consistent
+\begin_inset Quotes erd
+\end_inset
+
+ model.
+ In the face of the laws of the universe, motivate yourself by looking at
+ the graphics at the cover page: the planets are a
+\emph on
+symbol
+\emph default
+ for what you have to do!
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Example: the asynchronous communication protocol of MARS leads to a different
+ behaviour from DRBD in case of
+\series bold
+network partitions
+\series default
+ (temporary interruption of communication between some cluster nodes), because
+ MARS
+\emph on
+remembers
+\emph default
+ the old state of remote nodes over long periods of time, while DRBD knows
+ absolutely nothing about its peers in disconnected state.
+ Sysadmins familiar with DRBD might find the following behaviour unusual:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+
+\size tiny
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+Event
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+DRBD Behaviour
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+MARS Behaviour
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+1.
+ the network partitions
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+automatic disconnect
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+nothing happens, but replication lags behind
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+2.
+ on A:
+\family typewriter
+umount $device
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+works
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+works
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+3.
+ on A:
+\family typewriter
+{drbd,mars}adm secondary
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+works
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+works
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+4.
+ on B:
+\family typewriter
+{drbd,mars}adm primary
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+works, split brain happens
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\series bold
+\size tiny
+refused
+\series default
+ because B believes that A is primary
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+5.
+ the network resumes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+automatic connect attempt fails
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+communication automatically resumes
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+If you intentionally want to switch over (and to produce a split brain as
+ a side effect), the following variant must be used with MARS:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+
+\size tiny
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+Event
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+DRBD Behaviour
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+MARS Behaviour
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+1.
+ the network partitions
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+automatic disconnect
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+nothing happens, but replication lags behind
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+2.
+ on A:
+\family typewriter
+umount $device
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+works
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+works
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+3.
+ on A:
+\family typewriter
+{drbd,mars}adm secondary
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+works
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+works (but
+\emph on
+not remmonended!
+\emph default
+)
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+4.
+ on B:
+\family typewriter
+{drbd,mars}adm primary
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+split brain, but nobody knows
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\series bold
+\size tiny
+refused
+\series default
+ because B believes that A is primary
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+5.
+ on B:
+\family typewriter
+marsadm disconnect
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+works, nothing happens
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+6.
+ on B:
+\family typewriter
+marsadm primary --force
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+works, split brain happens on B, but A doesn't know
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+7.
+ on B:
+\family typewriter
+marsadm connect
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+works, nothing happens
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+8.
+ the network resumes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+automatic connect attempt fails
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size tiny
+communication resumes, A now detects the split brain
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+In order to implement the consistency model
+\begin_inset Quotes eld
+\end_inset
+
+eventually consistent
+\begin_inset Quotes erd
+\end_inset
+
+, MARS uses a so-called Lamport
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Published in the late 1970s by Leslie Lamport, also known as inventor of
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+LaTeX
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\end_inset
+
+ clock.
+ MARS uses a special variant called
+\begin_inset Quotes eld
+\end_inset
+
+physical Lamport clock
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+The physical Lamport clock is another almost-realtime clock which
+\emph on
+can
+\emph default
+ run independently from the Linux kernel system clock.
+ However, the Lamport clock tries to remain as near as possible to the system
+ clock.
+\end_layout
+
+\begin_layout Standard
+Both clocks can be queried at any time via
+\family typewriter
+cat /proc/sys/mars/lamport_clock
+\family default
+.
+ The result will show both clocks in parallel, in units of seconds since
+ the Unix epoch, with nanosecond resolution.
+\end_layout
+
+\begin_layout Standard
+When there are no network messages at all, both the system clock and the
+ Lamport clock will show almost the same time (except some minor differences
+ of a few nanoseconds resulting from the finite processor clock speed).
+\end_layout
+
+\begin_layout Standard
+The physical Lamport clock works rather simple:
+\emph on
+any
+\emph default
+ message on the network is augmented with a Lamport time stamp telling when
+ the message was
+\emph on
+sent
+\emph default
+ according to the local Lamport clock of the sender.
+ Whenever that message is received by some receiver, it checks whether the
+ time ordering relation would be violated: whenever the Lamport timestamp
+ in the message would claim that the sender had sent it
+\emph on
+after
+\emph default
+ it arrived at the receiver (according to drifts in their respective local
+ clocks), something must be wrong.
+ In this case, the local Lamport clock of the
+\emph on
+receiver
+\emph default
+ is advanced shortly after the sender Lamport timestamp, such that the time
+ ordering relation is no longer violated.
+\end_layout
+
+\begin_layout Standard
+As a consequence, any local Lamport clock may precede the corresponding
+ local system clock.
+ In order to avoid accumulation of deltas between the Lamport and the system
+ clock, the Lamport clock will run slower after that, possibly until it
+ reaches the system clock again (if no other message arrives which sets
+ it forward again).
+ After having reached the system clock, the Lamport clock will continue
+ with
+\begin_inset Quotes eld
+\end_inset
+
+normal
+\begin_inset Quotes erd
+\end_inset
+
+ speed.
+\end_layout
+
+\begin_layout Standard
+MARS uses the local Lamport clock for anything where other systems would
+ use the local system clock: for example, timestamp generation in the
+\family typewriter
+/mars/
+\family default
+ filesystem.
+ Even symlinks created there are timestamped according to the Lamport clock.
+ Both the kernel module and the userspace tool
+\family typewriter
+marsadm
+\family default
+ are always operating in the timescale of the Lamport clock.
+ Most importantly, all timestamp comparisons are always carried out with
+ respect to Lamport time.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Bigger differences between the Lamport and the system clock can be annoying
+ from a human point of view: when typing
+\family typewriter
+ls -l /mars/resource-mydata/
+\family default
+ many timestamps may appear as if they were created in the
+\begin_inset Quotes eld
+\end_inset
+
+future
+\begin_inset Quotes erd
+\end_inset
+
+, because the
+\family typewriter
+ls
+\family default
+ command compares the output formatting against the system clock (it does
+ not even know of the existence of the MARS Lamport clock).
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Always use
+\family typewriter
+ntp
+\family default
+ (or another clock synchronization service) in order to pre-synchronize
+ your system clocks as close as possible.
+ Bigger differences are not only annoying, but may lead some people to wrong
+ conclusions and therefore even lead to bad human decisions!
+\end_layout
+
+\begin_layout Standard
+In a professional datacenter, you should use
+\family typewriter
+ntp
+\family default
+ anyway, and you should monitor its effectiveness anyway.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Hint: many internal logfiles produced by the MARS kernel module contain
+ Lamport timestamps written as numerical values.
+ In order to convert them into human-readable form, use the command
+\family typewriter
+marsadm cat /mars/5.total.status
+\family default
+ or similar.
+\end_layout
+
+\begin_layout Section
+The Symlink Tree
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:The-Symlink-Tree"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ The symlink tree as described here will be replaced by another representation
+ in future versions of MARS.
+ Therefore, don't do any scripting by directly accessing symlinks! Use the
+ primitive macros described in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Predefined-Trivial-Macros"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+The current
+\family typewriter
+/mars/
+\family default
+ filesystem container format contains not only transaction logfiles, but
+ also acts as a generic storage for (persistent) state information.
+ Both configuration information and runtime state information are currently
+ stored in symlinks.
+ Symlinks are
+\begin_inset Quotes eld
+\end_inset
+
+misused
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+This means, the symlink targets need not be other files or directories,
+ but just any values like integers or strings.
+\end_layout
+
+\end_inset
+
+
+\begin_inset Quotes erd
+\end_inset
+
+ in order to represent some
+\family typewriter
+key -> value
+\family default
+ pairs.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+It is not yet clear / decided, but there is a
+\emph on
+chance
+\emph default
+ that the
+\emph on
+concept
+\emph default
+ of
+\family typewriter
+key -> value
+\family default
+ pairs will be retained in future versions of MARS.
+ Instead of being represented by symlinks, another representation will be
+ used, such that hopefully the
+\family typewriter
+key
+\family default
+ part will remain in the form of a pathname, even if there were no longer
+ a physical representation in an actual filesystem.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ A fundamentally different behaviour than DRBD: when your DRBD primary crashed
+ some time ago, and now comes up again, you have to setup DRBD again by
+ a sequence of commands like
+\family typewriter
+modprobe drbd; drbdadm up all; drbdadm primary all
+\family default
+ or similar.
+ In contrast, MARS needs only
+\family typewriter
+modprobe mars
+\family default
+ (after
+\family typewriter
+/mars/
+\family default
+ has been mounted by
+\family typewriter
+/etc/fstab
+\family default
+).
+ The
+\emph on
+persistence
+\emph default
+ of the symlinks residing in
+\family typewriter
+/mars/
+\family default
+ will automatically remember your previous state, even if some your resources
+ were primary while others were secondary (mixed operations).
+ You don't need to do any actions in order to
+\begin_inset Quotes eld
+\end_inset
+
+restore
+\begin_inset Quotes erd
+\end_inset
+
+ a previous state, no matter how
+\begin_inset Quotes eld
+\end_inset
+
+complex
+\begin_inset Quotes erd
+\end_inset
+
+ it was.
+\end_layout
+
+\begin_layout Standard
+(Almost) all symlinks appearing in the
+\family typewriter
+/mars/
+\family default
+ directory tree are automatically replicated thoughout the whole cluster,
+ provided that the cluster
+\family typewriter
+uuid
+\family default
+s are equal
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+This is protection against accidental
+\begin_inset Quotes eld
+\end_inset
+
+merging
+\begin_inset Quotes erd
+\end_inset
+
+ of two unrelated clusters which had been created at different times with
+ different
+\family typewriter
+uuids
+\family default
+.
+\end_layout
+
+\end_inset
+
+ at all sites.
+ Thus the
+\family typewriter
+/mars/
+\family default
+ directory forms some kind of
+\emph on
+global namespace
+\emph default
+.
+\end_layout
+
+\begin_layout Standard
+In order to avoid name clashes, each pathname created at node A follows
+ a convention: the node name A should be a suffix of the pathname.
+ Typically, internal MARS names follow the scheme
+\family typewriter
+/mars/
+\emph on
+something
+\emph default
+/myname-A
+\family default
+.
+ When using the expert command
+\family typewriter
+marsadm {get,set}-link
+\family default
+ (which will likely be replaced by something else in future MARS releases),
+ you should follow the best practice of systematically using pathnames like
+
+\family typewriter
+/mars/userspace/myname-A
+\family default
+ or similar.
+ As a result, each node will automatically get informed about the state
+ at any other node, like B when the corresponding information is recorded
+ on node B under the name
+\family typewriter
+/mars/userspace/myname-B
+\family default
+ (context-dependent names).
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ Experts only: the symlink replication works generically.
+ You might use the
+\family typewriter
+/mars/userspace/
+\family default
+ directory in order to place your own symlink there (for whatever purpose,
+ which need not have to do with MARS).
+ However, the symlinks are likely to disappear.
+ Use
+\family typewriter
+marsadm {get,set}-link
+\family default
+ instead.
+ There is a chance that these abstract commands (or variants thereof) will
+ be retained, by acting on the new data representation in future, even if
+ the old symlink format will vanish some day.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Important: the convention of placing the
+\series bold
+creator host name
+\series default
+ inside your pathnames should be used wherever possible.
+ The name part is a kind of
+\begin_inset Quotes eld
+\end_inset
+
+ownership indicator
+\begin_inset Quotes erd
+\end_inset
+
+.
+ It is crucial that no other host writes any symlink not
+\begin_inset Quotes eld
+\end_inset
+
+belonging
+\begin_inset Quotes erd
+\end_inset
+
+ to him.
+ Other hosts may read foreign information as often as they want, but never
+ modify them.
+ This way, your cluster nodes are able to
+\emph on
+communicate
+\emph default
+ with each other via symlink / information updates.
+\end_layout
+
+\begin_layout Standard
+Although experts might create (and change) the current symlinks with userspace
+ tools like
+\family typewriter
+ln -s
+\family default
+, you should use the following marsadm commands instead:
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+marsadm set-link myvalue /mars/userspace/mykey-A
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+marsadm delete-file /mars/userspace/mykey-A
+\end_layout
+
+\begin_layout Standard
+There are many reasons for this: first, the
+\family typewriter
+marsadm set-link
+\family default
+ command will automatically use the Lamport clock for symlink creation,
+ and therefore will avoid any errors resulting from a
+\begin_inset Quotes eld
+\end_inset
+
+wrong
+\begin_inset Quotes erd
+\end_inset
+
+ system clock (as in
+\family typewriter
+ln -s
+\family default
+).
+ Second, the
+\family typewriter
+marsadm delete-file
+\family default
+ (which also deletes symlinks) works on the
+\emph on
+whole cluster
+\emph default
+.
+ And finally, there is a chance that this will work in future versions of
+ MARS even after the symlinks have vanished.
+\end_layout
+
+\begin_layout Standard
+What's the difference? If you would try to remove your symlink locally by
+ hand via
+\family typewriter
+rm -f
+\family default
+, you will be surprised: since the symlink has been replicated to the other
+ cluster nodes, it will be re-transferred from there and will be resurrected
+ locally after some short time.
+ This way, you cannot delete any object reliably, because your whole cluster
+ (which may consist of many nodes) remembers all your state information
+ and will
+\begin_inset Quotes eld
+\end_inset
+
+correct
+\begin_inset Quotes erd
+\end_inset
+
+ it whenever
+\begin_inset Quotes eld
+\end_inset
+
+necessary
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+In order to solve the deletion problem, MARS uses some internal deletion
+ protocol using auxiliary symlinks residing in
+\family typewriter
+/mars/todo-global/.
+
+\family default
+ The deletion protocol ensures that all replicas get deleted in the whole
+ cluster, and only thereafter the auxiliary symlinks in
+\family typewriter
+/mars/todo-global/
+\family default
+ are also deleted eventually.
+\end_layout
+
+\begin_layout Standard
+You may update your already existing symlink via
+\family typewriter
+marsadm set-link some-other-value /mars/userspace/mykey-A
+\family default
+ .
+ The new value will be propagated throughout the cluster according to a
+
+\series bold
+timestamp comparison protocol
+\series default
+: whenever node B notices that A has a
+\emph on
+newer
+\emph default
+ version of some symlink (according to the Lamport timestamp), it will replace
+ its elder version by the newer one.
+ The opposite does
+\emph on
+not
+\emph default
+ work: if B notices that A has an elder version, just nothing happens.
+ This way, the timestamps of symlinks can only progress in forward direction,
+ but never backwards in time.
+\end_layout
+
+\begin_layout Standard
+As a consequence, symlink updates made
+\begin_inset Quotes eld
+\end_inset
+
+by hand
+\begin_inset Quotes erd
+\end_inset
+
+ via
+\family typewriter
+ln -sf
+\family default
+ may get lost when the local system clock is much more earlier than the
+ Lamport clock.
+\end_layout
+
+\begin_layout Standard
+When your cluster is fully connected by the network, the last timestamp
+ will finally win everywhere.
+ Only in case of network outages leading to
+\emph on
+network partitions
+\emph default
+, some information may be
+\emph on
+temporarily inconsistent
+\emph default
+, but only for the duration of the network outage.
+ The timestamp comparison protocol in combination with the Lamport clock
+ and with the persistence of the
+\family typewriter
+/mars/
+\family default
+ filesystem will automatically heal any temporary inconsistencies as soon
+ as possible, even in case of temporary node shutdown.
+\end_layout
+
+\begin_layout Standard
+The meaning of some internal MARS symlinks residing in
+\family typewriter
+/mars/
+\family default
+ will be hopefully documented in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Documentation-of-the"
+
+\end_inset
+
+ some day.
+\end_layout
+
+\begin_layout Section
+Defending Overflow of
+\family typewriter
+/mars/
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Defending-Overflow"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+This section describes an important difference to DRBD.
+ The metadata of DRBD is allocated
+\emph on
+statically
+\emph default
+ at
+\emph on
+creation
+\emph default
+
+\emph on
+time
+\emph default
+ of the resource.
+ In contrast, the MARS transaction logfiles are allocated
+\emph on
+dynamically
+\emph default
+ at
+\emph on
+runtime
+\emph default
+.
+\end_layout
+
+\begin_layout Standard
+This leads to a potential risk from the perspective of a sysadmin: what
+ happens if the
+\family typewriter
+/mars/
+\family default
+ filesystem runs out of space?
+\end_layout
+
+\begin_layout Standard
+No risk, no fun.
+ If you want a system which survives long-lasting network outages while
+ keeping your replicas always consistent (anytime consistency), you
+\emph on
+need
+\emph default
+ dynamic memory for that.
+ It is
+\emph on
+impossible
+\emph default
+ to solve that problem using static memory
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+The bitmaps used by DRBD don't preserve the
+\emph on
+order
+\emph default
+ of write operations.
+ They cannot do that, because their space is
+\begin_inset Formula $O(k)$
+\end_inset
+
+ for some constant
+\begin_inset Formula $k$
+\end_inset
+
+.
+ In contrast, MARS preserves the order.
+ Preserving the order as such (even when only
+\emph on
+facts
+\emph default
+ about the order were recorded without recording the actual data contents)
+ requires
+\begin_inset Formula $O(n)$
+\end_inset
+
+ space where
+\begin_inset Formula $n$
+\end_inset
+
+ is infinitely growing over time.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+Therefore, DRBD and MARS have different application areas.
+ If you just want a simple system for mirroring your data over short distances
+ like a crossover cable, DRBD will be a suitable choice.
+ However, if you need to replicate over longer distances, or if you need
+ higher levels of reliability even when multiple failures may accumulate
+ (such as network loss during a
+\emph on
+re
+\emph default
+sync of DRBD), the transaction logs of MARS can solve that, but at some
+
+\emph on
+cost
+\emph default
+.
+\end_layout
+
+\begin_layout Subsection
+Countermeasures
+\end_layout
+
+\begin_layout Subsubsection
+Dimensioning of
+\family typewriter
+/mars/
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Dimensioning-of-/mars/"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The first (and most important) measure against overflow of
+\family typewriter
+/mars/
+\family default
+ is simply to dimension it large enough to survive longer-lasting problems,
+ at least one weekend.
+\end_layout
+
+\begin_layout Standard
+Recommended size is at least one dedicated disk, residing at a hardware
+ RAID controller with BBU (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Preparation:-What-you"
+
+\end_inset
+
+).
+ During normal operation, that size is needed only for a small fraction,
+ typically a few percent or even less than one percent.
+ However, it is your
+\series bold
+safety margin
+\series default
+.
+ Keep it high enough!
+\end_layout
+
+\begin_layout Subsubsection
+Monitoring
+\end_layout
+
+\begin_layout Standard
+The next (equally important) measure is
+\series bold
+monitoring in userspace
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+Following is a list of countermeasures both in userspace and in kernelspace,
+ in the order of
+\begin_inset Quotes eld
+\end_inset
+
+defensive walling
+\begin_inset Quotes erd
+\end_inset
+
+:
+\end_layout
+
+\begin_layout Enumerate
+Regular userspace monitoring must throw an INFO if a certain freespace limit
+
+\begin_inset Formula $l_{1}$
+\end_inset
+
+ of
+\family typewriter
+/mars/
+\family default
+ is undershot.
+ Typical values for
+\begin_inset Formula $l_{1}$
+\end_inset
+
+ are 30%.
+ Typical actions are automated calls of
+\family typewriter
+marsadm cron
+\family default
+ (or
+\family typewriter
+marsadm log-rotate all
+\family default
+ followed by
+\family typewriter
+marsadm log-delete-all all
+\family default
+).
+ You have to implement that yourself in sysadmin space.
+\end_layout
+
+\begin_layout Enumerate
+Regular userspace monitoring must throw a WARNING if a certain freespace
+ limit
+\begin_inset Formula $l_{2}$
+\end_inset
+
+ of
+\family typewriter
+/mars/
+\family default
+ is undershot.
+ Typical values for
+\begin_inset Formula $l_{2}$
+\end_inset
+
+ are 20%.
+ Typical actions are (in addition to
+\family typewriter
+log-rotate
+\family default
+ and
+\family typewriter
+log-delete-all
+\family default
+) alarming human supervisors via SMS and/or further stronger automated actions.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Frequently large space is occupied by files stemming from debugging output,
+ or from other programs or processes.
+ A hot candidate is
+\begin_inset Quotes eld
+\end_inset
+
+forgotten
+\begin_inset Quotes erd
+\end_inset
+
+ removal of debugging output to
+\family typewriter
+/mars/
+\family default
+.
+ Sometimes, an
+\family typewriter
+rm -rf $(find /mars/ -name
+\begin_inset Quotes eld
+\end_inset
+
+*.log
+\begin_inset Quotes erd
+\end_inset
+
+)
+\family default
+ can work miracles.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Another source of space hogging is a
+\begin_inset Quotes eld
+\end_inset
+
+forgotten
+\begin_inset Quotes erd
+\end_inset
+
+
+\family typewriter
+pause-sync
+\family default
+ or
+\family typewriter
+disconnect
+\family default
+.
+ Therefore, a simple
+\family typewriter
+marsadm connect-global all
+\family default
+ followed by
+\family typewriter
+marsadm resume-replay-global all
+\family default
+ may also work miracles (if you didn't want to freeze some mirror deliberately).
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+If you just wanted to freeze a mirror at an outdated state for a very long
+ time, you simply
+\emph on
+cannot
+\emph default
+ do that without causing infinite growth of space consumption in
+\family typewriter
+/mars/
+\family default
+.
+ Therefore, a
+\family typewriter
+marsadm leave-resource $res
+\family default
+ at
+\emph on
+exactly that(!)
+\emph default
+ secondary site where the mirror is frozen, can also work miracles.
+ If you want to automate this in unserspace, be careful.
+ It is easy to get unintended effects when choosing the wrong site for
+\family typewriter
+leave-resource
+\family default
+.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Hint: you can / should start some of these measures even earlier at the
+ INFO level (see item 1), or even earlier.
+\end_layout
+
+\begin_layout Enumerate
+Regular userspace monitoring must throw an ERROR if a certain freespace
+ limit
+\begin_inset Formula $l_{3}$
+\end_inset
+
+ of
+\family typewriter
+/mars/
+\family default
+ is undershot.
+ Typical values for
+\begin_inset Formula $l_{3}$
+\end_inset
+
+ are 10%.
+ Typical actions are alarming the CEO via SMS and/or even stronger automated
+ actions.
+ For example, you may choose to automatically call
+\family typewriter
+marsadm leave-resource $res
+\family default
+ on some or all secondary nodes, such that the primary will be left alone
+ and now has a chance to really delete its logfiles because no one else
+ is any longer potentially needing it.
+\end_layout
+
+\begin_layout Enumerate
+First-level kernelspace action, automatically executed when
+\family typewriter
+
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+/proc/sys/mars/required_free_space_4_gb
+\end_layout
+
+\end_inset
+
+
+\family default
+ +
+\family typewriter
+
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+/proc/sys/mars/required_free_space_3_gb
+\end_layout
+
+\end_inset
+
+
+\family default
+ +
+\family typewriter
+
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+/proc/sys/mars/required_free_space_2_gb
+\end_layout
+
+\end_inset
+
+
+\family default
+ +
+\family typewriter
+
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+/proc/sys/mars/required_free_space_1_gb
+\end_layout
+
+\end_inset
+
+
+\family default
+ is undershot:
+\begin_inset Newline newline
+\end_inset
+
+a warning will be issued.
+\end_layout
+
+\begin_layout Enumerate
+Second-level kernelspace action, automatically executed when
+\family typewriter
+
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+/proc/sys/mars/required_free_space_3_gb
+\end_layout
+
+\end_inset
+
+
+\family default
+ +
+\family typewriter
+
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+/proc/sys/mars/required_free_space_2_gb
+\end_layout
+
+\end_inset
+
+
+\family default
+ +
+\family typewriter
+
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+/proc/sys/mars/required_free_space_1_gb
+\end_layout
+
+\end_inset
+
+
+\family default
+ is undershot:
+\begin_inset Newline newline
+\end_inset
+
+all locally secondary resources will delete local copies of transaction
+ logfiles which are no longer needed locally.
+ This is a desperate action of the kernel module.
+\end_layout
+
+\begin_layout Enumerate
+Third-level kernelspace action, automatically executed when
+\family typewriter
+
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+/proc/sys/mars/required_free_space_2_gb
+\end_layout
+
+\end_inset
+
+
+\family default
+ +
+\family typewriter
+
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+/proc/sys/mars/required_free_space_1_gb
+\end_layout
+
+\end_inset
+
+
+\family default
+ is undershot:
+\begin_inset Newline newline
+\end_inset
+
+all locally secondary resources will stop fetching transaction logfiles.
+ This is a more desperate action of the kernel module.
+ You don't want to get there (except for testing).
+\end_layout
+
+\begin_layout Enumerate
+Last desperate kernelspace action when all else has failed and
+\family typewriter
+
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+/proc/sys/mars/required_free_space_1_gb
+\end_layout
+
+\end_inset
+
+
+\family default
+ is undershot:
+\begin_inset Newline newline
+\end_inset
+
+all locally primary resources will enter
+\series bold
+emergency mode
+\series default
+ (see description below in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Emergency-Mode"
+
+\end_inset
+
+).
+ This is the most desperate action of the kernel module.
+ You don't want to get there (except for testing).
+\end_layout
+
+\begin_layout Standard
+In addition, the kernel module obeys a general global limit
+\family typewriter
+
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+/proc/sys/mars/required_total_space_0_gb
+\end_layout
+
+\end_inset
+
+ +
+\family default
+ the sum of all of the above limits.
+ When the
+\emph on
+total size
+\emph default
+ of
+\family typewriter
+/mars/
+\family default
+ undershots that sum, the kernel module refuses to start at all, because
+ it assumes that it is senseless to try to operate MARS on a system with
+ such low memory resources.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+The current level of emergency kernel actions may be viewed at any time
+ via
+\family typewriter
+
+\begin_inset Flex URL
+status collapsed
+
+\begin_layout Plain Layout
+
+/proc/sys/mars/mars_emergency_mode
+\end_layout
+
+\end_inset
+
+
+\family default
+.
+\end_layout
+
+\begin_layout Subsubsection
+Throttling
+\end_layout
+
+\begin_layout Standard
+The last measure for defense of overflow is
+\series bold
+throttling your performance pigs
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+Motivation: in rare cases, some users with
+\family typewriter
+ssh
+\family default
+ access can do
+\emph on
+very
+\emph default
+ silly things.
+ For example, some of them are creating their own backups via user-cron
+ jobs, and they do it every 5 minutes.
+ Some example guy created a zip archive (almost 1GB) by regularly copying
+ his old zip archive into a new one, then appending deltas to the new one,
+ and finally deleting the old archive.
+ Every 5 minutes.
+ Yes, every 5 minutes, although almost never any new files were added to
+ the archive.
+ Essentially, he copied over his archive, for nothing.
+ This led to massive bulk write requests, for ridiculous reasons.
+\end_layout
+
+\begin_layout Standard
+In general, your hard disks (or even RAID systems) allow much higher write
+ IO rates than you can ever transport over a standard TCP network from your
+ primary site to your secondary, at least over longer distances (see use
+ cases for MARS in chapter
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "chap:Use-Cases-for"
+
+\end_inset
+
+).
+ Therefore, it is easy to create a such a high write load that it will be
+
+\emph on
+impossible
+\emph default
+ to replicate it over the network,
+\emph on
+by construction
+\emph default
+.
+\end_layout
+
+\begin_layout Standard
+Therefore, we
+\emph on
+need
+\emph default
+ some mechanism for throttling bulk writers whenever the network is weaker
+ than your IO subsystem.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Notice that DRBD will
+\emph on
+always
+\emph default
+ throttle your writes whenever the network forms a bottleneck, due to its
+ synchronous operation mode.
+ In contrast, MARS allows for buffering of performance peaks in the transaction
+ logfiles.
+
+\emph on
+Only when
+\emph default
+ your buffer in
+\family typewriter
+/mars/
+\family default
+ runs short (cf subsection
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Dimensioning-of-/mars/"
+
+\end_inset
+
+), MARS will start to throttle your application writes.
+\end_layout
+
+\begin_layout Standard
+There are a lot of screws named
+\family typewriter
+/proc/sys/mars/write_throttle_*
+\family default
+ with the following meaning:
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+write_throttle_start_percent
+\family default
+ Whenever the used space in
+\family typewriter
+/mars/
+\family default
+ is below this threshold, no throttling will occur at all.
+ Only when this threshold is exceeded, throttling will start
+\emph on
+slowly
+\emph default
+.
+ Typical values for this are 60%.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+write_throttle_end_percent
+\family default
+ Maximum throttling will occur once this space threshold is reached, i.e.
+ the throttling is now at its maximum effect.
+ Typical values for this are 90%.
+ When the actual space in
+\family typewriter
+/mars/
+\family default
+ lies between
+\family typewriter
+write_throttle_start_percent
+\family default
+ and
+\family typewriter
+write_throttle_end_percent
+\family default
+, the strength of throttling will be interpolated linearly between the extremes.
+ In practice, this should lead to an equilibrum between new input flow into
+
+\family typewriter
+/mars/
+\family default
+ and output flow over the network to secondaries.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+write_throttle_size_threshold_kb
+\family default
+ (readonly) This parameter shows the internal strength calculation of the
+ throttling.
+ Only write
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Read requests are never throttled at all.
+\end_layout
+
+\end_inset
+
+ requests exceeding this size (in KB) are throttled at all.
+ Typically, this will hurt the bulk performance pigs first, while leaving
+ ordinary users (issuing small requests) unaffected.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+write_throttle_ratelimit_kb
+\family default
+ Set the global IO rate in KB/s for those write requests which are throttled.
+ In case of strongest
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+In case of lighter throttling, the input flow into
+\family typewriter
+/mars/
+\family default
+ may be higher because small requests are not throttled.
+\end_layout
+
+\end_inset
+
+ throttling, this parameters determines the input flow into
+\family typewriter
+/mars/
+\family default
+.
+ The default value is 5.000 KB/s.
+ Please adjust this value to your application needs and to your environment.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+write_throttle_rate_kb
+\family default
+ (readonly) Shows the current rate of exactly those requests which are actually
+ throttled (in contrast to
+\emph on
+all
+\emph default
+ requests).
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+write_throttle_cumul_kb
+\family default
+ (logically readonly) Same as before, but the cumulative sum of all throttled
+ requests since startup / reset.
+ This value can be reset from userspace in order to prevent integer overflow.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+write_throttle_count_ops
+\family default
+ (logically readonly) Shows the cumulative number of throttled requests.
+ This value can be reset from userspace in order to prevent integer overflow.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+write_throttle_maxdelay_ms
+\family default
+ Each request is delayed at most for this timespan.
+ Smaller values will improve the responsiveness of your userspace application,
+ but at the cost of potentially retarding the requests not sufficiently.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+write_throttle_minwindow_ms
+\family default
+ Set the minimum length of the measuring window.
+ The measuring window is the timespan for which the average (throughput)
+ rate is computed (see
+\family typewriter
+write_throttle_rate_kb
+\family default
+).
+ Lower values can increase the responsiveness of the controller algorithm,
+ but at the cost of accuracy.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+write_throttle_maxwindow_ms
+\family default
+ This parameter must be set sufficiently much greater than
+\family typewriter
+write_throttle_minwindow_ms
+\family default
+.
+ In case the flow of throttled operations pauses for some natural reason
+ (e.g.
+ switched off, low load, etc), this parameter determines when a completely
+ new rate calculation should be started over
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Motivation: if requests would pause for one hour, the measuring window could
+ become also an hour.
+ Of course, that would lead to completely meaningless results.
+ Two requests in one hour is
+\begin_inset Quotes eld
+\end_inset
+
+incorrect
+\begin_inset Quotes erd
+\end_inset
+
+ from a human point of view: we just have to ensure that averages are computed
+ with respect to a reasonable maximum time window in the magnitude of 10s.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsection
+Emergency Mode and its Resolution
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Emergency-Mode"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+When
+\family typewriter
+/mars/
+\family default
+ is almost full and there is really absolutely no chance of getting rid
+ of any local transaction logfile (or free some space in any other way),
+ there is only one exit strategy: stop creating new logfile data.
+\end_layout
+
+\begin_layout Standard
+This means that the ability for replication gets lost.
+\end_layout
+
+\begin_layout Standard
+When entering emergency mode, the kernel module will execute the following
+ steps for all resources where the affected host is acting as a primary:
+\end_layout
+
+\begin_layout Enumerate
+Do a kind of
+\begin_inset Quotes eld
+\end_inset
+
+logrotate
+\begin_inset Quotes erd
+\end_inset
+
+, but create a
+\emph on
+hole
+\emph default
+ in the sequence of transaction logfile numbers.
+ The
+\begin_inset Quotes eld
+\end_inset
+
+new
+\begin_inset Quotes erd
+\end_inset
+
+ logfile is left empty, i.e.
+ no data ist written to it (for now).
+ The hole in the numbering will prevent any secondaries from replaying any
+ logfiles behind the hole (should they ever contain some data, e.g.
+ because the emergency mode has been left again).
+ This works because the secondaries are regularly checking the logfile numbers
+ for contiguity, and they will refuse to replay anything which is not contiguous.
+ As a result, the secondaries will be left in a consistent, but outdated
+ state (at least if they already were consistent before that).
+\end_layout
+
+\begin_layout Enumerate
+The kernel module writes back all data present in the temporary memory buffer
+ (see figure in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-Transaction-Logger"
+
+\end_inset
+
+).
+ This may lead to a (short) delay of user write requests until that has
+ finished (typically fractions of a second or a few seconds).
+ The reason is that the temporary memory buffer must not be increased in
+ parallel during this phase (race conditions).
+\end_layout
+
+\begin_layout Enumerate
+After the temporary memory buffer is empty, all local IO requests (whether
+ reads or writes) are directly going to the underlying disk.
+ This has the same effect as if MARS would not be present anymore.
+ Transaction logging does no longer take place.
+\end_layout
+
+\begin_layout Enumerate
+Any sync from any secondary is stopped ASAP.
+ In case they are resuming their sync somewhen later, they will start over
+ from the beginning (position
+\begin_inset Formula $0$
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+In order to leave emergency mode, the sysadmin should do the following steps:
+\end_layout
+
+\begin_layout Enumerate
+Free enough space.
+ For example, delete any foreign files on
+\family typewriter
+/mars/
+\family default
+ which have nothing to do with MARS, or resize the
+\family typewriter
+/mars/
+\family default
+ filesystem, or whatever.
+\end_layout
+
+\begin_layout Enumerate
+If
+\family typewriter
+
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+/proc/sys/mars/mars_reset_emergency
+\end_layout
+
+\end_inset
+
+
+\family default
+ is not set, now it is time to set it.
+ Normally, it should be already set.
+\end_layout
+
+\begin_layout Enumerate
+Notice: as long as not enough space has been freed, a message containing
+
+\family typewriter
+
+\begin_inset Quotes eld
+\end_inset
+
+EMEGENCY MODE HYSTERESIS
+\begin_inset Quotes erd
+\end_inset
+
+
+\family default
+ (or similar) will be displayed by
+\family typewriter
+marsadm view all
+\family default
+.
+ As a consequence, any sync will be automatically halted.
+ This applies to freshly invoked syncs also, for example created by
+\family typewriter
+invalidate
+\family default
+ or
+\family typewriter
+join-resource
+\family default
+.
+\end_layout
+
+\begin_layout Enumerate
+On the secondaries, use
+\family typewriter
+marsadm invalidate $res
+\family default
+ in order to request updating your outdated mirrors.
+\end_layout
+
+\begin_layout Enumerate
+On the primary:
+\family typewriter
+marsadm log-delete-all all
+\end_layout
+
+\begin_layout Enumerate
+As soon as emough space has been freed everywhere to leave the
+\family typewriter
+EMEGENCY MODE HYSTERESIS
+\family default
+, sync should really start.
+ Until that it had been halted.
+\end_layout
+
+\begin_layout Enumerate
+Recommendation: check at secondaries that state
+\family typewriter
+Orphan
+\family default
+ has been left after a while.
+\end_layout
+
+\begin_layout Standard
+Alternatively, there is another method by roughly following the instructions
+ from appendix
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "chap:Alternative-Methods-for"
+
+\end_inset
+
+, but in a slightly different order.
+ In this case, do
+\family typewriter
+leave-resource
+\family default
+ everywhere on
+\emph on
+all
+\emph default
+ secondaries, but
+\emph on
+don't
+\emph default
+ start the
+\family typewriter
+join-resource
+\family default
+ phase
+\emph on
+for now
+\emph default
+.
+ Then cleanup all your secondaries via
+\family typewriter
+log-purge-all
+\family default
+, and finally
+\family typewriter
+log-delete-all all
+\family default
+ at the primary, and wait until the emergency has vanished everywhere.
+ Only after that, re-
+\family typewriter
+join-resource
+\family default
+ your secondaries.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Expert advice for
+\begin_inset Formula $k=2$
+\end_inset
+
+ replicas: this means you had only 1 mirror per resource before the overflow
+ happened.
+ Provided that you have enough space on your LVMs and on
+\family typewriter
+/mars/
+\family default
+, and provided that transaction logging has automatically restarted after
+
+\family typewriter
+leave-resource
+\family default
+ and
+\family typewriter
+log-purge-all
+\family default
+, you can recover redundancy by creating a
+\emph on
+new
+\emph default
+ replica via
+\family typewriter
+marsadm join-resource $res
+\family default
+ on a
+\emph on
+third
+\emph default
+ node.
+ Only after the initial full sync has finished there, run
+\family typewriter
+join-resource
+\family default
+at your original mirror.
+ This way, you will always retain at least one
+\series bold
+consistent mirror
+\series default
+ somewhere.
+ After all is up-to-date, you can delete the superfluous mirror by
+\family typewriter
+marsadm leave-resource $res
+\family default
+ and reclaim the disk space from its underlying LVM disk.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+If you already have
+\begin_inset Formula $k>2$
+\end_inset
+
+ replicas in total, it may be a wise idea to prefer the
+\family typewriter
+leave-resource ; log-purge-all ; join-resource
+\family default
+ method in front of
+\family typewriter
+invalidate
+\family default
+ because it does not invalidate
+\emph on
+all
+\emph default
+ your replicas at the same time (when handled properly in the right order).
+\end_layout
+
+\begin_layout Chapter
+The Macro Processor
+\begin_inset CommandInset label
+LatexCommand label
+name "chap:The-Macro-Processor"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+
+\family typewriter
+marsadm
+\family default
+ comes with a customizable macro processor.
+ It can be used for high-level complex display of the state of MARS (so-called
+
+\emph on
+complex macros
+\emph default
+), as well as for low-level display of lots of individual state values (so-calle
+d
+\emph on
+primitive macros
+\emph default
+).
+\end_layout
+
+\begin_layout Standard
+From the commandline, any macro can be called via
+\family typewriter
+marsadm view-
+\emph on
+$macroname
+\emph default
+ mydata
+\family default
+.
+ The short form
+\family typewriter
+marsadm view mydata
+\family default
+ is equivalent to
+\family typewriter
+marsadm view-default mydata
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+In general, the command
+\family typewriter
+marsadm view-
+\emph on
+$macroname
+\emph default
+ all
+\family default
+ will first call the macro
+\family typewriter
+\emph on
+$macroname
+\family default
+\emph default
+ in a loop for
+\emph on
+all
+\emph default
+ resources we are a
+\emph on
+member locally
+\emph default
+.
+ Finally, a trailing macro
+\family typewriter
+\emph on
+$macroname
+\emph default
+-global
+\family default
+ will be called with an empty
+\family typewriter
+%{res}
+\family default
+ argument, provided that such a macro is defined.
+ This way, you can produce per-resource output followed by global output
+ which does not depend on a particular resource.
+\end_layout
+
+\begin_layout Section
+Predefined Macros
+\end_layout
+
+\begin_layout Standard
+The macro processor is a very flexible and versatile tool for
+\series bold
+customizing
+\series default
+.
+ You can create your own macros, but probably the rich set of predefined
+ macros is already sufficient for your needs.
+\end_layout
+
+\begin_layout Subsection
+Predefined Complex and High-Level Macros
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Predefined-Complex-and"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The following predefined complex macros try to address the information needs
+ of humans.
+ Use them only in scripts when you are prepared about the fact that the
+ output format may change during development of MARS.
+\end_layout
+
+\begin_layout Standard
+Notice: the definitions of predefined complex macros may be updated in the
+ course of the MARS project.
+ However, the primitive macros recursively called by the complex ones will
+ be hopefully rather stable in future (with the exception of bugfixes).
+ If you want to retain an old / outdated version of a complex macro, just
+ check it out from git, follow the instructions in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Creating-your-own"
+
+\end_inset
+
+, and preferably give it a different name in order to avoid confusion with
+ the newer version.
+ In general, it should be possible to use old macros with newer versions
+ of
+\family typewriter
+marsadm
+\family default
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+You might need to check out also old versions of further macros and adapt
+ their names, whenever complex macros call each other.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+default
+\family default
+ This is equivalent to
+\family typewriter
+marsadm view mydata
+\family default
+ without
+\family typewriter
+\emph on
+-maroname
+\family default
+\emph default
+ suffix.
+ It shows a one-line status summary for each resource, optionally followed
+ by informational lines such as progress bars whenever a sync or a fetch
+ of logfiles is currently running.
+ The status line has the following fields:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_deeper
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+%{res}
+\family default
+ resource name.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+[
+\emph on
+this_count
+\emph default
+/
+\emph on
+total_count
+\emph default
+]
+\family default
+ total number of replicas of this resource, out of total number of cluster
+ members.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+%include{diskstate}
+\family default
+ see
+\family typewriter
+diskstate
+\family default
+ macro below.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+%include{replstate}
+\family default
+ see
+\family typewriter
+replstate
+\family default
+ macro below.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+%include{flags}
+\family default
+ see
+\family typewriter
+flags
+\family default
+ macro below.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+%include{role}
+\family default
+ see
+\family typewriter
+role
+\family default
+ macro below.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+%include{primarynode}
+\family default
+ see
+\family typewriter
+primarynode
+\family default
+ macro below.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+%include{commstate}
+\family default
+ see
+\family typewriter
+commstate
+\family default
+ macro below.
+\end_layout
+
+\end_deeper
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+\begin_inset space ~
+\end_inset
+
+ After that, optional lines such as progress bars are appearing only when
+ something unusual is happening.
+ These lines are subject to future changes.
+ For examples, wasted disk space due to missing
+\family typewriter
+resize
+\family default
+ is reported when
+\family typewriter
+%{threshold}
+\family default
+ is exceeded.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+1and1
+\family default
+
+\begin_inset space ~
+\end_inset
+
+or
+\begin_inset space ~
+\end_inset
+
+
+\family typewriter
+default-1and1
+\family default
+ A variant of
+\family typewriter
+default
+\family default
+ for internal use by 1&1 Internet AG.
+ You may call this complex macro by saying
+\family typewriter
+marsadm view-1and1 all
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Note: the
+\family typewriter
+marsadm view-1and1
+\family default
+ command has been intensely tested in Spring 2014 to produce exactly the
+ same output than the 1&1 internal
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+In addition to allow for customization, the macro processor is also meant
+ as an exit strategy for removing dependencies from non-free software.
+
+\series bold
+Please put your future macros also under GPL!
+\end_layout
+
+\end_inset
+
+ tool
+\family typewriter
+marsview
+\family default
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+There are some subtle differences: numbers are displayed in a different
+ precision, some bug fixes in the macro version (which might have occurred
+
+\emph on
+in the meantime
+\emph default
+ ) may lead to different output as a side effect from bug fixes in
+\emph on
+predefined
+\emph default
+ macros, because the original
+\family typewriter
+marsview
+\family default
+ command is currently not actively maintained.
+ Documentation of
+\family typewriter
+marsview
+\family default
+ can be found in the corresponding manpage, see
+\family typewriter
+man marsview
+\family default
+.
+ By construction, this is also the (unmaintained) documentation of
+\family typewriter
+marsadm view-1and1
+\family default
+ and other
+\family typewriter
+-1and1
+\family default
+ macros.
+ Notice that all
+\family typewriter
+*-1and1
+\family default
+ macros are not officially supported by the developer of MARS, and they
+ may disappear in a future major release.
+ However, they could be useful for your own customization macros.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Customization via your own macros (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Creating-your-own"
+
+\end_inset
+
+) is explicitly encouraged by the developer.
+ It would be nice if a vibrant user community would emerge, helping each
+ other by exchange of macros.
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Hint: in order to produce your own customized inspection / monitoring tools,
+ you may ask the author for an official reservation of a macro sub-namespace
+ such as
+\family typewriter
+*-
+\emph on
+yourcompanyname
+\family default
+\emph default
+.
+ You will be fully responsible for your own reserved namespace and can do
+ with it whatever you want.
+ The official MARS release will guarantee that
+\emph on
+no name clashes
+\emph default
+ with your reserved sub-namespace will occur in future.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+default-global
+\family default
+ Currently, this just calls
+\family typewriter
+comminfo
+\family default
+ (see below).
+ May be extended in future.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+diskstate
+\family default
+ Shows the status of the underlying disk device, in the following order
+ of precedence
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+When an earlier list item is displayed, no combinations with following items
+ are possible.
+ This kind of
+\begin_inset Quotes eld
+\end_inset
+
+hiding effect
+\begin_inset Quotes erd
+\end_inset
+
+ can lead to an
+\emph on
+information loss
+\emph default
+.
+ In order to get a non-lossy picture from the state of your system, please
+ look at the
+\family typewriter
+flags
+\family default
+ which are able to display cartesian combinations of more detailed internal
+ states.
+\end_layout
+
+\end_inset
+
+:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_deeper
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+NotJoined
+\family default
+ (cf
+\family typewriter
+%get-disk{}
+\family default
+) No underlying disk device is configured.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+NotPresent
+\family default
+ (cf
+\family typewriter
+%disk-present{}
+\family default
+) The underlying disk device (as configured, see
+\family typewriter
+marsadm view-get-disk
+\family default
+) does not exist or the device node is not accessible.
+ Therefore MARS cannot work.
+ Check that LVM or other software is properly configured and running.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+Detached
+\family default
+ (cf
+\family typewriter
+InConsistent
+\family default
+,
+\family typewriter
+NeedsReplay
+\family default
+,
+\family typewriter
+%todo-attach{}
+\family default
+,
+\family typewriter
+%is-attach{}
+\family default
+) The underlying disk is willingly switched off (see
+\family typewriter
+marsadm detach
+\family default
+), and it actually is no longer opened by MARS.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+Detaching
+\family default
+ (cf
+\family typewriter
+%todo-attach{}
+\family default
+ and
+\family typewriter
+%is-attach{}
+\family default
+) Access to the underlying disk is switched off, but actually not yet
+\family typewriter
+close()
+\family default
+d by MARS.
+ This can happen for a long time on a primary when other secondaries are
+ accessing the disk remotely for syncing.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+DefectiveLog[
+\emph on
+description-text
+\emph default
+]
+\family default
+ (cf
+\family typewriter
+%replay-code{}
+\family default
+) Typicially this indicates an
+\family typewriter
+md5
+\family default
+ checksum error in a transaction logfile, or another (hardware / filesystem)
+ defect.
+ This occurs extremely rarely in practice, but has been observed more frequently
+ during a massive failure of air conditioning in a datacenter, when disk
+ temperatures raised to more than 80° Celsius.
+ Notice that a secondary
+\series bold
+refuses
+\series default
+ to apply any knowingly defective logfile data to the disk.
+ Although this message is
+\emph on
+not directly
+\emph default
+ referring to the underlying disk, it is mentioned here because of its superior
+
+\series bold
+relevance
+\series default
+ for the diskstate.
+ A damaged transaction logfile will always affect the
+\emph on
+actuality
+\emph default
+ of the disk, but not its
+\emph on
+integrity
+\emph default
+ (by itself).
+ What to do in such a case?
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+When the damage is only at one of your secondaries, you should first ensure
+ that the primary has a good logfile after a
+\family typewriter
+marsadm log-rotate
+\family default
+, then try
+\family typewriter
+marsadm invalidate
+\family default
+ at the damaged secondary.
+ It is crucial that the primary has a fresh correct logfile behind the error
+ position, and that it is continuing to operate correctly.
+\end_layout
+
+\begin_layout Enumerate
+When
+\emph on
+all
+\emph default
+ of your secondaries are reporting
+\family typewriter
+DefectiveLog
+\family default
+, the primary could have
+\emph on
+produced
+\emph default
+ a damaged logfile (e.g.
+ in RAM, in a DMA channel, etc) while continuing to operate, and all of
+ your secondaries got that defective logfile.
+ After
+\family typewriter
+marsadm log-delete-all all
+\family default
+, you can check this by comparing the
+\family typewriter
+md5sum
+\family default
+ of the first primary logfile (having the lowest serial number) with the
+ versions on your replicas.
+ The problem is that you don't know whether the primary side has a silent
+ corruption on any of its disks, or not.
+ You will need to take an operational decision whether to switchover to
+ a secondary via
+\family typewriter
+primary --force
+\family default
+, or whether to continue operation at the primary and
+\family typewriter
+invalidate
+\family default
+ your secondaries.
+\end_layout
+
+\begin_layout Enumerate
+When the original primary is affected in a very bad way, such that it crashed
+ badly and afterwards even recovery of the
+\emph on
+primary
+\emph default
+ is impossible
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+In such a rare case, the
+\emph on
+original primary
+\emph default
+ (but not any other host)
+\series bold
+refuses
+\series default
+ to come up during recovery with
+\emph on
+his own
+\emph default
+ logfile originally produced by
+\emph on
+himself
+\emph default
+.
+ This is not a bug, but saves you from incorrectly assuming that your original
+ primary disk were consistent - it is
+\emph on
+known
+\emph default
+ to be inconsistent, but recovery is impossible due to the damaged logfile.
+ Thus
+\emph on
+this one
+\emph default
+ replica is trapped by defective hardware.
+ The other replicas shouldn't.
+\end_layout
+
+\end_inset
+
+ due to this error (which typically occurs extremely rarely, observed two
+ times during 7 millions of operating hours on defective hardware), you
+ need to take an operational decision between the following alternatives:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+switchover to a former secondary via
+\family typewriter
+primary --force
+\family default
+, producing a split brain, and producing some (typically small) data loss.
+ However, integrity is more important than actuality in such an extreme
+ case.
+\end_layout
+
+\begin_layout Enumerate
+deconstruction of the resource at
+\emph on
+all
+\emph default
+ replicas via
+\family typewriter
+leave-resource --force
+\family default
+, running
+\family typewriter
+fsck
+\family default
+ or similar tools by hand at the underlying disks, selecting the best replica
+ out of them, and finally re-constructing the resource again.
+\end_layout
+
+\begin_layout Enumerate
+restore your backup.
+\end_layout
+
+\end_deeper
+\end_deeper
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+Orphan
+\family default
+ The secondary cannot replay data anymore, because it has been kicked out
+ for avoidance of emergency mode.
+ The data is not recent anymore.
+ Typically,
+\family typewriter
+marsadm invalidate
+\family default
+ needs to be done.
+\begin_inset Newline newline
+\end_inset
+
+There is an execption: shortly after
+\family typewriter
+join-resource
+\family default
+ or
+\family typewriter
+invalidate
+\family default
+, it may take some time until state
+\family typewriter
+Orphan
+\family default
+ may be left, and until the newest logfile has appeared at your secondary
+ site (depending on the size of logfiles, and on your network).
+ In case of network problems, this may take very long.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ This state tells you that your replica is not current, and currently not
+ being updated at all.
+ Don't forget to
+\series bold
+monitor
+\series default
+ for longer occurrences of this state! Otherwise you may get a big surprise
+ when you need a forceful emergency failover, but your replica is very old
+ or even does not really exist at all.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+NoAttach
+\family default
+ (cf
+\family typewriter
+%is-attach{}
+\family default
+) The underlying disk is currently not opened by MARS.
+ Reasons may be that the kernel module is not loaded, or an exclusive
+\family typewriter
+open()
+\family default
+ is currently not possible because somebody else has already opened it.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+InConsistent
+\family default
+ (cf
+\family typewriter
+%is-consistent{}
+\family default
+) A logfile replay and/or sync is known to be needed / or to complete (e.g.
+ after
+\family typewriter
+invalidate
+\family default
+ has started) in order to restore local consistency (for details, look at
+
+\family typewriter
+flags
+\family default
+).
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Hint: in the current implementation of MARS, this will never happen on secondari
+es during ordinary replay (but only when either sync has not yet finished,
+ or when the
+\emph on
+initial
+\emph default
+ logfile replay after the sync has not yet finished), because the ordinary
+ logfile replay always maintains anytime consistency once a consistent state
+ had been reached.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\emph on
+Only
+\emph default
+ in case of a primary node crash, and
+\emph on
+only
+\emph default
+ after attempts have failed to become primary again (e.g.
+ IO errors, etc), this
+\emph on
+can
+\emph default
+ (but need not) mean that something went wrong.
+ Even in such an extremely unlikely event, chances are high that
+\family typewriter
+fsck
+\family default
+can fix any remaining problems (and, of course, you can also switchover
+ to a former secondary).
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+When this message appears, simply start MARS again (e.g.
+
+\family typewriter
+modprobe mars; marsadm up all
+\family default
+), in whatever role you are intending.
+ This will
+\emph on
+automatically
+\emph default
+ try to replay any necessary transaction logfile(s) in order to fix the
+ inconsistency.
+ Only if the automatic fix fails and this message persists for a long time
+ without progress, you
+\emph on
+might
+\emph default
+ have a problem.
+ Typically, as observed at a large installation at 1&1, this happens extremely
+ rarely, and then typically indicates that your hardware is likely to be
+ defective.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+OutDated[FR]
+\family default
+ (cf
+\family typewriter
+%work-reached{}
+\family default
+) Only at secondaries.
+ Tells whether it is
+\emph on
+currently known
+\emph default
+ that the disk has any lag-behind when compared to the
+\emph on
+currently known
+\emph default
+ state of the current designated primary (if there exists one).
+ Only meaningful if a current designated primary exists.
+ Notice that this kind of status display is subject to
+\emph on
+natural races
+\emph default
+, for example when new logfile data has been produced in parallel, or network
+ propagation is very slow.
+ Additional information is in brackets:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_deeper
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+[F]
+\family default
+ Fetch is known to be needed.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+[R]
+\family default
+ Replay is known to be needed.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+[FR]
+\family default
+ Both are known to be needed.
+\end_layout
+
+\end_deeper
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+WriteBack[
+\emph on
+amount
+\emph default
+]
+\family default
+ (cf
+\family typewriter
+%is-primary{}
+\family default
+ and amount via
+\family typewriter
+%writeback-rest{}
+\family default
+) Appears only at actual primaries (whether designated or not), when the
+ writeback from the RAM buffer is active (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-Transaction-Logger"
+
+\end_inset
+
+).
+ The
+\emph on
+amount
+\emph default
+ is displayed in human readable form, and may be used for a very rough estimatio
+n of recovery time after a primary crash.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+Recovery
+\family default
+ (cf
+\family typewriter
+%todo-primary{}
+\family default
+) Appears only at the designated primary before it actually has become primary.
+ Similar to database recovery, this indicates the recovery phase after a
+ crash
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+In some cases,
+\family typewriter
+primary --force
+\family default
+ may also trigger this message.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+EmergencyMode
+\family default
+ (cf
+\family typewriter
+%is-emergency{}
+\family default
+) A current designated primary exists, and it is known that this host has
+ entered emergency mode.
+ See section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Emergency-Mode"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+UpToDate
+\family default
+ Displayed when none of the above has been detected.
+\end_layout
+
+\end_deeper
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+diskstate-1and1
+\family default
+ A variant for internal use by 1&1 Internet AG.
+ See above note.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+replstate
+\family default
+ Shows the status of the replication in the following order of precedence:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_deeper
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+ModuleNotLoaded
+\family default
+ (cf
+\family typewriter
+%is-module-loaded{}
+\family default
+) No kernel module is loaded, and as a consequence no
+\family typewriter
+/proc/sys/mars/
+\family default
+ does exist.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+UnResponsive
+\family default
+ (cf
+\family typewriter
+%is-alive{%{host}}
+\family default
+) The main thread
+\family typewriter
+mars_light
+\family default
+ did not do any noticable work for more than
+\family typewriter
+%{window}
+\family default
+ (default 60) seconds.
+ Notice that this may happen when deleting
+\emph on
+extremely
+\emph default
+ large logfiles (up to hundreds of gigabytes or terabytes).
+ If this happens for a
+\emph on
+very
+\emph default
+ long time, you should check whether you might need a reboot in order to
+ fix the hang.
+ The time window may be changed by
+\family typewriter
+--window=$seconds
+\family default
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+NotJoined
+\family default
+ (cf
+\family typewriter
+%get-disk{}
+\family default
+) No underlying disk device is configured for this resource.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+NotStarted
+\family default
+ (cf
+\family typewriter
+%todo-attach{}
+\family default
+) Replication has not been started.
+\end_layout
+
+\begin_layout Itemize
+When the current host is designated as a primary, the rest of the precedence
+ list looks as follows:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_deeper
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+EmergencyMode
+\family default
+ (cf.
+
+\family typewriter
+%is-emergency{}
+\family default
+) See section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Emergency-Mode"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+Replicating
+\family default
+ (cf.
+
+\family typewriter
+%is-primary{}
+\family default
+) Primary mode has been entered.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+NotYetPrimary
+\family default
+(catchall) This means the current host
+\emph on
+should
+\emph default
+ act as a primary (see
+\family typewriter
+marsadm primary
+\family default
+ or
+\family typewriter
+marsadm primary --force
+\family default
+), but currently doesn't (yet).
+ This happens during logfile replay, before primary mode is actually entered.
+ Notice that replay of very big logfiles may take a long time.
+\end_layout
+
+\end_deeper
+\begin_layout Itemize
+When the current host is
+\emph on
+not
+\emph default
+ designated as a primary:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_deeper
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+PausedSync
+\family default
+ (cf.
+
+\family typewriter
+%sync-rest{}
+\family default
+ and
+\family typewriter
+%todo-sync{}
+\family default
+) Some data needs to be synced, but sync is currently switched off.
+ See
+\family typewriter
+marsadm {pause,resume}-sync
+\family default
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+Syncing
+\family default
+ (cf.
+
+\family typewriter
+%is-sync{}
+\family default
+) Sync is currently running.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+PausedFetch
+\family default
+ (cf.
+
+\family typewriter
+%todo{fetch}
+\family default
+) Fetch is currently switched off.
+ See
+\family typewriter
+marsadm {pause,resume}-fetch
+\family default
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+PausedReplay
+\family default
+ (cf.
+
+\family typewriter
+%todo{replay}
+\family default
+) Replay is currently switched off.
+ See
+\family typewriter
+marsadm {pause,resume}-replay
+\family default
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+NoPrimaryDesignated
+\family default
+ (cf.
+
+\family typewriter
+%get-primary{}
+\family default
+) A
+\family typewriter
+secondary
+\family default
+ command has been given somewhere in the cluster.
+ Thus no designated primary exists.
+ All resource members are in state
+\family typewriter
+Secondary
+\family default
+ or try to approach it.
+ Sync and other operations are not possible.
+ This state is therefore not recommended.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+PrimaryUnreachable
+\family default
+ (cf.
+
+\family typewriter
+%is-alive{}
+\family default
+) A current designated primary has been set, but this host has not been
+ remotely updated for more than 60 seconds (see also
+\family typewriter
+--window=$seconds
+\family default
+).
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+Orphan
+\family default
+ The secondary cannot replay data anymore, because it has been kicked out
+ for avoidance of emergency mode.
+ The data is not recent anymore.
+ Typically,
+\family typewriter
+marsadm invalidate
+\family default
+ needs to be done.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+Replaying
+\family default
+ (catchall) None of the previous conditions have triggered.
+\end_layout
+
+\end_deeper
+\end_deeper
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+replstate-1and1
+\family default
+ A variant for internal use by 1&1 Internet AG.
+ See above note.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+flags
+\family default
+ For each of disk, consistency, attach, sync, fetch, and replay, show exactly
+ one character.
+ Each character is either a capital one, or the corresponding lowercase
+ one, or a dash.
+ The meaning is as follows:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_deeper
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+disk/device:
+\family typewriter
+D
+\family default
+ = the device
+\family typewriter
+/dev/mars/mydata
+\family default
+ is present,
+\family typewriter
+d
+\family default
+ = only the underlying disk
+\family typewriter
+/dev/lv-x/mydata
+\family default
+ is present,
+\family typewriter
+-
+\family default
+ = none present / configured.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+consistency: this relates to the
+\emph on
+underlying disk
+\emph default
+, not to
+\family typewriter
+/dev/mars/mydata
+\family default
+!
+\family typewriter
+C
+\family default
+ = locally consistent,
+\family typewriter
+c
+\family default
+ = maybe inconsistent (no guarantee), - = cannot determine.
+ Notice: this does not tell anything about
+\emph on
+actuality
+\emph default
+.
+ Notice: like the other flags, this flag is subject to races and therefore
+ should be relied on only in
+\emph on
+detached
+\emph default
+ state! See also description of macro
+\family typewriter
+is-consistent
+\family default
+ below.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+attach:
+\family typewriter
+A
+\family default
+ = attached,
+\family typewriter
+a
+\family default
+ = currently trying to attach/detach but not yet ready (intermediate state),
+
+\family typewriter
+-
+\family default
+ = attach is switched off.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+sync:
+\family typewriter
+S
+\family default
+ = sync finished,
+\family typewriter
+s
+\family default
+ = currently syncing,
+\family typewriter
+-
+\family default
+ = sync is switched off.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+fetch:
+\family typewriter
+F
+\family default
+ = according to knowlege, fetched logfiles are up-to-date,
+\family typewriter
+f
+\family default
+ = currently fetching (some parts of) a logfile,
+\family typewriter
+-
+\family default
+ = fetch is switched off.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+replay:
+\family typewriter
+R
+\family default
+ = all fetched logfiles are replayed,
+\family typewriter
+r
+\family default
+ = currently replaying,
+\family typewriter
+-
+\family default
+ = replay is switched off.
+\end_layout
+
+\end_deeper
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+flags-1and1
+\family default
+ A variant for internal use by 1&1 Internet AG.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+todo-role
+\family default
+ Shows the
+\emph on
+designated
+\emph default
+ state:
+\family typewriter
+None
+\family default
+,
+\family typewriter
+Primary
+\family default
+ or
+\family typewriter
+Secondary
+\family default
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+role
+\family default
+ Shows the
+\emph on
+actual
+\emph default
+ state:
+\family typewriter
+None
+\family default
+,
+\family typewriter
+NotYetPrimary
+\family default
+,
+\family typewriter
+Primary
+\family default
+,
+\family typewriter
+RemainsPrimary
+\family default
+, or
+\family typewriter
+Secondary
+\family default
+.
+ Any differences to the designated state are indicated by a prefix to the
+ keyword
+\family typewriter
+Primary
+\family default
+:
+\family typewriter
+NotYet
+\family default
+ means that it
+\emph on
+should
+\emph default
+ become primary, but actually hasn't.
+ Vice versa,
+\family typewriter
+Remains
+\family default
+ means that it
+\emph on
+should
+\emph default
+ leave primary state in order to become secondary, but actually cannot do
+ that because the
+\family typewriter
+/dev/mars/mydata
+\family default
+ device is currently in use .
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+%todo-primary{} == 0
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+%todo-primary{} == 1
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+%is-primary{} == 0
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+None
+\family default
+ /
+\family typewriter
+Secondary
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+NotYetPrimary
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+%is-primary{} == 1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+RemainsPrimary
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+Primary
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+role-1and1
+\family default
+ A variant for internal use by 1&1 Internet AG.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+primarynode
+\family default
+ Display
+\family typewriter
+(none)
+\family default
+ or the hostname of the designated primary.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+primarynode-1and1
+\family default
+ A variant for internal use by 1&1 Internet AG.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+commstate
+\family default
+ When the last metadata communication to the designated primary is longer
+ ago than
+\family typewriter
+${window}
+\family default
+ (see also
+\family typewriter
+--window=
+\emph on
+seconds
+\family default
+\emph default
+ option), display that age in human readable form.
+ See also primitive macro
+\family typewriter
+%alive-age{}
+\family default
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+syncinfo
+\family default
+ Shows an informational progress bar when sync is running.
+ Intended for humans.
+ Scripts should not rely on any details from this.
+ Scripts may use this only as an
+\emph on
+approximate
+\emph default
+ means for detecting progress (when comparing the
+\emph on
+full
+\emph default
+ output text to a prior version and finding
+\emph on
+any
+\emph default
+ difference, they may conclude that some progress has happened, how small
+ whatsoever).
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+syncinfo-1and1
+\family default
+ A variant for internal use by 1&1 Internet AG.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+replinfo
+\family default
+ Shows an informational progress bar when fetch is running.
+ This should not be used for scripting at all, because it contains realtime
+ information in human-readable form.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+replinfo-1and1
+\family default
+ A variant for internal use by 1&1 Internet AG.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+fetch-line
+\family default
+ Additional details, called by
+\family typewriter
+replinfo
+\family default
+.
+ Shows the amount of data to be fetched, as well as the current transfer
+ rate and a very rough estimation of the future duration.
+ When primitive macros
+\family typewriter
+%fetch-age{}
+\family default
+ or
+\family typewriter
+%fetch-lag{}
+\family default
+ exceed
+\family typewriter
+${window}
+\family default
+, their values are also displayed for human informational purposes.
+ See description of these primitive macros.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+replay-line
+\family default
+ Additional details, called by
+\family typewriter
+replinfo
+\family default
+.
+ Shows the amount of data to be replayed, as well as the current replay
+ rate and a very rough estimation of the future duration.
+ When primitive macro
+\family typewriter
+%replay-age{}
+\family default
+ exceeds
+\family typewriter
+${window}
+\family default
+, it is also displayed for human informational purposes.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+comminfo
+\family default
+ When the network communication is in an unusual condition, display it.
+ Otherwise, don't produce any output.
+\end_layout
+
+\begin_layout Subsection
+Predefined Primitive Macros
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Predefined-Trivial-Macros"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Intended for Humans
+\end_layout
+
+\begin_layout Standard
+In the following, shell glob notation
+\family typewriter
+{a,b}
+\family default
+ is used to document similar variants of similar macros in a single place.
+ When you actually call the macro, you must choose one of the possible variants
+ (excluding the braces).
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+the-err-msg
+\family default
+ Show reported errors for a resource.
+ When the resource argument is missing or empty, show global error information.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+all-err-msg
+\family default
+ Like before, but show all information including those which are
+\family typewriter
+OK
+\family default
+.
+ This way, you get a list
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+The list may be extended in future versions of MARS.
+\end_layout
+
+\end_inset
+
+ of
+\emph on
+all
+\emph default
+ potential error information present in the system.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{all,the}-wrn-msg
+\family default
+ Show all / reported warnings in the system.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{all,the}-inf-msg
+\family default
+ Show all / reported informational messages in the system.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{all,the}-msg
+\family default
+ Show all / reported messages regardless of its classification.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{all,the}-global-msg
+\family default
+ Show global messages not associated with any resource (the resource argument
+ of the
+\family typewriter
+marsadm
+\family default
+ command is ignored in this case).
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{all,the}-global-{inf,wrn,err}-msg
+\family default
+ Dito, but more specific.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{all,the}-pretty-{global-,}{inf-,wrn-,err-,}msg
+\family default
+ Dito, but show numerical timestamps in a human readable form.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{all,the}-{global-,}{inf-,wrn-,err-,}count
+\family default
+ Instead of showing the messages, show their count (number of lines).
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+errno-text
+\family default
+ This macro takes 1 argument, which must represent a Linux
+\family typewriter
+errno
+\family default
+ number, and converts it to human readable form (similar to the C
+\family typewriter
+strerror()
+\family default
+ function).
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+todo-{attach,sync,fetch,replay,primary}
+\family default
+ Shows a boolean value (0 or 1) indicating the current state of the correspondin
+g todo switch (whether on or off).
+ The meaning of todo switches is illustrated in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-State-of"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+get-resource-{fat,err,wrn}
+\family default
+ Access to the internal error status files.
+ This is not an official interface and may thus change at any time without
+ notice.
+ Use this only for human inspection, not for scripting!
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ These macros, as well as the error status files, are likely to disappear
+ in future versions of MARS.
+ They should be used for debugging only.
+ At least when merging into the upstream Linux kernel, only the
+\family typewriter
+*-msg
+\family default
+ macros will likely survive.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+get-resource-{fat,err,wrn}-count
+\family default
+ Dito, but get the number of lines instead of the text.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+replay-code
+\family default
+ Indicate the current state of logfile replay / recovery:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_deeper
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+(empty) Unknown.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+0 No replay is currently running.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+1 Replay is currently running.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+2 Replay has successfully stopped.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+<0 See Linux
+\family typewriter
+errno
+\family default
+ code.
+ Typically this indicates a damaged logfile, or another filesystem error
+ at
+\family typewriter
+/mars
+\family default
+.
+\end_layout
+
+\end_deeper
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+is-{attach,sync,fetch,replay,primary,module-loaded}
+\family default
+ Shows a boolean value (0 or 1) indicating the
+\emph on
+actual
+\emph default
+ state, whether the corresponding action has been actually carried out,
+ or not (yet).
+ Notice that the values indicated by
+\family typewriter
+is-*
+\family default
+ may differ from the
+\family typewriter
+todo-*
+\family default
+ values when something is not (yet) working.
+ More explanations can be found in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-State-of"
+
+\end_inset
+
+.
+
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+is-split-brain
+\family default
+ Shows whether split brain (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Split-Brain-Resolution"
+
+\end_inset
+
+) has been detected, or not.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+is-consistent
+\family default
+ Shows whether the
+\emph on
+underlying disk
+\emph default
+ is in a locally consistent state, i.e.
+ whether it
+\emph on
+could
+\emph default
+ be (potentially) detached and then used for read-only test-mounting
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice that the
+\emph on
+writeback
+\emph default
+ at the primary side is out-of-order by default, for performance reasons.
+ Therefore, the underlying disk is only guaranteed to be consistent when
+ there is no data left to be written back.
+ Notice that this condition is racy by construction.
+ When your primary node crashes during writeback and then comes up again,
+ you must do a
+\family typewriter
+modprobe mars
+\family default
+ first in order to automatically replay the transaction logfiles, which
+ will automatically heal such temporary inconsistencies.
+\end_layout
+
+\end_inset
+
+.
+ Don't confuse this with the consistency of
+\family typewriter
+/dev/mars/mydata
+\family default
+, which is by construction
+\emph on
+always
+\emph default
+ locally consistent once it has appeared
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Exceptions are possible when using
+\family typewriter
+marsadm fake-sync
+\family default
+.
+ Even in split brain situations,
+\family typewriter
+marsadm primary --force
+\family default
+ tries to prevent any further potential exception as best as it can, by
+ not letting
+\family typewriter
+/dev/mars/mydata
+\family default
+ to appear and by insisting on split brain resolution first.
+ In future implementations, this might change if more pressure is put on
+ the developer to sacrifice consistency in preference to not waiting for
+ a full logfile replay.
+\end_layout
+
+\end_inset
+
+.
+ By construction of MARS, the disk of secondaries will
+\emph on
+always
+\emph default
+ remain in a locally consistent state once the initial sync has finished
+ as well as the initial logfile replay.
+ Notice that local consistency does not necessarily imply actuality (see
+ high-level explanation in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Behaviour-of-MARS"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+is-emergency
+\family default
+ Shows whether emergency mode (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Emergency-Mode"
+
+\end_inset
+
+) has been entered for the named resource, or not.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+rest-space
+\family default
+ (global, no resource argument necessary) Shows the
+\emph on
+logically
+\emph default
+ available space in
+\family typewriter
+/mars/
+\family default
+, which may deviate from the physically available space as indicated by
+ the
+\family typewriter
+df
+\family default
+ command.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+get-{disk,device}
+\family default
+ Show the name of the underlying disk, or of the
+\family typewriter
+/dev/mars/mydata
+\family default
+ device (if it is available).
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{disk,device}-present
+\family default
+ Show (as a boolean value) whether the underlying disk, or the
+\family typewriter
+/dev/mars/mydata
+\family default
+ device, is available.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+device-opened
+\family default
+ Show (as a number) how often
+\family typewriter
+/dev/mars/mydata
+\family default
+ has been actually openend, e.g.
+ by
+\family typewriter
+mount
+\family default
+ or by some processes like
+\family typewriter
+dd
+\family default
+, or by iSCSI, etc.
+\end_layout
+
+\begin_layout Subsubsection
+Intended for Scripting
+\end_layout
+
+\begin_layout Standard
+While complex macros may output a whole bunch of information, the following
+ primitive macros are outputting exactly one value.
+ They are intended for script use (cf.
+ section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Scripting-HOWTO"
+
+\end_inset
+
+).
+ Of course, curious humans may also try them :)
+\end_layout
+
+\begin_layout Standard
+In the following, shell glob notation
+\family typewriter
+{a,b}
+\family default
+ is used to document similar variants of similar macros in a single place.
+ When you actually call the macro, you must choose one of the possible variants
+ (excluding the braces).
+\end_layout
+
+\begin_layout Paragraph
+Name Querying
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+cluster-members
+\family default
+ Show a newline-separated list of all host names participating in the cluster.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+resource-members
+\family default
+ Show a newline-separated list of all host names participating in the particular
+ resource
+\family typewriter
+%{res}
+\family default
+.
+ Notice that this may be a subset of
+\family typewriter
+%cluster-members{}
+\family default
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{my,all}-resources
+\family default
+ Show a newline-separated list of either all resource names existing in
+ the cluster, or only those where the current host
+\family typewriter
+%{host}
+\family default
+ is member.
+ Optionally, you may specify the hostname as a parameter, e.g.
+
+\family typewriter
+%my-resources{
+\emph on
+otherhost
+\emph default
+}
+\family default
+.
+\end_layout
+
+\begin_layout Paragraph
+Amounts of Data Inquiry
+\end_layout
+
+\begin_layout Standard
+\begin_inset Float figure
+placement h
+wide false
+sideways false
+status open
+
+\begin_layout Plain Layout
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/fetch-replay-total.fig
+ width 80col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Caption Standard
+
+\begin_layout Plain Layout
+overview on amounts / cursors
+\begin_inset CommandInset label
+LatexCommand label
+name "fig:overview-on-amounts"
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+The following macros are meaningful for both primary and secondary nodes:
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+deletable-size
+\family default
+ Show the total amount of
+\emph on
+locally present
+\emph default
+ logfile data which
+\emph on
+could
+\emph default
+ be deleted by
+\family typewriter
+marsadm log-delete-all mydata
+\family default
+.
+ This differs almost always from both
+\family typewriter
+replay-pos
+\family default
+ and
+\family typewriter
+occupied-size
+\family default
+ due to granularity reasons (only whole logfiles can be deleted).
+ Units are
+\emph on
+bytes
+\emph default
+, not kilobytes.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+occupied-size
+\family default
+ Show the total amount of
+\emph on
+locally present
+\emph default
+ logfile data (sum of all file sizes).
+ This is often roughly approximate to
+\family typewriter
+fetch-pos
+\family default
+, but it may differ vastly (in both directions) when logfiles are not completely
+ transferred, when some are damaged, during split brain, after a
+\family typewriter
+join-resource
+\family default
+ /
+\family typewriter
+invalidate
+\family default
+, or when the resource is in emergency mode (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Emergency-Mode"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+disk-size
+\family default
+ Show the size of the underlying local disk in bytes.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+resource-size
+\family default
+ Show the logical size of the resource in bytes.
+ When this value is lower than
+\family typewriter
+disk-size
+\family default
+, you are wasting space.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+device-size
+\family default
+ At a primary node, this may differ from
+\family typewriter
+resource-size
+\family default
+ only for a very short time during the
+\family typewriter
+resize
+\family default
+ operation.
+ At secondaries, there will be no difference.
+\end_layout
+
+\begin_layout Standard
+\noindent
+The following macros are only meaningful for resources in primary mode:
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+writeback-rest
+\family default
+ Show the amount of data which is already in the transaction logfile, but
+ has not yet been written back to the underlying disk.
+ This may be used for estimation of recovery time after a potential primary
+ crash.
+ The writeback buffer is explained by the graphics at
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:The-Transaction-Logger"
+plural "false"
+caps "false"
+noprefix "false"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+The following macros are only meaningful for resources in secondary mode.
+ By information theoretic limits, they can only tell what is
+\emph on
+locally known
+\emph default
+.
+ They
+\series bold
+cannot
+\series default
+ reflect the
+\begin_inset Quotes eld
+\end_inset
+
+true (global) state
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice that according to Einstein's law, and according to observations by
+ Lamport, the concept of
+\begin_inset Quotes eld
+\end_inset
+
+true state
+\begin_inset Quotes erd
+\end_inset
+
+ does not exist at all in a distributed system.
+ Anything you can know in a distributed system is always local knowlege,
+ which races with other (remote) knowlege, and may be outdated at
+\emph on
+any
+\emph default
+ time.
+\end_layout
+
+\end_inset
+
+
+\begin_inset Quotes erd
+\end_inset
+
+ of a cluster, in particular during network partitions.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{sync,fetch,replay,work}-size
+\family default
+ Show the total amount of data which is / was to be processed by either
+ sync, fetch, or replay.
+
+\family typewriter
+work-size
+\family default
+ is equivalent to
+\family typewriter
+fetch-size
+\family default
+.
+
+\family typewriter
+replay-size
+\family default
+ is equivalent to
+\family typewriter
+fetch-pos
+\family default
+ (see below).
+ Units are
+\emph on
+bytes
+\emph default
+, not kilobytes.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{sync,fetch,replay,work}-pos
+\family default
+ Show the total amount of data which is already processed (current
+\begin_inset Quotes eld
+\end_inset
+
+cursor
+\begin_inset Quotes erd
+\end_inset
+
+ position).
+
+\family typewriter
+work-pos
+\family default
+ is equivalent to
+\family typewriter
+replay-pos
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+The 0% point is the
+\emph on
+locally contiguous
+\emph default
+ amount of data since the last
+\family typewriter
+create-resource
+\family default
+,
+\family typewriter
+join-resource
+\family default
+, or
+\family typewriter
+invalidate
+\family default
+, or since the last emergency mode, but possibly shortened by
+\family typewriter
+log-delete
+\family default
+s.
+ Notice that the 0% point may be different on different cluster nodes, because
+ their resource history may be different or non-contiguous during split
+ brain, or after a
+\family typewriter
+join-resource
+\family default
+, or after
+\family typewriter
+invalidate
+\family default
+, or during / after emergency mode.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{sync,fetch,replay,work}-rest
+\family default
+ Shows the difference between
+\family typewriter
+*-size
+\family default
+ and
+\family typewriter
+*-pos
+\family default
+ (amount of work to do).
+
+\family typewriter
+work-rest
+\family default
+ is therefore the difference between
+\family typewriter
+fetch-size
+\family default
+ and
+\family typewriter
+replay-pos
+\family default
+, which is the
+\emph on
+total
+\emph default
+ amount of work to do (regardless whether to be fetched and/or to be replayed).
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{sync,fetch,replay,work}-reached
+\family default
+ Boolean value indicating whether
+\family typewriter
+*-rest
+\family default
+ dropped down to zero
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Recall from chapter
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "chap:Use-Cases-for"
+
+\end_inset
+
+ that MARS (in its current stage of development) does only guarantee local
+ consistency, but cannot guarantee actuality in all imaginable situations.
+ Notice that a general notion of
+\begin_inset Quotes eld
+\end_inset
+
+actuality
+\begin_inset Quotes erd
+\end_inset
+
+ is
+\emph on
+undefinable
+\emph default
+ in a widely distributed system at all, according to Einstein's laws.
+\end_layout
+
+\begin_layout Plain Layout
+Let's look at an example.
+ In case of a node crash, and after the node is up again, a
+\family typewriter
+modprobe mars
+\family default
+ has to occur, in order to replay the transaction logs of MARS again.
+ However, at the recovery phase before, the journalling
+\family typewriter
+ext4
+\family default
+ filesystem
+\family typewriter
+/mars/
+\family default
+
+\emph on
+may
+\emph default
+ have rolled back some internal symlink updates which have occurred immediately
+ before the crash.
+ MARS is relying on the fact that journalling filesystems like
+\family typewriter
+ext4
+\family default
+ should do their recovery in a consistent way, possibly by sacrifycing actuality
+ a little bit.
+ Therefore, the above macros cannot guarantee to deliver true information
+ about what is persisted at the moment.
+\end_layout
+
+\begin_layout Plain Layout
+Notice that there are further potential caveats.
+\end_layout
+
+\begin_layout Plain Layout
+In case of
+\family typewriter
+{sync,fetch}-reached
+\family default
+, MARS uses
+\family typewriter
+bio
+\family default
+ callbacks resp.
+
+\family typewriter
+fdatasync()
+\family default
+ by default, thus the underlying storage layer has
+\emph on
+told
+\emph default
+ us that it
+\emph on
+believes
+\emph default
+ it has commited the data in a reboot-safe way.
+ Whether this is
+\emph on
+really
+\emph default
+ true does not depend on MARS, but on the lower layers of the storage hierarchy.
+ There exists hardware where this claim is known to be wrong under certain
+ circumstances, such as certain hard disk drives in certain modes of operation.
+ Please check the hardware for any violations of storage semantics under
+ certain circumstances such as power loss, and check information sources
+ like magazines about the problem area.
+ Please notice that such a problem, if it exists at all, is independent
+ from MARS.
+ It would also exist if you wouldn't use MARS on the same system.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{fetch,replay,work}-threshold-reached
+\family default
+ Boolean value indicating whether
+\family typewriter
+*-rest
+\family default
+ dropped down to
+\family typewriter
+%{threshold}
+\family default
+, which is pre-settable by the
+\family typewriter
+--threshold=
+\emph on
+size
+\family default
+\emph default
+ command line option (default is 10 MiB).
+ In asynchronous use cases of MARS, this should be preferred over
+\family typewriter
+*-reached
+\family default
+ for
+\emph on
+human display
+\emph default
+, because it produces less flickering by the inevitable replication delay.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{fetch,replay,work}-almost-reached
+\family default
+ Boolean value indicating whether
+\family typewriter
+*-rest
+\family default
+
+\emph on
+almost
+\emph default
+ /
+\emph on
+approximately
+\emph default
+ dropped down to zero.
+ The default is that at lease 990 permille are reached.
+ In asynchronous use cases of MARS, this can be preferred over
+\family typewriter
+*-reached
+\family default
+ for
+\emph on
+human display
+\emph default
+ only, because it produces less flickering by the inevitable replication
+ delay.
+ However, don't base any decisions on this!
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{sync,fetch,replay,work}-percent
+\family default
+ The cursor position
+\family typewriter
+*-pos
+\family default
+ as a percentage of
+\family typewriter
+*-size
+\family default
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{sync,fetch,replay,work}-permille
+\family default
+ The cursor position
+\family typewriter
+*-pos
+\family default
+ as permille of
+\family typewriter
+*-size
+\family default
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{sync,fetch,replay,work}-rate
+\family default
+ Show the current throughput in bytes
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice that the internal granularity reported by the kernel may be coarser,
+ such as KiB.
+ This interfaces abstracts away from kernel internals and thus presents
+ everything in byte units.
+\end_layout
+
+\end_inset
+
+ per second.
+
+\family typewriter
+work-rate
+\family default
+ is the
+\emph on
+maximum
+\emph default
+ of
+\family typewriter
+fetch-rate
+\family default
+ and
+\family typewriter
+replay-rate
+\family default
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{sync,fetch,replay,work}-remain
+\family default
+ Show the
+\emph on
+estimated
+\emph default
+ remaining time for completion of the respective operation.
+ This is just a very raw guess.
+ Units are seconds.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+summary-vector
+\family default
+ Show the colon-separated CSV value
+\family typewriter
+%replay-pos{}:%fetch-pos{}:%fetch-size{}
+\family default
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+replay-basenr
+\family default
+Get currently first reachable logfile number (see figure
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "fig:overview-on-amounts"
+
+\end_inset
+
+).
+ Only for curious humans or for debugging / monitoring - don't base any
+ decisions on this.
+ Use the
+\family typewriter
+*-{pos,size}
+\family default
+ macros instead.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{replay,fetch,work}-lognr
+\family default
+Get current logfile number of replay or fetch position, or of the currently
+ known last reachable number (see figure
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "fig:overview-on-amounts"
+
+\end_inset
+
+).
+ Only for curious humans or for debugging / monitoring - don't base any
+ decisions on this.
+ Use the
+\family typewriter
+*-{pos,size}
+\family default
+ macros instead.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{replay,fetch,work}-logcount
+\family default
+Get current number of logfiles which are already replayed, or are already
+ fetched, or are to be applied in total (see figure
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "fig:overview-on-amounts"
+
+\end_inset
+
+).
+ Only for curious humans or for debugging / monitoring - don't base any
+ decisions on this.
+ Use the
+\family typewriter
+*-{rest}
+\family default
+ macros instead.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+alive-timestamp
+\family default
+ Tell the Lamport Unix timestamp (seconds since 1970) of the last metadata
+ communication to the designated primary (or to any other host given by
+ the first argument).
+ Returns
+\begin_inset Formula $-1$
+\end_inset
+
+ if no such host exists.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{fetch,replay,work}-timestamp
+\family default
+ Tell the Lamport Unix timestamp (seconds since 1970) when the last progress
+ has been made.
+ When no such action exists,
+\begin_inset Formula $-1$
+\end_inset
+
+ is returned.
+
+\family typewriter
+%work-timestamp{
+\emph on
+hostname
+\emph default
+}
+\family default
+ is the maximum of
+\family typewriter
+%fetch-timestamp{
+\emph on
+hostname
+\emph default
+}
+\family default
+ and
+\family typewriter
+%replay-timestamp{
+\emph on
+hostname
+\emph default
+}
+\family default
+.
+ When the parameter
+\family typewriter
+\emph on
+hostname
+\family default
+\emph default
+ is empty, the local host will be reported (default).
+ Example usage:
+\family typewriter
+marsadm view all --macro=
+\begin_inset Quotes erd
+\end_inset
+
+%replay-timestamp{%todo-primary{}}
+\begin_inset Quotes erd
+\end_inset
+
+
+\family default
+ shows the timestamp of the last reported
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Updates of this information are occurring with lower frequency than actual
+ writebacks, for performance reasons.
+ The metadata network update protocol will add further delays.
+ Therefore, the accuracy is only in the range of minutes.
+\end_layout
+
+\end_inset
+
+ writeback action at the designated primary.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{alive,fetch,replay,work}-age
+\family default
+ Tell the number of seconds since the last respective action, or
+\begin_inset Formula $-1$
+\end_inset
+
+ if none exists.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+{alive,fetch,replay,work}-lag
+\family default
+ Report the time difference (in seconds) between the last
+\emph on
+known
+\emph default
+ action at the local host and at the designated primary (or between any
+ other hosts when 2 parameters are given).
+ Returns
+\begin_inset Formula $-1$
+\end_inset
+
+ if no such action exists at any of the two hosts.
+ Attention! This need not reflect the
+\emph on
+actual
+\emph default
+ state in case of networking problems.
+ Don't draw wrong conclusions from a high
+\family typewriter
+{fetch,replay}-lag
+\family default
+ value: it could also mean that simply no write operation at all has occurred
+ at the primary side for a long time.
+ Conversely, a low lag value does not imply that the replication is recent:
+ it may refer to
+\emph on
+different
+\emph default
+ write operations at each of the hosts; therefore it only tells that
+\emph on
+some
+\emph default
+ progress has been made, but says nothing about the amount of the progress.
+\end_layout
+
+\begin_layout Paragraph
+Misc Informational Status
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+get-primary
+\family default
+ Return the name of the current designated primary node as locally known.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+actual-primary
+\family default
+ (deprecated) try to determine the name of the node which
+\emph on
+appears
+\emph default
+ to be the actual primary.
+ This only a
+\series bold
+\emph on
+guess
+\series default
+\emph default
+, because it is not generally unique in split brain situations! Don't use
+ this macro.
+ Instead, use
+\family typewriter
+is-primary
+\family default
+ on those nodes you are interested in.
+ The explanations from section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-State-of"
+
+\end_inset
+
+ also apply to
+\family typewriter
+get-primary
+\family default
+ versus
+\family typewriter
+actual-primary
+\family default
+ analogously.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+is-alive
+\family default
+ Boolean value indicating whether all other nodes participating in
+\family typewriter
+mydata
+\family default
+ are reachable / healthy.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+uuid
+\family default
+ (global) Show the unique identifier created by
+\family typewriter
+create-cluster
+\family default
+ or by
+\family typewriter
+create-uuid
+\family default
+.
+ Hint: this is immutable, and it is firmly bound to the
+\family typewriter
+/mars/
+\family default
+ filesystem.
+ It can only be destroyed by deleting the whole filesystem (see section
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "leave-cluster"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+tree
+\family default
+ (global) Indicate symlink tree version (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-Symlink-Tree"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Paragraph
+Experts Only
+\end_layout
+
+\begin_layout Standard
+The following is for hackers who know what they are doing.
+ The following is not officially supported.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+wait-{is,todo}-{attach,sync,fetch,replay,primary}-{on,off}
+\family default
+ This may be used to program some useful waiting conditions in advanced
+ macro scripts.
+ Use at your own risk!
+\end_layout
+
+\begin_layout Section
+Creating your own Macros
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Creating-your-own"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+In order to create your own macros, you could start writing them from scratch
+ with your favorite ASCII text editor.
+ However, it is much easier to take an existing macro and to customize it
+ to your needs.
+ In addition, you can learn something about macro programming by looking
+ at the existing macro code.
+\end_layout
+
+\begin_layout Standard
+Go to a new empty directory and say
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+marsadm dump-macros
+\end_layout
+
+\begin_layout Standard
+in order to get the most interesting complex macros, or say
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+marsadm dump-all-macros
+\end_layout
+
+\begin_layout Standard
+in order to additionally get some primitive macros which could be customized
+ if needed.
+ This will write lots of files
+\family typewriter
+*.tpl
+\family default
+ into your current working directory.
+\end_layout
+
+\begin_layout Standard
+Any modfied or new macro file should be placed either into the current working
+ directory
+\family typewriter
+./
+\family default
+ , or into
+\family typewriter
+$HOME/.marsadm/
+\family default
+ , or into
+\family typewriter
+/etc/marsadm/
+\family default
+ .
+ They will be searched in this order, and the first match will win.
+ When no macro file is found, the built-in version will be used if it exists.
+ This way, you may override builtin macros.
+\end_layout
+
+\begin_layout Standard
+Example: if you have a file
+\family typewriter
+./mymacro.tpl
+\family default
+ you just need to say
+\family typewriter
+marsadm view-mymacro mydata
+\family default
+ in order to invoke it in the resource context
+\family typewriter
+mydata
+\family default
+.
+\end_layout
+
+\begin_layout Subsection
+General Macro Syntax
+\end_layout
+
+\begin_layout Standard
+Macros are simple ASCII text, enriched with calls to other macros.
+\end_layout
+
+\begin_layout Standard
+ASCII text outside of comments are copied to the output verbatim.
+ Comments are skipped.
+ Comments may have one of the following well-known forms:
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+# skipped text until / including next newline character
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+// skipped text until / including next newline character
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+/* skipped text including any newline characters */
+\end_layout
+
+\begin_layout Itemize
+denoted as Perl regex:
+\family typewriter
+
+\backslash
+
+\backslash
+
+\backslash
+n
+\backslash
+s*
+\family default
+(single backslash directly followed by a newline character, and eating up
+ any whitespace characters at the beginning of the next line) Hint: this
+ may be fruitfully used to structure macros in a more readable form / indentatio
+n.
+\end_layout
+
+\begin_layout Standard
+Special characters are always initiated by a backslash.
+ The following pre-defined special character sequences are recognized:
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+
+\backslash
+n
+\family default
+ newline
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+
+\backslash
+r
+\family default
+ return (useful for DOS compatibility)
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+
+\backslash
+t
+\family default
+ tab
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+
+\backslash
+f
+\family default
+ formfeed
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+
+\backslash
+b
+\family default
+ backspace
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+
+\backslash
+a
+\family default
+ alarm (bell)
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+
+\backslash
+e
+\family default
+ escape (e.g.
+ for generating ANSI escape sequences)
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+
+\backslash
+
+\family default
+ followed by anything else: assure that the next character is taken verbatim.
+ Although possible, please don't use this for escaping letters, because
+ further escape sequences might be pre-defined in future.
+ Best practice is to use this only for escaping the backslash itself, or
+ for escaping the percent sign when you don't want to call a macro (protect
+ against evaluation), or to escape a brace directly after a macro call (verbatim
+ brace not to be interpreted as a macro parameter).
+\end_layout
+
+\begin_layout Itemize
+All other characters stand for their own.
+ If you like, you should be able to produce XML, HTML, JSON and other ASCII-base
+d output formats this way.
+\end_layout
+
+\begin_layout Standard
+Macro calls have the following syntax:
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%
+\emph on
+macroname
+\emph default
+{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}{
+\emph on
+argn
+\emph default
+}
+\end_layout
+
+\begin_layout Itemize
+Of course, arguments may be empty, denoted as
+\family typewriter
+{}
+\end_layout
+
+\begin_layout Itemize
+It is possible to supply more arguments than required.
+ These are simply ignored.
+\end_layout
+
+\begin_layout Itemize
+There must be always at least 1 argument, even for parameterless macros.
+ In such a case, it is good style to leave it empty (even if it is actually
+ ignored).
+ Just write
+\family typewriter
+%parameterlessmacro{}
+\family default
+ in such a case.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%{
+\emph on
+varname
+\emph default
+}
+\family default
+ syntax: As a special case, the macro name may be empty, but then the first
+ argument must denote a previously defined variable (such as assigned via
+
+\family typewriter
+%let{varname}{myvalue}
+\family default
+, or a pre-defined standard variable like
+\family typewriter
+%{res}
+\family default
+ for the current resource name, see later paragraph
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "par:Predefined-Variables"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Itemize
+Of course, parameter calls may be (almost) arbitrarily nested.
+\end_layout
+
+\begin_layout Itemize
+Of course, the
+\emph on
+correctness
+\emph default
+ of nesting of braces must be generally obeyed, as usual in any other macro
+ processor language.
+ General rule: for each opening brace, there must be exactly one closing
+ brace somewhere afterwards.
+\end_layout
+
+\begin_layout Standard
+These rules are hopefully simple and intuitive.
+ There are currently no exceptions.
+ In particular, there is no special infix operator syntax for arithmetic
+ expressions, and therefore no operator precedence rules are necessary.
+ You have to write nested arithmetic expressions always in the above prefix
+ syntax, like
+\family typewriter
+%*{7}{%+{2}{3}}
+\family default
+ (similar to non-inverse polish notation).
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+When deeply nesting macros and their braces, you may easily find yourself
+ in a feeling like in the good old days of Lisp.
+ Use the above backslash-newline syntax to indent your macros in a readable
+ and structured way.
+ Fortunately, modern text editors like (x)emacs or vim have modes for dealing
+ with the correctness of nested braces.
+\end_layout
+
+\begin_layout Subsection
+Calling Builtin / Primitive Macros
+\end_layout
+
+\begin_layout Standard
+Primitive macros can be called in two alternate forms:
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%primitive-
+\emph on
+macroname
+\emph default
+{
+\emph on
+something
+\emph default
+}
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%
+\emph on
+macroname
+\emph default
+{
+\emph on
+something
+\emph default
+}
+\end_layout
+
+\begin_layout Standard
+When using the
+\family typewriter
+%primitive-*{}
+\family default
+ form, you
+\emph on
+explicitly disallow
+\emph default
+ interception of the call by a
+\family typewriter
+*.tpl
+\family default
+ file.
+ Otherwise, you may override the standard definition even of primitive macros
+ by your own template files.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Notice that
+\family typewriter
+%call{}
+\family default
+ conventions are used in such a case.
+ The parameters are passed via
+\family typewriter
+%{0}
+\family default
+
+\begin_inset Formula $\ldots$
+\end_inset
+
+
+\family typewriter
+%{n}
+\family default
+ variables (see description below).
+\end_layout
+
+\begin_layout Paragraph
+Standard MARS State Inspection Macros
+\end_layout
+
+\begin_layout Standard
+These are already described in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Predefined-Trivial-Macros"
+
+\end_inset
+
+.
+ When calling one of them, the call will simply expand to the corresponding
+ value.
+\end_layout
+
+\begin_layout Standard
+Example:
+\family typewriter
+%get-primary{}
+\family default
+ will expand to the hostname of the current designated primary node.
+\end_layout
+
+\begin_layout Paragraph
+Further MARS State Inspection Macros
+\end_layout
+
+\begin_layout Paragraph
+Variable Access Macros
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%let{
+\emph on
+varname
+\emph default
+}{
+\emph on
+expression
+\emph default
+}
+\family default
+Evaluates both
+\family typewriter
+\emph on
+varname
+\family default
+\emph default
+ and the
+\family typewriter
+\emph on
+expression
+\family default
+\emph default
+.
+ The
+\family typewriter
+\emph on
+expression
+\family default
+\emph default
+ is then assigned to
+\family typewriter
+varname
+\family default
+.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%let{
+\emph on
+varname
+\emph default
+}{
+\emph on
+expression
+\emph default
+}
+\family default
+Evaluates both
+\family typewriter
+\emph on
+varname
+\family default
+\emph default
+ and the
+\family typewriter
+\emph on
+expression
+\family default
+\emph default
+.
+ The
+\family typewriter
+\emph on
+expression
+\family default
+\emph default
+ is then appended to
+\family typewriter
+varname
+\family default
+ (concatenation).
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%{
+\emph on
+varname
+\emph default
+}
+\family default
+Evaluates
+\family typewriter
+\emph on
+varname
+\family default
+\emph default
+, and outputs the value of the corresponding variable.
+ When the variable does not exist, the empty string is returned.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%{++}{
+\emph on
+varname
+\emph default
+}
+\family default
+or
+\family typewriter
+%{
+\emph on
+varname
+\emph default
+}{++}
+\family default
+ Has the obvious well-known side effect e.g.
+ from C or Java.
+ You may also use
+\family typewriter
+--
+\family default
+ instead of
+\family typewriter
+++
+\family default
+.
+ This is handy for programming loops (see below).
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%dump-vars{}
+\family default
+Writes all currently defined variables (from the currently active scope)
+ to
+\family typewriter
+stderr
+\family default
+.
+ This is handy for debugging.
+\end_layout
+
+\begin_layout Paragraph
+CSV Array Macros
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%{
+\emph on
+varname
+\emph default
+}{
+\emph on
+delimiter
+\emph default
+}{
+\emph on
+index
+\emph default
+}
+\family default
+Evaluates all arguments.
+ The contents of
+\family typewriter
+\emph on
+varname
+\family default
+\emph default
+ is interpreted as a comma-separated list, delimited by
+\family typewriter
+\emph on
+delimiter
+\family default
+\emph default
+.
+ The
+\family typewriter
+\emph on
+index
+\family default
+\emph default
+'th list element is returned.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%set{
+\emph on
+varname
+\emph default
+}{
+\emph on
+delimiter
+\emph default
+}{
+\emph on
+index
+\emph default
+}{
+\emph on
+expression
+\emph default
+}
+\family default
+Evaluates all arguments.
+ The contents of the old
+\family typewriter
+\emph on
+varname
+\family default
+\emph default
+ is interpreted as a comma-separated list, delimited by
+\family typewriter
+\emph on
+delimiter
+\family default
+\emph default
+.
+ The
+\family typewriter
+\emph on
+index
+\family default
+\emph default
+'th list element is the assigend to, or substituted by,
+\family typewriter
+\emph on
+expression
+\family default
+\emph default
+.
+\end_layout
+
+\begin_layout Paragraph
+Arithmetic Expression Macros
+\end_layout
+
+\begin_layout Standard
+The following macros can also take more than two arguments, carrying out
+ the corresponding arithmetic operation in sequence (it depends on the operator
+ whether this accords to the associative law).
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%+{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Evaluates the arguments, inteprets them as numbers, and adds them together.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%-{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Subtraction.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%*{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Multiplication.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%/{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Division.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%%{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Modulus.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%&{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Bitwise Binary And.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%|{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Bitwise Binary Or.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%^{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Bitwise Binary Exclusive Or.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%<<{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Binary Shift Left.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%>>{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Binary Shift Right.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%min{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Compute the arithmetic minimum of the arguments.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%max{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Compute the arithmetic maximum of the arguments.
+\end_layout
+
+\begin_layout Paragraph
+Boolean Condition Macros
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%=={
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Numeral Equality.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%!={
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Numeral Inequality.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%<{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Numeral Less Then.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%<={
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Numeral Less or Equal.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%>{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Numeral Greater Then.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%>={
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Numeral Greater or Equal.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%eq{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+
+\begin_inset space ~
+\end_inset
+
+String Equality.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%ne{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+String Inequality.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%lt{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+String Less Then.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%le{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+String Less or Equal.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%gt{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+String Greater Then.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%ge{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+String Greater or Equal.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%=~{
+\emph on
+string
+\emph default
+}{
+\emph on
+regex
+\emph default
+}{
+\emph on
+opts
+\emph default
+}
+\family default
+or
+\family typewriter
+%match{
+\emph on
+string
+\emph default
+}{
+\emph on
+regex
+\emph default
+}{
+\emph on
+opts
+\emph default
+}
+\family default
+ Checks whether
+\family typewriter
+\emph on
+string
+\family default
+\emph default
+ matches the Perl regular expression
+\family typewriter
+\emph on
+regex
+\family default
+\emph default
+.
+ Modifiers can be given via
+\family typewriter
+\emph on
+opts
+\family default
+\emph default
+.
+\end_layout
+
+\begin_layout Paragraph
+Shortcut Evaluation Operators
+\end_layout
+
+\begin_layout Standard
+The following operators evaluate their arguments only when needed (like
+ in C).
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%&&{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Logical And.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%and{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Alias for
+\family typewriter
+%&&{}
+\family default
+.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%||{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Logical Or.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%or{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}
+\family default
+Alias for
+\family typewriter
+%||{}
+\family default
+.
+\end_layout
+
+\begin_layout Paragraph
+Unary Operators
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%!{
+\emph on
+arg
+\emph default
+}
+\family default
+Logical Not.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%not{
+\emph on
+arg
+\emph default
+}
+\family default
+Alias for
+\family typewriter
+%!{}
+\family default
+.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%~{
+\emph on
+arg
+\emph default
+}
+\family default
+Bitwise Ńegation.
+\end_layout
+
+\begin_layout Paragraph
+String Functions
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%length{
+\emph on
+string
+\emph default
+}
+\family default
+Return the number of ASCII characters present in
+\family typewriter
+\emph on
+string
+\family default
+\emph default
+.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%toupper{
+\emph on
+string
+\emph default
+}
+\family default
+Return all ASCII characters converted to uppercase.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%tolower{
+\emph on
+string
+\emph default
+}
+\family default
+Return all ASCII characters converted to lowercase.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%append{
+\emph on
+varname
+\emph default
+}{
+\emph on
+string
+\emph default
+}
+\family default
+Equivalent to
+\family typewriter
+%let{
+\emph on
+varname
+\emph default
+}{%{
+\emph on
+varname
+\emph default
+}
+\emph on
+string
+\emph default
+}
+\family default
+.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%subst{
+\emph on
+string
+\emph default
+}{
+\emph on
+regex
+\emph default
+}{
+\emph on
+subst
+\emph default
+}{
+\emph on
+opts
+\emph default
+}
+\family default
+Perl regex substitution.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%sprintf{
+\emph on
+fmt
+\emph default
+}{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}{
+\emph on
+argn
+\emph default
+}
+\family default
+Perl
+\family typewriter
+sprintf()
+\family default
+ operator.
+ Details see Perl manual.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%human-number{
+\emph on
+unit
+\emph default
+}{
+\emph on
+delim
+\emph default
+}{
+\emph on
+unit-sep
+\emph default
+}{
+\emph on
+number
+\emph default
+1}{
+\emph on
+number
+\emph default
+2}
+\begin_inset Formula $\ldots$
+\end_inset
+
+
+\family default
+Convert a number or a list of numbers into human-readable
+\family typewriter
+B
+\family default
+,
+\family typewriter
+KiB
+\family default
+,
+\family typewriter
+MiB
+\family default
+,
+\family typewriter
+GiB
+\family default
+,
+\family typewriter
+TiB
+\family default
+, as given by
+\family typewriter
+\emph on
+unit
+\family default
+\emph default
+.
+ When
+\family typewriter
+\emph on
+unit
+\family default
+\emph default
+ is empty, a reasonable unit will be guessed automatically from the maximum
+ of all given numbers.
+ A single result string is produced, where multiple numbers are separated
+ by
+\family typewriter
+\emph on
+delim
+\family default
+\emph default
+ when necessary.
+ When
+\family typewriter
+\emph on
+delim
+\family default
+\emph default
+ is empty, the slash symbol
+\family typewriter
+/
+\family default
+ is used by default (the most obvious use case is result strings like
+\family typewriter
+
+\begin_inset Quotes eld
+\end_inset
+
+17/32 KiB
+\begin_inset Quotes erd
+\end_inset
+
+
+\family default
+).
+ The final unit text is separated from the previous number(s) by
+\family typewriter
+\emph on
+unit-sep
+\family default
+\emph default
+.
+ When
+\family typewriter
+\emph on
+unit-sep
+\family default
+\emph default
+ is empty, a single blank is used by default.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%human-seconds{
+\emph on
+number
+\emph default
+}
+\family default
+Convert the given number of seconds into
+\family typewriter
+hh:mm:ss
+\family default
+ format.
+\end_layout
+
+\begin_layout Paragraph
+Complex Helper Macros
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%progress{20}
+\family default
+Return a string containing a progress bar showing the values from
+\family typewriter
+%summary-vector{}
+\family default
+.
+ The default width is 20 characters plus two braces.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%progress{20}{
+\emph on
+minvalue
+\emph default
+}{
+\emph on
+midvalue
+\emph default
+}{
+\emph on
+maxvalue
+\emph default
+}
+\family default
+Instead of taking the values from
+\family typewriter
+%summary-vector{}
+\family default
+, use the supplied values.
+
+\family typewriter
+minvalue
+\family default
+ and
+\family typewriter
+midvalue
+\family default
+ indicate two different intermediate points, while
+\family typewriter
+maxvalue
+\family default
+ will determine the 100% point.
+\end_layout
+
+\begin_layout Paragraph
+Control Flow Macros
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%if{
+\emph on
+expression
+\emph default
+}{
+\emph on
+then-part
+\emph default
+}
+\family default
+ or
+\family typewriter
+%if{
+\emph on
+expression
+\emph default
+}{
+\emph on
+then-part
+\emph default
+}{
+\emph on
+else-part
+\emph default
+}
+\family default
+ Like in any other macro or programming language, this evaluates the
+\family typewriter
+expression
+\family default
+ once, not copying its outcome to the output.
+ If the result is non-empty and is not a string denoting the number
+\family typewriter
+0
+\family default
+, the
+\family typewriter
+\emph on
+then-part
+\family default
+\emph default
+ is evaluated and copied to the output.
+ Otherwise, the
+\family typewriter
+else-part
+\family default
+ is evaluated and copied, provided that one exists.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%unless{
+\emph on
+expression
+\emph default
+}{
+\emph on
+then-part
+\emph default
+}
+\family default
+ or
+\family typewriter
+%unless{
+\emph on
+expression
+\emph default
+}{
+\emph on
+then-part
+\emph default
+}{
+\emph on
+else-part
+\emph default
+}
+\family default
+ Like
+\family typewriter
+%if{}
+\family default
+, but the expression is logically negated.
+ Essentially, this is a shorthand for
+\family typewriter
+%if{%not{expression}}{...}
+\family default
+ or similar.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%elsif{
+\emph on
+expr1
+\emph default
+}{
+\emph on
+then1
+\emph default
+}{
+\emph on
+expr2
+\emph default
+}{
+\emph on
+then2
+\emph default
+}
+\family default
+
+\begin_inset Formula $\ldots$
+\end_inset
+
+ or
+\family typewriter
+%elsif{
+\emph on
+expr1
+\emph default
+}{
+\emph on
+then1
+\emph default
+}{
+\emph on
+expr2
+\emph default
+}{
+\emph on
+then2
+\emph default
+}
+\family default
+
+\begin_inset Formula $\ldots$
+\end_inset
+
+
+\family typewriter
+{
+\emph on
+odd-else-part
+\emph default
+}
+\family default
+ This is for simplification of boring if-else-if chains.
+ The classical if-syntax (as shown above) has the drawback that inner if-parts
+ need to be nested into outer else-parts, so rather deep nestings may occur
+ when you are programming longer chains.
+ This is an alternate syntax for avoidance of deep nesting.
+ When giving an odd number of arguments, the last argument is taken as final
+ else-part.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%elsunless
+\family default
+
+\begin_inset Formula $\ldots$
+\end_inset
+
+ Like
+\family typewriter
+%elsif
+\family default
+, but
+\emph on
+all
+\emph default
+ conditions are negated.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%while{
+\emph on
+expression
+\emph default
+}{
+\emph on
+body
+\emph default
+}
+\family default
+Evaluates the
+\family typewriter
+\emph on
+expression
+\family default
+\emph default
+ in a while loop, like in any other macro or programming language.
+ The
+\family typewriter
+\emph on
+body
+\family default
+\emph default
+ is evaluated exactly as many times as the
+\family typewriter
+\emph on
+expression
+\family default
+\emph default
+ holds.
+ Notice that endless loops can be only avoided by a calling a non-pure macro
+ inspecting external state information, or by creating (and checking) another
+ side effect somewhere, like assigning to a variable somewhere.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%until{
+\emph on
+expression
+\emph default
+}{
+\emph on
+body
+\emph default
+}
+\family default
+Like
+\family typewriter
+ %while{
+\emph on
+expression
+\emph default
+}{
+\emph on
+body
+\emph default
+}
+\family default
+, but negate the expression.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%for{
+\emph on
+exp
+\emph default
+r1}{
+\emph on
+exp
+\emph default
+r2}{
+\emph on
+exp
+\emph default
+r3}{
+\emph on
+body
+\emph default
+}
+\family default
+ As you will expect from the corresponding C, Perl, Java, or (add your favorite
+ language) construct.
+ Only the syntactic sugar is a little bit different.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%foreach{
+\emph on
+varname
+\emph default
+}{
+\emph on
+CSV-delimited-string
+\emph default
+}{
+\emph on
+delimiter
+\emph default
+}{
+\emph on
+body
+\emph default
+}
+\family default
+ As you can expect from similar
+\family typewriter
+foreach
+\family default
+ constructs in other languages like Perl.
+ Currently, the macro processor has no arrays, but can use comma-separated
+ strings as a substitute.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%eval{
+\emph on
+count
+\emph default
+}{
+\emph on
+body
+\emph default
+}
+\family default
+ Evaluates the
+\family typewriter
+\emph on
+body
+\family default
+\emph default
+ exactly as many times as indicated by the numeric argument
+\family typewriter
+\emph on
+count
+\family default
+\emph default
+.
+ This may be used to re-evaluate the output of other macros once again.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%protect{
+\emph on
+body
+\emph default
+}
+\family default
+ Equivalent to
+\family typewriter
+%eval{0}{
+\emph on
+body
+\emph default
+}
+\family default
+, which means that the body is not evaluated at all, but copied to the output
+ verbatim
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+TeX
+\end_layout
+
+\end_inset
+
+
+\begin_inset space ~
+\end_inset
+
+or
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+LaTeX
+\end_layout
+
+\end_inset
+
+
+\begin_inset space ~
+\end_inset
+
+fans usually know what this is good for ;)
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%eval-down{
+\emph on
+body
+\emph default
+}
+\family default
+ Evaluates the
+\family typewriter
+\emph on
+body
+\family default
+\emph default
+ in a loop until the result does not change any more
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Mathematicians knowing Banach's fixedpoint theorem will know what this is
+ good for ;)
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%tmp{
+\emph on
+body
+\emph default
+}
+\family default
+ Evaluates the
+\family typewriter
+\emph on
+body
+\family default
+\emph default
+ once in a temporary scope which is thrown away afterwards.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%call{
+\emph on
+macroname
+\emph default
+}{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}{
+\emph on
+argn
+\emph default
+}
+\family default
+ Like in many other macro languages, this evaluates the named macro in the
+ a new scope.
+ This means that any side effects produced by the called macro, such as
+ variable assignments, will be reverted after the call, and therefore not
+ influence the old scope.
+ However notice that the arguments
+\family typewriter
+\emph on
+arg1
+\family default
+\emph default
+ to
+\family typewriter
+\emph on
+argn
+\family default
+\emph default
+ are evaluted in the
+\emph on
+old
+\emph default
+ scope before the call actually happens (possibly producing side effects
+ if they contain some), and their result is respectively assigned to
+\family typewriter
+%{1}
+\family default
+ until
+\family typewriter
+%{
+\emph on
+n
+\emph default
+}
+\family default
+ in the new scope, analogously to the Shell or to Perl.
+ In addition, the new
+\family typewriter
+%{0}
+\family default
+ gets the
+\family typewriter
+\emph on
+macroname
+\family default
+\emph default
+.
+ Notice that the argument evaluation happens non-lazily in the old scope
+ and therefore differs from other macro processors like
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+TeX
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%include{
+\emph on
+macroname
+\emph default
+}{
+\emph on
+arg1
+\emph default
+}{
+\emph on
+arg2
+\emph default
+}{
+\emph on
+argn
+\emph default
+}
+\family default
+ Like
+\family typewriter
+%call{}
+\family default
+, but evaluates the named macro in the
+\emph on
+current
+\emph default
+ scope (similar to the
+\family typewriter
+source
+\family default
+ command of the bourne shell).
+ This means that any side effects produced by the called macro, such as
+ variable assignments, will
+\emph on
+not
+\emph default
+ be reverted after the call.
+ Even the
+\family typewriter
+%{0}
+\family default
+ until
+\family typewriter
+%{
+\emph on
+n
+\emph default
+}
+\family default
+ variables will continue to exist (and may lead to confusion if you aren't
+ aware of that).
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%callstack{}
+\family default
+ Useful for debugging: show the current chain of macro invocations.
+\end_layout
+
+\begin_layout Paragraph
+Time Handling Macros
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%time{}
+\family default
+ Return the current Lamport timestamp (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-Lamport-Clock"
+
+\end_inset
+
+), in units of seconds since the Unix epoch.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%real-time{}
+\family default
+ Return the current system clock timestamp, in units of seconds since the
+ Unix epoch.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%sleep{
+\emph on
+seconds
+\emph default
+}
+\family default
+ Pause the given number of seconds.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%timeout{
+\emph on
+seconds
+\emph default
+}
+\family default
+ Like
+\family typewriter
+%sleep{
+\emph on
+seconds
+\emph default
+}
+\family default
+, but abort the
+\family typewriter
+marsadm
+\family default
+ command after the total waiting time has exceeded the timeout given by
+ the
+\family typewriter
+--timeout=
+\family default
+ parameter.
+\end_layout
+
+\begin_layout Paragraph
+Misc Macros
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%warn{
+\emph on
+text
+\emph default
+}
+\family default
+ Show a WARNING:
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%die{
+\emph on
+text
+\emph default
+}
+\family default
+ Abort execution with an error message.
+\end_layout
+
+\begin_layout Paragraph
+Experts Only - Risky
+\end_layout
+
+\begin_layout Standard
+The following macros are unstable and may change at any time without notice.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%get-msg{
+\emph on
+name
+\emph default
+}
+\family default
+ Low-level access to system messages.
+ You should not use this, since this is not extensible (you must know the
+ name in advance).
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%readlink{
+\emph on
+path
+\emph default
+}
+\family default
+ Low-level access to symlinks.
+ Don't misuse this for circumvention of the abstraction macros from the
+ symlink tree!
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%setlink{
+\emph on
+value
+\emph default
+}{
+\emph on
+path
+\emph default
+}
+\family default
+ Low-level creation of symlinks.
+ Don't misuse this for circumvention of the abstraction macros for the symlink
+ tree!
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%fetch-info{}
+\family default
+etc.
+ Low-level access to internal symlink formats.
+ Don't use this in scripts! Only for curious humans.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%is-almost-consistent{}
+\family default
+ Whatever you guess what this could mean, don't use it, at least never in
+ place of
+\family typewriter
+%is-consistent{}
+\family default
+ - it is risky to base decisions on this.
+ Mostly for historical reasons.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%does{
+\emph on
+name
+\emph default
+}
+\family default
+Equivalent to
+\family typewriter
+%is-
+\emph on
+name
+\emph default
+{}
+\family default
+ (just more handy for computing the macro name).
+ Use with care!
+\end_layout
+
+\begin_layout Subsection
+Predefined Variables
+\begin_inset CommandInset label
+LatexCommand label
+name "par:Predefined-Variables"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%{cmd}
+\family default
+The command argument of the invoked
+\family typewriter
+marsadm
+\family default
+ command.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%{res}
+\family default
+The resource name given to the
+\family typewriter
+marsadm
+\family default
+ command as a command line parameter (or, possibly expanded from
+\family typewriter
+all
+\family default
+).
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%{resdir}
+\family default
+The corresponding resource directory.
+ The current version of MARS uses
+\family typewriter
+/mars/resource-%{res}/
+\family default
+, but this may change in future.
+ Normally, you should not need this, since anything should be already abstracted
+ for you.
+ In case you
+\emph on
+really
+\emph default
+ need low-level access to something, please prefer this variable over
+\family typewriter
+%{mars}/resource-%{res}
+\family default
+ because it is a bit more abstracted.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%{mars}
+\family default
+Currently the fixed string
+\family typewriter
+/mars
+\family default
+.
+ This may change in future, probably with the advent of MARS Full.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%{host}
+\family default
+The hostname of the local node.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%{ip}
+\family default
+The IP address of the local node.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%{timeout}
+\family default
+The value given by the
+\family typewriter
+--timeout=
+\family default
+ option, or the corresonding default value.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%{threshold}
+\family default
+The value given by the
+\family typewriter
+--threshold=
+\family default
+ option, or the corresonding default value.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%{window}
+\family default
+The value given by the
+\family typewriter
+--window=
+\family default
+ option, or the corresonding default value (60s).
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%{force}
+\family default
+The number of times the
+\family typewriter
+--force
+\family default
+ option has been given.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%{dry-run}
+\family default
+The number of times the
+\family typewriter
+--dry-run
+\family default
+ option has been given.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%{verbose}
+\family default
+The number of times the
+\family typewriter
+--verbose
+\family default
+ option has been given.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+%{callstack}
+\family default
+Same as the
+\family typewriter
+%callstack{}
+\family default
+ macro.
+ The latter gives you an opportunity for overriding, while the former is
+ firmly built in.
+\end_layout
+
+\begin_layout Section
+Scripting HOWTO
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Scripting-HOWTO"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Both the
+\series bold
+asynchronous communication model
+\series default
+ of MARS (cf section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-Lamport-Clock"
+
+\end_inset
+
+) including the Lamport clock, and the
+\series bold
+state model
+\series default
+ (cf section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-State-of"
+
+\end_inset
+
+) is something you
+\emph on
+definitely
+\emph default
+ should have in mind when you want to do some scripting.
+ Here is some further concrete advice:
+\end_layout
+
+\begin_layout Itemize
+Don't access anything on
+\family typewriter
+/mars/
+\family default
+ directly, except for debugging purposes.
+ Use
+\family typewriter
+marsadm
+\family default
+.
+\end_layout
+
+\begin_layout Itemize
+Avoid running scripts in parallel, other than for inspection / monitoring
+ purposes.
+ When you give two
+\family typewriter
+marsadm
+\family default
+ commands in parallel (whether on the same host, or on different hosts belonging
+ to the same cluster), it is very likely to produce a mess.
+
+\family typewriter
+marsadm
+\family default
+ has no internal locking.
+ There is no cluster-wide locking at all.
+ Unfortunately, some systems like Pacemaker are violating this in many cases
+ (depending on their configuration).
+ Best is if you have a dedicated / more or less centralized
+\series bold
+control machine
+\series default
+ which controls masses of your georedundant working servers.
+ This reduces the risk of running interfering actions in parallel.
+ Of course, you need backup machines for your control machines, and in different
+ locations.
+ Not obeying this advice can easily lead to problems such as complex races
+ which are very difficult to solve in long-distance distributed systems,
+ even in general (not limited to MARS).
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+marsadm wait-cluster
+\family default
+ is your friend.
+ Whenever your (near-)central script has to switch between different hosts
+
+\family typewriter
+A
+\family default
+ and
+\family typewriter
+B
+\family default
+ (of the same cluster), use it in the following way:
+\begin_inset Newline newline
+\end_inset
+
+
+\family typewriter
+ssh A
+\begin_inset Quotes eld
+\end_inset
+
+marsadm action1
+\begin_inset Quotes erd
+\end_inset
+
+; ssh B
+\begin_inset Quotes eld
+\end_inset
+
+marsadm wait-cluster; marsadm action2
+\begin_inset Quotes erd
+\end_inset
+
+
+\begin_inset Newline newline
+\end_inset
+
+
+\family default
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Don't ignore this advice! Interference is almost
+\emph on
+sure
+\emph default
+! As a rule of thumb, precede almost any action command with some appropriate
+ waiting command!
+\end_layout
+
+\begin_layout Itemize
+Further friends are any
+\family typewriter
+marsadm wait-*
+\family default
+ commands, such as
+\family typewriter
+wait-umount
+\family default
+.
+\end_layout
+
+\begin_layout Itemize
+In some places, busy-wait loops might be needed, e.g.
+ for waiting until a specific resource is
+\family typewriter
+UpToDate
+\family default
+ or matches some other condition.
+ Examples of waiting conditions can be found under
+\family typewriter
+github.com/schoebel/test-suite
+\family default
+ in subdirectory
+\family typewriter
+mars/modules/
+\family default
+, specifically
+\family typewriter
+02_predicates.sh
+\family default
+ or similar.
+\end_layout
+
+\begin_layout Itemize
+In case of network problems, some command may hang (forever), if you don't
+ set the
+\family typewriter
+--timeout=
+\family default
+ option.
+ Don't forget the check the return state of any failed / timeouted commands,
+ and to take appropriate measures!
+\end_layout
+
+\begin_layout Itemize
+Test your scripts in failure scenarios!
+\end_layout
+
+\begin_layout Chapter
+The Sysadmin Interface (
+\family typewriter
+marsadm
+\family default
+ and
+\family typewriter
+/proc/sys/mars/
+\family default
+)
+\family typewriter
+
+\begin_inset CommandInset label
+LatexCommand label
+name "chap:The-Sysadmin-Interface"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+In general, the term
+\begin_inset Quotes eld
+\end_inset
+
+after a while
+\begin_inset Quotes erd
+\end_inset
+
+ means that other cluster nodes will take notice of your actions according
+ to the
+\begin_inset Quotes eld
+\end_inset
+
+eventually consistent
+\begin_inset Quotes erd
+\end_inset
+
+ propagation protocol described in sections
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-Lamport-Clock"
+
+\end_inset
+
+ and
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-Symlink-Tree"
+
+\end_inset
+
+.
+ Please be aware that this
+\begin_inset Quotes eld
+\end_inset
+
+while
+\begin_inset Quotes erd
+\end_inset
+
+ may last very long in case of network outages or bad firewall rules.
+\end_layout
+
+\begin_layout Standard
+In the following tables, column
+\begin_inset Quotes eld
+\end_inset
+
+Cmp
+\begin_inset Quotes erd
+\end_inset
+
+ means compatibility with DRBD.
+ Please note that 100% exact compatibility is not possible, because of the
+ asynchronous communication paradigm.
+\end_layout
+
+\begin_layout Standard
+The following table documents common options which work with (almost) any
+ command:
+\end_layout
+
+\begin_layout Standard
+
+\size scriptsize
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Option
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Cmp
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Description
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+--dry-run
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Run the command without actually creating symlinks or touching files or
+ executing rsync.
+ This option
+\emph on
+should
+\emph default
+ be used first at any dangerous command, in order to check what would happen.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+Don't use in scripts! Only use by hand!
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+This option does not change the waiting logic.
+ Many commands are waiting until the desired effect has taken place.
+ However, with
+\family typewriter
+--dry-run
+\family default
+ the desired effect will never happen, so the command may wait forever (or
+ abort with a timeout).
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+In addition, this option can lead to additional aborts of the commands due
+ to unmet conditions, which cannot be met because the symlinks are not actually
+ created / altered.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Thus this option can give only a
+\series bold
+rough estimate
+\series default
+ of what would happen later!
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+--force
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+almost
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Some preconditions are skipped, i.e.
+ the command will / should work although some (more or less) vital preconditions
+ are violated.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Instead of giving
+\family typewriter
+--force
+\family default
+, you may alternatively prefix your command with
+\family typewriter
+force-
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ THIS OPTION IS DANGEROUS!
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Use it only when you are absolutely sure that you know what you are doing!
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Use it only as a last resort if the same command without
+\family typewriter
+--force
+\family default
+ has failed
+\emph on
+for no good reason
+\emph default
+!
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+--ignore-sync
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+almost
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Use this for a
+\emph on
+planned
+\emph default
+ handover instead of
+\family typewriter
+--force
+\family default
+.
+ Only one precondition is relaxed: some sync may be running somewhere.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+Careful when using this on extremely huge LVs where the sync may take serveral
+ days, or weeks.
+ It is your sysadmin decision what you want to prefer: restarting the sync,
+ or planned handover.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+--verbose
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Some (few) commands will become more speaky.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+--timeout=$seconds
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Some commands require response from either the local kernel module, or from
+ other cluster nodes.
+ In order to prevent infinite waiting in case of network outages or other
+ problems, the command will fail after the given timeout has been reached.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+When $seconds is -1, the command will wait forever.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+When $seconds is 0, the command will not wait in case any precondition is
+ not met, und abort without performing an action..
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+The default timeout is 5s.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+--window=$seconds
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+The time window for checking the aliveness of other nodes in the network.
+ When no symlink updates have occurred during the last window, the node
+ is considered dead.
+ Default is 60s.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+--threshold=$size
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+The macros containing the substring
+\family typewriter
+-threshold-
+\family default
+ or
+\family typewriter
+-almost-
+\family default
+ are using this as a default value for approximation whether something has
+ been approximately reached.
+ Default is 10MiB.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+The $size argument may be a number optionally followed by one the lowercase
+ characters k m g t p for indicating kilo mega giga tera or peta bytes as
+ multiples of 1000.
+ When using the corresponding uppercase character, multiples of 1024 are
+ formed instead.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+--host=$host
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+The command acts as if the command were executed on another host $host.
+ This option should not be used regularly, because the local information
+ in the symlink tree may be outdated or even wrong.
+ Additionally, some local information like remote sizes of physical devices
+ (e.g.
+ remote disks) is not present in the symlink tree at all, or is wrong (reflectin
+g only the
+\emph on
+local
+\emph default
+ state).
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ THIS OPTION IS DANGEROUS!
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Use it only for final destruction of dead cluster nodes, see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Final-Destroy-of"
+
+\end_inset
+
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+--ip=$ip
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+By default,
+\family typewriter
+marsadm
+\family default
+ always uses the IP for
+\family typewriter
+$host
+\family default
+ as stored in the symlink tree (directory
+\family typewriter
+/mars/ips/
+\family default
+).
+ When such an IP entry does not (yet) exist (e.g.
+
+\family typewriter
+create-cluster
+\family default
+ or
+\family typewriter
+join-cluster
+\family default
+), all local network interfaces are automatically scanned for IPv4 adresses,
+ and the first one is taken.
+ This may lead to wrong decisions if you have multiple network interfaces.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+In order to override the automatic IP detection and.to explicitly tell the
+ IP address of your storage network, use this option.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+Usually you will need this only at
+\family typewriter
+{create,join}-cluster
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+--verbose
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Some (few) commands will become more speaky.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+Cluster Operations
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Cluster-Operations"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+
+\size scriptsize
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Command / Params
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Cmp
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Description
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+create-cluster
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the
+\family typewriter
+/mars/
+\family default
+ filesystem must be mounted and it must be empty (
+\family typewriter
+mkfs.ext4
+\family default
+, see instructions in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Setup-your-Cluster"
+
+\end_inset
+
+).
+ The kernel module must
+\emph on
+not
+\emph default
+ be loaded.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: the initial symlink tree is created in
+\family typewriter
+/mars/
+\family default
+.
+ Additionally, the
+\family typewriter
+/mars/uuid
+\family default
+ symlink is created for later distribution in the cluster.
+ It uniquely indentifies the cluster in the world.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+This must be called exactly once at the initial primary.
+
+\end_layout
+
+\begin_layout Plain Layout
+Hint: use the
+\family typewriter
+--ip=
+\family default
+ option if you have multiple interfaces.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+join-cluster
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$host
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the
+\family typewriter
+/mars/
+\family default
+ filesystem must be mounted and it must be empty (
+\family typewriter
+mkfs.ext4
+\family default
+, see instructions in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Setup-your-Cluster"
+
+\end_inset
+
+).
+ The kernel module must
+\emph on
+not
+\emph default
+ be loaded.
+ The cluster must have been already created at another node
+\family typewriter
+$host
+\family default
+.
+ A working ssh connecttion to $host as root must exist (without password).
+
+\family typewriter
+rsync
+\family default
+ must be installed at all cluster nodes.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: the initial symlink tree
+\family typewriter
+/mars/
+\family default
+ is replicated from the remote host
+\family typewriter
+$host
+\family default
+, and the local host has been added as another cluster member.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+This must be called exactly once at every initial secondary node.
+\end_layout
+
+\begin_layout Plain Layout
+Hint: use the
+\family typewriter
+--ip=
+\family default
+ option if you have multiple interfaces.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+leave-cluster
+\begin_inset CommandInset label
+LatexCommand label
+name "leave-cluster"
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the
+\family typewriter
+/mars/
+\family default
+ filesystem must be mounted and it must contain a valid MARS symlink tree
+ produced by the other
+\family typewriter
+marsadm
+\family default
+ commands.
+ The local node must no longer be member of any resource (see
+\family typewriter
+marsadm leave-resource
+\family default
+).
+ The kernel module should be loaded and the network should be operating
+ in order to also propogate the effect to the other nodes.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: the local node is removed from the replicated symlink tree
+
+\family typewriter
+/mars/
+\family default
+ such that other nodes will cease to communicate with it after a while.
+ The converse it not true: the local node may continue
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Reason:
+\family typewriter
+leave-cluster
+\family default
+ removes only its
+\emph on
+own
+\emph default
+ IP address from
+\family typewriter
+/mars/ips/
+\family default
+, but does not destroy the usual symmetry of the symlink tree by leaving
+ the other IPs intact.
+ Therefore, the local node will continue fetching updates from all nodes
+ present in
+\family typewriter
+/mars/ips/
+\family default
+.
+ As an effect, the local node will
+\emph on
+passively
+\emph default
+ mirror the symlinks of other cluster members, but not vice versa.
+ There is no communication from the local node to the other ones, turning
+ the local node into a
+\series bold
+whitness
+\series default
+ according to some terminology from Distributed Systems.
+ This is a feature, not a bug.
+ It could be used for porst-mortem analysis, or for monitoring purposes.
+ However,
+\emph on
+deletions
+\emph default
+ of symlinks are not guaranteed to take place, so your whitness may
+\emph on
+accumulate
+\emph default
+ thousands of old symlinks over a long time.
+ If you want to eventually stop all communication to the local node, just
+ run
+\family typewriter
+rmmod
+\family default
+.
+\end_layout
+
+\end_inset
+
+ passivley fetching the symlink tree.
+ In order to really stop all communication, the kernel module should be
+ unloaded afterwards.
+ The local
+\family typewriter
+/mars/
+\family default
+ filesystem may be manually destroyed after that (at least if you need to
+ reuse it).
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+In case of an eventual node loss (e.g.
+ fire, water, ...) this command should be used on another node $helper in order
+ to finally remove $damaged from the cluster via the command
+\family typewriter
+marsadm leave-cluster --host=$damaged --force
+\family default
+.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+In case you cannot use
+\family typewriter
+leave-resource
+\family default
+ for any reason, you may do the following: just destroy the
+\family typewriter
+/mars/
+\family default
+ filesystem on the host
+\family typewriter
+$deadhost
+\family default
+ you want to remove (e.g.
+ by
+\family typewriter
+mkfs
+\family default
+), or take other measures to
+\emph on
+ensure
+\emph default
+ that it cannot be accidentally re-used in any way (e.g.
+ physical destruction of the underlying RAID,
+\family typewriter
+lvremove
+\family default
+, etc).
+ On all other hosts, do
+\family typewriter
+rmmod mars
+\family default
+, then delete the symlink
+\family typewriter
+/mars/ips/ip-$deadhost
+\family default
+ everywhere by hand, and finally
+\family typewriter
+modprobe mars
+\family default
+ again.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\size scriptsize
+Notice that the last
+\family typewriter
+leave-resource
+\family default
+ operation does not delete the cluster as such.
+ It just creates an
+\emph on
+empty
+\emph default
+ cluster which has no longer any members.
+ In particular, the cluster ID
+\family typewriter
+/mars/uuid
+\family default
+ is
+\emph on
+not
+\emph default
+ removed, deliberately
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+This is a feature, not a bug.
+ The
+\family typewriter
+uuid
+\family default
+ is created once, but never alterered anywhere.
+ The only way to get rid of it is
+\emph on
+external
+\emph default
+ deletion (not by
+\family typewriter
+marsadm
+\family default
+)
+\emph on
+together(!)
+\emph default
+ with all other contents of
+\family typewriter
+/mars/
+\family default
+.
+ This prevents you from accidentally merging half-dead remains which could
+ have survived a disaster for any reason, such as snapshotting filesystems
+ / VMs or whatever.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+Before you can re-use
+\emph on
+any
+\emph default
+ left-over
+\family typewriter
+/mars/
+\family default
+ filesystem for creating / joining a new / different cluster, you
+\emph on
+must
+\emph default
+ obey the instructions in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Setup-your-Cluster"
+
+\end_inset
+
+ and use
+\family typewriter
+mkfs.ext4
+\family default
+ accordingly.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+merge-cluster
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$host
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the set of resources at the local cluster (transitively) and
+ at the cluster of
+\family typewriter
+$host
+\family default
+ (transitively) must be disjoint.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Create the union of both clusters, consisting of the union of all participating
+ machines (transitively).
+ Resource memberships are unaffected.
+ This is useful for creating a
+\begin_inset Quotes eld
+\end_inset
+
+virtual LVM cluster
+\begin_inset Quotes erd
+\end_inset
+
+ where resources can be migrated later via
+\family typewriter
+join-resource
+\family default
+ /
+\family typewriter
+leave-resource
+\family default
+ operations.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\size scriptsize
+Attention! The mars branch
+\family typewriter
+0.1.y
+\family default
+ does not scale well in number of cluter members, because it evolved from
+ a lab prototype with
+\begin_inset Formula $O(n^{2})$
+\end_inset
+
+ behaviour at metadata exchange.
+ Never exceed the maximum cluster members as described in appendix
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "chap:Technical-Data-MARS"
+
+\end_inset
+
+.
+ For safety, you should better stay at 1/2 of the numbers mentioned there.
+ Use
+\family typewriter
+split-cluster
+\family default
+ for going back to smaller clusters again after your background data migration
+ has completed.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+Future versions of MARS, starting with branch
+\family typewriter
+0.1b.y
+\family default
+ will be constructed for very big clusters in the range of thousands of
+ nodes.
+ Development has not yet stabilized there, and operational experiences are
+ missing at the moment.
+ Be careful until official announcements are appearing in the ChangeLog,
+ reporting of operational experiences from the 1&1 big cluster at metadata
+ level.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+merge-cluster-check
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$host
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Check in advance whether the set of resources at the local cluster and at
+ the other cluster
+\family typewriter
+$host
+\family default
+ are disjoint.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+split-cluster
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+This is almost the inverse operation of
+\family typewriter
+merge-cluster
+\family default
+: it determines the minimum sub-cluster groups participating in some common
+ resources.
+ Then it splits the cluster memberships such that unnecessary connections
+ between non-related nodes are interrupted.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Use this for avoidance of too big clusters.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+wait-cluster
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+See section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Waiting"
+
+\end_inset
+
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+create-uuid
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+Deprecated.
+ Only for compatibility with old version light0.1beta05 or earlier.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the
+\family typewriter
+/mars/
+\family default
+ filesystem must be mounted.
+ A
+\family typewriter
+uuid
+\family default
+ (such as automatically created by recent versions of
+\family typewriter
+marsadm create-cluster
+\family default
+) must not already exist; i.e.
+ you have a very old and outdated symlink tree.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: the
+\family typewriter
+/mars/uuid
+\family default
+ symlink is created for later distribution in the cluster.
+ It uniquely indentifies the cluster in the world.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+This must be called at most once at the current primary.
+
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+Resource Operations
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Resource-Operations"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Common precondition for all resource operations is that the
+\family typewriter
+/mars/
+\family default
+ filesystem is mounted, that it contains a valid MARS symlink tree produced
+ by other
+\family typewriter
+marsadm
+\family default
+ commands (including a unique
+\family typewriter
+uuid
+\family default
+), that your current node is a valid member of the cluster, and that the
+ kernel module is loaded.
+ When communication is impossible due to network outages or bad firewall
+ rules, most commands will succeed, but other cluster nodes may take a long
+ time to notice your changes.
+\end_layout
+
+\begin_layout Standard
+Instead of executing
+\family typewriter
+marsadm
+\family default
+ commands serveral times for each resource argument, you may give the special
+ resource argument
+\family typewriter
+all
+\family default
+.
+ This work even when combined with
+\family typewriter
+--force
+\family default
+, but be cautious when giving dangerous command combinations like
+\family typewriter
+marsadm delete-resource --force all
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+In newer versions of
+\family typewriter
+marsadm
+\family default
+, you may give a comma-separated list of resource names in place of
+\family typewriter
+all
+\family default
+.
+ This way, you have more fine-grained control over the set of resource names
+ you want to use.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+Beware when combining this with
+\family typewriter
+--host=somebody
+\family default
+.
+ In some very rare cases, like final destruction of a whole datacenter after
+ an earthquake, you might need a combination like
+\family typewriter
+marsadm --host=defective delete-resource --force all
+\family default
+.
+ Don't use such combinations if you don't need them
+\emph on
+really
+\emph default
+! You can easily shoot yourself in your head if you are not carefully operating
+ such commands!
+\end_layout
+
+\begin_layout Subsection
+Resource Creation / Deletion / Modification
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Resource-Creation"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+
+\size scriptsize
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Command / Params
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Cmp
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Description
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+create-resource
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$disk_dev
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+[$mars_name]
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+[$size]
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the resource argument
+\family typewriter
+$res
+\family default
+ must not denote an already existing resource name in the cluster.
+ The argument
+\family typewriter
+$disk_dev
+\family default
+ must denote an absolute path to a usable local block device, its size must
+ be greater zero.
+ When the optional
+\family typewriter
+$mars_name
+\family default
+ is given, that name must not already exist on the local node; when not
+ given,
+\family typewriter
+$mars_name
+\family default
+ defaults to
+\family typewriter
+$res
+\family default
+.
+ When the optional
+\family typewriter
+$size
+\family default
+ argument is given, it must be a number, optionally followed by a lowercase
+ suffix
+\family typewriter
+k
+\family default
+,
+\family typewriter
+m
+\family default
+,
+\family typewriter
+g
+\family default
+,
+\family typewriter
+t
+\family default
+, or
+\family typewriter
+p
+\family default
+ (denoting size factors as multiples of 1000), or an uppercase suffix
+\family typewriter
+K
+\family default
+,
+\family typewriter
+M
+\family default
+,
+\family typewriter
+G
+\family default
+,
+\family typewriter
+T
+\family default
+ or
+\family typewriter
+P
+\family default
+ (denoting size factors as multiples of 1024).
+ The given size must not exceed the actual size of
+\family typewriter
+$disk_dev
+\family default
+.
+ It will specify the future resource size as shown by
+\family typewriter
+marsadm view-resource-size $res
+\family default
+.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: the resource
+\family typewriter
+$res
+\family default
+ is created, the inital role of the current node is primary.
+ The corresponding symlink tree information is asynchonously distributed
+ in the cluster (in the background).
+ The device
+\family typewriter
+/dev/mars/$mars_name
+\family default
+ should appear after a while.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Notice: when
+\family typewriter
+$size
+\family default
+ is strictly smaller than the size of
+\family typewriter
+$disk_dev
+\family default
+, you will unnecessarily waste some space..
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+This must be called exactly once for any new resource.
+
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+join-resource
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$disk_dev
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+[$mars_name]
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the resource argument
+\family typewriter
+$res
+\family default
+ must denote an already existing resource in the cluster (i.e.
+ its symlink tree information must have been received).
+ The resource must have a designated primary, and it must no be in emergency
+ mode.
+ There must not exist a split brain in the cluster.
+ The local node must not be already member of that resource.
+ The argument
+\family typewriter
+$disk_dev
+\family default
+ must denote an absolute path to a usable (but currently unused) local block
+ device, its size must be greater or equal to the logical size of the resource.
+ When the optional
+\family typewriter
+$mars_name
+\family default
+ is given, that name must not already exist on the local node; when not
+ given,
+\family typewriter
+$mars_name
+\family default
+ defaults to
+\family typewriter
+$res
+\family default
+.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: the current node becomes a member of resource
+\family typewriter
+$res
+\family default
+, the inital role is secondary.
+ The initial full sync should start after a while.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Notice: when the size of $disk_dev is strictly greater than the size of
+ the resource, you will unnecessarily waste some space.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\size scriptsize
+After a while, state
+\family typewriter
+Orphan
+\family default
+ should be left.
+ Don't forget to regularly monitor for longer occurrences of
+\family typewriter
+Orphan
+\family default
+!
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+leave-resource
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the local node must be a member of the resource
+\family typewriter
+$res
+\family default
+; its current role must be secondary.
+ Sync, fetch and replay must be paused (see commands
+\family typewriter
+pause-{sync,fetch,replay}
+\family default
+ or their abbreviation
+\family typewriter
+down
+\family default
+).
+ The disk must be detatched (see commands
+\family typewriter
+detach
+\family default
+ or
+\family typewriter
+down
+\family default
+).
+ The kernel module should be loaded and the network should be operating
+ in order to also propogate the effect to the other nodes.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: the local node is no longer a member of
+\family typewriter
+$res
+\family default
+.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Notice: as a side effect for other nodes, their
+\family typewriter
+log-delete
+\family default
+ may now become possible, since the current node does no longer count as
+ a candidate for logfile application.
+ In addition, a split brain situation may be (partly) resolved by this.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+ Please notice that this command
+\emph on
+may
+\emph default
+ lead to (but does not guarantee) split-brain resolution.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\size scriptsize
+The contents of the disk is not changed by this command.
+ Before issuing this command, check whether the disk appears to be locally
+ consistent (see
+\family typewriter
+view-is-consistent
+\family default
+)! After giving this command, any internal information indicating the consistenc
+y state will be gone, and you will no longer be able to guess consistency
+ properties.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+ When you are
+\emph on
+sure
+\emph default
+.that the disk was consistent before (or is now by manually checking it),
+ you may re-create a new resource out of it via
+\family typewriter
+create-resource
+\family default
+.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+In case of an eventual node loss (e.g.
+ fire, water, ...) this command may be used on another node $helper in order
+ to finally remove all the resources $damaged from the cluster via the command
+
+\family typewriter
+marsadm leave-resource $res --host=$damaged --force
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+delete-resource
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the resource must be empty (i.e.
+ all members must have left via
+\family typewriter
+leave-resource
+\family default
+).
+ This precondition is overridable by
+\family typewriter
+--force
+\family default
+, increasing the danger to maximum! It is even possible to combine
+\family typewriter
+--force
+\family default
+ with an invalid resource argument and an invalid
+\family typewriter
+--host=somebodyelse
+\family default
+ argument in order to desperately try to destroy remains of incomplete or
+ pysically damaged hardware.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: all cluster members will somewhen be forcefully removed from
+
+\family typewriter
+$res
+\family default
+.
+ In case of network interruptions, the forced removal may take place far
+ in the future.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ THIS COMMAND IS
+\emph on
+VERY
+\emph default
+ DANGEROUS!
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Use this only in desperate situations, and only manually.
+ Don't call this from scripts.
+ You are forcefully using a sledgehammer, even without
+\family typewriter
+--force
+\family default
+! The danger is that the
+\emph on
+true
+\emph default
+ state of other cluster nodes need not be known in case of network problems
+ .Even when it were known, it could be compromised by
+\series bold
+byzantine failures
+\series default
+.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+It is strongly advised to try this command with
+\family typewriter
+--dry-run
+\family default
+ first.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+When combined with
+\family typewriter
+--force
+\family default
+, this command will definitely
+\series bold
+murder
+\series default
+ other cluster nodes, possibly after a long while, and even when they are
+ operating in primary mode / having split brains / etc.
+ However, there is no guarantee that other cluster nodes will be
+\emph on
+really
+\emph default
+ dead – it is (theoretically) possible that they remain only
+\emph on
+half
+\emph default
+
+\emph on
+dead
+\emph default
+.
+ For example, a half dead node may continue to write data to
+\family typewriter
+/mars/
+\family default
+ and thus lead to overflow somewhen.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+This command implies a forceful detach, possibly destroying consistency.
+
+\size scriptsize
+It is similar in spirit to a
+\series bold
+STONITH
+\series default
+.
+ In particular, when a cluster node was operating in primary mode (
+\family typewriter
+/dev/mars/mydata
+\family default
+ being continuously in use), the forceful detach cannot be carried out until
+ the device is completely unused.
+ In the meantime, the current transaction logfile will be appended to, but
+ the file
+\emph on
+might
+\emph default
+ be already unlinked (orphan file filling up the disk).
+ After the forceful detach, the underlying disk need not be consistent (although
+ MARS does its best).
+ Since this command deletes any symlinks which normally would indicate the
+ consistency state, no guarantees about consistency can be given after this
+
+\emph on
+in general
+\emph default
+! Always check consistency by hand!
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+When possible / as soon as possible, check the local state on the other
+ nodes in order to
+\emph on
+really
+\emph default
+ shutdown the resource everywhere (e.g.
+ to
+\emph on
+really
+\emph default
+ unuse the
+\family typewriter
+/dev/mars/mydata
+\family default
+ device, etc).
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+After this command, you
+\emph on
+should
+\emph default
+ rebuild the resource under a different name, in order to avoid any clashes
+ caused by unexpected resurrection of
+\begin_inset Quotes eld
+\end_inset
+
+dead
+\begin_inset Quotes erd
+\end_inset
+
+ or
+\begin_inset Quotes eld
+\end_inset
+
+half-dead
+\begin_inset Quotes erd
+\end_inset
+
+ nodes (beware of shapshot / restores on virtual machines!!).
+ MARS does its best to avoid problems even in case the new resource name
+ should equal the old one, but there can be
+\emph on
+no guarantee
+\emph default
+ in all possible failure scenarios / usage scenarios.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+When possible, prefer
+\family typewriter
+leave-resource
+\family default
+ over this!
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+wait-resource
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+{is-,}{attach,
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+ primary,
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+ device}{-off,}
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+See section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Waiting"
+
+\end_inset
+
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection
+Operation of the Resource
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Operation-of-the"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Common preconditions are the preconditions from section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Resource-Operations"
+
+\end_inset
+
+, plus the respective resource
+\family typewriter
+$res
+\family default
+ must exist, and the local node must be a member of it.
+ With the single exception of
+\family typewriter
+attach
+\family default
+ itself, all other operations must be started in
+\family typewriter
+attached
+\family default
+ state.
+\end_layout
+
+\begin_layout Standard
+When
+\family typewriter
+$res
+\family default
+ has the special reserved value
+\family typewriter
+all
+\family default
+, the following operations will work on all resources where the current
+ node is a member (analogously to DRBD).
+\end_layout
+
+\begin_layout Standard
+With newer versions of
+\family typewriter
+marsadm
+\family default
+, you can also give a list of comma-separated resource names in place of
+
+\family typewriter
+all
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+
+\size scriptsize
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Command / Params
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Cmp
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Description
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+attach
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the local disk belonging to $res is not in use by anyone else.
+ Its contents has not been altered in the meantime since the last
+\family typewriter
+detach
+\family default
+.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+Mounting
+\emph on
+read-only
+\emph default
+ is allowed during the detached phase.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\size scriptsize
+However, be careful! If you
+\emph on
+accidentally
+\emph default
+ forget to give the right readonly-mount flags, if you use
+\family typewriter
+fsck
+\family default
+ in repair mode inbetween, or alter the disk content in any other way (beware
+ of LVM snapshots / restores etc), you will almost certainly produce an
+
+\series bold
+unnoticed inconsistency
+\series default
+ (not reported by
+\family typewriter
+view-is-consistent
+\family default
+)! MARS has
+\emph on
+no chance
+\emph default
+ to notice suchalike!
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: MARS uses the local disk and is able to work with it (e.g.
+ replay logfiles on it).
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Note: the local disk is opened in exclusive read-write mode.
+ This should protect against most common misuse, such as opening the disk
+ in parallel to MARS.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\size scriptsize
+However, this does not necessarily protect against non-exclusive openers.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+detach
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the local
+\family typewriter
+/dev/mars/mydata
+\family default
+ device (when present) is no longer opened by anybody.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: the local disk belonging to $res is no longer in use.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+In contrast to DRBD, you need not explicitly pause syncing, fetching, or
+ replaying
+\emph on
+to
+\emph default
+ (as apposed to
+\emph on
+from
+\emph default
+) the local disk.
+ These processes are automatically paused.
+ As another contrast to DRBD, the respective processes will usually
+\emph on
+automatically
+\emph default
+ resume after re-attach, as far as possible in the respective new situation.
+ This will usually work even over
+\family typewriter
+rmmod
+\family default
+ or reboot cycles, since the internal symlink tree will automatically persist
+ all todo switches for you (c.f.
+ section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-State-of"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\size scriptsize
+Notice: only
+\emph on
+local
+\emph default
+ transfer operations
+\emph on
+to
+\emph default
+ the local disk are paused by a detach.
+ When another node is remotely running a sync
+\emph on
+from
+\emph default
+ your local disk, it will likely remain in use for remote reading.
+ The reason is that the server part of MARS is operating purely passively,
+ in order serve all remote requests as best as possible (similar to the
+ original Unix philosophy).
+ In order to really stop all accesses, do a
+\family typewriter
+pause-sync
+\family default
+ on all other resource member where a sync is currently running.
+ You may also try
+\family typewriter
+pause-sync-global
+\family default
+.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\size scriptsize
+WARNING! After this, and ather having paused any remote data access, you
+ might use the underlying disk for your own purposes, such as test-mounting
+ it in
+\emph on
+readonly
+\emph default
+ mode.
+
+\series bold
+Don't modifiy
+\series default
+ its contents in any way! Not even by an
+\family typewriter
+fsck
+\family default
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Some (but not all)
+\family typewriter
+fsck
+\family default
+ tools for some filesystems have options to start only a test repair / verify
+ mode / dry run, without doing actual modifications to the data.
+ Of course, these modes
+\emph on
+can
+\emph default
+ be used.
+ But be really sure! Double-check for the right options!
+\end_layout
+
+\end_inset
+
+! Otherwise, you will have inconsistencies
+\emph on
+guaranteed
+\emph default
+.
+ MARS has no way for knowing of any modifications to your disk when bypassing
+
+\family typewriter
+/dev/mars/*
+\family default
+.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+In case you accidentally modified the underlying disk at the
+\emph on
+primary
+\emph default
+ side, you may choose to resolve the inconsistencies by
+\family typewriter
+marsadm invalide $res
+\family default
+ on
+\emph on
+each
+\emph default
+ secondary.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+pause-sync
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Equivalent to
+\family typewriter
+pause-sync-local
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+pause-sync-local
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: none additionally.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: any sync operation targeting the local disk (when not yet
+ completed) is paused after a while (cf section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-State-of"
+
+\end_inset
+
+).
+ When successfully completed, this operation will remember the switch state
+ forever and automatically become relevant if a sync is needed again (e.g.
+
+\family typewriter
+invalidate
+\family default
+ or
+\family typewriter
+resize
+\family default
+).
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+pause-sync-global
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Like
+\family typewriter
+*-local
+\family default
+, but operates on all members of the resource.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+resume-sync
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Equivalent to
+\family typewriter
+resume-sync-local
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+resume-sync-local
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: additionally, a primary must be designated, and it must not
+ be in emergency mode.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: any sync operation targeting the local disk (when not yet
+ completed) is resumed after a while.
+ When completed, this operation will remember the switch state forever and
+ become relevant if a sync is needed again (e.g.
+
+\family typewriter
+invalidate
+\family default
+ or
+\family typewriter
+resize
+\family default
+).
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+resume-sync-global
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Like
+\family typewriter
+*-local
+\family default
+, but operates on all members of the resource.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+pause-fetch
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Equivalent to
+\family typewriter
+pause-fetch-local
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+pause-fetch-local
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: none additionally.
+ The resource
+\emph on
+should
+\emph default
+ be in secondary role.
+ Otherwise the switch has
+\emph on
+no
+\emph default
+
+\emph on
+immediate
+\emph default
+ effect, but will come (possibly unexpectedly) into effect whenever secondary
+ role is entered later for whatever reason.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: any transfer of (parts of) transaction logfiles which are
+ present at another primary host to the local
+\family typewriter
+/mars/
+\family default
+ storage are paused at their current stage.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+This switch works independently from
+\family typewriter
+{pause,resume}-replay
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+pause-fetch-global
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Like
+\family typewriter
+*-local
+\family default
+, but operates on all members of the resource.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+resume-fetch
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Equivalent to
+\family typewriter
+resume-fetch-local
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+resume-fetch-local
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: none additionally.
+ The resource
+\emph on
+should
+\emph default
+ be in secondary role.
+ Otherwise the switch has
+\emph on
+no
+\emph default
+
+\emph on
+immediate
+\emph default
+ effect, but will come (possibly unexpectedly) into effect whenever secondary
+ role is entered later for whatever reason.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: any (parts of) transaction logfiles which are present at
+ another primary host shouldl be transferred to the local
+\family typewriter
+/mars/
+\family default
+ storage as far as not yet locally present.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+This works independently from
+\family typewriter
+{pause,resume}-replay
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+resume-fetch-global
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Like
+\family typewriter
+*-local
+\family default
+, but operates on all members of the resource.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+pause-replay
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Equivalent to
+\family typewriter
+pause-replay-local
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+pause-replay-local
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: none additionally.
+ The resource
+\emph on
+should
+\emph default
+ be in secondary role.
+ Otherwise the switch has
+\emph on
+no
+\emph default
+
+\emph on
+immediate
+\emph default
+ effect, but will come (possibly unexpectedly) into effect whenever secondary
+ role is entered later for whatever reason.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: any local replay operations of transaction logfiles to the
+ local disk are paused at their current stage.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+This works independently from
+\family typewriter
+{pause,resume}-fetch
+\family default
+ resp.
+
+\family typewriter
+{dis,}connect
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+pause-replay-global
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Like
+\family typewriter
+*-local
+\family default
+, but operates on all members of the resource.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+resume-replay
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Equivalent to
+\family typewriter
+pause-replay-local
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+resume-replay-local
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status collapsed
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: must be in secondary role.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: any (parts of) locally existing transaction logfiles (whether
+ replicated from other hosts or produced locally) are started for replay
+ to the local disk, as far as they have not yet been applied.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+resume-replay-global
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Like
+\family typewriter
+*-local
+\family default
+, but operates on all members of the resource.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+connect
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Equivalent to
+\family typewriter
+connect-local
+\family default
+ and to
+\family typewriter
+resume-fetch-local
+\family default
+.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+Note: although this sounds similar to DRBD's
+\family typewriter
+drbdadm connect
+\family default
+, there are subtle differences.
+ DRBD has exactly one connection per resource, which is associated with
+
+\emph on
+pairs
+\emph default
+ of nodes.
+ In contrast, MARS may create multiple connections per resource at runtime,
+ and these are associated with the
+\emph on
+target
+\emph default
+ host (not with
+\emph on
+pairs
+\emph default
+ of hosts).
+ As a consequence, the fetch may
+\emph on
+potentially
+\emph default
+ occur from any other other source host which happens to be reachable (although
+ the current implementation prefers the current designated primary, but
+ this may change in future).
+ In addition,
+\family typewriter
+marsadm disconnect
+\family default
+ does not stop
+\emph on
+all
+\emph default
+ communication.
+ It only stops fetching logfiles.
+ The symlink update running in background is
+\emph on
+not
+\emph default
+ stopped, in order to always propagate as much metadata as possible in the
+ cluster.
+ In case of a later incident, chances are higher for a better knowledge
+ of the
+\emph on
+real
+\emph default
+ state of the cluster.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+connect-local
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Equivalent to
+\family typewriter
+resume-fetch-local
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+connect-global
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Equivalent to
+\family typewriter
+resume-fetch-global
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+disconnect
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Equivalent to
+\family typewriter
+disconnect-local
+\family default
+ and to
+\family typewriter
+pause-fetch-local
+\family default
+.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+See above note at
+\family typewriter
+connect
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+disconnect-local
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Equivalent to
+\family typewriter
+pause-fetch-local
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+disconnect-global
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+partly
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Equivalent to
+\family typewriter
+pause-fetch-global
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+up
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Equivalent to
+\family typewriter
+attach
+\family default
+ followed by
+\family typewriter
+resume-fetch
+\family default
+ followed by
+\family typewriter
+resume-replay
+\family default
+ followed by
+\family typewriter
+resume-sync
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+down
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Equivalent to
+\family typewriter
+pause-sync
+\family default
+ followed by
+\family typewriter
+pause-fetch
+\family default
+ followed by
+\family typewriter
+pause-replay
+\family default
+ followed by
+\family typewriter
+detach
+\family default
+.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+Hint: consider to prefer plain
+\family typewriter
+detach
+\family default
+ over this, because
+\family typewriter
+detach
+\family default
+ will remember the last state of all switches, while
+\family typewriter
+down
+\family default
+ will
+\emph on
+not
+\emph default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+primary
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+almost
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: sync must have finished at any resource member.
+ All relevant transaction logfiles must be either already locally present,
+ or be fetchable (see
+\family typewriter
+resume-fetch
+\family default
+ and
+\family typewriter
+resume-replay
+\family default
+).
+ When some logfile data is locally missing, there must be enough space on
+
+\family typewriter
+/mars/
+\family default
+ to fetch it.
+ Any replay must not have been interrupted by a replay error (see macro
+ %replay-code{} or diskstate
+\family typewriter
+DefectiveLog
+\family default
+).
+ The current designated primary must be reachable over network.
+ When there is no designated primary (i.e.
+
+\family typewriter
+marsadm secondary
+\family default
+ had been executed before, which is explicitly
+\emph on
+not recommended
+\emph default
+),
+\emph on
+all
+\emph default
+ other members of the resource must be reachable (since we have no memory
+ who was the old primary before), and then they must also match the same
+ preconditions.
+ When another host is currently primary (whether designated or not), it
+ must match the preconditions of
+\family typewriter
+marsadm secondary
+\family default
+ (that means, its local
+\family typewriter
+/dev/mars/mydata
+\family default
+ device must not be in use any more).
+ A split brain must not already exist.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition:
+\family typewriter
+/dev/mars/$dev_name
+\family default
+ appears locally and is usable; the current host is in primary role.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Switches the
+\series bold
+designated primary
+\series default
+.
+ There are three variants:
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+1)
+\series bold
+Handover
+\series default
+ when
+\emph on
+not
+\emph default
+ giving
+\family typewriter
+--force
+\family default
+: when another host is currently primary, it is first asked to leave its
+ primary role, and it is waited until it actually has become secondary.
+ After that, the local host is asked to become primary.
+ Before actually becoming primary, all relevant logfiles are transferred
+ over the network and replayed, in order to avoid accidental creation of
+ split brain as best as possible
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Note that split brain avoidance is
+\series bold
+best effort
+\series default
+ and cannot be guaranteed in general.
+ For example, it may be impossible to avoid split brain in case of long-lasting
+ network outages.
+\end_layout
+
+\end_inset
+
+.
+ Only after that,
+\family typewriter
+/dev/mars/$dev_name
+\family default
+ will appear.
+ When network transfers of the symlink tree are very slow (or currently
+ impossible), this command may take a very long time.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+In case a split brain is already detected at the initial situation, the
+ local host will refuse to switch the designated primary without
+\family typewriter
+--force
+\family default
+.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+ In case of
+\begin_inset Formula $k>2$
+\end_inset
+
+ replicas: if you want to handover between host
+\family typewriter
+A
+\family default
+ and
+\family typewriter
+B
+\family default
+ while a sync is currently running at host
+\family typewriter
+C
+\family default
+, you have the following options:
+\end_layout
+
+\begin_layout Enumerate
+
+\size scriptsize
+wait until the sync has finished (see macro
+\family typewriter
+sync-rest
+\family default
+, or
+\family typewriter
+marsadm view
+\family default
+ in general).
+\end_layout
+
+\begin_layout Enumerate
+
+\size scriptsize
+do a
+\family typewriter
+leave-resouce
+\family default
+ on host
+\family typewriter
+C
+\family default
+, and later
+\family typewriter
+join-resource
+\family default
+ after the handover completed successfully.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+2)
+\series bold
+Handover ignoring running syncs,
+\series default
+ by adding the option
+\family typewriter
+--ignore-sync
+\family default
+.
+ Any running syncs will restart from scratch, in order to ensure consistency.
+ Use this only when the planned handover is more important than the sync
+ time.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+3)
+\series bold
+Forced switching
+\series default
+: by giving –force while
+\family typewriter
+pause-fetch
+\family default
+ is active (but not
+\family typewriter
+pause-replay
+\family default
+), most preconditions are ignored, and MARS does its best to actually become
+ primary even if some logfiles are missing or incomplete or even defective.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\family typewriter
+\size scriptsize
+primary --force
+\family default
+ is a potentially harmful variant, because it will provoke a split brain
+ in most cases, and therefore in turn will lead to
+\series bold
+data loss
+\series default
+ because one of your split brain versions must be discarded later in order
+ to resolve the split brain (see section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Split-Brain-Resolution"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\series bold
+\size scriptsize
+Never
+\series default
+ call
+\family typewriter
+primary --force
+\family default
+ when
+\family typewriter
+primary
+\family default
+ without
+\family typewriter
+--force
+\family default
+ is sufficient! If
+\family typewriter
+primary
+\family default
+ without
+\family typewriter
+--force
+\family default
+ complains that the device is in use at the former primary side, take it
+ seriously! Don't override with
+\family typewriter
+--force
+\family default
+, but rather umount
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+A common misconception is when people think that they can keep their filesystem
+ mounted without provoking a split brain, because they have their application
+ stopped and thus don't write any data into the filesystem.
+ This is a wrong idea, because filesystems may write some metadata, like
+ booking information, even after hours or days of inactivity.
+ Therefore MARS insists that the device is no longer in use before any handover
+ can take place.
+\end_layout
+
+\end_inset
+
+ the device at the other side!
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+ Only use
+\family typewriter
+primary --force
+\family default
+ when something is
+\emph on
+already broken
+\emph default
+, such as a network outage, or a node crash, etc.
+ During ordinary operations (network OK, nodes OK), you should never need
+
+\family typewriter
+primary --force
+\family default
+!
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+ If you umount
+\family typewriter
+/dev/mars/mydata
+\family default
+ on the old primary
+\family typewriter
+A
+\family default
+, and then wait until
+\family typewriter
+marsadm view
+\family default
+ (or another suitable macro) on the target host
+\family typewriter
+B
+\family default
+ shows that everything is
+\family typewriter
+UpToDate
+\family default
+, you can prevent a split brain by yourself even when giving
+\family typewriter
+primary --force
+\family default
+ afterwards.
+ However, checking / assuring this is
+\emph on
+your
+\emph default
+ responsibility!
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\family typewriter
+\size scriptsize
+ primary --force
+\family default
+ switches the
+\emph on
+designated
+\emph default
+ primary.
+ In some extremely rare cases, when
+\emph on
+multiple
+\emph default
+ faults have accumulated in a
+\emph on
+weird
+\emph default
+ situation, it
+\emph on
+might
+\emph default
+ be impossible becoming the / an actual primary.
+ Typically you may be
+\emph on
+already
+\emph default
+ in a split brain situation.
+ This has not been observed for a long operations time on recent versions
+ of MARS, but in general becoming primary via
+\family typewriter
+--force
+\family default
+ cannot be guaranteed always, although MARS does its best.
+ In split brain situations, or if you ever encounter such a problem, you
+
+\emph on
+must
+\emph default
+ resolve the split brain immediately after giving this command (see section
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Split-Brain-Resolution"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+ Hint in case of
+\begin_inset Formula $k>2$
+\end_inset
+
+ replicas:
+\family typewriter
+marsadm invalidate
+\family default
+ cannot always resolve a split brain at other secondaries (which are neither
+ the old nor the new designated primary).
+ Therefore, prefer the
+\family typewriter
+leave-resource
+\family default
+ method described in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Split-Brain-Resolution"
+
+\end_inset
+
+, starting with a
+\family typewriter
+leave-resource
+\family default
+ phase at the old primary, and proceeding to
+\begin_inset Quotes eld
+\end_inset
+
+unrelated
+\begin_inset Quotes erd
+\end_inset
+
+ secondaries step by step, until the split brain is gone.
+ Don't
+\family typewriter
+join-resource
+\family default
+ again before the split brain is gone! This way, all these replicas will
+ remain consistent for now, but of course outdated (or potentially even
+ a
+\begin_inset Quotes eld
+\end_inset
+
+wrong
+\begin_inset Quotes erd
+\end_inset
+
+ split-brain version, but
+\emph on
+potentially usable
+\emph default
+ in case you get under pressure in some way).
+ In the hopefully unlikely case that you should later discover that you
+ accidentally forced the
+\emph on
+wrong
+\emph default
+ replica via
+\family typewriter
+primary --force
+\family default
+, you will have a chance to recover by either forcing the
+\begin_inset Quotes eld
+\end_inset
+
+correct
+\begin_inset Quotes erd
+\end_inset
+
+ host to primary (if it did not already leave the resource), or by creating
+ a completely fresh resource out of the
+\begin_inset Quotes eld
+\end_inset
+
+correct
+\begin_inset Quotes erd
+\end_inset
+
+ local disk.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+ Generally: in case of
+\family typewriter
+primary --force
+\family default
+, the preconditions are different.
+ The fetch
+\emph on
+must
+\emph default
+ be switched off (see
+\family typewriter
+pause-fetch
+\family default
+), in order to get stable logfile positions.
+ See section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Forced-Switching"
+
+\end_inset
+
+.
+ For your safety,
+\family typewriter
+–force
+\family default
+ does not work in newer marsadm (after mars0.1stable52) when your replica
+ is a current sync target.
+ More explanations see section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "subsec:Forced-Switching"
+
+\end_inset
+
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+secondary
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+almost
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the local
+\family typewriter
+/dev/mars/$dev_name
+\family default
+ is no longer in use (e.g.
+ umounted).
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: There exists no designated primary any more.
+ During split brain and when the network is OK (again), all actual primaries
+ (including the local host) will leave primary ASAP (i.e.
+ when their
+\family typewriter
+/dev/mars/mydata
+\family default
+ is no longer in use).
+ Any secondary will start following (old) logfiles (even from backlogs)
+ by replaying transaction logs if it is
+\emph on
+uniquely
+\emph default
+ possible (which is often violated during split brain).
+ On any secondary,
+\family typewriter
+/dev/mars/$dev_name
+\family default
+ will have disappeared.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+ Notice: in difference to DRBD, you
+\series bold
+don't need
+\series default
+ this command during normal operation, including handover.
+ Any resource member which is
+\emph on
+not
+\emph default
+ designated as primary will
+\emph on
+automatically
+\emph default
+ go into secondary role.
+ For example, if you have
+\begin_inset Formula $k=4$
+\end_inset
+
+ replicas, only
+\emph on
+one of them
+\emph default
+ can be designated as a primary.
+ When the network is OK, all other 3 nodes will know this fact, and they
+ will
+\emph on
+automatically
+\emph default
+ go into secondary mode, following the transaction logs from the (new) primary.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\size scriptsize
+Hint: avoid this command.
+ It turns off
+\emph on
+any
+\emph default
+ primary,
+\series bold
+globally
+\series default
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+A serious
+\series bold
+misconception
+\series default
+ among some people is when they believe that they can switch
+\begin_inset Quotes eld
+\end_inset
+
+a certain node to secondary
+\begin_inset Quotes erd
+\end_inset
+
+.
+ It is not possible to switch individual nodes to secondary, without affecting
+ other nodes! The concept of
+\begin_inset Quotes eld
+\end_inset
+
+designated primary
+\begin_inset Quotes erd
+\end_inset
+
+ is
+\series bold
+global
+\series default
+ throughout a resource!
+\end_layout
+
+\end_inset
+
+.
+ You cannot start a sync after that (e.g.
+
+\family typewriter
+invalidate
+\family default
+ or
+\family typewriter
+join-resource
+\family default
+ or
+\family typewriter
+resume-sync
+\family default
+), because it is
+\emph on
+not unique
+\emph default
+ wherefrom the data shall be fetched.
+ In split brain situations (when the network is OK again), this may have
+ further drawbacks.
+ It is much better / easier to
+\series bold
+\emph on
+directly
+\emph default
+ switch the designated primary
+\series default
+ from one node to another via the
+\family typewriter
+primary
+\family default
+ command.
+ See also section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Forced-Switching"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\size scriptsize
+ There is only one valid use case where you
+\emph on
+really
+\emph default
+ need this command: before finally destroying a resouce via the
+\emph on
+last
+\emph default
+
+\family typewriter
+leave-resource
+\family default
+ (or the dangerous
+\family typewriter
+delete-resource
+\family default
+), you will need this before you can do that.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+wait-umount
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+See section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Waiting"
+
+\end_inset
+
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+log-purge-all
+\begin_inset CommandInset label
+LatexCommand label
+name "log-purge-all$res"
+
+\end_inset
+
+
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: none additionally.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: all locally known logfiles and version links are removed,
+ whenever they are not / no longer reachable by any split brain version.
+\end_layout
+
+\begin_layout Plain Layout
+Rationale: remove hindering split-brain /
+\family typewriter
+leave-resource
+\family default
+ leftovers.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Use this only when split brain does not go away by means of
+\family typewriter
+leave-resource
+\family default
+ (which
+\emph on
+could
+\emph default
+ happen in very weird scenarios such as MARS running on virtual machines
+ doing a restore of their snapshots, or otherwise unexpected resurrection
+ of dead or half-dead nodes).
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ THIS IS POTENTIALLY DANGEROUS!
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+This command
+\emph on
+might
+\emph default
+ destroy some valuable logfiles / other information in case the local informatio
+n is outdated or otherwise incorrect.
+ MARS does its best for checking anything, but there is no guarantee.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Hint: use
+\family typewriter
+--dry-run
+\family default
+ beforehand for checking!
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+resize
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+[$size]
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+almost
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: The local host must be primary.
+ All disks in the cluster participating in
+\family typewriter
+$res
+\family default
+ must be physically larger than the logical resource size (e.g, by use of
+
+\family typewriter
+lvm
+\family default
+; can be checked by macros
+\family typewriter
+%disk-size{}
+\family default
+ and
+\family typewriter
+%resource-size{}
+\family default
+).
+ When the optional
+\family typewriter
+$size
+\family default
+ argument is present, it must be smaller than the minimum of all physical
+ sizes, but larger than the current logical size of the resource.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: the logical size of
+\family typewriter
+/dev/mars/$dev_name
+\family default
+ will reflect the new size after a while.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection
+Logfile Operations
+\end_layout
+
+\begin_layout Standard
+
+\size scriptsize
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Command / Params
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Cmp
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Description
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+cron
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Do all necessary housekeeping tasks.
+ See
+\family typewriter
+log-rotate
+\family default
+ and
+\family typewriter
+log-delete-all
+\family default
+ for details.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+This should be regularly called by an external cron job or similar.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+log-rotate
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the local node
+\family typewriter
+$host
+\family default
+ must be primary at
+\family typewriter
+$res
+\family default
+.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: after a while, a new transaction logfile
+\family typewriter
+/mars/resource-$res/log-$new_nr-$host
+\family default
+ will be used instead of
+\family typewriter
+/mars/resource-$res/log-$old_nr-$host
+\family default
+ where
+\family typewriter
+$new_nr
+\family default
+ =
+\family typewriter
+$old_nr
+\family default
+ + 1.
+ Without
+\family typewriter
+--force
+\family default
+, this will only carry out actions at the primary side since it makes no
+ sense on secondaries.
+ With
+\family typewriter
+--force
+\family default
+, secondaries are
+\emph on
+trying
+\emph default
+ to
+\emph on
+remotely
+\emph default
+ trigger a log-rotate, but without any guarantee (likely even a split-brain
+ may result instead, so use this only if you are
+\emph on
+really
+\emph default
+ desperate).
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+log-delete
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the local node must be a member of
+\family typewriter
+$res
+\family default
+.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: when there exists some old transaction logfiles
+\family typewriter
+/mars/resource-$res/log-*-$some_host
+\family default
+ which are no longer referenced by any of the symlinks
+\family typewriter
+/mars/resource-$res/replay-*
+\family default
+ , those logfiles are marked for deletion in the whole cluster.
+ When no such logfiles exist, nothing will happen.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+log-delete-one
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the local node must be a member of
+\family typewriter
+$res
+\family default
+.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: when there exists an old transaction logfile
+\family typewriter
+/mars/resource-$res/log-$old_nr-$some_host
+\family default
+ where
+\family typewriter
+$old_nr
+\family default
+ is the minimum existing number and that logfile is no longer referenced
+ by any of the symlinks
+\family typewriter
+/mars/resource-$res/replay-*
+\family default
+ , that logfile is marked for deletion in the whole cluster.
+ When no such logfile exists, nothing will happen.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+log-delete-all
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Alias for
+\family typewriter
+log-delete
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection
+Consistency Operations
+\end_layout
+
+\begin_layout Standard
+
+\size scriptsize
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Command / Params
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Cmp
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Description
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+invalidate
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the local node must be in secondary role at
+\family typewriter
+$res
+\family default
+.
+ A
+\emph on
+designated
+\emph default
+ primary must exist.
+ When having
+\begin_inset Formula $k>2$
+\end_inset
+
+ replicas, no split brain must exist (otherwise, or when
+\family typewriter
+invalidate
+\family default
+ does not work in case of
+\begin_inset Formula $k=2$
+\end_inset
+
+, use the
+\family typewriter
+leave-resource
+\family default
+ ;
+\family typewriter
+join-resource
+\family default
+ method described in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Split-Brain-Resolution"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: the local disk is marked as inconsistent, and a fast fullsync
+ from the designated primary will start after a while.
+ Notice that
+\family typewriter
+marsadm {pause,resume}-sync
+\family default
+ will influence whether the sync really starts.
+ When the fullsync has finished successfully, the local node will be consistent
+ again.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\size scriptsize
+After a while, state
+\family typewriter
+Orphan
+\family default
+ should be left.
+ Don't forget to regularly monitor for longer occurrences of
+\family typewriter
+Orphan
+\family default
+!
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+fake-sync
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the local node must be in secondary role at
+\family typewriter
+$res
+\family default
+.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: when a fullsync is running, it will stop after a while, and
+ the local node will be
+\emph on
+marked
+\emph default
+ as consistent as if it were consistent again.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\size scriptsize
+ONLY USE THIS IF YOU REALLY KNOW WHAT YOU ARE DOING!
+\begin_inset Newline newline
+\end_inset
+
+See the WARNING in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Creating-and-Maintaining"
+
+\end_inset
+
+
+\begin_inset Newline newline
+\end_inset
+
+Use this only
+\emph on
+before
+\emph default
+ creating a fresh filesystem inside
+\family typewriter
+/dev/mars/$res
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+set-replay
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\size scriptsize
+ONLY FOR ADVANCED HACKERS WHO KNOW WHAT THEY ARE DOING!
+\begin_inset Newline newline
+\end_inset
+
+This command is deliberately not documented.
+ You need the competence level RTFS (
+\begin_inset Quotes eld
+\end_inset
+
+read the fucking sources
+\begin_inset Quotes erd
+\end_inset
+
+).
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+Further Operations
+\end_layout
+
+\begin_layout Subsection
+Inspection Commands
+\end_layout
+
+\begin_layout Standard
+
+\size scriptsize
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Command / Params
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Cmp
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Description
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+view-
+\emph on
+macroname
+\begin_inset Newline newline
+\end_inset
+
+
+\emph default
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Display the output of a macro evaluation.
+ See section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Inspecting-the-State"
+
+\end_inset
+
+ for a thorough description.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+view
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Equivalent to
+\family typewriter
+view-default
+\family default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+role
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Deprectated.
+ Use
+\family typewriter
+view-role
+\family default
+ instead.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+state
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Deprectated.
+ Use
+\family typewriter
+view-state
+\family default
+ instead.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+cstate
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Deprectated.
+ Use
+\family typewriter
+view-cstate
+\family default
+ instead.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+dstate
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Deprectated.
+ Use
+\family typewriter
+view-dstate
+\family default
+ instead.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+status
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Deprectated.
+ Use
+\family typewriter
+view-status
+\family default
+ instead.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+show-state
+\end_layout
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Deprectated.
+ Don't use it.
+ Use
+\family typewriter
+view-state
+\family default
+ instead, or other macros.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+show-info
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Deprectated.
+ Don't use it.
+ Use
+\family typewriter
+view-info
+\family default
+ instead, or other macros.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+show
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Deprectated.
+ Don't use it.
+ Use or implement some macros instead.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+show-errors
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Deprectated.
+ Use
+\family typewriter
+view-the-err-msg
+\family default
+ or
+\family typewriter
+view-resource-err
+\family default
+ similar macros.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+cat
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$file
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Write the file content to stdout, but replace all occurences of numeric
+ timestamps converted to a human-readable format.
+ Thus is most useful for inspection of status and log files, e.g.
+
+\family typewriter
+marsadm cat /mars/5.total.log
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection
+Setting Parameters
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Setting-Parameters"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Per-Resource Parameters
+\end_layout
+
+\begin_layout Standard
+
+\size scriptsize
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Command / Params
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Cmp
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Description
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+set-emergency-limit $res
+\emph on
+n
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+The argument
+\emph on
+n
+\emph default
+ must be percentage between 0 and 100 %.
+ When the remaining store space in
+\family typewriter
+/mars/
+\family default
+ undershoots the given percentage, the resource will go
+\emph on
+earlier
+\emph default
+ into emergency mode than by the global computation described in section
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Defending-Overflow"
+
+\end_inset
+
+.
+ 0 means unlimited.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+get-emergency-limit $res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Inquiry of the preceding value.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Global Parameters
+\end_layout
+
+\begin_layout Standard
+
+\size scriptsize
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Command / Params
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Cmp
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Description
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+set-sync-limit-value
+\emph on
+n
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Limit the concurrency of sync operations to some maximum number.
+ 0 means unlimited.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+get-sync-limit-value
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Inquiry of the preceding value.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+set-connect-pref-list host1,host2,hostn
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Set the order of preferences for connections when there are more than 2
+ hosts participating in a cluster.
+ The argument must be comma-separated list of node names.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+get-connect-pref-list
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Inquiry of the preceding value.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection
+Waiting
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Waiting"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+
+\size scriptsize
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Command / Params
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Cmp
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Description
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+wait-cluster
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the
+\family typewriter
+/mars/
+\family default
+ filesystem must be mounted and it must contain a valid MARS symlink tree
+ produced by the other
+\family typewriter
+marsadm
+\family default
+ commands.
+ The kernel module must be loaded.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: none.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Wait until
+\emph on
+all
+\emph default
+ nodes in the cluster have sent a message, or until timeout.
+ The default timeout is 30 s (exceptionally) and
+\size default
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+Be
+\size scriptsize
+ may be changed by
+\family typewriter
+ --timeout=$seconds
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+wait-resource
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+{is-,}{attach,
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+ primary,
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+ device}{-off,}
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: the local node must be a member of the resource
+\family typewriter
+$res
+\family default
+.
+
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: none.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Wait until the local node reaches a specified condition on
+\family typewriter
+$res
+\family default
+, or until timeout.
+ The default timeout of 60 s may be changed by
+\family typewriter
+ --timeout=$seconds
+\family default
+.
+ The last argument denotes the condition.
+ The condition is inverted if suffixed by
+\family typewriter
+-off
+\family default
+.
+ When preceded by
+\family typewriter
+is-
+\family default
+ (which is the most useful case), it is checked whether the condition is
+ actually reached.
+ When the
+\family typewriter
+is-
+\family default
+ prefix is left off, the check is whether another
+\family typewriter
+marsadm
+\family default
+ command has been already given which
+\emph on
+tries
+\emph default
+ to achieves the intended result (typicially, you may use this after the
+
+\family typewriter
+is-
+\family default
+ variant has failed).
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+wait-connect
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+almost
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+This is an alias for
+\family typewriter
+wait-cluster
+\family default
+ waiting until only those nodes are reachable which belong to
+\family typewriter
+$res
+\family default
+ (instead of waiting for the
+\emph on
+full
+\emph default
+ cluster).
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+wait-umount
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$res
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Precondition: none additionally.
+\end_layout
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Postcondition: the local
+\family typewriter
+/dev/mars/$dev_name
+\family default
+ is no longer in use (e.g.
+ umounted).
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection
+Low-Level Expert Commands
+\end_layout
+
+\begin_layout Standard
+These commands are for experts and advanced sysadmins only.
+ The interface is not stable, i.e.
+ the meaning may change at any time.
+ Use at your own risk!
+\end_layout
+
+\begin_layout Standard
+
+\size scriptsize
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Command / Params
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Cmp
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Description
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+set-link
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+RTFS.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+get-link
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+RTFS.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+delete-file
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+RTFS.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The following commands are for manual setup / repair of cluster membership.
+ Only to be used by experts who know what they are doing! In general, cluster-wi
+de operations on IP addresses may need to be repeated at all hosts in the
+ cluster iff the communication is not (yet) possible and/or not (yet) actually
+ working (e.g.
+ firewalling problems etc).
+\end_layout
+
+\begin_layout Standard
+
+\size scriptsize
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Command / Params
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Cmp
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Description
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "30col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+lowlevel-ls-host-ips
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "50col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+List all configured cluster members together with their currently configured
+ IP addresses, as known
+\emph on
+locally
+\emph default
+.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "30col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+lowlevel-set-host-ip
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$hostname
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$ip
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "50col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Change the assignment of IP addresses
+\emph on
+locally
+\emph default
+.
+ May be used when hosts are moved to different network locations, or when
+ different network interfaces are to be used for replication (e.g.
+ dedicated replication IPs).
+ Notice that the names of hosts must not change at all, only their IP addresses
+ may be changed.
+ Check active connections with
+\family typewriter
+netstat
+\family default
+ & friends.
+ Updates may need some time to proceed (socket timeouts etc).
+\begin_inset Newline newline
+\end_inset
+
+Hint: for safety, call this on
+\emph on
+all
+\emph default
+ members of a cluster to ensure consistency.
+ Otherwise it may happen that some cluster members do not know the
+\emph on
+new
+\emph default
+ IP address where to fetch the
+\emph on
+new
+\emph default
+ information from.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "30col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+lowlevel-delete-host
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+strut
+\backslash
+hfill
+\end_layout
+
+\end_inset
+
+$hostname
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "50col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Remove a host from the cluster membership
+\emph on
+locally
+\emph default
+, together with its IP address assignment.
+ This does not remove any further information.
+ In particular, resource memberships are untouched.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection
+Senseless Commands (from DRBD)
+\end_layout
+
+\begin_layout Standard
+
+\size scriptsize
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Command / Params
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Cmp
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Description
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+syncer
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+new-current-uuid
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+create-md
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+dump-md
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+dump
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+get-gi
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+show-gi
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+outdate
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+adjust
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+yes
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Implemented as NOP (not necessary with MARS).
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+hidden-commands
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection
+Forbidden Commands (from DRBD)
+\end_layout
+
+\begin_layout Standard
+These commands are not implemented because they would be dangerous in MARS
+ context:
+\end_layout
+
+\begin_layout Standard
+
+\size scriptsize
+\begin_inset Tabular
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Command / Params
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Cmp
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+Description
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+invalidate-remote
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+This would be too dangerous in case you have multiple secondaries.
+ A similar effect can be achieved with the
+\family typewriter
+--host=
+\family default
+ option.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "20col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+\size scriptsize
+verify
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+no
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size scriptsize
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "60col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+
+\size scriptsize
+This would cause unintended side effects due to races between logfile transfer
+ / application and block-wise comparison of the underlying disks.
+ However,
+\family typewriter
+marsadm join-resource
+\family default
+ or
+\family typewriter
+invalidate
+\family default
+ will do the same as DRBD verify followed by DRBD resync, i.e.
+ this will automatically correct any found errors;.
+ Note that the fast-fullsync algorithm of MARS will minimize network traffic.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+The
+\family typewriter
+/proc/sys/mars/
+\family default
+ and other Expert Tweaks
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:The-/proc/sys/mars/-Expert"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+In many case, you will not need to deal with tweaks in
+\family typewriter
+/proc/sys/mars/
+\family default
+ because everything should already default to reasonable predefined values.
+ This interface allows access to some internal kernel variables of the
+\family typewriter
+mars.ko
+\family default
+ kernel module at
+\emph on
+runtime
+\emph default
+.
+ This means, the values will be reset to default at
+\family typewriter
+rmmod mars
+\family default
+ or at reboot.
+ If you need some persistence, implement it by yourself, e.g.
+ at startup scripts.
+\end_layout
+
+\begin_layout Standard
+
+\family typewriter
+/proc/sys/mars/
+\family default
+ is
+\emph on
+not
+\emph default
+ a stable interface.
+ It is not only specific for MARS, but may also change between releases
+ without notice.
+\end_layout
+
+\begin_layout Standard
+This section describes only those tweaks intended for sysadmins, not those
+ for developers / very deep internals.
+\end_layout
+
+\begin_layout Subsection
+Tuning Network Performance
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Tuning-Network-Performance"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Starting with MARS Light series 0.2, a new feature called
+\begin_inset Quotes eld
+\end_inset
+
+socket bundling
+\begin_inset Quotes erd
+\end_inset
+
+ is available.
+\end_layout
+
+\begin_layout Standard
+It is mostly intendend for lines showing high packet loss.
+ By using multiple TCP sockets in parallel for emulating a single logical
+ connection, throughput can be significantly increased.
+\end_layout
+
+\begin_layout Standard
+Example for setting the socket parallelism to 4:
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+echo 4 > /proc/sys/mars/parallel_connections
+\end_layout
+
+\begin_layout Standard
+The following graphics shows the throughput of a non-fast
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+The fast fullsync algorithm would not saturate the
+\family typewriter
+eth0
+\family default
+ link with traffic from a single resource.
+\end_layout
+
+\end_inset
+
+ fullsync of a
+\emph on
+single
+\emph default
+ 100GiB resource over a loaded long-distance line between Europe/Germany
+ and USA/Midwest.
+ In order to compensate highly varying load at the line, all the experiments
+ were repeated more than 10 times and averaged.
+ Each bar shows the throughput for a particular socket parallelism.
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/socket-bundling-long-summary.png
+ width 70col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+Notice that the uplinks of the two servers are only 1 GBit/s respectively.
+ When the uplink is saturated, about 100 MByte/s is the maximum possible
+ peak throughput in theory.
+ You can easily recognize that the peak throughput is almost reached with
+ a parallelism degree of 2, but using even more sockets appears to be slightly
+ counter-productive.
+ One of the reasons is that more sockets will increase contention on the
+ line, and thus increasing packet loss.
+ Another potential reason is that higher parallelism at sockets will lead
+ to higher parallelism in disk reads, in turn leading to more permutations
+ of disk read positions (more
+\emph on
+random
+\emph default
+ reads instead of purely sequential reads), which is counter-productive
+ for disk readahead strategies.
+\end_layout
+
+\begin_layout Standard
+The next graphics shows the same, but over a medium distance of about 50km.
+ This line is even more heavily loaded with respect to the number of TCP
+ connections running in parallel (probly some 10,000 or even 100,000 if
+ not more), and there is some kind of
+\begin_inset Quotes eld
+\end_inset
+
+traffic shaping
+\begin_inset Quotes erd
+\end_inset
+
+ at some intermediate network gear which will
+\begin_inset Quotes eld
+\end_inset
+
+punish
+\begin_inset Quotes erd
+\end_inset
+
+ those traffic sources disproportionally increasing overall packet loss.
+ This can explain the even higher counter-productive effect of using too
+ much sockets and thus injecting additional packet loss:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/socket-bundling-short-summary.png
+ width 70col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+In general, the optimum value for
+\family typewriter
+/proc/sys/mars/parallel_connections
+\family default
+ may depend on many runtime factors such as other load running over some
+ (parts of) physical equipment.
+ You will need to determine optimum values yourself.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Notice that socket bundling is conceptually the
+\begin_inset Quotes eld
+\end_inset
+
+opposite
+\begin_inset Quotes erd
+\end_inset
+
+ of traffic shaping.
+ You are trying to get
+\emph on
+more
+\emph default
+ bandwidth, at the cost of
+\emph on
+other
+\emph default
+ traffic competing for the same network resources.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ If you are operating masses of servers, don't set the MARS socket parallelism
+
+\series bold
+too high
+\series default
+everywhere.
+ You might
+\begin_inset Quotes eld
+\end_inset
+
+steal
+\begin_inset Quotes erd
+\end_inset
+
+ too much bandwidth from other applications when starting masses of syncs
+ in parallel, e.g.
+ after an incident.
+ Best practice is to start with a default value of 1, and to increase it
+ only
+\emph on
+on demand
+\emph default
+, and/or preferably
+\emph on
+only
+\emph default
+ at those servers where high load really occurs or where some urgent actions
+ need a
+\emph on
+temporary
+\emph default
+ boost.
+\end_layout
+
+\begin_layout Subsection
+Syslogging
+\end_layout
+
+\begin_layout Standard
+All internal messages produced by the kernel module belong to one of the
+ following classes:
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+0 debug messages
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+1 info messages
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+2 warnings
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+3 error messages
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+4 fatal error messages
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+5 any message (summary of 0 to 4)
+\end_layout
+
+\begin_layout Subsubsection
+Logging to Files
+\end_layout
+
+\begin_layout Standard
+This feature will likely disappear when MARS goes to kernel upstream.
+ It was mostly intended for debugging during early beta phases and is no
+ longer needed for stable operation.
+ Developers may use it for spotting potential problems.
+\end_layout
+
+\begin_layout Standard
+The classes may be used to produce status files
+\family typewriter
+$class.*.status
+\family default
+ in the
+\family typewriter
+/mars/
+\family default
+ and/or in the
+\family typewriter
+/mars/resource-
+\emph on
+mydata
+\emph default
+/
+\family default
+ directory / directories.
+\end_layout
+
+\begin_layout Standard
+When you create a file
+\family typewriter
+$class.*.log
+\family default
+ in parallel to any
+\family typewriter
+$class.*.status
+\family default
+, the
+\family typewriter
+*.log
+\family default
+ file will be appended forever with the same messages as in
+\family typewriter
+*.status
+\family default
+.
+ The difference is that *.status is regenerated anew from an empty starting
+ point, while *.log can (potentially) increase indefinitely unless you remove
+ it, or rename it to something else.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+Beware, any permamently present
+\family typewriter
+*.log
+\family default
+ file can easily fill up your
+\family typewriter
+/mars/
+\family default
+ partition until the problems described in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Defending-Overflow"
+
+\end_inset
+
+ will appear.
+ Use
+\family typewriter
+*.log
+\family default
+ only for a
+\series bold
+limited time
+\series default
+, and
+\series bold
+only for debugging!
+\end_layout
+
+\begin_layout Subsubsection
+Logging to Syslog
+\end_layout
+
+\begin_layout Standard
+The classes also play a role in the following
+\family typewriter
+/proc/sys/mars/
+\family default
+ tweaks:
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+syslog_min_class
+\family default
+ (rw) The
+\emph on
+mimimum
+\emph default
+ class number for
+\emph on
+permanent
+\emph default
+ syslogging.
+ By default, this is set to -1 in order to switch off perment logging completely.
+ Permament logging can easily flood your syslog with such huge amounts of
+ messages (in particular when class=0), that your system as a whole may
+ become unusable (because vital kernel threads may be blocked too long or
+ too often by the userspace syslog daemon).
+ Instead, please use the flood-protected syslogging described below!
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+syslog_max_class
+\family default
+ (rw) The
+\emph on
+maximum
+\emph default
+ class number for
+\emph on
+permanent
+\emph default
+ syslogging.
+ Please use the flood-protected version instead.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+syslog_flood_class
+\family default
+ (rw) The mimimum class of flood-protected syslogging.
+ The maximum class is always 4.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+syslog_flood_limit
+\family default
+ (rw) The maxmimum number of messages after which the flood protection will
+ start.
+ This is a hard limit for the the number of messages written to the syslog.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+syslog_flood_recovery_s
+\family default
+ (rw) The number of seconds after which the internal flood counter is reset
+ (after flood protection state has been reached).
+ When no new messages appear after this time, the flood protection will
+ start over at count 0.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+The rationale behind flood protected syslogging: sysadmins are usually only
+ interested in the point in time where some problems / incidents / etc have
+
+\emph on
+started
+\emph default
+.
+ They are usually not interested in capturing
+\emph on
+each
+\emph default
+ and
+\emph on
+every
+\emph default
+ single error message (in particular when they are flooding the system logs).
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+If you
+\emph on
+really
+\emph default
+ need complete error information, use the
+\family typewriter
+*.log
+\family default
+ files described above, compress them and save them to somewhere else
+\emph on
+regularly
+\emph default
+ by a cron job.
+ This bears much less overhead than filtering via the syslog daemon, or
+ even remote syslogging in real time which will almost surely screw up your
+ system in case of network problems co-inciding with flood messages, such
+ as caused in turn by those problems.
+ Don't rely on real-time concepts, just do it the old-fashioned batch job
+ way.
+\end_layout
+
+\begin_layout Subsubsection
+Tuning Verbosity of Logging
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+show_debug_messages
+\family default
+ Boolean switch, 0 or 1.
+ Mostly useful only for developers.
+ This can easily flood your logs if our are not careful.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+show_log_messages
+\family default
+ Boolean switch, 0 or 1.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+show_connections
+\family default
+ Boolean switch, 0 or 1.
+ Show detailed internal statistics on sockets.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+show_statistics_local
+\begin_inset space ~
+\end_inset
+
+/
+\begin_inset space ~
+\end_inset
+
+show_statistics_global
+\family default
+ Only useful for kernel developers.
+ Shows some internal information on internal brick instances, memory usage,
+ etc.
+\end_layout
+
+\begin_layout Subsection
+Tuning the Sync
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+sync_flip_interval_sec
+\family default
+ (rw) The sync process must not run in parallel to logfile replay, in order
+ to easily guarantee consistency of your disk.
+ If logfile replay would be paused for the full duration of very large or
+ long-lasting syncs (which could take some days over very slow networks),
+ your
+\family typewriter
+/mars/
+\family default
+ filesystem could overflow because no replay would be possible in the meantime.
+ Therefore, MARS regulary flips between actually syncing and actually replaying,
+ if both is enabled.
+ You can set the time interval for flipping here.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+sync_limit
+\family default
+ (rw) When > 0, this limits the maximum number of sync processes actually
+ running parallel.
+ This is useful if you have a large number of resources, and you don't want
+ to overload the network with sync processes.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+sync_nr
+\family default
+ (ro) Passive indicator for the number of sync processes currently running.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+sync_want
+\family default
+ (ro) Passive indicator for the number of sync processes which
+\emph on
+demand
+\emph default
+ running.
+\end_layout
+
+\begin_layout Subsection
+Lowlevel TCP Tuning (Networking Experts Only)
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:TCP-Tuning"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+When
+\family typewriter
+CONFIG_MARS_SEPARATE_PORTS
+\family default
+ and
+\family typewriter
+CONFIG_MARS_IPv4_TOS
+\family default
+ are enabled, MARS uses the following types of traffic:
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+MARS_TRAFFIC_META
+\family default
+ (by default on port 7777 with
+\family typewriter
+IPTOS_LOWDELAY
+\family default
+) This can be tuned in directory
+\family typewriter
+/proc/sys/mars/tcp_tuning_0_meta_traffic/
+\family default
+.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+MARS_TRAFFIC_REPLICATION
+\family default
+ (by default on port 7778 with
+\family typewriter
+IPTOS_RELIABILITY
+\family default
+) This can be tuned in directory
+\family typewriter
+/proc/sys/mars/tcp_tuning_1_replication_traffic/
+\family default
+.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+MARS_TRAFFIC_SYNC
+\family default
+ (by default on port 7779 with
+\family typewriter
+IPTOS_MINCOST
+\family default
+) This can be tuned in directory
+\family typewriter
+/proc/sys/mars/tcp_tuning_2_sync_traffic/
+\family default
+.
+ Attention: since the advent of
+\family typewriter
+DSCP
+\family default
+, this bit (hex
+\family typewriter
+0x2
+\family default
+ in host byte order) is suppressed by the kernel, and yields
+\family typewriter
+DS0
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+In each of these directories, the following tunables are available (only
+ for networking experts who know what they are doing):
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+ip_tos
+\family default
+ As explained above.
+ Notice: hex constants from
+\family typewriter
+/usr/include/linux/ip.h
+\family default
+ must be converted to decimal before forwarding to the
+\family typewriter
+/proc
+\family default
+ interface.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+tcp_window_size
+\family default
+ Current default is 8 * 1024 * 1024.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+tcp_nodelay
+\family default
+ Current default is 0.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+tcp_timeout
+\family default
+ Current default is 2.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+tcp_keepcnt
+\family default
+ Current default is 3.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+tcp_keepintvl
+\family default
+ Current default is 3.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+tcp_keepidle
+\family default
+ Current default is 4.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Further tuning parameters are in the standard Linux kernel.
+ Notice that
+\family typewriter
+IP_TOS
+\family default
+ is internally converted to
+\family typewriter
+DSCP
+\family default
+, which in turn can be further manipulated by
+\family typewriter
+netfilter
+\family default
+ /
+\family typewriter
+iptables
+\family default
+ and/or by
+\family typewriter
+qdisc
+\family default
+ (
+\family typewriter
+tc
+\family default
+) and/or by further (external) networking components.
+ The ancient TOS settings are meant as a default
+\emph on
+starting point
+\emph default
+ for further customization to your needs.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Typically,
+\emph on
+public
+\emph default
+ internet transports are flattening / ignoring or otherwise manipulating
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+DSCP markings can be only made reliable on private networks (possibly requiring
+ some effort).
+ Public Internet service and transit providers do not necessarily treat
+ the TOS values or DSCP markings with any form of priority and may also
+ remove or change them without any notice.
+ Some internet service or transit providers also do use specific DSCP markings
+ to mark packets for being dropped, which may result in hard to find transmissio
+n errors.
+\end_layout
+
+\begin_layout Plain Layout
+If want to use MARS on a public internet connection, you should use
+\series bold
+encrypted
+\series default
+
+\series bold
+VPN
+\series default
+ with different DSCP markings, and coordinate them with your network services
+ provider.
+\end_layout
+
+\end_inset
+
+ the TOS / DSCP fields.
+ There it will not work.
+ Anyway, you should never route unencrypted MARS traffic over public transports,
+ for obvious security reasons.
+ Notice: MARS replication is meant for company-
+\emph on
+internal
+\emph default
+ networks like
+\emph on
+internal
+\emph default
+
+\series bold
+replication networks
+\series default
+ (or storage networks) where some networking department has control of.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Playing with the above settings can easily tear down your whole (replication)
+ network if you don't know exactly what you are doing.
+ Please test any changes in the lab first.
+ Mass rollout should be done in incremental phases, each in power of 10
+ units.
+ There might be unexpected effects like packet storms, or packet loss, etc.
+ Some of these effects may only show up when a certain number of hosts is
+ exceeded, or when certain load conditions are hammering the overall Distributed
+ System.
+ Some very old routers / switches are known to break down unexpectedly when
+ overloaded in certain ways.
+ Be careful in a production environment!
+\end_layout
+
+\begin_layout Chapter
+Tips and Tricks
+\end_layout
+
+\begin_layout Section
+IO Performance Tuning
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:IO-Performance-Tuning"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+There
+\emph on
+exist
+\emph default
+ some use cases where MARS
+\emph on
+can
+\emph default
+ deliver better IO performance than a raw block device.
+ However, this cannot be expected
+\emph on
+in general
+\emph default
+.
+ In some
+\emph on
+other
+\emph default
+ cases the performance may be
+\emph on
+lower
+\emph default
+ than with a
+\emph on
+single
+\emph default
+ local raw device.
+\end_layout
+
+\begin_layout Standard
+For demonstration, we use the
+\family typewriter
+blkreplay
+\family default
+ tool from
+\begin_inset Flex URL
+status open
+
+\begin_layout Plain Layout
+
+http://blkreplay.org
+\end_layout
+
+\end_inset
+
+ and a load which has been captured from a
+\series bold
+real datacenter
+\series default
+ (1&1 Ionos ShaHoLin = Shared Hosting Linux).
+ The load already contains a parallelism degree of 20 LXC containers running
+ in parallel at the same iron.
+ This corresponds to about 60,000 web spaces running on 20 Apache instances,
+ already in parallel.
+ In difference to artificial benchmarks (like pure random IO or pure sequential
+ IO), this benchmark is much more close to real server operations, while
+ artificial benchmarks are not meaningful for practice in general, because
+ they can deviate from real server operations by
+\emph on
+factors
+\emph default
+ or even by
+\series bold
+orders of magnitude
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+In order to determine the limits of the test candidates, the timing of the
+ original workload was converted to a linear ramp-up, simulating an
+\series bold
+overloaded
+\series default
+ system.
+ Otherwise benchmarking would not be possible.
+\end_layout
+
+\begin_layout Standard
+The following
+\family typewriter
+blkreplay
+\family default
+ benchmarks were executed on an otherwise unloaded Dell R630 with 40 CPU
+ threads on 2 sockets, 192 GB RAM, a Dell R730 hardware RAID controller
+ with 2 GB BBU cache, and 10 spindles Dell 1.8 TB 2.5 inch SAS disks configured
+ as RAID-6.
+ All data, including the
+\family typewriter
+/mars
+\family default
+ directory, was located on the hardware RAID via LVM2.
+
+\family typewriter
+/dev/vginfong/lv-0
+\family default
+ was assigned a size of 8 TiB.
+ For testing, vanilla kernel 4.9.x with the MARS pre-patch and
+\family typewriter
+mars0.1astable72
+\family default
+ was used.
+\end_layout
+
+\begin_layout Standard
+The
+\family typewriter
+blkreplay
+\family default
+ parameters were as follows:
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+output_label="MARS"
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+# input description
+\end_layout
+
+\begin_layout Plain Layout
+
+input_file_list="http://blkreplay.org/loads/natural/1and1/shared-hosting/2016/Sha
+HoLin_from_bare_metal/x20/shaholin-x20-ramped/shaholin-x20.adjacent.ramped-100.load.
+gz"
+\end_layout
+
+\begin_layout Plain Layout
+
+replay_duration=110
+\end_layout
+
+\begin_layout Plain Layout
+
+speedup=10
+\end_layout
+
+\begin_layout Plain Layout
+
+threads=512
+\end_layout
+
+\begin_layout Plain Layout
+
+cmode=with-conflicts
+\end_layout
+
+\begin_layout Plain Layout
+
+scheduler="noop"
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+# hardware setup
+\end_layout
+
+\begin_layout Plain Layout
+
+replay_host_list="icpu5133"
+\end_layout
+
+\begin_layout Plain Layout
+
+replay_device_list="/dev/vginfong/lv-0"
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+# output description
+\end_layout
+
+\begin_layout Plain Layout
+
+enable_graph=1
+\end_layout
+
+\begin_layout Plain Layout
+
+graph_options="--no-static --dynamic"
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+We start with the
+\series bold
+raw
+\series default
+ device
+\family typewriter
+/dev/vginfong/lv-0
+\family default
+ which had a size of 8 TiB.
+ The throughput is about 1418 IOPS, and the latency diagram shows that the
+ system is overloaded, but can cope with that overload:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/blkreplay/MARS.MARS.raw.iosched-noop.nr_request-128.icpu5133.vginfong.lv-0.g01.latency.realtime.png
+ width 100col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+As you can see in the filename, the NOOP kernel IO scheduler was used, and
+ the kernel parameter
+\family typewriter
+nr_requests
+\family default
+ was left at its default value of 128.
+ When you read the specs of the Dell R730 hardware RAID controller, you
+ will notice that it can handle a much higher IO request parallelism of
+ almost 1024 requests in parallel.
+\end_layout
+
+\begin_layout Standard
+So the first natural tuning attempt is
+\family typewriter
+nr_requests=1020
+\family default
+, in order to release the
+\begin_inset Quotes eld
+\end_inset
+
+kernel IO handbrake
+\begin_inset Quotes erd
+\end_inset
+
+.
+ This results in an improved throughput of 1562 IOPS, and even the
+\emph on
+maximum
+\emph default
+ latencies are improved, but the
+\emph on
+average
+\emph default
+ latencies are becoming a little bit worse:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/blkreplay/MARS.MARS.raw.iosched-noop.nr_request-1020.icpu5133.vginfong.lv-0.g01.latency.realtime.png
+ width 100col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+It is well known since decades that there is a principal tradeoff between
+ throughput and latencies in IO systems.
+ Thus it is not a surprising result.
+\end_layout
+
+\begin_layout Standard
+On servers, overload situations should be rare, and during overload throughput
+ is typically much more important than latencies, as long as latencies are
+ not exceedingly high.
+ Thus we can recommend
+\family typewriter
+nr_requests=1000
+\family default
+ for production.
+\end_layout
+
+\begin_layout Standard
+However, some sysadmins might be tempted to question why the NOOP scheduler
+ has been used.
+ On the internet, there are a ton of claims that CFQ is much better.
+\end_layout
+
+\begin_layout Standard
+Well, testing with CFQ instead of NOOP is no problem for
+\family typewriter
+blkreplay
+\family default
+.
+ However, the result is very surprising.
+ While the IOPS are 1539, which is only a slight decrease which could result
+ from measurement tolerances, the latencies are now turning almost into
+ a disaster:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/blkreplay/MARS.MARS.raw.iosched-cfq.nr_request-1020.icpu5133.vginfong.lv-0.g01.latency.realtime.png
+ width 100col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+In production, you should never encounter IO latencies of almost 15 seconds.
+ So what is going wrong here?
+\end_layout
+
+\begin_layout Standard
+Here is an explanation.
+ A hardware RAID controller
+\emph on
+already
+\emph default
+ has an
+\emph on
+internal
+\emph default
+ IO scheduler.
+ This IO scheduler is hidden in a black box, such that many sysadmins don't
+ know of its existence.
+ If you add another IO scheduler at kernel level, you will have
+\series bold
+two different
+\series default
+ IO schedulers running in parallel, and sometimes taking
+\series bold
+contradictory decisions
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+These contradictory scheduling decisions may lead to problems in certain
+ cases and scenarios.
+\end_layout
+
+\begin_layout Standard
+While kernel-level IO schedulers like CFQ certainly have their merits at
+ improving your workstation's IO behaviour, they are counter-productive
+ at servers with hardware RAID controllers.
+\end_layout
+
+\begin_layout Standard
+So the advice is clear:
+\series bold
+switch them off
+\series default
+
+\emph on
+in such a case
+\emph default
+.
+\end_layout
+
+\begin_layout Standard
+Even if you have a software RAID, check with
+\family typewriter
+blkreplay
+\family default
+ that any IO schedulers are
+\emph on
+really
+\emph default
+ improving things.
+ When possible, use your real workload, captured with
+\family typewriter
+blktrace
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Never use a benchmark which only delivers IOPS! As demonstrated, inappropriate
+ IOPS tuning (or choice of inappropriate components) can worsen latencies
+ so much that production can be endangered!
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ Always look at
+\emph on
+both
+\emph default
+ IOPS
+\emph on
+and
+\emph default
+ latencies!
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+
+\emph on
+Average
+\emph default
+ latencies, even when enriched with
+\emph on
+standard deviation
+\emph default
+, are not enough.
+ Classical statistics does not clearly describe operational problems like
+
+\series bold
+hangs
+\series default
+ and
+\series bold
+exceptionally high latency requests
+\series default
+, which may occur only rarely, but can then lead to
+\series bold
+serious incidents
+\series default
+.
+ Use a tool which can clearly display
+\emph on
+any
+\emph default
+ faulty behaviour, such as
+\family typewriter
+blkreplay
+\family default
+'s
+\series bold
+latency diagrams
+\series default
+!
+\end_layout
+
+\begin_layout Standard
+Now we come to benchmarking
+\family typewriter
+/dev/mars/lv-0
+\family default
+ placed on top of
+\family typewriter
+/dev/vginfong/lv-0
+\family default
+.
+ Notice that MARS needs to write all write requests twice: once into the
+ transaction logfile, and a second time by writeback into
+\family typewriter
+/dev/vginfong/lv-0
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+So you might expect that performace of
+\family typewriter
+/dev/mars/lv-0
+\family default
+ could be worse than at the underlying raw device.
+\end_layout
+
+\begin_layout Standard
+Nevertheless, the
+\series bold
+throughput
+\series default
+ is now measured 4338 IOPS, which means that performance has
+\series bold
+more than doubled
+\series default
+.
+ You can also see it by the duration of the benchmark at the x axis.
+ Even the latencies have improved in many cases:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/blkreplay/MARS.MARS.mars.iosched-noop.nr_request-1020.icpu5133.mars.lv-0.g01.latency.realtime.png
+ width 100col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+How is it possible to be faster than a RAW device? How can this be explained?
+\end_layout
+
+\begin_layout Standard
+Look at the graphics and at the explanations from section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:The-Transaction-Logger"
+plural "false"
+caps "false"
+noprefix "false"
+
+\end_inset
+
+.
+ The key to local IO performance is the
+\series bold
+re-ordering of writeback
+\series default
+ according to ascending sector numbers.
+ This can reduce mechanical seek times of hard disks considerably, and even
+ by factors, such that it can over-compensate the doubled writes to the
+ transaction logfile, and even when both are residing at the same RAID set.
+\end_layout
+
+\begin_layout Standard
+Notice: this effect is not only dependent from total RAM size and from the
+ maximum size of the MARS temporary memory buffer (tuning parameter
+\family typewriter
+/proc/sys/mars/mars_mem_percent
+\family default
+ which defaults to a limit of 20%).
+ It is also highly dependent from the actual seek behaviour of the
+\series bold
+workload
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+For example, if you use
+\family typewriter
+dd
+\family default
+ for sequentially overwriting /dev/mars/lv-0 with a parallelism degree of
+ 1, the writeback optimization of MARS cannot be exploited.
+ However,
+\family typewriter
+dd
+\family default
+ is no appropriate benchmarking tool, and has almost nothing to do with
+ real workloads occuring in datacenters, which typically are neither sequential,
+ nor do they have a parallelism degree of only 1.
+ Please don't try to lead any discussions about this: simply use
+\family typewriter
+blktrace
+\family default
+ to capture your real server workload, and compare it to a run of dd.
+ Only if you encounter the same behaviour as
+\family typewriter
+dd
+\family default
+, only then you can really claim that your workload is like
+\family typewriter
+dd
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Any assumptions about workloads are very dangerous: they can deviate from
+ practice not only by factors, but sometimes even by
+\emph on
+orders of magnitude
+\emph default
+.
+\end_layout
+
+\begin_layout Standard
+Notice: the writeback optimization of MARS can typically only improve performanc
+e of HDDs, but not of SSDs.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ By placing
+\family typewriter
+/mars
+\family default
+ onto its own physical device with appropriate speed, you can compensate
+ the doubled writes to some degree.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ Depending on the workload and on RAID parameters,
+\family typewriter
+/mars
+\family default
+ may be better placed onto SSDs, or better be placed on HDDs.
+ There is no general rule.
+ Just use
+\family typewriter
+blktrace
+\family default
+ on your real workload, and check several configuration alternatives (also
+ different RAID levels etc) with
+\family typewriter
+blkreplay
+\family default
+.
+\end_layout
+
+\begin_layout Section
+Avoiding Inappropriate Clustermanager Types for Medium and Long-Distance
+ Replication
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Inappropriate-Clustermanger"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+This section addresses some wide-spread misconceptions.
+ Its main target audience is developers, but sysadmins will profit from
+
+\series bold
+detailed explanations of problems and pitfalls
+\series default
+.
+ When the problems described in this section are solved somewhen in future,
+ this section will be shortened and some relevant parts moved to the appendix.
+\end_layout
+
+\begin_layout Standard
+Doing
+\series bold
+High Availability (HA)
+\series default
+ wrong at
+\emph on
+concept level
+\emph default
+ may easily get you into trouble, and may cost you several millions of €
+ or $ in larger installations, or even knock you out of business when disasters
+ are badly dealt with at higher levels such as clustermanagers.
+\end_layout
+
+\begin_layout Subsection
+General Cluster Models
+\end_layout
+
+\begin_layout Standard
+The most commonly known cluster model is called
+\series bold
+shared-disk
+\series default
+, and typically controlled by clustermanagers like
+\family typewriter
+PaceMaker
+\family default
+:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/shared-disk-model.fig
+ width 50col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+The most important property of shared-disk is that there exists only a single
+ disk instance.
+ Nowadays, this disk often has some
+\emph on
+internal
+\emph default
+ redundancy such as RAID.
+ At
+\emph on
+system
+\emph default
+ architecure layer / network level, there exists no redundant disk at all.
+ Only the application cluster is built redundant.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ It should be immediately clear that shared-disk clusters are only suitable
+ for short-distance operations in the same datacenter.
+ Although running one of the data access lines over short distances between
+ very near-by datacenters (e.g.
+ 1 km) would be theoretically possible, there would be no sufficient protection
+ against failure of a whole datacenter.
+\end_layout
+
+\begin_layout Standard
+Both DRBD and MARS belong to a different architectural model called
+\series bold
+shared-nothing
+\series default
+:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/shared-nothing-model.fig
+ width 50col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+The characteristic feature of a shared-nothing model is (additional)
+\series bold
+ redundancy at network level
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ Shared-nothing
+\begin_inset Quotes eld
+\end_inset
+
+clusters
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice that the term
+\begin_inset Quotes eld
+\end_inset
+
+cluster computing
+\begin_inset Quotes erd
+\end_inset
+
+ usually refers to short-distance only.
+ Long-distance coupling should be called
+\begin_inset Quotes eld
+\end_inset
+
+grid computing
+\begin_inset Quotes erd
+\end_inset
+
+ in preference.
+ As known from the scientific literature, grid computing requires different
+ concepts and methods in general.
+ Only for the sake of simplicity, we use
+\begin_inset Quotes eld
+\end_inset
+
+cluster
+\begin_inset Quotes erd
+\end_inset
+
+ and
+\begin_inset Quotes eld
+\end_inset
+
+grid
+\begin_inset Quotes erd
+\end_inset
+
+ interchangeably.
+\end_layout
+
+\end_inset
+
+
+\begin_inset Quotes erd
+\end_inset
+
+ could theoretically be built for
+\emph on
+any
+\emph default
+ distances, from short to medium to long distances.
+ However, concrete technologies of disk coupling such as synchronous operation
+ may pose practical limits on the distances (see chapter
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "chap:Use-Cases-for"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+In general, clustermanagers must fit to the model.
+ Some clustermanager can be configured to fit to multiple models.
+ If so, this must be done properly, or you may get into serious trouble.
+\end_layout
+
+\begin_layout Standard
+Some people don't know, or they don't believe, that different architectural
+ models like shared-disk or shared-nothing will
+\emph on
+require
+\emph default
+ an
+\emph on
+appropriate
+\emph default
+ type of clustermanager and/or a different configuration.
+ Failing to do so, by selection of an inappropriate clustermanager type
+ and/or an inappropriate configuration may be hazardous.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Selection of the right model alone is not sufficient.
+ Some, if not many, clustermanagers have not been designed for long distances.
+ As explained in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Special-Requirements-for"
+
+\end_inset
+
+, long distances have further
+\series bold
+hard requirements
+\series default
+.
+ Disregarding them may be also hazardous!
+\end_layout
+
+\begin_layout Subsection
+Handover / Failover Reasons and Scenarios
+\end_layout
+
+\begin_layout Standard
+From a sysadmin perspective, there exist a number of different
+\series bold
+reasons
+\series default
+ why the application workload must be switched from the currently active
+ side A to the currently passive side B:
+\end_layout
+
+\begin_layout Enumerate
+Some
+\series bold
+defect
+\series default
+ has occurred at cluster side A or at some corresponding part of the network.
+\end_layout
+
+\begin_layout Enumerate
+Some
+\series bold
+maintenance
+\series default
+ has to be done at side A which would cause a longer downtime (e.g.
+ security kernel update or replacement of core network equipment or maintainance
+ of UPS or of the BBU cache etc - hardware isn't 24/7/365 in practice, although
+ some vendors
+\emph on
+claim
+\emph default
+ it - it is either not really true, or it becomes
+\emph on
+extremely
+\emph default
+ expensive).
+\end_layout
+
+\begin_layout Standard
+Both reasons are valid and must be automatically handled in larger installations.
+ In order to deal with all of these reasons, the following basic mechanisms
+ can be used in either model:
+\end_layout
+
+\begin_layout Enumerate
+
+\series bold
+Failover
+\series default
+ (triggered either manually or automatically)
+\end_layout
+
+\begin_layout Enumerate
+
+\series bold
+Handover
+\series default
+ (triggered manually
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Automatic triggering could be feasible for prophylactic treatments.
+\end_layout
+
+\end_inset
+
+)
+\end_layout
+
+\begin_layout Standard
+It is important to not confuse handover with failover at concept level.
+ Not only the reasons / preconditions are very different, but also the
+\emph on
+requirements
+\emph default
+.
+ Example: precondition for handover is that
+\emph on
+both
+\emph default
+ cluster sides are healthy, while precondition for failover is that
+\emph on
+some relevant(!)
+\emph default
+ failure has been
+\emph on
+detected
+\emph default
+ somewhere (whether this is
+\emph on
+really
+\emph default
+ true is another matter).
+ Typically, failover must be able to run in masses, while planned handover
+ often has lower scaling requirements.
+\end_layout
+
+\begin_layout Standard
+Not all existing clustermanagers are dealing with all of these cases (or
+ their variants) equally well, and some are not even dealing with some of
+ these cases / variants
+\emph on
+at all
+\emph default
+.
+
+\end_layout
+
+\begin_layout Standard
+Some clustermanagers cannot easily express the concept of
+\begin_inset Quotes eld
+\end_inset
+
+automatic triggering
+\begin_inset Quotes erd
+\end_inset
+
+ versus
+\begin_inset Quotes eld
+\end_inset
+
+manual triggering
+\begin_inset Quotes erd
+\end_inset
+
+ of an action.
+ There exists simply no cluster-global switch which selects either
+\begin_inset Quotes eld
+\end_inset
+
+manual mode
+\begin_inset Quotes erd
+\end_inset
+
+ or
+\begin_inset Quotes eld
+\end_inset
+
+automatic mode
+\begin_inset Quotes erd
+\end_inset
+
+ (except when you start to hack the code and/or write new plugins; then
+ you might notice that there is almost no architectural layering / sufficient
+ separation between mechanism and strategy).
+ Being forced to permanently use an automatic mode for several hundreds
+ or even thousands of clusters is not only boring, but bears a considerable
+ risk when automatics do a wrong decision at hundreds of instances in parallel.
+\end_layout
+
+\begin_layout Subsection
+Granularity and Layering Hierarchy for Long Distances
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Granularity-and-Layering"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Many existing clustermanager solutions are dealing with a single cluster
+ instance, as the term
+\begin_inset Quotes eld
+\end_inset
+
+
+\emph on
+cluster
+\emph default
+manager
+\begin_inset Quotes erd
+\end_inset
+
+ suggests.
+ However, when running several hundreds or thousands of cluster instances,
+ you likely will not want to manage each of them individually.
+ In addition, failover should
+\emph on
+not only
+\emph default
+ be
+\emph on
+triggered
+\emph default
+ (not to be confused with
+\emph on
+executed
+\emph default
+) individually at cluster level, but likely
+\emph on
+also
+\emph default
+ at a higher granularity such as a room, or a whole datacenter.
+ Otherwise, some chaos is likely to happen.
+\end_layout
+
+\begin_layout Standard
+Here is what you probably will
+\series bold
+need
+\series default
+, possibly in difference to what you may find on the market (whether OpenSource
+ or not).
+ For simplicity, the following diagram shows only two levels of granularity,
+ but can be easily extended to multiple layers of granularity, or to some
+ concept of various
+\emph on
+subsets of clusters
+\emph default
+:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/clustermanager-hierarchy.fig
+ width 70col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+Notice that many existing clustermanager solutions are not addressing the
+ datacenter granularity at all.
+ Typically, they use concepts like
+\series bold
+quorums
+\series default
+ for determining failures
+\emph on
+at cluster level
+\emph default
+ solely, and then immediately executing failover of the cluster, sometimes
+ without clean architectural distinction between trigger and execution (similar
+ to the
+\begin_inset Quotes eld
+\end_inset
+
+separation of concerns
+\begin_inset Quotes erd
+\end_inset
+
+ between
+\series bold
+mechanism
+\series default
+ and
+\series bold
+strategy
+\series default
+ in Operating Systems).
+ Sometimes there is even no internal software layering / modularization
+ according to this separation of concerns at all.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ When there is no distinction between different levels of granularity, you
+ are hopelessly bound to a non-extensible and thus non-adaptable system
+ when you need to operate masses of clusters.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ A lacking distinction between automatic mode and manual mode, and/or lack
+ of corresponding
+\series bold
+architectural software layers
+\series default
+ is not only a blatant ignoration of well-established best practices of
+
+\series bold
+software engineering
+\series default
+, but will bind you even more firmly to an inflexible system.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ Terminology: for practical reasons, we use the general term
+\begin_inset Quotes eld
+\end_inset
+
+clustermanager
+\begin_inset Quotes erd
+\end_inset
+
+ also for speaking about layers dealing with higher granularity, such as
+ datacenter layers, and also for long-distance replication scenarios, although
+ some terminology from grid computing would be more appropriate in a scientific
+ background.
+\end_layout
+
+\begin_layout Standard
+Please consider the following: when it comes to long-distance HA, the above
+ layering architecture is also motivated by vastly different numbers of
+ instances for each layer.
+ Ideally, the topmost automatics layer should be able to overview several
+ datacenters in parallel, in order to cope with (almost) global network
+ problems such as network partitions.
+ Additionally, it should also detect single cluster failures, or intermediate
+ problems like
+\begin_inset Quotes eld
+\end_inset
+
+rack failure
+\begin_inset Quotes erd
+\end_inset
+
+ or
+\begin_inset Quotes eld
+\end_inset
+
+room failure
+\begin_inset Quotes erd
+\end_inset
+
+, as well as various types of (partial / intermediate) (replication) network
+ failures.
+ Incompatible decisions at each of the different granularities would be
+ a no-go in practice.
+ Somewhere and somehow, you need one single
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+If you have
+\emph on
+logical pairs of datacenters
+\emph default
+ which are firmly bound together, you could also have several topmost automatics
+ instances, e.g.
+ for each
+\emph on
+pair
+\emph default
+ of datacenters.
+ However, that would be very
+\series bold
+inflexible
+\series default
+, because then you cannot easily mix locations or migrate your servers between
+ datacenters.
+ Using
+\begin_inset Formula $k>2$
+\end_inset
+
+ replicas with MARS would also become a nightmare.
+ In your own interest, please don't create any concepts where masses of
+ hardware are firmly bound to fixed constants at some software layers.
+\end_layout
+
+\end_inset
+
+ top-most
+\emph on
+logical
+\emph default
+ problem detection / ranking instance, which should be
+\emph on
+internally distributed
+\emph default
+ of course, typically using some
+\series bold
+distributed consensus protocol
+\series default
+; but in difference to many published distributed consensus algorithms it
+ should be able to work with multiple granularities at the same time.
+\end_layout
+
+\begin_layout Subsection
+Methods and their Appropriateness
+\end_layout
+
+\begin_layout Subsubsection
+Failover Methods
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Failover-Methods"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Failover methods are only needed in case of an incident.
+ They should not be used for regular handover.
+\end_layout
+
+\begin_layout Paragraph
+STONITH-like Methods
+\end_layout
+
+\begin_layout Standard
+STONITH = Shoot The Other Node In The Head
+\end_layout
+
+\begin_layout Standard
+These methods are widely known, although they have several serious drawbacks.
+ Some people even believe that
+\emph on
+any
+\emph default
+ clustermanager must
+\emph on
+always
+\emph default
+ have some STONITH-like functionality.
+ This is wrong.
+ There
+\emph on
+exist
+\emph default
+ alternatives, as shown in the next paragraph.
+\end_layout
+
+\begin_layout Standard
+The most obvious drawback is that STONITH will always create a
+\series bold
+damage
+\series default
+, by definition.
+\end_layout
+
+\begin_layout Standard
+Example: a typical contemporary STONITH implementation uses IPMI for automatical
+ly powering off your servers, or at least pushes the (virtual) reset button.
+ This will
+\emph on
+always
+\emph default
+ create a certain type of damage: the affected systems will definitely not
+ be available, at least for some time until they have (manually) rebooted.
+\end_layout
+
+\begin_layout Standard
+This is a conceptual contradiction: the reason for starting failover is
+ that you want to restore availability as soon as possible, but in order
+ to do so you will first
+\emph on
+destroy
+\emph default
+ the availability of a particular
+\emph on
+component
+\emph default
+.
+ This may be counter-productive.
+\end_layout
+
+\begin_layout Standard
+Example: when your hot standby node B does not work as expected, or if it
+ works even
+\emph on
+worse
+\emph default
+ than A before, you will loose some time until you
+\emph on
+can
+\emph default
+ become operational again at the old side A.
+\end_layout
+
+\begin_layout Standard
+Here is an example method for handling a failure scenario.
+ The old active side A is assumed to be no longer healthy anymore.
+ The method uses a sequential state transition chain with a STONITH-like
+ step:
+\end_layout
+
+\begin_layout Description
+Phase1 Check whether the hot standby B is currently usable.
+ If this is violated (which may happen during certain types of disasters),
+ abort the failover for any affected resources.
+\end_layout
+
+\begin_layout Description
+Phase2
+\emph on
+Try
+\emph default
+ to shutdown the damaged side A (in the
+\emph on
+hope
+\emph default
+ that there is no
+\emph on
+serious
+\emph default
+ damage).
+\end_layout
+
+\begin_layout Description
+Phase3 In case phase2 did not work during a grace period / after a timeout,
+ assume that A is badly damaged and therefore STONITH it.
+\end_layout
+
+\begin_layout Description
+Phase4 Start the application at the hot standby B.
+\end_layout
+
+\begin_layout Standard
+Notice: any cleanup actions, such as
+\series bold
+repair
+\series default
+ of defective hard- or software etc, are outside the scope of failover processes.
+ Typically, they are executed much later when restoring redundancy.
+\end_layout
+
+\begin_layout Standard
+Also notice: this method is a
+\emph on
+heavily
+\emph default
+ distributed one, in the sense that sequential actions are alternated multiple
+ times on different hosts.
+ This is known to be cumbersome in distributed systems, in particular in
+ presence of network problems.
+\end_layout
+
+\begin_layout Standard
+\begin_inset CommandInset label
+LatexCommand label
+name "Phase4-in-more"
+
+\end_inset
+
+Phase4 in more detail for DRBD, augmented with some pseudo code for application
+ control:
+\end_layout
+
+\begin_layout Enumerate
+at side B:
+\family typewriter
+drbdadm disconnect all
+\end_layout
+
+\begin_layout Enumerate
+at side B:
+\family typewriter
+drbdadm primary --force all
+\end_layout
+
+\begin_layout Enumerate
+at side B:
+\family typewriter
+applicationmanager start all
+\end_layout
+
+\begin_layout Standard
+The same phase4 using MARS:
+\end_layout
+
+\begin_layout Enumerate
+at side B:
+\family typewriter
+marsadm pause-fetch all
+\end_layout
+
+\begin_layout Enumerate
+at side B:
+\family typewriter
+marsadm primary --force all
+\end_layout
+
+\begin_layout Enumerate
+at side B:
+\family typewriter
+applicationmanager start all
+\end_layout
+
+\begin_layout Standard
+This sequential 4-phase method is far from optimal, for the following reasons:
+\end_layout
+
+\begin_layout Itemize
+The method tries to handle both failover and handover scenarios with one
+ single sequential receipe.
+ In case of a true failover scenario where it is
+\emph on
+already known for sure
+\emph default
+ that side A is badly damaged, this method will unnecessarily waste time
+ for phase 2.
+ This could be fixed by introduction of a conceptual distinction between
+ handover and failover, but it would not fix the following problems.
+\end_layout
+
+\begin_layout Itemize
+Before phase4 is started (which will re-establish the service from a user's
+ perspective), a lot of time is wasted by
+\emph on
+both
+\emph default
+ phases 2
+\emph on
+and
+\emph default
+ 3.
+ Even if phase 2 would be skipped, phase 3 would unnecessarily cost some
+ time.
+ In the next paragraph, an alternative method is explained which eliminates
+ any unnecessary waiting time at all.
+\end_layout
+
+\begin_layout Itemize
+The above method is adapted to the shared-disk model.
+ It does not take advantage of the shared-nothing model, where further possibili
+ties for better solutions exist.
+\end_layout
+
+\begin_layout Itemize
+In case of long-distance network partitions and/or sysadmin / system management
+ subnetwork outages, you may not even be able to (remotely) start STONITH
+ at at.
+ Thus the above method misses an important failure scenario.
+\end_layout
+
+\begin_layout Standard
+Some people seem to have a
+\emph on
+binary
+\emph default
+ view at the healthiness of a system: in their view, a system is either
+ operational, or it is damaged.
+ This kind of view is ignoring the fact that some systems may be half-alive,
+ showing only
+\emph on
+minor
+\emph default
+ problems, or occurring only from time to time.
+\end_layout
+
+\begin_layout Standard
+It is obvious that damaging a healthy system is a bad idea by itself.
+ Even
+\emph on
+generally
+\emph default
+ damaging a half-alive system in order to
+\begin_inset Quotes eld
+\end_inset
+
+fix
+\begin_inset Quotes erd
+\end_inset
+
+ problems is not generally a good idea, because it may increase the damage
+ when you don't know the
+\emph on
+real
+\emph default
+ reason
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Example, occurring in masses: an incorrectly installed bootloader, or a
+ wrong BIOS boot priority order which unexpectedly lead to hangs or infinite
+ reboot cycles once the DHCP or BOOTP servers are not longer available /
+ reachable.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+Even worse: in a distributed system
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice: the STONITH concept is more or less associated with short-distance
+ scenarios where
+\series bold
+crossover cables
+\series default
+ or similare equipment are used.
+ The assumption is that crossover cables can't go defective, or at least
+ it would be an extremely unlikely scenario.
+ For long-distance replication, this assumption is simply not true.
+\end_layout
+
+\end_inset
+
+ you sometimes
+\emph on
+cannot(!)
+\emph default
+ know whether a system is healthy, or to what degree it is healthy.
+ Typical STONITH methods as used in some contemporary clustermanagers are
+
+\series bold
+assuming a worst case
+\series default
+, even if that worst case is currently not for real.
+\end_layout
+
+\begin_layout Standard
+Therefore, avoid the following
+\series bold
+fundamental flaws
+\series default
+ in failover concepts and healthiness models, which apply to implementors
+ / configurators of clustermanagers:
+\end_layout
+
+\begin_layout Itemize
+Don't mix up knowledge with conclusions about a (sub)system, and also don't
+ mix this up with the real state of that (sub)system.
+ In reality, you don't have any knowledge about a complex distributed system.
+ You only may have
+\emph on
+some
+\emph default
+ knowledge about
+\emph on
+some
+\emph default
+ parts of the system, but you cannot
+\begin_inset Quotes eld
+\end_inset
+
+see
+\begin_inset Quotes erd
+\end_inset
+
+ a complex distributed system as a whole.
+ What you think is your knowledge, isn't knowledge in reality: in many cases,
+ it is
+\emph on
+conclusion
+\emph default
+, not knowledge.
+ Don't mix this up!
+\end_layout
+
+\begin_layout Itemize
+Some systems are more complex than your model of it.
+ Don't neglect important parts (such as networks, routers, switches, cables,
+ plugs) which may lead you to wrong conclusions!
+\end_layout
+
+\begin_layout Itemize
+Don't restrict your mind to boolean models of healthyness.
+ Doing so can easily create unnecessary damage by construction, and even
+ at concept level.
+ You should know from software engineering that defects in concepts or models
+ are much more serious than simple bugs in implementations.
+ Choosing the wrong model cannot be fixed as easily as a typical bug or
+ a typo.
+\end_layout
+
+\begin_layout Itemize
+Try to deduce the state of a system as
+\series bold
+reliably
+\series default
+ as possible.
+ If you don't know something for sure, don't generally assume that it has
+ gone wrong.
+ Don't confuse missing knowledge with the conclusion that something is bad.
+ Boolean algebra restricts your mind to either
+\begin_inset Quotes eld
+\end_inset
+
+good
+\begin_inset Quotes erd
+\end_inset
+
+ or
+\begin_inset Quotes eld
+\end_inset
+
+bad
+\begin_inset Quotes erd
+\end_inset
+
+.
+ Use at least
+\series bold
+tri-state algebra
+\series default
+ which has a means for expressing
+\series bold
+
+\begin_inset Quotes eld
+\end_inset
+
+unknown
+\begin_inset Quotes erd
+\end_inset
+
+
+\series default
+.
+ Even better: attach a probability to anything you (believe to) know.
+ Errare humanum est: nothing is absolutely sure.
+\end_layout
+
+\begin_layout Itemize
+Oversimplification: don't report an
+\begin_inset Quotes eld
+\end_inset
+
+unknown
+\begin_inset Quotes erd
+\end_inset
+
+ or even a
+\begin_inset Quotes eld
+\end_inset
+
+broken
+\begin_inset Quotes erd
+\end_inset
+
+ state for a complex system whenever a smaller subsystem exists for which
+ you have some knowledge (or you can conclude something about it with reasonable
+ evidence).
+ Otherwise, your users / sysadmins may draw wrong conclusions, and assume
+ that the whole system is broken, while in reality only some minor part
+ has some minor problem.
+ Users could then likely make wrong decisions, which may then easily lead
+ to bigger damages.
+\end_layout
+
+\begin_layout Itemize
+Murphy's law:
+\series bold
+never assume that something can't go wrong!
+\series default
+ Doing so is a blatant misconception at topmost level: the
+\emph on
+purpose
+\emph default
+ of a clustermanager is creating High Availablity (HA) out of more or less
+
+\begin_inset Quotes eld
+\end_inset
+
+unreliable
+\begin_inset Quotes erd
+\end_inset
+
+ components.
+ It is the damn duty of both a clustermanager and its configurator to try
+ to compensate
+\emph on
+any
+\emph default
+ failures,
+\emph on
+regardless of their probability
+\emph default
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Never claim that something has only low probability (and therefore it were
+ not relevant).
+ In the HA area, you simply
+\series bold
+cannot know
+\series default
+ that, because you typically have
+\emph on
+sporadic
+\emph default
+ incidents.
+ In extreme cases, the
+\emph on
+purpose
+\emph default
+ of your HA solution is protection against 1 failure per 10 years.
+ You simply don't have the time to wait for creating an incident statistics
+ about that!
+\end_layout
+
+\end_inset
+
+, as best as possible.
+\end_layout
+
+\begin_layout Itemize
+Never confuse
+\series bold
+probability
+\series default
+ with
+\series bold
+ expectancy value!
+\series default
+If you don't know the mathematical term
+\begin_inset Quotes eld
+\end_inset
+
+expectancy value
+\begin_inset Quotes erd
+\end_inset
+
+, or if you don't know what this means
+\emph on
+in practice
+\emph default
+, don't take responsibility for millions of € or $.
+\end_layout
+
+\begin_layout Itemize
+When operating masses of hard- and software: never assume that a particular
+ failure can occur only at a low number of instances.
+ There are
+\series bold
+\emph on
+unknown(!)
+\emph default
+ systematic errors
+\series default
+ which may pop up at the wrong time and in huge masses when you don't expect
+ them.
+\end_layout
+
+\begin_layout Itemize
+Multiple layers of fallback:
+\emph on
+any
+\emph default
+ action can fail.
+ Be prepared to have a plan B, and even a plan C, and even better a plan
+ D, wherever possible.
+\end_layout
+
+\begin_layout Itemize
+Never increase any damage anywhere, unnecessarily! Always try to
+\emph on
+miminize
+\emph default
+ any damage! It can be mathematically proven that in deterministic probabilistic
+ systems having finite state, increases of a damage level
+\emph on
+at the wrong place
+\emph default
+ will
+\emph on
+introduce
+\emph default
+ an
+\emph on
+additional
+\emph default
+
+\emph on
+risk
+\emph default
+ of getting into an
+\series bold
+endless loop
+\series default
+.
+ This is also true for nondeterministic systems, as known from formal language
+ theory
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Finite automatons are known to be transformable to deterministic ones, usually
+ by an exponential increase in the number of states.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Itemize
+Use the
+\series bold
+best effort principle
+\series default
+.
+ You should be aware of the following fact: in general, it is impossible
+ to create an
+\emph on
+absolutely reliable system
+\emph default
+ out of unreliable components.
+ You can
+\emph on
+lower
+\emph default
+ the risk of failures to any
+\begin_inset Formula $\epsilon>0$
+\end_inset
+
+ by investing a lot of resources and of money, but whatever you do:
+\begin_inset Formula $\epsilon=0$
+\end_inset
+
+ is impossible.
+ Therefore, be careful with boolean algebra.
+ Prefer approximation methods / optimizing methods instead.
+ Always do
+\emph on
+your
+\emph default
+ best, instead of trying to reach a
+\emph on
+global
+\emph default
+ optimum which likely does not exist at all (because the
+\begin_inset Formula $\epsilon$
+\end_inset
+
+ can only
+\emph on
+converge
+\emph default
+ to an optimum, but will never actually reach it).
+ The best effort principle means the following: if you discover a method
+ for improving your operating state by reduction of a (potential) damage
+ in a reasonable time and with reasonable effort, then
+\series bold
+simply do it
+\series default
+.
+ Don't argue that a particular step is no 100% solution for all of your
+ problems.
+
+\emph on
+Any
+\emph default
+
+\emph on
+improvement
+\emph default
+ is valuable.
+
+\series bold
+Don't miss any valuable step
+\series default
+ having reasonable costs with respect to your budget.
+ Missing valuable measures which have low costs are certainly a violation
+ of the best effort principle, because you are not doing
+\emph on
+your
+\emph default
+ best.
+ Keep that in mind.
+\begin_inset Newline newline
+\end_inset
+
+If you have
+\emph on
+understood
+\emph default
+ this (e.g.
+ deeply think at least one day about it), you will no longer advocate STONITH
+ methods
+\emph on
+in general
+\emph default
+, when there are alternatives.
+ STONITH methods are only valuable when you
+\emph on
+know in advance
+\emph default
+ that the final outcome (after reboot) will most likely be better, and that
+ waiting for reboot will most likely
+\emph on
+pay off
+\emph default
+.
+ In general, this condition is
+\emph on
+not true
+\emph default
+ if you have a healthy hot standby system.
+ This should be easy to see.
+ But there exist well-known clustermanager solutions / configurations blatantly
+ ignoring
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+For some
+\emph on
+special(!)
+\emph default
+ cases of the shared-disk model, there exist some justifications for doing
+ STONITH
+\emph on
+before
+\emph default
+ starting the application at the hot standby.
+ Under certain circumstances, it can happen that system A running amok could
+ destroy the data on your single shared disk (example: a filesystem doubly
+ mounted
+\emph on
+in parallel
+\emph default
+, which will certainly destroy your data, except you are using
+\family typewriter
+ocfs2
+\family default
+ or suchalike).
+ This argument is only valid for
+\emph on
+passive
+\emph default
+ disks which are
+\emph on
+directly
+\emph default
+ attached to
+\emph on
+both
+\emph default
+ systems A and B, such that there is no
+\emph on
+external
+\emph default
+ means for fencing the disk.
+ In case of iSCSI running over ordinary network equipment such as routers
+ or switches, the argument
+\begin_inset Quotes eld
+\end_inset
+
+fencing the disk is otherwise not possible
+\begin_inset Quotes erd
+\end_inset
+
+ does not apply.
+ You can interrupt iSCSI connection at the network gear, or you can often
+ do it at cluster A or at the iSCSI target.
+ Even commercial storage appliances speaking iSCSI can be remotely controlled
+ for forcefully aborting iSCSI sessions.
+ In modern times, the STONITH method has no longer such a justification.
+ The justification stems from ancient times when a disk was a purely passive
+ mechanical device, and its disk controller was part of the server system.
+\end_layout
+
+\end_inset
+
+ this.
+ Only when the former standby system does not work as expected (this means
+ that
+\emph on
+all
+\emph default
+ of your redundant systems are not healthy enough for your application),
+
+\emph on
+only then
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice that STONITH may be needed for (manual or partially automatic)
+\emph on
+repair
+\emph default
+ in some cases, e.g.
+ when you know that a system has a kernel crash.
+ Don't mix up the repair phase with failover or handover phases.
+ Typically, they are executed at different times.
+ The repair phase is outside the scope of this section.
+\end_layout
+
+\end_inset
+
+
+\emph default
+ STONITH is unevitable as a
+\emph on
+last resort
+\emph default
+ option.
+\begin_inset Newline newline
+\end_inset
+
+In short: blindly using STONITH without true need during failover is a violation
+ of the best effort principle.
+ You are simply not doing your best.
+\end_layout
+
+\begin_layout Itemize
+When your budget is limited, carefully select those improvements which make
+ your system
+\series bold
+as reliable as possible
+\series default
+, given your fixed budget.
+\end_layout
+
+\begin_layout Itemize
+Create statistics on the duration of your actions.
+ Based on this, try to get a
+\emph on
+balanced
+\emph default
+ optimum between time and costs.
+\end_layout
+
+\begin_layout Itemize
+Whatever actions you can
+\series bold
+start in parallel
+\series default
+ for saving time, do it.
+ Otherwise you are disregarding the best effort principle, and your solution
+ will be sub-optimal.
+ You will require deep knowledge of parallel systems, as well as experience
+ with dealing with problems like (distributed) races.
+ Notice that
+\emph on
+any
+\emph default
+ distributed system is
+\emph on
+inherently parallel
+\emph default
+.
+ Don't believe that sequential methods can deliver an optimum solution in
+ such a difficult area.
+\end_layout
+
+\begin_layout Itemize
+If you don't have the
+\series bold
+necessary skills
+\series default
+ for (a) recognizing already existing parallelism, (b) dealing with parallelism
+ at concept level, (c) programming and/or configuring parallelism race-free
+ and deadlock-free (or if you even don't know what a race condition is and
+ where it may occur in practice), then don't take responsibility for millions
+ of € or $.
+\end_layout
+
+\begin_layout Itemize
+Avoid hard timeouts wherever possible.
+ Use
+\series bold
+adaptive timeouts
+\series default
+ instead.
+ Reason: depending on hardware or workload, the same action A may take a
+ very short time on cluster 1, but take a very long time on cluster 2.
+ If you need to guard action A from hanging (which is almost always the
+ case because of Murphy's law), don't configure any fixed timeout for it.
+ When having several hundreds of clusters, you would need to use the
+\emph on
+worst case value
+\emph default
+, which is the longest time occurring somewhere at the very slow clusters
+ / slow parts of the network.
+ This wastes a lot of time in case one of the fast clusters is hanging.
+ Adaptive timeouts work differently: they use a kind of
+\begin_inset Quotes eld
+\end_inset
+
+progress bar
+\begin_inset Quotes erd
+\end_inset
+
+ to monitor the
+\emph on
+progress
+\emph default
+ of an action.
+ They will abort only if there is
+\emph on
+no progress
+\emph default
+ for a certain amount of time.
+ Hint: among others,
+\family typewriter
+marsadm view-*-rest
+\family default
+ commands or macros are your friend.
+\end_layout
+
+\begin_layout Paragraph
+ITON = Ignore The Other Node
+\end_layout
+
+\begin_layout Standard
+This means
+\series bold
+fencing from application traffic
+\series default
+, and can be used as an alternative to STONITH when done properly.
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/fencing-hierarchy.fig
+ width 60col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+Fencing from application traffic is best suited for the shared-nothing model,
+ but can also be adapted to the shared-disk model with some quirks.
+\end_layout
+
+\begin_layout Standard
+The idea is simple: always route your application network traffic to the
+ current (logically) active side, whether it is currently A or B.
+ Just don't route any application requests to the current (logically) passive
+ side at all.
+\end_layout
+
+\begin_layout Standard
+For failover (and
+\emph on
+only
+\emph default
+ for that), you
+\emph on
+should not care about
+\emph default
+ any split brain occurring at the low-level generic block device:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/split-brain-history.fig
+ width 50col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+Although having a split brain at the generic low-level block device, you
+ now define the
+\begin_inset Quotes eld
+\end_inset
+
+logically active
+\begin_inset Quotes erd
+\end_inset
+
+ and
+\begin_inset Quotes eld
+\end_inset
+
+logically passive
+\begin_inset Quotes erd
+\end_inset
+
+ side by yourself by
+\emph on
+logically ignoring
+\emph default
+ the
+\begin_inset Quotes eld
+\end_inset
+
+wrong
+\begin_inset Quotes erd
+\end_inset
+
+ side as defined by yourself:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/split-brain-resolved.fig
+ width 50col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+This is possible because the generic block devices provided by DRBD or MARS
+ are completely
+\series bold
+agnostic
+\series default
+ of the
+\begin_inset Quotes eld
+\end_inset
+
+meaning
+\begin_inset Quotes erd
+\end_inset
+
+ of either version A or B.
+ Higher levels such as clustermanagers (or humans like sysadmins) can assign
+ them a meaning like
+\begin_inset Quotes eld
+\end_inset
+
+relevant
+\begin_inset Quotes erd
+\end_inset
+
+ or
+\begin_inset Quotes eld
+\end_inset
+
+not relevant
+\begin_inset Quotes erd
+\end_inset
+
+, or
+\begin_inset Quotes eld
+\end_inset
+
+logically active
+\begin_inset Quotes erd
+\end_inset
+
+ or
+\begin_inset Quotes eld
+\end_inset
+
+logically passive
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+As a result of fencing from application traffic, the
+\begin_inset Quotes eld
+\end_inset
+
+logically passive
+\begin_inset Quotes erd
+\end_inset
+
+ side will
+\emph on
+logically
+\emph default
+ cease any actions such as updating user data, even if it is
+\begin_inset Quotes eld
+\end_inset
+
+physically active
+\begin_inset Quotes erd
+\end_inset
+
+ during split-brain (when two primaries exist in DRBD or MARS sense
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Hint: some clustermanagers and/or some people seem to define the term
+\begin_inset Quotes eld
+\end_inset
+
+split-brain
+\begin_inset Quotes erd
+\end_inset
+
+ differently from DRBD or MARS.
+ In the context of generic block devices, split brain means that the
+\emph on
+history
+\emph default
+ of both versions has been split to a Y-like
+\series bold
+fork
+\series default
+ (for whatever reason), such that re-joining them
+\emph on
+incrementally
+\emph default
+ by ordinary write operations is no longer guaranteed to be possible.
+ As a slightly simplified definition, you might alternatively use the definition
+
+\begin_inset Quotes eld
+\end_inset
+
+two incompatible primaries are existing in parallel
+\begin_inset Quotes erd
+\end_inset
+
+, which means almost the same in practice.
+ Details of formal semantics are not the scope of this treatment.
+\end_layout
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+If you already have some load balancing, or BGP, or another
+\emph on
+mechanism
+\emph default
+ for dynamic routing, you already have an important part for the ITON method.
+ Additionally, ensure by an appropriate
+\emph on
+strategy
+\emph default
+ that your balancer status / BGP announcement etc does always coincide with
+ the
+\begin_inset Quotes eld
+\end_inset
+
+logically active
+\begin_inset Quotes erd
+\end_inset
+
+ side (recall that even during split-brain
+\emph on
+you
+\emph default
+ must define
+\begin_inset Quotes eld
+\end_inset
+
+logically active
+\begin_inset Quotes erd
+\end_inset
+
+
+\series bold
+uniquely
+\series default
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+A possible strategy is to use a Lamport clock for route changes: the change
+ with the most recent Lamport timestamp will always win over previous changes.
+\end_layout
+
+\end_inset
+
+ by yourself).
+\end_layout
+
+\begin_layout Standard
+Example:
+\end_layout
+
+\begin_layout Description
+Phase1 Check whether the hot standby B is currently usable.
+ If this is violated (which may happen during certain types of disasters),
+ abort the failover for any affected resources.
+\end_layout
+
+\begin_layout Description
+Phase2 Do the following
+\emph on
+in parallel
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+For database applications where no transactions should get lost, you should
+ slightly modify the order of operations: first fence the old side A, then
+ start the application at standby side B.
+ However, be warned that even this cannot guarantee that no transaction
+ is lost.
+ When the network between A and B is interrupted
+\emph on
+before
+\emph default
+ the incident happens, DRBD will automatically disconnect, and MARS will
+ show a lagbehind.
+ In order to fully eliminate this possibility, you can either use DRBD and
+ configure it to hang forever during network outages (such that users will
+ be unable to commit any transactions at all), or you can use the shared-disk
+ model instead.
+ But in the latter case, you are introducing a SPOF at the single shared
+ disk.
+ The former case is logically almost equivalent to shared-disk, but avoiding
+ some parts of the physical SPOF.
+ In a truly distributed system, the famous CAP theorem is limiting your
+ possibilities.
+ Therefore, no general solution exists fulfilling all requirements at the
+ same time.
+\end_layout
+
+\end_inset
+
+:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_deeper
+\begin_layout Itemize
+Start all affected applications at the hot standby B.
+ This can be done with the same DRBD or MARS procedure as described
+\begin_inset CommandInset ref
+LatexCommand vpageref
+reference "Phase4-in-more"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Itemize
+Fence A by fixedly routing all affected application traffic to B.
+\end_layout
+
+\end_deeper
+\begin_layout Standard
+That's all which has to be done for a shared-nothing model.
+ Of course, this will likely produce a split-brain (even when using DRBD
+ in place of MARS), but that will not matter from a user's perspective,
+ because the users will no longer
+\begin_inset Quotes eld
+\end_inset
+
+see
+\begin_inset Quotes erd
+\end_inset
+
+ the
+\begin_inset Quotes eld
+\end_inset
+
+logically passive
+\begin_inset Quotes erd
+\end_inset
+
+ side A through their network.
+ Only during the relatively small time period where application traffic
+ was going to the old side A while not replicated to B due to the incident,
+ a very small number of updates
+\emph on
+could
+\emph default
+ have gone lost.
+ In fields like webhosting, this is taken into account.
+ Users will usually not complain when some (smaller amount of) data is lost
+ due to split-brain.
+ They will complain when the service is unavailable.
+\end_layout
+
+\begin_layout Standard
+This method is the fastest for restoring availability, because it doesn't
+ try to execute any (remote) action at side A.
+ Only from a sysadmin's perspective, there remain some cleanup tasks to
+ be done during the following repair phase, such as split-brain resolution,
+ which are outside the scope of this treatment.
+\end_layout
+
+\begin_layout Standard
+By running the application fencing step
+\emph on
+sequentially
+\emph default
+ (including wait for its partial successfulness such that the old side A
+ can no longer be reached by any users) in front of the failover step, you
+ may minimize the amount of lost data, but at the cost of total duration.
+ Your service will take longer to be available again, while the amount of
+ lost data is typically somewhat smaller.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ A few people might clamour when some data is lost.
+ In long-distance replication scenarios with high update traffic, there
+ is
+\emph on
+simply no way at all
+\emph default
+ for guaranteeing that no data can be lost ever.
+ According to the laws of Einstein and the laws of Distributed Systems like
+ the famous CAP theorem, this isn't the fault of DRBD+proxy or MARS, but
+ simply the
+\emph on
+consequence
+\emph default
+ of having long distances.
+ If you want to protect against data loss as best as possible, then don't
+ use
+\begin_inset Formula $k=2$
+\end_inset
+
+ replicas.
+ Use
+\begin_inset Formula $k\geq4$
+\end_inset
+
+, and spread them over different distances, such as mixed small + medium
+ + long distances.
+ Future versions of MARS will support adaptive pseudo-synchronous modes,
+ which will allow individual adaptation to network latencies / distances.
+\end_layout
+
+\begin_layout Standard
+The ITON method can be adapted to shared-disk by additionally fencing the
+ common disk from the (presumably) failed cluster node A.
+\end_layout
+
+\begin_layout Subsubsection
+Handover Methods
+\end_layout
+
+\begin_layout Standard
+Planned handover is conceptually simpler, because both sides must be (almost)
+ healthy as a
+\emph on
+precondition
+\emph default
+.
+ There are simply no pre-existing failures to deal with.
+\end_layout
+
+\begin_layout Standard
+Here is an example using DRBD, some application commands denoted as pseudo
+ code:
+\end_layout
+
+\begin_layout Enumerate
+at side A:
+\family typewriter
+applicationmanager stop all
+\end_layout
+
+\begin_layout Enumerate
+at side A:
+\family typewriter
+drbdadm secondary all
+\end_layout
+
+\begin_layout Enumerate
+at side B:
+\family typewriter
+drbdadm primary all
+\end_layout
+
+\begin_layout Enumerate
+at side B:
+\family typewriter
+applicationmanager start all
+\end_layout
+
+\begin_layout Standard
+MARS already has a conceptual distinction between handover and failover.
+ With MARS, it becomes even simpler, because a generic handover procedure
+ is already built in:
+\end_layout
+
+\begin_layout Enumerate
+at side A:
+\family typewriter
+applicationmanager stop all
+\end_layout
+
+\begin_layout Enumerate
+at side B:
+\family typewriter
+marsadm primary all
+\end_layout
+
+\begin_layout Enumerate
+at side B:
+\family typewriter
+applicationmanager start all
+\end_layout
+
+\begin_layout Subsubsection
+Hybrid Methods
+\end_layout
+
+\begin_layout Standard
+In general, a planned handover may fail at any stage.
+ Notice that such a failure is also a failure, but (partially) caused by
+ the planned handover.
+ You have the following alternatives for automatically dealing with such
+ cases:
+\end_layout
+
+\begin_layout Enumerate
+In case of a failure, switch back to the old side A.
+\end_layout
+
+\begin_layout Enumerate
+Instead, forcefully switch to the new side A, similar to the methods described
+ in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Failover-Methods"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+Similar options exist for a failed failover (at least in theory), but chances
+ are lower for actually recovering if you have only
+\begin_inset Formula $k=2$
+\end_inset
+
+ replicas in total.
+\end_layout
+
+\begin_layout Standard
+Whatever you decide to do in what case in whatever priority order, whether
+ you decide it in advance or during the course of a failing action: it simply
+ means that according to the best effort principle, you should
+\series bold
+never leave your system in a broken state
+\series default
+ when there exists a chance to recover availability with any method.
+\end_layout
+
+\begin_layout Standard
+Therefore, you should
+\emph on
+implement
+\emph default
+ neither handover nor failover in their pure forms.
+ Always implement hybrid forms following the best effort principle.
+\end_layout
+
+\begin_layout Subsection
+Special Requirements for Long Distances
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Special-Requirements-for"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Most contemporary clustermanagers have been constructed for short distance
+ shared-nothing clusters, or even for
+\emph on
+local
+\emph default
+ shared-nothing clusters (c.f.
+ DRBD over crossover cables), or even for shared-disk clusters (
+\emph on
+originally
+\emph default
+, when their
+\emph on
+concepts
+\emph default
+ were developed).
+ Blindly using them for long-distance replication without modification /
+ adaptation bears some additional risks.
+\end_layout
+
+\begin_layout Itemize
+Notice that long-distance replication always
+\emph on
+requires
+\emph default
+ a
+\series bold
+shared-nothing
+\series default
+ model.
+\end_layout
+
+\begin_layout Itemize
+As a consequence,
+\series bold
+split brain
+\series default
+ can appear
+\emph on
+regularly
+\emph default
+ during failover.
+ There is no way for preventing it! This is an
+\emph on
+inherent property
+\emph default
+ of distributed systems, not limited to MARS (e.g.
+ also ocurring with DRBD if you try to use it over long distances).
+ Therefore, you
+\emph on
+must
+\emph default
+ deal with occurences of split-brain as a
+\emph on
+requirement
+\emph default
+.
+\end_layout
+
+\begin_layout Itemize
+The probability of
+\series bold
+network partitions
+\series default
+ is much higher: although you should have been required by Murphy's law
+ to deal with network partitions already in short-distance scenarios, it
+ now becomes
+\emph on
+mandatory
+\emph default
+.
+\end_layout
+
+\begin_layout Itemize
+Be prepared that in case of certain types of (more or less global) internet
+ partitions, you may not be able to trigger STONITH actions
+\emph on
+at all
+\emph default
+.
+ Therefore,
+\series bold
+fencing of application traffic
+\series default
+ is
+\emph on
+mandatory
+\emph default
+.
+\end_layout
+
+\begin_layout Section
+
+\family typewriter
+systemd
+\family default
+ Templates
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:systemd-Templates"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Starting with
+\family typewriter
+mars0.1stable57
+\family default
+ (resp.
+
+\family typewriter
+mars0.1abeta9
+\family default
+), you may use
+\family typewriter
+systemd
+\family default
+ as a cluster manager at the Mechanics Layer as explained in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "subsec:Granularity-and-Layering"
+
+\end_inset
+
+.
+ MARS will replicate some
+\family typewriter
+systemd
+\family default
+-relevant state information across the (big) cluster, so there is some limited
+ remote operation support.
+ In particular, automated handover via
+\family typewriter
+marsadm primary $resource
+\family default
+ is supported.
+ More features will be added to future releases.
+\end_layout
+
+\begin_layout Subsection
+Why
+\family typewriter
+systemd
+\family default
+?
+\end_layout
+
+\begin_layout Standard
+All major Linux distributions are now
+\family typewriter
+systemd
+\family default
+ based.
+ It is the new quasi standard.
+ Although there have been some discussions in the community about its merits
+ and shortcomings, it appears to be accepted now in large parts of the Linux
+ world.
+\end_layout
+
+\begin_layout Standard
+Systemd has a few advantages:
+\end_layout
+
+\begin_layout Enumerate
+It is running as
+\family typewriter
+init
+\family default
+ process under the reserved
+\family typewriter
+pid=1
+\family default
+.
+ If it would ever die, then your system would die.
+ There is no need for adding a new MARS clustermanager daemon or similar,
+ which could fail independently from other parts of the system.
+\end_layout
+
+\begin_layout Enumerate
+Although
+\family typewriter
+systemd
+\family default
+ has been criticised as being
+\begin_inset Quotes eld
+\end_inset
+
+monolithic
+\begin_inset Quotes erd
+\end_inset
+
+ (referring to its internal software architecture), its
+\emph on
+usage
+\emph default
+ by sysadmins is easily decomposable into many plugins called
+\series bold
+units
+\series default
+.
+\end_layout
+
+\begin_layout Enumerate
+Local LXC containers, local VMs,
+\family typewriter
+iSCSI
+\family default
+ exports,
+\family typewriter
+nfs
+\family default
+ exports and many other parts of the system are often already controlled
+ by
+\family typewriter
+systemd
+\family default
+.
+ Together with
+\family typewriter
+udev
+\family default
+ and other parts, it already controls devices, LVM, mountpoints, etc.
+ Since MARS is only a particular
+\emph on
+component
+\emph default
+ in a bigger complicated stack, it is an advantage to use the same (more
+ or less standardized and well-integrated) tools for managing the whole
+ stack.
+\end_layout
+
+\begin_layout Standard
+Systemd has also a few disadvantages:
+\end_layout
+
+\begin_layout Enumerate
+It is not accepted everywhere.
+ Therefore the
+\family typewriter
+systemd
+\family default
+ template extensions of
+\family typewriter
+marsadm
+\family default
+ are not mandatory for MARS operations.
+ You can implement your own alternatives when necessary.
+\end_layout
+
+\begin_layout Enumerate
+It can be messy to deal with.
+ In particular, it can sometimes
+\emph on
+believe
+\emph default
+ that the system
+\emph on
+were
+\emph default
+ in a particular state, although in reality it isn't.
+ Compensation is hairy.
+\end_layout
+
+\begin_layout Enumerate
+Usablility / reporting: it is less usable for getting an overview over a
+ bigger local system, and is practically unusable (out-of-the-box) for managing
+ a bigger cluster at cluster level.
+ Monitoring needs to be done separately.
+\end_layout
+
+\begin_layout Subsection
+Working Principle of the
+\family typewriter
+systemd
+\family default
+ Template Engine
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Working-Principle-of"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Systemd already has some very basic templating capabilities.
+ It is possible to create unit names containing the
+\family typewriter
+@
+\family default
+ symbol, which can then be expanded under certain circumstances, e.g.
+ to tty names etc.
+ However, automatic expansion is only done when somebody knows the instance
+ name already
+\emph on
+in advance
+\emph default
+.
+ The author has not found any way for creating instance names out of
+\begin_inset Quotes eld
+\end_inset
+
+thin air
+\begin_inset Quotes erd
+\end_inset
+
+, such as from dynamically created MARS resource names.
+ Essentially, an
+\emph on
+inference machine
+\emph default
+ for systemd templates does not yet exist.
+\end_layout
+
+\begin_layout Standard
+This lacking functionality is completed with the following macro processing
+ capabilities of
+\family typewriter
+marsadm
+\family default
+:
+\end_layout
+
+\begin_layout Standard
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+sloppy
+\end_layout
+
+\end_inset
+
+ Some ordinary or templated
+\family typewriter
+systemd
+\family default
+ unit files (see
+\family typewriter
+man systemd.unit
+\family default
+) can be installed into one of the following directories:
+\family typewriter
+./systemd-templates
+\family default
+,
+\family typewriter
+$HOME/.marsadm/systemd-templates/
+\family default
+,
+\family typewriter
+/etc/marsadm/systemd-templates/
+\family default
+,
+\family typewriter
+/usr/lib/marsadm/systemd-templates/
+\family default
+,
+\family typewriter
+/usr/local/lib/marsadm/systemd-templates/
+\family default
+.
+ Futher places can be defined by overriding the $
+\family typewriter
+MARS_PATH
+\family default
+ environment variable.
+\end_layout
+
+\begin_layout Standard
+From these directories, ordinary systemd unit files will be just copied
+ into
+\family typewriter
+/run/systemd/system/
+\family default
+ (configurable via
+\family typewriter
+$SYSTEMD_TARGET_DIR
+\family default
+) and then picked up by
+\family typewriter
+systemd
+\family default
+ as ordinary unit files.
+\end_layout
+
+\begin_layout Standard
+Template unit files are nothing but unit files containing
+\family typewriter
+@{
+\emph on
+varname
+\emph default
+}
+\family default
+ parts or other macro definitions in their filename, and possibly also in
+ their bodies, at arbitrary places.
+ These
+\family typewriter
+@{...}
+\family default
+ parts are substituted by a
+\family typewriter
+marsadm
+\family default
+ macro processing engine.
+\end_layout
+
+\begin_layout Standard
+The following macro capabilities are currently defined:
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+@{
+\emph on
+varname
+\emph default
+}
+\family default
+ Expands to the value of the variable.
+ This can be used both in template filenames and and in content of template
+ files.
+ Predefined are the following variables:
+\end_layout
+
+\begin_deeper
+\begin_layout Description
+
+\family typewriter
+@{res}
+\family default
+ The MARS resource name.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+@{resdir}
+\family default
+ The MARS resource directory
+\family typewriter
+/mars/resource-$res/
+\family default
+.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+@{host}
+\family default
+ The local host name as determined by
+\family typewriter
+marsadm
+\family default
+, or as overridden by the
+\family typewriter
+--host=
+\family default
+ parameter.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+@{cmd}
+\family default
+ The
+\family typewriter
+marsadm
+\family default
+ command as given on the command line (only reasonable for debugging or
+ for error messages).
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+@{
+\emph on
+varname
+\emph default
+}
+\family default
+ Further variables as defined by the macro processor, see section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "par:Predefined-Variables"
+
+\end_inset
+
+, and as definable by
+\family typewriter
+%let{
+\emph on
+varname
+\emph default
+}{...}
+\family default
+ statements, see also sections
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "subsec:Predefined-Complex-and"
+
+\end_inset
+
+ and
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "subsec:Predefined-Trivial-Macros"
+
+\end_inset
+
+.
+\end_layout
+
+\end_deeper
+\begin_layout Description
+
+\family typewriter
+@eval{
+\emph on
+text
+\emph default
+}
+\family default
+ Calls the MARS macro processor as explained in chapter
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "chap:The-Macro-Processor"
+
+\end_inset
+
+, and substitutes its output.
+ Notice that systemd template variables occurring in the macro processor
+
+\family typewriter
+\emph on
+text
+\family default
+\emph default
+ must be accessed via the macro processor syntax
+\family typewriter
+%{varname}
+\family default
+, because the macro processor uses
+\family typewriter
+%
+\family default
+ as an escape symbol, while the systemd template engine uses
+\family typewriter
+@
+\family default
+ instead.
+ This is necessary for distinction of both layers.
+ Notice that variables defined via the macro processor syntax
+\family typewriter
+%let{varname}{value}
+\family default
+ can be afterwards accessed by the template engine via
+\family typewriter
+@{varname}
+\family default
+ syntax, once the macro engine has finished working on
+\family typewriter
+\emph on
+text
+\family default
+\emph default
+.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+^
+\emph on
+{varname
+\emph default
+}
+\family default
+
+\begin_inset space ~
+\end_inset
+
+or
+\begin_inset space ~
+\end_inset
+
+,
+\family typewriter
+^
+\emph on
+{varname
+\emph default
+}{
+\emph on
+regex
+\emph default
+}
+\family default
+ This can be used in template filenames only.
+ The
+\family typewriter
+\emph on
+regex
+\family default
+\emph default
+ denotes a delimiter for scanning the filename until the delimiter is reached.
+ The matching part of the filename is assigned to
+\family typewriter
+\emph on
+varname
+\family default
+\emph default
+, and can be used at any following
+\family typewriter
+ @{
+\emph on
+varname
+\emph default
+}
+\family default
+ substitutions, both in the rest of the filename, and in the content of
+ the file.
+ When
+\family typewriter
+\emph on
+regex
+\family default
+\emph default
+ is omitted or empty, it defaults to
+\family typewriter
+-
+\family default
+ (a single minus symbol) which is suitable for matching paths of mountpoints
+ as written in systemd syntax.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+@esc{
+\emph on
+text
+\emph default
+}
+\family default
+ Calls the
+\family typewriter
+systemd-escape
+\family default
+ tool for conversion of pathnames following the
+\family typewriter
+systemd
+\family default
+ naming conventions (see
+\family typewriter
+man systemd-escape
+\family default
+).
+ For example, a dash is converted to
+\family typewriter
+
+\backslash
+x2d
+\family default
+.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Omitting this can lead to problems when your resource names are containing
+ special characters like dashes or other special symbols (in the sense of
+
+\family typewriter
+systemd
+\family default
+).
+ Bugs of this kind are hard to find and to debug.
+ Either forbid special characters in your installation, or don't forget
+ to test everything with some crude resource names!
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ Example snippet from a
+\family typewriter
+.path
+\family default
+ unit.
+ Please notice where escaping is needed and where it must not be used (also
+ notice that a dash is sometimes a legal part of the
+\family typewriter
+.mount
+\family default
+ unit name, but except from the resource name part):
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+[Path]
+\end_layout
+
+\begin_layout Plain Layout
+
+PathExists=/dev/mars/@{res}
+\end_layout
+
+\begin_layout Plain Layout
+
+Unit=vol-@escvar{res}.mount
+\end_layout
+
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Another source of crude bugs is the backslash character in the
+\family typewriter
+systemd-escape
+\family default
+ substitution, such as from
+\family typewriter
+
+\backslash
+x2d
+\family default
+.
+ When passed to a shell, such as in certain
+\family typewriter
+ExecStart=
+\family default
+ statements, the backslash will be removed.
+ Therefore, don't forget to either replace any single backslash with two
+ backslashes, or to put the whole pathname in single quotes, or similar.
+ Always check the result of your substitutions! It depends on the
+\emph on
+target
+\emph default
+ (such as
+\family typewriter
+bash
+\family default
+, as opposed to
+\family typewriter
+systemd
+\family default
+) whether further escaping of the escapes is needed, or whether it
+\emph on
+must not
+\emph default
+ be applied.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Become a master of the escaping hell by inserting debug code into your scripts
+ (reporting to
+\family typewriter
+/dev/stderr
+\family default
+ or to log files) and do thorough testing like a devil.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+@escvar{
+\emph on
+varname
+\emph default
+}
+\family default
+ Equivalent to
+\family typewriter
+@esc{@{
+\emph on
+varname
+\emph default
+}}
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+When creating a new resource via
+\family typewriter
+marsadm create-resource
+\family default
+, or when adding a new replica via
+\family typewriter
+marsadm join-resource
+\family default
+ or similar, the template system will automatically create new instances
+ for the new resource or its replicas.
+ Conversely,
+\family typewriter
+marsadm leave-resource
+\family default
+ and its friends like
+\family typewriter
+delete-resource
+\family default
+ etc will automatically remove the corresponding template instances from
+
+\family typewriter
+/run/systemd/system/
+\family default
+.
+\end_layout
+
+\begin_layout Subsection
+Example
+\family typewriter
+systemd
+\family default
+ Templates
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Example-systemd-Templates"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+These can be found in the MARS repo in the
+\family typewriter
+systemd/
+\family default
+ subdirectory.
+ At the moment, the following are available (subject to further extension
+ and improvements without notice):
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+mars.path
+\family default
+ This ensures that the mountpoint
+\family typewriter
+/mars/
+\family default
+ is already mounted before
+\family typewriter
+mars.service
+\family default
+ is started.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+mars.service
+\family default
+ This starts and stops the MARS kernel module, provided that
+\family typewriter
+/mars
+\family default
+ is (somehow) mounted.
+ The latter can be ensured by classical
+\family typewriter
+/etc/fstab
+\family default
+ methods, or by
+\family typewriter
+.mount
+\family default
+ units like your own hand-crafted
+\family typewriter
+mars.mount
+\family default
+ unit.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+mars-trigger.path
+\family default
+ This is used for remote triggering of the marsadm template engine from
+ another MARS cluster member, e.g.
+ when initiating a handover.
+ Local triggering is also possible via
+\family typewriter
+touch /mars/userspace/systemd-trigger
+\family default
+.
+ When triggered, the command
+\family typewriter
+marsadm systemd-trigger
+\family default
+ is executed.
+ In turn, this will re-compute all
+\family typewriter
+systemd
+\family default
+ templates and start those units where the local host is in primary role.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+dev-mars-@{res}.path
+\family default
+ This is used for generic triggering of any
+\family typewriter
+systemd
+\family default
+ unit as set by
+\family typewriter
+marsadm set-systemd-unit $res $unit
+\family default
+ (see below in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Handover-using-systemd"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+^{unit}-@{res}.mount
+\family default
+ This is one of the possible sub-ordinate targets which depend on
+\family typewriter
+dev-mars-@{res}.path
+\family default
+.
+ For fully automatic activation of this target, use something like
+\family typewriter
+marsadm set-systemd-unit mydata vol-mydata.mount
+\family default
+ or similar.
+ This will automatically mount
+\family typewriter
+/dev/mars/mydata
+\family default
+ to the mountpoint
+\family typewriter
+/vol/mydata
+\family default
+.
+ Notice that the template notation
+\family typewriter
+^{unit}
+\family default
+ can be used for mounting to an arbitrary mountpoint, such as
+\family typewriter
+ /another/mountdir/mydata
+\family default
+, by using the corresponding systemd template syntax in
+\family typewriter
+marsadm set-systemd-unit mydata another-mountdir-mydata.mount
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+ In general, it is good practice to have a
+\emph on
+consistent
+\emph default
+ name scheme.
+ Always use the same name for the underlying LV (called disk in MARS terminology
+), equal to the MARS resource name, equal to the last part of the mountpoint,
+ equal to the IQN of an iSCSI export, equal to the NFS share name, equal
+ to the LXC container name, equal to the KVM/qemu virtual machine name,
+ and so on.
+ Messing around with non-systematic naming conventions can easily result
+ in a hell.
+\end_layout
+
+\begin_layout Subsection
+Handover involving
+\family typewriter
+systemd
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Handover-using-systemd"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+First, you need to install your systemd templates into one of the template
+ directories mentioned in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Working-Principle-of"
+
+\end_inset
+
+.
+ In case you have never used the template engine before, you can create
+ the first instantiation via
+\family typewriter
+marsadm systemd-trigger
+\family default
+.
+ Afterwards, inspect
+\family typewriter
+/run/systemd/system/
+\family default
+ for newly created template instances and check them.
+\end_layout
+
+\begin_layout Standard
+For each resource
+\family typewriter
+$res
+\family default
+, you should set (potentially different) systemd targets via
+\family typewriter
+marsadm set-systemd-unit $res
+\begin_inset Quotes eld
+\end_inset
+
+$start_unit
+\begin_inset Quotes erd
+\end_inset
+
+
+\begin_inset Quotes eld
+\end_inset
+
+$stop_unit
+\family default
+
+\begin_inset Quotes erd
+\end_inset
+
+.
+ Notice that
+\family typewriter
+$start_unit
+\family default
+ and
+\family typewriter
+$stop_unit
+\family default
+ are typically denoting different targets (with few exceptions) for the
+ following reason:
+\end_layout
+
+\begin_layout Description
+Example: assume your stack consists of
+\family typewriter
+vol-@{res}.mount
+\family default
+ and
+\family typewriter
+nfs-export-@{res}.service
+\family default
+.
+ Before the filesystem can be exported via
+\family typewriter
+nfs
+\family default
+, it
+\emph on
+first
+\emph default
+ needs to be mounted.
+ At startup,
+\family typewriter
+systemd
+\family default
+ can do this easily for you: just add a
+\family typewriter
+Requires=
+\family default
+ dependency between both targets, or similar.
+ However, the situation can become tricky upon shutdown.
+ Theoretically,
+\family typewriter
+systemctl stop nfs-export-@{res}.service
+\family default
+
+\emph on
+could
+\emph default
+ work in some cases, but in general it is not reliable.
+ Reason: there might be other
+\emph on
+sister
+\emph default
+ units which
+\emph on
+also
+\emph default
+ depend on the mount.
+ In some cases, you need not necessarily notice that sisters, because systemd
+ can add further (internal) targets
+\emph on
+automatically
+\emph default
+.
+ The problem is easily solvable by
+\family typewriter
+systemctl stop vol-@{res}.mount
+\family default
+, which will automatically tear down all dependencies in reverse order.
+\end_layout
+
+\begin_layout Standard
+For maximum safety,
+\family typewriter
+$start_unit
+\family default
+ should always point at the
+\emph on
+tip
+\emph default
+ of your stack, while
+\family typewriter
+$stop_unit
+\family default
+ should point at the
+\emph on
+bottom
+\emph default
+ (but one level higher than
+\family typewriter
+/dev/mars/$res
+\family default
+).
+\end_layout
+
+\begin_layout Standard
+Removing any systemd targets is also possible via
+\family typewriter
+marsadm set-systemd-unit $res
+\begin_inset Quotes eld
+\end_inset
+
+
+\begin_inset Quotes erd
+\end_inset
+
+
+\family default
+ .
+
+\end_layout
+
+\begin_layout Standard
+When everything is set up properly, the following should work:
+\end_layout
+
+\begin_layout Enumerate
+Issue
+\family typewriter
+marsadm primary $res
+\family default
+ on another node which is currently in secondary role.
+\end_layout
+
+\begin_layout Enumerate
+As a consequence,
+\family typewriter
+systemctl stop
+\begin_inset Quotes eld
+\end_inset
+
+$stop_unit
+\begin_inset Quotes erd
+\end_inset
+
+
+\family default
+ should be automatically executed at the old primary side.
+
+\end_layout
+
+\begin_layout Enumerate
+After a while, the MARS kernel module will notice that
+\family typewriter
+/dev/mars/$res
+\family default
+ is no longer opened.
+ You can check this manually via
+\family typewriter
+ marsadm view-device-opened $res
+\family default
+ which will tell you a boolean result.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ In case the device is not closed, ordinary handover cannot proceed, because
+ somebody could (at least potentially) write some data into it, even after
+ the handover, which would lead to a split brain.
+ Therefore MARS
+\emph on
+must
+\emph default
+ insist that the device is closed before ordinary handover will proceed.
+ In case it is not closed, you can (a) use
+\family typewriter
+primary --force
+\family default
+ which will likely provoke a split brain, or (b) check your
+\family typewriter
+systemd
+\family default
+ configuration or other sources of error why the device is not closed.
+ Possible reasons could be hanging processes or hanging sessions which might
+ need a
+\family typewriter
+kill
+\family default
+ or a
+\family typewriter
+kill -9
+\family default
+ or similar.
+ Notice that
+\family typewriter
+lsof
+\family default
+ does not catch
+\emph on
+all
+\emph default
+ possible sources like (recursive or bind-) mounts.
+\end_layout
+
+\begin_layout Enumerate
+Once
+\family typewriter
+/dev/mars/$res
+\family default
+ has disappeared, the ordinary MARS handover from the old primary to the
+ new site should proceed as usual.
+\end_layout
+
+\begin_layout Enumerate
+After
+\family typewriter
+/dev/mars/$res
+\family default
+ has appeared at the new site,
+\family typewriter
+systemctl start
+\begin_inset Quotes eld
+\end_inset
+
+$start_unit
+\begin_inset Quotes erd
+\end_inset
+
+
+\family default
+ should be automatically executed.
+
+\end_layout
+
+\begin_layout Standard
+The rest depends on your
+\family typewriter
+systemd
+\family default
+ and its configuration.
+ For example, you can configure systemd targets for activation of VMs, or
+ for
+\family typewriter
+LXC
+\family default
+ containers, or for
+\family typewriter
+iSCSI
+\family default
+ exports, or for
+\family typewriter
+nfs
+\family default
+ exports, or for
+\family typewriter
+glusterfs
+\family default
+ exports, or for whatever you need.
+ For true geo-redundancy, you will likely have to include some
+\family typewriter
+quagga
+\family default
+ or
+\family typewriter
+bird
+\family default
+ or other BGP configurations into your stack.
+\end_layout
+
+\begin_layout Section
+Creating Backups via Pseudo Snapshots
+\end_layout
+
+\begin_layout Standard
+When all your secondaries are all homogenously located in a standby datacenter,
+ they will be almost idle all the time.
+ This is a waste of computing resources.
+\end_layout
+
+\begin_layout Standard
+Since MARS is no substitute for a full-fledged backup system, and since
+ backups may put high system load onto your active side, you may want to
+ utilize your passive hardware resources in a better way.
+\end_layout
+
+\begin_layout Standard
+MARS supports this thanks to its ability to switch the
+\family typewriter
+pause-replay
+\family default
+
+\emph on
+independently
+\emph default
+ from
+\family typewriter
+pause-fetch
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+The basic idea is simple: just use
+\family typewriter
+pause-replay
+\family default
+ at your secondary site, but leave the replication of transaction logfiles
+ intact by deliberately
+\emph on
+not
+\emph default
+ saying
+\family typewriter
+pause-fetch
+\family default
+.
+ This way, your secondary replica (block device) will stay frozen for a
+ limited time, without loosing your redundancy: since the transaction logs
+ will continue to replicate in the meantime, you can start
+\family typewriter
+resume-replay
+\family default
+ at any time, in particular when a primary-side incident should happen unexpecte
+dly.
+ The former secondary will just catch up by replaying the outstanding parts
+ of the transaction logs in order to become recent.
+\end_layout
+
+\begin_layout Standard
+However, some
+\emph on
+details
+\emph default
+ have to be obeyed.
+ In particular, the current version of MARS needs an additional
+\family typewriter
+detach
+\family default
+ operation, in order to release exclusive access to the underlying disk
+
+\family typewriter
+/dev/lv/$res
+\family default
+.
+ Future versions of MARS are planned to support this more directly, without
+ need for an intermediate
+\family typewriter
+detach
+\family default
+ operation.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+Beware:
+\family typewriter
+mount -o ro /dev/vg/$res
+\family default
+ can lead to
+\series bold
+unnoticed write operations
+\series default
+ if you are not careful! Some journalling filesystems like
+\family typewriter
+xfs
+\family default
+ or
+\family typewriter
+ext4
+\family default
+ may replay their journals onto the disk, leading to
+\emph on
+binary
+\emph default
+ differences and thus
+\series bold
+destroying your consistency
+\series default
+ later when you re-enable
+\family typewriter
+resume-replay
+\family default
+!
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Therefore, you may use small LVM snapshots (only in such cases).
+ Typically,
+\family typewriter
+xfs
+\family default
+ journal replay will require only a few megabytes.
+ Therefore you typically don't need much temporary space for this.
+ Here is a more detailed description of steps:
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+marsadm pause-replay $res
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+marsadm detach $res
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+lvcreate --size 100m --snapshot --name ro-$res /dev/vg/$res
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+mount -o ro /dev/vg/ro-$res /mnt/tmp
+\end_layout
+
+\begin_layout Enumerate
+Now draw your backup from
+\family typewriter
+/mnt/tmp/
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+umount /mnt/tmp
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+lvremove -f /dev/vg/ro-$res
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+marsadm up $res
+\end_layout
+
+\begin_layout Standard
+Hint: during the backup, the transaction logs will accumulate on
+\family typewriter
+/mars/
+\family default
+.
+ In order to avoid overflow of
+\family typewriter
+/mars/
+\family default
+ (c.f.
+ section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Defending-Overflow"
+
+\end_inset
+
+), don't unnecessarily prolong the backup duration.
+\end_layout
+
+\begin_layout Chapter
+LV Football / VM Football / Container Football
+\begin_inset CommandInset label
+LatexCommand label
+name "chap:LV-Football"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The Football scripts can be obtained in two different ways:
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+git clone --recurse-submodules https://github.com/schoebel/mars
+\begin_inset Newline newline
+\end_inset
+
+
+\family default
+then
+\family typewriter
+cd mars/football/
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+git clone https://github.com/schoebel/football
+\end_layout
+
+\begin_layout Standard
+The
+\family typewriter
+--recurse-submodule
+\family default
+ method is the preferred way for non-developers because the main repo contains
+ a link to the right version of Football.
+\end_layout
+
+\begin_layout Standard
+When switching branches, you should use
+\family typewriter
+git submodule update
+\family default
+ for synchronizing the Football submodule with the MARS main checkout.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Recommended MARS branch for playing Football is
+\family typewriter
+mars0.1a.y
+\family default
+.
+ Although the old stable branch
+\family typewriter
+mars0.1.y
+\family default
+ has been updated for the most important
+\family typewriter
+marsadm
+\family default
+ features
+\family typewriter
+merge-cluster
+\family default
+ and
+\family typewriter
+split-cluster
+\family default
+, it does not scale well for Football and can cause operational problems
+ when merging too many hosts together, showing some
+\begin_inset Formula $O(n^{2})$
+\end_inset
+
+ metadata update behaviour where
+\begin_inset Formula $n$
+\end_inset
+
+ is the number of machines in a MARS cluster.
+ The future branch
+\family typewriter
+mars0.1b.y
+\family default
+ will contain more scalability improvements; in particular the
+\family typewriter
+split-cluster
+\family default
+ operation should no longer be needed at all because it is planned to scale
+ with
+\begin_inset Formula $O(k)$
+\end_inset
+
+ where
+\begin_inset Formula $k$
+\end_inset
+
+ is the number of resources at a
+\emph on
+single
+\emph default
+ host.
+ This should allow creation of a
+\emph on
+virtual(!)
+\emph default
+
+\family typewriter
+BigCluster
+\family default
+ pool at
+\emph on
+metadata
+\emph default
+ level (where metadata transfer rates are typically measured in KiB/s),
+ consisting of thousands of machines, while at the same time creating a
+
+\family typewriter
+LocalSharding
+\family default
+ or
+\family typewriter
+FlexibleSharding
+\family default
+ model at the realtime IO paths (where some petabytes are pumped through
+ thick pipelines).
+ Please check the other branches regularly at the github repo whether some
+ newer branches will be marked
+\begin_inset Quotes eld
+\end_inset
+
+stable
+\begin_inset Quotes erd
+\end_inset
+
+, or at least
+\begin_inset Quotes eld
+\end_inset
+
+beta
+\begin_inset Quotes erd
+\end_inset
+
+.
+ At the moment (spring 2018),
+\family typewriter
+mars0.1a.y
+\family default
+ is marked
+\begin_inset Quotes eld
+\end_inset
+
+beta
+\begin_inset Quotes erd
+\end_inset
+
+ although it is in production at several thousands of machines for several
+ months.
+\end_layout
+
+\begin_layout Standard
+Low-level documentation is available by calling any of the scripts with
+
+\family typewriter
+--help
+\family default
+ parameter (see also appendix
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:football-–help"
+
+\end_inset
+
+ ff):
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+./football.sh --help
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+./screener.sh --help
+\end_layout
+
+\begin_layout Standard
+By adding
+\family typewriter
+--verbose
+\family default
+, you can get a list of parameters for configuring and tweaking.
+\end_layout
+
+\begin_layout Section
+Football Overview
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Football-Overview"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Topmost architectural level (not yet implemented):
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/pool-optimizer.fig
+ width 100col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+The planned heart of the Football system is the generic pool optimizer,
+ which aims to provide a similar functionality than Kubernetes, but working
+ on a sharding architecture.
+ Instead of controlling
+\emph on
+stateless
+\emph default
+ Docker containers, its designated goal is to control masses of LVs on thousands
+ of machines, creating a
+\begin_inset Quotes eld
+\end_inset
+
+Virtually Distributed LVM pool
+\begin_inset Quotes erd
+\end_inset
+
+ (petabytes of total storage), and doing similar things than Software Defined
+ Storage (SDS) on the virtual pool.
+\end_layout
+
+\begin_layout Standard
+In addition to load balancing of storage space (and its special cases like
+ hardware lifecycle), there will be designated plugins for dealing with
+ CPU and RAM dimensions.
+ Further dimensions and a variety of goal functions could be added via future
+ plugins.
+ The optimizer itself aims to be as generic as possible, while functionality
+ and interfaces can be added via plugins and/or drivers.
+ Future versions might even support DRBD in addition to MARS.
+ The first version may use a simple greedy algorithm for solving the underlying
+
+\begin_inset Formula ${\cal NP}$
+\end_inset
+
+-complete problem, but could be augmented with more sophisticated problem
+ solvers in future.
+\end_layout
+
+\begin_layout Standard
+The automatic operations generated by pool-optimizer will be customizable
+ by dozens of parameters, and also extendable by action plugins.
+ At the moment, the following
+\family typewriter
+football.sh
+\family default
+ actions are planned:
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+migrate
+\family default
+ This will move an LV (together with its VM / LXC container / etc) to a
+ different machine in the machine pool.
+ This is the classical Football
+\begin_inset Quotes eld
+\end_inset
+
+kick
+\begin_inset Quotes erd
+\end_inset
+
+ operation.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+shrink
+\family default
+ This decreases the occupied LV space of a filesystem (currently only
+\family typewriter
+xfs
+\family default
+ implemented, but easily extendable) via creation of a smaller temporary
+ LV at the hypervisor, then transferring all data during operations via
+ local
+\family typewriter
+rsync
+\family default
+, then shutting down the VM for a short period, doing a final incremental
+
+\family typewriter
+rsync
+\family default
+, renaming the copied temporary LV to its original name, restarting the
+ VM on the new version (which contains the same data as before but wastes
+ less space), and finally re-establishing the MARS replicas (but of course
+ with smaller LV size).
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+extend
+\family default
+ This is much easier than shrinking: it first increases the underlying LV
+ size dynamically on all replicas, then
+\family typewriter
+marsadm resize
+\family default
+, and finally calls
+\family typewriter
+xfs_growfs
+\family default
+ while the filesystem remains mounted and while the VM / container is running.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+migrate+shrink
+\family default
+ Similar to
+\family typewriter
+migrate
+\family default
+ immediately followed by
+\family typewriter
+shrink
+\family default
+, but produces less network traffic and runs faster.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+migrate+shrink+back
+\family default
+ Use this when there is not enough local temporary space for shrinking.
+ The LV is first migrated to a temporary host, then shrunk, and finally
+ migrated back to its original position.
+\end_layout
+
+\begin_layout Standard
+By running the overall system in an endless loop, a control loop for permanent
+ optimization can be established.
+ Typical periods are each few days, or once a week.
+ In addition, manual triggering is also possible.
+\end_layout
+
+\begin_layout Standard
+The result of an (incremental) pool-optimizer run is a CSV file, which may
+ be automatically forwarded to the execution engine
+\family typewriter
+football.sh
+\family default
+ for
+\emph on
+manual
+\emph default
+ execution, or to
+\family typewriter
+screener.sh
+\family default
+ for mass execution on a common control machine.
+ Alternatively, intermediate steps like manual checking, filtering etc may
+ be inserted into the processing pipeline.
+\end_layout
+
+\begin_layout Standard
+The excecution engine
+\family typewriter
+football.sh
+\family default
+ rep.
+ its 1&1-internal variant
+\family typewriter
+tetris.sh
+\family default
+ is already in production at 1&1, and already reached more than 300 migrations
+ per week.
+ Architecture of the execution engine:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/football.fig
+ width 90col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+The so-called Screener is simply a generic program allowing mass execution
+ of arbitrary scripts in background
+\family typewriter
+screen
+\family default
+ sessions.
+ This allows masses (several hundreds, possibly thousands) of long-lasting
+ processes (hours or days) to run
+\emph on
+unattended
+\emph default
+ in background, while allowing a (larger) group of sysadmins to attach /
+ detach to
+\family typewriter
+screen
+\family default
+ sessions at any time for corrective by-hand actions, e.g.
+ in case of failures or other problems, or for supervision, etc.
+\end_layout
+
+\begin_layout Standard
+When Screener is combined with the Football execution engine
+\family typewriter
+football.sh
+\family default
+, more specialized functionality is available (via a variety of plugins):
+\end_layout
+
+\begin_layout Itemize
+Optional waiting for sysadmin confirmation before some customer downtime
+ is initiated.
+\end_layout
+
+\begin_layout Itemize
+Automatic generation of
+\family typewriter
+motd
+\family default
+ status reporting to other sysadmins.
+\end_layout
+
+\begin_layout Itemize
+Automatic sending of email alerts or status reports, e.g.
+ on errors or critical errors, etc.
+ By sending email to SMS gateways, real-time alerting can be configured
+ (e.g.
+ over the weekend).
+\end_layout
+
+\begin_layout Itemize
+Generic interfacing to external scripts with configurable parameters, e.g.
+ for triggering monitoring systems, feeding external databases, etc.
+\end_layout
+
+\begin_layout Standard
+Screener can detect and will automatically manage the following states (in
+ this example, all state lists are empty):
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+$common_user> ./screener.sh list
+\end_layout
+
+\begin_layout Plain Layout
+
+List of waiting:
+\end_layout
+
+\begin_layout Plain Layout
+
+List of delayed:
+\end_layout
+
+\begin_layout Plain Layout
+
+List of condition:
+\end_layout
+
+\begin_layout Plain Layout
+
+List of running:
+\end_layout
+
+\begin_layout Plain Layout
+
+List of critical:
+\end_layout
+
+\begin_layout Plain Layout
+
+List of serious:
+\end_layout
+
+\begin_layout Plain Layout
+
+List of interrupted:
+\end_layout
+
+\begin_layout Plain Layout
+
+List of illegal:
+\end_layout
+
+\begin_layout Plain Layout
+
+List of failed:
+\end_layout
+
+\begin_layout Plain Layout
+
+List of timeouted:
+\end_layout
+
+\begin_layout Plain Layout
+
+List of done:
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+Screener can discriminate the
+\emph on
+seriosity
+\emph default
+ of errors as follows:
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+failed
+\family default
+ An error occurred
+\emph on
+outside
+\emph default
+ of critical sections, e.g.
+ during preparation of LV space etc.
+ During ordinary operations, VMs / containers are usually running continuously,
+ and there is no customer impact to be expected.
+ Typically,
+\family typewriter
+./screener.sh restart $resource
+\family default
+ should fix the problem if it is only a temporary problem.
+ However, for maximum safety, manual inspection via .
+\family typewriter
+/screener.sh attach $resource
+\family default
+ or inspection of the logfile via .
+\family typewriter
+/screener.sh show $resource
+\family default
+ is recommended before trying an automatic restart.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+serious
+\family default
+ An error occured while a VM / container was temporarily stopped, which
+
+\series bold
+would
+\series default
+ normally lead to customer downtime, but Football was able to
+\emph on
+compensate
+\emph default
+ the problem
+\emph on
+for now
+\emph default
+ by
+\emph on
+automatically
+\emph default
+ restarting the VM.
+ Thus no long-lasting customer impact has likely occurred.
+ However, manual inspection and repair by sysadmins is likely necessary.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+critical
+\family default
+ An
+\emph on
+uncompensated
+\emph default
+ error occured during customer downtime.
+ The VM / container is likely down.
+ This will need manual sysadmin actions ASAP, such as hardware replacement,
+ networking fixes, etc.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+timeouted
+\family default
+ This means that the script is assumed to hang because it did not produce
+ any output for more than
+\family typewriter
+$session_timeout
+\family default
+ seconds (default 3600 * 3 = 3 hours).
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+illegal
+\family default
+ This means that a precondition is not met.
+ For example, there is not enough space at the target LVM.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+interrupted
+\family default
+ Somebody has pressed
+\family typewriter
+Ctl-c
+\family default
+ in a
+\family typewriter
+screen
+\family default
+ session, or has otherwise sent a signal to the running script.
+ As a result, a signal
+\family typewriter
+trap
+\family default
+ has been executed.
+\end_layout
+
+\begin_layout Standard
+\noindent
+Ordinary Screener states during execution:
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+running
+\family default
+ This means that a (background) process is currently running.
+ You can attach to the screen session either manually via
+\family typewriter
+screen -x $pid.$resource
+\family default
+, or more comfortably via
+\family typewriter
+./screener.sh attach $resource
+\family default
+.
+ Then you can use
+\family typewriter
+screen
+\family default
+ as documented in
+\family typewriter
+man screen
+\family default
+.
+ The most important operation is detaching via keystrokes
+\family typewriter
+Ctrl-a d
+\family default
+.
+
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+Notice: don't press
+\family typewriter
+Ctrl-c
+\family default
+ unless you know what you are doing.
+ In most cases, this will terminate the running process, and in consequence
+ lead to
+\family typewriter
+\series bold
+interrupted
+\family default
+\series default
+ or
+\family typewriter
+\series bold
+failed
+\family default
+\series default
+ or even
+\family typewriter
+\series bold
+critical
+\family default
+\series default
+ state (depending on the moment of keypress).
+ Depending on parameter
+\family typewriter
+drop_shell
+\family default
+, the Screener session will also terminate, or you will get an interactive
+ shell for manual repair.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+waiting
+\family default
+ When the plugins
+\family typewriter
+football-waiting
+\family default
+ and
+\family typewriter
+screener-waiting
+\family default
+ are configured properly (which is
+\emph on
+not
+\emph default
+ the default), the script execution will pause immediately before a customer
+ downtime action would be started.
+ Now any sysadmin from the larger group has a chance to
+\family typewriter
+./screener attach $resource
+\family default
+ and to press RETURN to continue the waiting script and to personally watch
+ the course of the critical section.
+ There are some more comfortable variants like
+\family typewriter
+./screener continue $resource
+\family default
+ for background continuation of a single session, or
+\family typewriter
+./screener continue 100
+\family default
+ which can be used for continuing masses of waiting sessions.
+ There are further variants which are automatically attaching to sessions,
+ see Appendix
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:screener–help"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+delayed
+\family default
+ This state is only entered before
+\family typewriter
+lvremove $resource
+\family default
+ is executed (which will destroy your old internal backup copy), and when
+ configured appropriately.
+ Typically, you also need to configure the
+\family typewriter
+$wait_before_cleanup
+\family default
+ variable in order to avoid endless waiting.
+ Notice that old LV data gets soon outdated after a while, so please don't
+ unnecessarily prolong the running time of your scripts by choosing too
+ long
+\family typewriter
+$wait_before_cleanup
+\family default
+ values.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+condition
+\family default
+ Special case of delay: some condition is currently not met, such as the
+
+\family typewriter
+$business_hours
+\family default
+ feature, where you can configure when customer downtimes are allowed, and
+ when not.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+done
+\family default
+ This means that the script reported successful execution by exit status
+
+\family typewriter
+0
+\family default
+.
+ The background screen session terminated automatically.
+ You can inspect the logfile manually via
+\family typewriter
+./screener.sh show $resource
+\family default
+, or by looking into the directory
+\family typewriter
+$screener_logdir/done/
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Logfiles of other states can also be inspected (or monitored by standard
+ tools like
+\family typewriter
+grep
+\family default
+) by looking into sister directories, such as
+\family typewriter
+$screener_logdir/running/
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+When running Screener for several months or years, old logfiles will accumulate
+ in these directories over time.
+ Call
+\family typewriter
+./screener.sh purge
+\family default
+ or
+\family typewriter
+./screener.sh cron
+\family default
+ regularly via a cron job, or archieve your old logfiles from time to time
+ via another method.
+\end_layout
+
+\begin_layout Section
+HOWTO instantiate / customize Football
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:HOWTO-instantiate-Football"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+In order to install and operate Football, the recommended
+\emph on
+deployment
+\emph default
+ strategy is bottom-up, layer by layer.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Top-down strategies should be used
+\emph on
+only
+\emph default
+, and
+\emph on
+only
+\emph default
+, for planning.
+ An Egyptian pyramid can never be built, even if you had some billions of
+ workers, by starting at the tip and by creating the foundations as the
+ very last step.
+ Suchalike attempt would end up in a disaster.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+
+\series bold
+Testing
+\series default
+ of each layer
+\series bold
+separately
+\series default
+ is very important.
+ Before proceeding to the next higher layer, first ensure that any lower
+ layer is working
+\emph on
+correctly
+\emph default
+.
+ Otherwise debugging can become tricky.
+\end_layout
+
+\begin_layout Subsection
+Block Device Layer
+\end_layout
+
+\begin_layout Standard
+Step-by-step instructions can be found in chapter
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "chap:Quick-Start-Guide"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+Please ensure that your hardware (including RAID controllers and LVM and
+ so on), and your operating system, and your network / setup, and MARS is
+ working correctly before proceeding to the next layer.
+\end_layout
+
+\begin_layout Subsection
+Mechanics Layer of Cluster Operations
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Mechanics-Layer of Cluster"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+In the following example, it is assumed that
+\family typewriter
+systemd
+\family default
+ is used, as explained in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:systemd-Templates"
+
+\end_inset
+
+, and now applied to
+\family typewriter
+vm4711
+\family default
+ supposed to run on hypervisors
+\family typewriter
+hyper1234a
+\family default
+ (primary role) and
+\family typewriter
+hyper1234b
+\family default
+ (secondary role), which is assumed to be controllable via the following
+
+\family typewriter
+systemd
+\family default
+ start and stop units:
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+marsadm set-systemd-unit vm4711 lxc-vm4711.target vol-vm4711.mount
+\end_layout
+
+\begin_layout Standard
+Test the cluster mechanics layer like in the following example:
+\end_layout
+
+\begin_layout Itemize
+On host
+\family typewriter
+hyper1234b
+\family default
+, the following must work:
+\family typewriter
+marsadm primary vm4711
+\end_layout
+
+\begin_layout Standard
+This must result in an automatic handover of
+\family typewriter
+vm4711
+\family default
+ from the current primary site
+\family typewriter
+hyper1234a
+\family default
+ to the new primary
+\family typewriter
+hyper1234b
+\family default
+, as explained in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:systemd-Templates"
+
+\end_inset
+
+.
+ Please check that
+\family typewriter
+vm4711
+\family default
+ is running correctly at the new location.
+ It must be reachable via network.
+ In case you are using BGP because
+\family typewriter
+hyper1234a
+\family default
+ and
+\family typewriter
+hyper1234b
+\family default
+ are located in different datacenters, ensure that BGP is also controlled
+ by your
+\family typewriter
+systemd
+\family default
+ unit dependencies, and test it.
+\end_layout
+
+\begin_layout Subsection
+Mechanics Layer of Football Operations
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Mechanics-Layer-of-Football"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+At the moment, there are two alternative plugins already implemented in
+ the Football sub-project (see subdirectory
+\family typewriter
+football/plugins/
+\family default
+).
+ Of course, you can implement some further plugins.
+ Please put them under GPL, and share them.
+ Please contact the author of MARS for inclusion into the official MARS
+ release.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+football-cm3.sh
+\family default
+ This plugin can be only used at Shared Hosting Linux (ShaHoLin) at 1&1,
+ since it is bound to a specific
+\emph on
+proprietary
+\emph default
+ instance.
+ However, the
+\emph on
+sourcecode
+\emph default
+ of the
+\emph on
+plugin
+\emph default
+ itself (not the code called by the plugin, e.g.
+ over REST interfaces) is under GPL.
+ You can (and
+\emph on
+should
+\emph default
+)
+\emph on
+inspect
+\emph default
+ the plugin code, and
+\series bold
+learn
+\series default
+ how a real-world system (which has grown over some decades and bears a
+ lot of history) is actually working at certain points.
+\begin_inset Newline newline
+\end_inset
+
+This plugin is automatically activated when called via the symlink
+\family typewriter
+tetris.sh
+\family default
+ instead of directly calling
+\family typewriter
+football.sh
+\family default
+.
+ This has historic reasons.
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+football-basic.sh
+\family default
+ This plugin uses the new
+\family typewriter
+systemd
+\family default
+ interface of
+\family typewriter
+marsadm
+\family default
+ for controlling the mechanics.
+ See section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:systemd-Templates"
+
+\end_inset
+
+.
+ You should be familiar with commands like
+\family typewriter
+marsadm set-systemd-unit
+\family default
+.
+ Manual handover via
+\family typewriter
+marsadm primary $resource
+\family default
+ must be already working (with high reliability
+\begin_inset Formula $\leadsto$
+\end_inset
+
+ check that any
+\family typewriter
+umount
+\family default
+ works everywhere without hangups) before you can start using this plugin
+ for
+\family typewriter
+football.sh
+\family default
+.
+\begin_inset Newline newline
+\end_inset
+
+This plugin is automatically activated when calling football.sh.
+ It can be deactivated by overriding variable
+\family typewriter
+enable_basic
+\family default
+=0.
+\end_layout
+
+\begin_layout Subsubsection
+Configuring and Overriding Variables
+\end_layout
+
+\begin_layout Standard
+A detailed list of all available customization options can be obtained via
+
+\family typewriter
+./football.sh --help --verbose
+\family default
+.
+ Each option is documented by some help text, and you can always see the
+ default settings.
+ See also section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:football-help-verbose"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+If you create any new plugin for Football, or if you modify an existing
+ one, please follow these standards.
+ Try to describe any option as concisely as possible.
+\end_layout
+
+\begin_layout Standard
+Configuring is possible in the following ways, in order of precedence:
+\end_layout
+
+\begin_layout Itemize
+at the command line via
+\family typewriter
+./football.sh --$variable_name=$value $arguments
+\family default
+.
+\end_layout
+
+\begin_layout Itemize
+via environment variables, e.g.
+ globally via
+\family typewriter
+export $variable_name=$value && ./football.sh $arguments
+\family default
+, or locally via
+\family typewriter
+$variable_name=$value ./football.sh $arguments
+\family default
+.
+\end_layout
+
+\begin_layout Itemize
+by adding some small
+\family typewriter
+football-*.conf
+\family default
+ files into one of the directories
+\family typewriter
+/usr/lib/mars/plugins
+\family default
+
+\family typewriter
+/etc/mars/plugins
+\family default
+
+\family typewriter
+$script_dir/plugins
+\family default
+
+\family typewriter
+$HOME/.mars/plugins
+\family default
+
+\family typewriter
+./plugins
+\family default
+, in this order of precedence.
+ This list of directories can be modifed externally over the environment
+ variable
+\family typewriter
+football_includes
+\family default
+ (but not during already running inclusions of
+\family typewriter
+football-*.conf
+\family default
+ files).
+\end_layout
+
+\begin_layout Subsubsection
+
+\family typewriter
+football-basic.sh
+\family default
+ Customization
+\end_layout
+
+\begin_layout Standard
+Here is a brief summary of the most important configuration tasks and options:
+\end_layout
+
+\begin_layout Description
+
+\family typewriter
+initial_hostname_file
+\family default
+ Somehow, the
+\family typewriter
+football-basic.sh
+\family default
+ plugin must know the hostnames of your pool.
+ Once Football is working, the hostname will be
+\emph on
+automatically
+\emph default
+ maintained whenever
+\family typewriter
+marsadm join-cluster
+\family default
+ or
+\family typewriter
+marsadm merge-cluster
+\family default
+ is executed somewhere.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+For your hardware deployment strategy, this means the following: just deploy
+ any new hardware, or remove your old one (after Football has emptied all
+ of your former LV resources).
+ It does not matter how you are doing this, e.g.
+ via OpenStack, or via the proprietary
+\family typewriter
+Schlunix
+\family default
+ methods used at ShaHoLin, or whatever.
+ Then you have the following options for adding the new machines to the
+ Football hostname cache (see variable
+\family typewriter
+hostname_cache
+\family default
+):
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+Write the pure hostname(s) into the file as configured with
+\family typewriter
+initial_hostname_file
+\family default
+ (by default:
+\family typewriter
+./hostnames.input
+\family default
+).
+ Each hostname must be on its own ASCII line.
+ Not only these new hosts will be picked up automatically, but also...
+\end_layout
+
+\begin_layout Enumerate
+...any further hosts reported anywhere (at the already known hosts) by
+\family typewriter
+marsadm view-cluster-members
+\family default
+,
+\series bold
+transitively
+\series default
+.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Consequence: if you are running the new
+\family typewriter
+mars0.1b.y
+\family default
+ (or newer) branch of MARS, you don't need
+\family typewriter
+marsadm split-cluster
+\family default
+ anymore.
+ Then you can operate several thousands of machines as a big
+\series bold
+virtual
+\series default
+ cluster, even if their storage is local (see
+\family typewriter
+LocalSharding
+\family default
+ model described in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "subsec:Variants-of-Sharding"
+
+\end_inset
+
+).
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Previous versions of MARS, like
+\family typewriter
+mars0.1.y
+\family default
+ and
+\family typewriter
+mars0.1a.y
+\family default
+, are not yet scalable at their
+\series bold
+metadata
+\series default
+ exchange level.
+ Trying to
+\family typewriter
+join-cluster
+\family default
+ or
+\family typewriter
+merge-cluster
+\family default
+ several tens or even hundreds of machines with those versions will surely
+ lead to a disaster.
+ Always use
+\family typewriter
+marsadm split-cluster
+\family default
+ at those versions, regularly.
+ First upgrade to the future
+\family typewriter
+mars0.1b.y
+\family default
+ (or later versions) before creating big clusters at
+\emph on
+metadata
+\emph default
+ level!
+\end_layout
+
+\begin_layout Enumerate
+Use
+\family typewriter
+./football.sh basic_add_host $hostname
+\family default
+ for adding a single new host manually.
+ Afterwards, the transitive closure of all reachable hosts is computed as
+ usual.
+ This may also be used for the very first initialization of a fresh Football
+ installation, provided you already have a big cluster at metadata level.
+\end_layout
+
+\end_deeper
+\begin_layout Standard
+Test the Football mechanics like one of the following example command sequences,
+ where it is assumed that
+\family typewriter
+hyper4321a
+\family default
+ and
+\family typewriter
+hyper4321b
+\family default
+ are already
+\emph on
+newly
+\emph default
+ deployed hypervisors having enough local LVM storage, and have been already
+ added to the MARS cluster via
+\family typewriter
+marsadm join-cluster
+\family default
+, or have been at least added to
+\family typewriter
+hostname_cache
+\family default
+ as explained above:
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+ssh-add; ./football.sh migrate vm4711 hyper4321a hyper4321b
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+ssh-add; ./football.sh migrate vm4711 hyper4321a hyper4321b --screener; ./screener.s
+h attach vm4711
+\end_layout
+
+\begin_layout Standard
+Check the automatically produced logfile (via
+\family typewriter
+./screener.sh show vm4711
+\family default
+) that Football has automatically determined the old hypervisor where
+\family typewriter
+vm4711
+\family default
+ was running before, that it has automatically executed
+\family typewriter
+marsadm merge-cluster
+\family default
+ when necessary, and has created the LV replicas at the new hypervisors,
+ and has executed some
+\family typewriter
+marsadm join-resource
+\family default
+ commands, has automatically waited for MARS fast fullsync to finish, then
+ successfully executed an automatic handover to the new primary hypervisor,
+ and finally has destructed the old MARS replicas including their old LVs.
+ Check that
+\family typewriter
+vm4711
+\family default
+ is running correctly at the new hypervisor pair, and that handover between
+ the new hypervisor sites
+\family typewriter
+*a
+\family default
+ and
+\family typewriter
+*b
+\family default
+ is working correctly.
+
+\end_layout
+
+\begin_layout Standard
+A larger group of sysadmins can co-work over a central common control machine
+ via ssh agent forwarding (which must be enabled in
+\family typewriter
+/etc/ssh/sshd_config
+\family default
+) in the following way:
+\end_layout
+
+\begin_layout Itemize
+At the workstation:
+\family typewriter
+ssh-add; ssh -A football@common-control.mycompany.org
+\family default
+
+\begin_inset Newline newline
+\end_inset
+
+Then
+\family typewriter
+cd $script_dir
+\family default
+ and run your
+\family typewriter
+./football.sh
+\family default
+ or
+\family typewriter
+./screener.sh
+\family default
+ commands as usual.
+ The automatically generated logfiles will be tagged with the
+\emph on
+real
+\emph default
+ usernames from your original workstation login, as reported by
+\family typewriter
+ssh-add -l
+\family default
+, even transitively when using ssh agent forwarding.
+ Thus you may use a common username like
+\family typewriter
+football
+\family default
+ on the common
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Of course, it is also possible to maintain individual accounts for the same
+ Unix group, and set
+\family typewriter
+umask
+\family default
+ and common directory permissions accordingly, such that the classical group-wis
+e working concept from the 1970s will do the rest.
+ This is much more work, but can establish more fine-grained access control.
+ Even more sophisticated methods could involve ACLs, but suchalike is probably
+ only necessary at extremely high-sensitive installations.
+\end_layout
+
+\end_inset
+
+ control machine.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Hint: use
+\family typewriter
+./screener.sh list
+\family default
+ (or one of its more specific variants like
+\family typewriter
+./screener.sh list-running
+\family default
+) for determining what's currently going on in a larger group of sysadmins.
+\end_layout
+
+\begin_layout Chapter
+MARS for Developers
+\end_layout
+
+\begin_layout Standard
+This chapter is organized strictly top-down.
+\end_layout
+
+\begin_layout Standard
+If you are a sysadmin and want to inform yourself about internals (useful
+ for debugging), the relevant information is at the beginning, and you don't
+ need to dive into all technical details at the end.
+\end_layout
+
+\begin_layout Standard
+If you are a kernel developer and want to contribute code to the emerging
+ MARS community, please read it (almost) all.
+ Due to the top-down organization, sometimes you will need to follow some
+ forward references in order to understand details.
+ Therefore I recommend reading this chapter twice in two different reading
+ modes: in the first reading pass, you just get a raw network of principles
+ and structures in your brain (you don't want to grasp details, therefore
+ don't strive for a full understanding).
+ In the second pass, you will exploit your knowlegde from the first pass
+ for a deeper understanding of the details.
+\end_layout
+
+\begin_layout Standard
+Alternatively, you may first read the sections about general architecture,
+ and then start a bottom-up scan by first reading the last section about
+ generic objects and aspects, and working in reverse
+\emph on
+section
+\emph default
+ order (but read
+\emph on
+sub
+\emph default
+sections in-order) until you finally reach the kernel interfaces / symlink
+ trees.
+\end_layout
+
+\begin_layout Section
+Motivation / Politics
+\end_layout
+
+\begin_layout Standard
+MARS is not yet upstream in the Linux kernel.
+ This section tries to clear up some potential doubts.
+ Some people have asked why MARS uses its own internal framework instead
+ of
+\emph on
+directly
+\emph default
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice that
+\emph on
+indirect
+\emph default
+ use of pre-existing Linux infrastructure is not only possible, but actually
+ implemented, by usinig it
+\emph on
+internally
+\emph default
+ in brick
+\emph on
+implementations
+\emph default
+ (black-box principle).
+ However, such bricks are not portable to other environments like userspace.
+\end_layout
+
+\end_inset
+
+ being based on some already existing Linux kernel infrastructures like
+ the device mapper.
+ Here is a list of technical reasons:
+\end_layout
+
+\begin_layout Enumerate
+The existing device mapper infrastructure is based on
+\family typewriter
+struct bio
+\family default
+.
+ In contrast, the new XIO personality of the generic brick infrastructure
+ is based on the concept of AIO (Asynchronous IO), which is a
+\series bold
+true superset
+\series default
+ of block IO.
+\end_layout
+
+\begin_layout Enumerate
+In particular,
+\family typewriter
+struct bio
+\family default
+ is firmly referencing to
+\family typewriter
+struct page
+\family default
+ (via intermediate
+\family typewriter
+struct bio_vec
+\family default
+), using types like
+\family typewriter
+sector_t
+\family default
+ in the field
+\family typewriter
+bi_sector
+\family default
+.
+ Basic transfer units are blocks, or sectors, or pages, or the like.
+ In contrast,
+\family typewriter
+struct aio_object
+\family default
+ used by the XIO personality can address
+\series bold
+arbitrary granularity
+\series default
+ memory with byte resolution even at odd
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Some brick
+\emph on
+implementations
+\emph default
+ (as opposed to the capabilities of the
+\emph on
+interface
+\emph default
+) may be (and, in fact,
+\emph on
+are
+\emph default
+) restricted to
+\family typewriter
+PAGE_SIZE
+\family default
+ operations or the like.
+ This is no general problem, because IOP can automatically insert some translato
+r bricks extending the capabilities to universal granularity (of course
+ at some performance costs).
+\end_layout
+
+\end_inset
+
+ positions in (virtual) files / devices, similar to classical Unix file
+ IO, but
+\emph on
+asynchronously
+\emph default
+.
+ Practical experience shows that even non-functional properties like performance
+ of many datacenter workloads are profiting from that
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+The current transaction logger uses variable-sized headers at
+\begin_inset Quotes eld
+\end_inset
+
+odd
+\begin_inset Quotes erd
+\end_inset
+
+ addresses.
+ Although this increases
+\family typewriter
+memcpy()
+\family default
+ load due to
+\begin_inset Quotes eld
+\end_inset
+
+misalignment
+\begin_inset Quotes erd
+\end_inset
+
+, the
+\emph on
+overall performance
+\emph default
+ was provably better than in variants where sector / page alignment was
+ strictly obeyed, but space was wasted for alignments.
+ Such functionality is only possible if the XIO infrastructure
+\emph on
+allows
+\emph default
+
+\emph on
+for
+\emph default
+ (but doesn't force)
+\begin_inset Quotes eld
+\end_inset
+
+mis-aligned
+\begin_inset Quotes erd
+\end_inset
+
+ IO operations.
+ In future, many different transaction logfile formats showing different
+ runtime behaviour (e.g.
+ optimized for high-throughput SSD loads) may co-exist in parallel.
+ Note that properly aligned XIO operations bear no noticeable overhead compared
+ to classical block IO, at least in typical datacenter RAID scenarios.
+\end_layout
+
+\end_inset
+
+.
+ The AIO/XIO abstraction contains no fixed link to kernel abstractions and
+ should be
+\series bold
+easily portable
+\series default
+ to other environments.
+ In summary, the new personality provides a uniform abstraction which abstracts
+ away from multiple different kernel interfaces; it is designed to be useful
+ even in userspace.
+\end_layout
+
+\begin_layout Enumerate
+Kernel infrastructures for the concept of
+\emph on
+direct IO
+\emph default
+ are different from those for
+\emph on
+buffered IO
+\emph default
+.
+ The XIO personality used by MARS subsumes both concepts as use case
+\emph on
+variants
+\emph default
+.
+
+\series bold
+Buffering
+\series default
+ is an optional internal property of XIO bricks (almost non-functional property
+ with support for consistency guarantees).
+\end_layout
+
+\begin_layout Enumerate
+The AIO/XIO personality is generically designed for remote operations over
+ networks, at arbitrary places in the IO stack, with (almost
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+By default, automatic network connection re-establishment and infinite network
+ retries are already implemented in the
+\family typewriter
+xio_client
+\family default
+ and
+\family typewriter
+xio_server
+\family default
+ bricks to provide fully transparent semantics.
+ However, this may be undesirable in case of fatal crashes.
+ Therefore, abort operations are also configurable, as well as network timeouts
+ which are then mapped to classical IO errors.
+\end_layout
+
+\end_inset
+
+) no semantic differences to local operations (built-in
+\series bold
+ network transparency
+\series default
+).
+ There are universal provisions for mixed operation of different versions
+ (
+\series bold
+rolling software updates
+\series default
+ in clusters / grids).
+\end_layout
+
+\begin_layout Enumerate
+The generic brick infrastructure (as well as its personalities like XIO
+ or any other future personality) supports
+\series bold
+dynamic re-wiring / re-configuration
+\series default
+
+\emph on
+during
+\emph default
+ operation (even while parallel IO requests are flying, some of them taking
+ different paths in the IO stack in parallel).
+ This is absolutely needed for MARS logfile rotation.
+ In the long term, this would be useful for many advanced new features and
+ products, not limited to multipathing.
+\end_layout
+
+\begin_layout Enumerate
+The generic brick infrastructure (and in turn all personalities) provide
+
+\series bold
+additional comfort
+\series default
+ to the programmer while enabling
+\series bold
+increased functionality
+\series default
+: by use of a generalization of
+\series bold
+aspect orientation
+\series default
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Similar to AOP, insertion of IOP bricks for checking / debugging etc is
+ one of the key advantages of the generic brick infrastructure.
+ In contrast to AOP where debugging is usually {en,dis}abled statically
+ at compile time, IOP allows for
+\emph on
+dynamic
+\emph default
+ (re-)configuration of debugging bricks, automatic repair, and many more
+ features promoted by
+\emph on
+organic computing
+\emph default
+.
+\end_layout
+
+\end_inset
+
+, the programmer need no longer worry about dynamic memory allocations for
+
+\emph on
+local state
+\emph default
+ in a brick instance.
+ MARS is
+\series bold
+automating local state
+\series default
+ even when dynamically instantiating new bricks (possibly having the same
+ brick type) at runtime.
+ Specifially, XIO is automating
+\series bold
+request stacking
+\series default
+ at the completion path this way, even while dynamically reconfiguring the
+ IO stack
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+The generic aspect orientation approach leads to better
+\series bold
+separation of concerns
+\series default
+: local state needed by brick implementations is not visible from outside
+ by default.
+ In other words, local state is also
+\series bold
+private state
+\series default
+.
+ Accidental hampering of internal operations is impeded.
+\end_layout
+
+\begin_layout Plain Layout
+Example from the kernel: in
+\family typewriter
+include/linux/blkdev.h
+\family default
+ the definition of
+\family typewriter
+struct request
+\family default
+ contains the following comment:
+\family typewriter
+/* the following two fields are internal, NEVER access directly */
+\family default
+.
+ It appears that
+\family typewriter
+struct request
+\family default
+ contains not only fields relevant for the caller, but also
+\series bold
+internal fields
+\series default
+ needed only in
+\emph on
+some
+\emph default
+
+\emph on
+specific
+\emph default
+ callees.
+ For example,
+\family typewriter
+rb_node
+\family default
+ is documented to be used only in IO schedulers.
+\end_layout
+
+\begin_layout Plain Layout
+XIO goes one step further: there need not exist exactly one IO scheduler
+ instance in the IO stack for a single device.
+ Future
+\family typewriter
+xio_scheduler_{deadline,cfq,...}
+\family default
+ brick types could be each instantiated many times, and in arbitrary places,
+ even for the same (logical) device.
+ The equivalent of
+\family typewriter
+rb_node
+\family default
+ would then be automatically instantiated multiple times for the same IO
+ request, by automatically instantiating the right local aspect instances.
+\end_layout
+
+\end_inset
+
+.
+ A similar automation
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+DM can achieve stacking and dynamic routing by a workaround called
+\emph on
+request cloning
+\emph default
+, potentially leading to mass creation of temporary / intermediate object
+ instances.
+\end_layout
+
+\end_inset
+
+ does not exist in the rest of the Linux kernel.
+\end_layout
+
+\begin_layout Enumerate
+The generic brick infrastructure, together with personalities like XIO,
+ enables
+\series bold
+new long-term functional and non-functional opportunities
+\series default
+ by use of concepts from instance-oriented programming (IOP
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+See
+\begin_inset Flex URL
+status collapsed
+
+\begin_layout Plain Layout
+
+http://athomux.net/papers/paper_inst2.pdf
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+
+).
+ The application area is
+\series bold
+not limited to device drivers
+\series default
+.
+ For example, a new personality for
+\emph on
+stackable filesystems
+\emph default
+ could be developed in future.
+\end_layout
+
+\begin_layout Standard
+In summary, anyone who would insist that MARS should be
+\emph on
+directly
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Notice that kernel-specific structures like
+\family typewriter
+struct bio
+\family default
+ are of course used by MARS, but only
+\emph on
+inside
+\emph default
+ the blackbox implementation of bricks like
+\family typewriter
+mars_bio
+\family default
+ or
+\family typewriter
+mars_if
+\family default
+ which act as
+\series bold
+adaptors
+\series default
+ to/from that structure.
+ It is possible to write further adaptors, e.g.
+ for direct interfacing to the device mapper infrastructure.
+\end_layout
+
+\end_inset
+
+
+\emph default
+ based on pre-existing kernel structures / frameworks instead of contributing
+ a new framework would cause a
+\emph on
+massive regression of functionality
+\emph default
+.
+\end_layout
+
+\begin_layout Itemize
+On one hand, all code contributed by the MARS project is
+\series bold
+non-intrusive
+\series default
+ into the rest of the Linux kernel.
+ From the viewpoint of other parts of the kernel, the whole addition
+\emph on
+behaves
+\emph default
+
+\emph on
+like
+\emph default
+ a driver (although its infrastructure is much more than a driver).
+\end_layout
+
+\begin_layout Itemize
+On the other hand, if people are interested, the contributed infrastructure
+
+\emph on
+may
+\emph default
+ be used to
+\emph on
+add
+\emph default
+ to the power of the Linux kernel.
+ It is designed to be
+\series bold
+open for contributions
+\series default
+.
+\end_layout
+
+\begin_layout Itemize
+A
+\emph on
+possible
+\emph default
+ (but not the only possible) way to do this is giving the generic brick
+ framework / the XIO personality as well as future personalities / the MARS
+ application the status of a
+\emph on
+subsystem
+\emph default
+ inside the kernel (in the long term), similar to the SCSI subsystem or
+ the network subsystem.
+ Noone is forced to use it, but anybody may use it if he/she likes.
+\end_layout
+
+\begin_layout Itemize
+Politically, the author is a FOSS advocate willing to collaborate and to
+ support anyone interested in contributions.
+ The author's personal interest is long-term and is open for both in-tree
+ and out-of-tree extensions of both the framework and MARS by any other
+ party obeying the GPL and not hazarding FOSS by patents (instead supporting
+ organizations like the Open Invention Network).
+ The author is open to closer relationships with the Linux Foundation and
+ other parts of the Linux ecosystem.
+\end_layout
+
+\begin_layout Section
+Architecture Overview
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/MARS_Framework_Architecture.pdf
+ width 100col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+Some Architectural Details
+\end_layout
+
+\begin_layout Standard
+The following pictures show some
+\begin_inset Quotes eld
+\end_inset
+
+zones of responsibility
+\begin_inset Quotes erd
+\end_inset
+
+, not necessarily a strict hierarchy (although Dijkstra's famous layering
+ rules from THE are tried to be respected as much as possible).
+ The construction principle follows the concept of
+\series bold
+Instance Oriented Programming
+\series default
+ (IOP) described in
+\begin_inset Flex URL
+status collapsed
+
+\begin_layout Plain Layout
+
+http://athomux.net/papers/paper_inst2.pdf
+\end_layout
+
+\end_inset
+
+.
+ Please note that MARS is only instance-
+\emph on
+based
+\emph default
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Similar to OOP, where
+\begin_inset Quotes eld
+\end_inset
+
+object-based
+\begin_inset Quotes erd
+\end_inset
+
+ means a weaker form of
+\begin_inset Quotes eld
+\end_inset
+
+object-oriented
+\begin_inset Quotes erd
+\end_inset
+
+, the term
+\begin_inset Quotes eld
+\end_inset
+
+instance-based
+\begin_inset Quotes erd
+\end_inset
+
+ means that the
+\emph on
+strategy
+\emph default
+ brick layer need not be fully modularized according to the IOP principles,
+ but the
+\emph on
+worker
+\emph default
+ brick layer already is.
+\end_layout
+
+\end_inset
+
+, while MARS Full is planned to be fully instance-
+\emph on
+oriented
+\emph default
+.
+\end_layout
+
+\begin_layout Subsection
+MARS Architecture
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/mars-light-architecture.fig
+ width 40col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection
+MARS Full Architecture (planned)
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Graphics
+ filename images/mars-full-architecture.fig
+ width 80col%
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+Documentation of the Symlink Trees
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Documentation-of-the"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The
+\family typewriter
+/mars/
+\family default
+ symlink tree is serving the following purposes, all at the same time:
+\end_layout
+
+\begin_layout Enumerate
+For
+\series bold
+communication
+\series default
+ between cluster nodes, see sections
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-Lamport-Clock"
+
+\end_inset
+
+ and
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:The-Symlink-Tree"
+
+\end_inset
+
+.
+ This communication is even the
+\emph on
+only
+\emph default
+ communication between cluster nodes (apart from the
+\emph on
+contents
+\emph default
+ of transaction logfiles and sync data).
+\end_layout
+
+\begin_layout Enumerate
+
+\series bold
+\emph on
+Internal
+\emph default
+ interface
+\series default
+ between the kernel module and the userspace tool
+\family typewriter
+marsadm
+\family default
+.
+\end_layout
+
+\begin_layout Enumerate
+
+\series bold
+\emph on
+Internal
+\emph default
+ persistent repository
+\series default
+ which keeps state information between reboots (also in case of node crashes).
+ It is even the
+\emph on
+only
+\emph default
+ place where state information is kept.
+ There is no other place like
+\family typewriter
+/etc/drbd.conf
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+Because of its internal character, its representation and semantics may
+ change at any time without notice (e.g.
+ via an
+\emph on
+internal
+\emph default
+ upgrade procedure between major releases).
+ It is
+\emph on
+not
+\emph default
+ an external interface to the outer world.
+ Don't build anything on it.
+\end_layout
+
+\begin_layout Standard
+However, knowledge of the symlink tree is useful for advanced sysadmins,
+ for
+\series bold
+human inspection
+\series default
+ and for
+\series bold
+debugging
+\series default
+.
+ And, of course, for developers.
+\end_layout
+
+\begin_layout Standard
+As an
+\begin_inset Quotes eld
+\end_inset
+
+official
+\begin_inset Quotes erd
+\end_inset
+
+ interface from outside, only the
+\family typewriter
+marsadm
+\family default
+ command should be used.
+\end_layout
+
+\begin_layout Subsection
+Documentation of the MARS Symlink Tree
+\end_layout
+
+\begin_layout Section
+XIO Worker Bricks
+\end_layout
+
+\begin_layout Section
+StrategY Worker Bricks
+\end_layout
+
+\begin_layout Standard
+NYI
+\end_layout
+
+\begin_layout Section
+The XIO Brick Personality
+\end_layout
+
+\begin_layout Section
+The Generic Brick Infrastructure Layer
+\end_layout
+
+\begin_layout Section
+The Generic Object and Aspect Infrastructure
+\end_layout
+
+\begin_layout Chapter
+\start_of_appendix
+Technical Data MARS
+\begin_inset CommandInset label
+LatexCommand label
+name "chap:Technical-Data-MARS"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+MARS has some built-in limitations which should be overcome
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Some internal algorithms are quadratic.
+ The reason is that MARS evolved from a lab prototype which wasn't originally
+ intended for enterprise grade usage, but should have been succeeded by
+ the fully instance-oriented MARS Full much earlier.
+\end_layout
+
+\end_inset
+
+ by the future MARS Full.
+ Please don't exceed the following limits:
+\end_layout
+
+\begin_layout Itemize
+maximum 10 nodes per cluster
+\end_layout
+
+\begin_layout Itemize
+maximum 10 resources per cluster
+\end_layout
+
+\begin_layout Itemize
+maximum 100 logfiles per resource
+\end_layout
+
+\begin_layout Chapter
+Handout for Midnight Problem Solving
+\begin_inset CommandInset label
+LatexCommand label
+name "chap:Handout-for-Midnight"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Here are generic instructions for the generic
+\family typewriter
+marsadm
+\family default
+ and commandline level.
+ Other levels (e.g.
+ different types of cluster managers, PaceMaker, control scripts /
+\family typewriter
+rc
+\family default
+ scripts /
+\family typewriter
+upstart
+\family default
+ scripts, etc should be described elsewhere.
+\end_layout
+
+\begin_layout Section
+Inspecting the State of MARS
+\end_layout
+
+\begin_layout Standard
+For manual inspection, please prefer the new
+\family typewriter
+marsadm view all
+\family default
+ over the old
+\family typewriter
+marsadm view-1and1 all
+\family default
+.
+ It shows more appropriate / detailed information.
+\end_layout
+
+\begin_layout Standard
+Hint: this might change in future when somebody will program better marcros
+ for the
+\family typewriter
+view-1and1
+\family default
+ variant, or create even better other macros.
+\end_layout
+
+\begin_layout Quotation
+
+\family typewriter
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+# watch marsadm view all
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Checking the low-level network connections at runtime:
+\end_layout
+
+\begin_layout Quotation
+
+\family typewriter
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+# watch "netstat --tcp | grep 777"
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Meaning of the port numbers (as currently configured into the kernel module,
+ may change in future):
+\end_layout
+
+\begin_layout Itemize
+7777 = metadata / symlink propagation
+\end_layout
+
+\begin_layout Itemize
+7778 = transfer of transaction logfiles
+\end_layout
+
+\begin_layout Itemize
+7779 = transfer of sync traffic
+\end_layout
+
+\begin_layout Standard
+7777 must be always active on a healthy cluster.
+ 7778 and 7779 will appear only on demand, when some data is transferred.
+\end_layout
+
+\begin_layout Standard
+Hint: when one of the columns Send-Q or Recv-Q are constantly at high values,
+ you might have a network bottleneck.
+\end_layout
+
+\begin_layout Section
+Replication is Stuck
+\end_layout
+
+\begin_layout Standard
+Indications for a stuck:
+\end_layout
+
+\begin_layout Itemize
+One of the flags shown by
+\family typewriter
+marsadm view all
+\family default
+ or
+\family typewriter
+marsadm view-flags all
+\family default
+ contain a symbol
+\family typewriter
+"-"
+\family default
+ (dash).
+ This means that some switch is currently switched off (deliberately).
+ Please check whether there is a valid reason why somebody else switched
+ it off.
+ If the switch-off is just by accident, use the following command to fix
+ the stuck:
+\family typewriter
+
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+# marsadm up all
+\end_layout
+
+\end_inset
+
+
+\family default
+(or replace
+\family typewriter
+all
+\family default
+ by a particular resource name if you want to start only a specific one).
+\begin_inset Newline newline
+\end_inset
+
+Note:
+\family typewriter
+up
+\family default
+ is equivalent to the sequence
+\family typewriter
+attach; resume-fetch; resume-replay; resume-sync
+\family default
+.
+ Instead of switching each individual knob, use
+\family typewriter
+up
+\family default
+ as a shortcut for switching on anything which is currently off.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+netstat --tcp | grep 7777
+\family default
+ does not show anything.
+ Please check the following:
+\begin_inset Separator latexpar
+\end_inset
+
+
+\end_layout
+
+\begin_deeper
+\begin_layout Itemize
+Is the kernel module loaded? Check
+\family typewriter
+lsmod | grep mars
+\family default
+.
+ When necessary, run
+\family typewriter
+modprobe mars
+\family default
+.
+\end_layout
+
+\begin_layout Itemize
+Is the network interface down? Check
+\family typewriter
+ifconfig
+\family default
+, and/or
+\family typewriter
+ethtool
+\family default
+ and friends, and fix it when necessary.
+\end_layout
+
+\begin_layout Itemize
+Is a
+\family typewriter
+ping
+\family default
+ possible? If not, fix the network / routing / firewall / etc.
+ When fixed, the MARS connections should automatically appear after about
+ 1 minute.
+\end_layout
+
+\begin_layout Itemize
+When
+\family typewriter
+ping
+\family default
+ is possible, but a MARS connection to port 7777 does not appear after a
+ few minutes, try to connect to remote port 7777 by hand via
+\family typewriter
+telnet
+\family default
+.
+ But don't type anything, just abort the connection immediately when it
+ works! Typing anything will almost certainly throw a harsh error message
+ at the other server, which could unnecessarily alarm other people.
+\end_layout
+
+\end_deeper
+\begin_layout Itemize
+Check whether
+\family typewriter
+marsadm view all
+\family default
+ shows some progress bars somewhere.
+ Example:
+\family typewriter
+\size scriptsize
+
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+istore-test-bap1:~# marsadm view all
+\end_layout
+
+\begin_layout Plain Layout
+
+--------- resource lv-0
+\end_layout
+
+\begin_layout Plain Layout
+
+ lv-0 OutDated[F] PausedReplay dCAS-R Secondary istore-test-bs1
+\end_layout
+
+\begin_layout Plain Layout
+
+ replaying: [>...................] 1.21% (12/1020)MiB logs: [2..3]
+\end_layout
+
+\begin_layout Plain Layout
+
+ > fetch: 1008.198 MiB rate: 0 B/sec remaining: --:--:-- hrs
+\end_layout
+
+\begin_layout Plain Layout
+
+ > replay: 0 B rate: 0 B/sec remaining: 00:00:00 hrs
+\end_layout
+
+\end_inset
+
+
+\family default
+\size default
+At least one of the
+\family typewriter
+rate:
+\family default
+ values should be greater than 0.
+ When none of the
+\family typewriter
+rate:
+\family default
+ values indicate any progress for a longer time, try
+\family typewriter
+marsadm up all
+\family default
+ again.
+ If it doesn't help, check and repair the network.
+ If even this does not help, check the hardware for any IO hangups, or kernel
+ hangups.
+ First, check the RAID controllers.
+ Often (but not certainly), a stuck kernel can be recognized when many processes
+ are
+\emph on
+permanently
+\emph default
+ in state "D", for a long time:
+\family typewriter
+ps ax | grep " D" | grep -v grep
+\family default
+ or similar.
+ Please check whether there is just an overload, or
+\emph on
+really
+\emph default
+ a true kernel problem.
+ Discrimination is not easy, and requires experience (as with any other
+ system; not limited to MARS).
+ A truly stuck kernel can only be resurrected by rebooting.
+ The same holds for any hardware problems.
+\end_layout
+
+\begin_layout Itemize
+Check whether
+\family typewriter
+marsadm view all
+\family default
+ reports any lines like
+\family typewriter
+WARNING: SPLIT BRAIN at '' detected
+\family default
+.
+ In such a case, check that there is
+\emph on
+really
+\emph default
+ a split brain, before obeying the instructions in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Resolution-of-Split"
+
+\end_inset
+
+.
+ Notice that network outages or missing
+\family typewriter
+marsadm log-delete-all all
+\family default
+ or
+\family typewriter
+cron
+\family default
+ may continue to report an old split brain which has gone in the meantime.
+\end_layout
+
+\begin_layout Itemize
+Check whether
+\family typewriter
+/mars/
+\family default
+ is too full.
+ For a rough impression,
+\family typewriter
+df /mars/
+\family default
+ may be used.
+ For getting authoritative values as internally used by the MARS emergency-mode
+ computations, use
+\family typewriter
+marsadm view-rest-space
+\family default
+ (the unit is GiB).
+ In practice, the differences are only marginal, at least on bigger
+\family typewriter
+/mars/
+\family default
+ partitions.
+ When there is only few rest space (or none at all), please obey the instruction
+s in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Resolution-of-Emergency"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Section
+Resolution of Emergency Mode
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Resolution-of-Emergency"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Emergency mode occurs when
+\family typewriter
+/mars/
+\family default
+ runs out of space, such that no new logfile data can be written anymore.
+\end_layout
+
+\begin_layout Standard
+In emergency mode, the primary will write any write requests
+\emph on
+directly
+\emph default
+ to the underlying disk, as if MARS were not present at all.
+ Thus, your application will continue to run.
+ Only the
+\emph on
+replication
+\emph default
+ as such is stopped.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Note Greyedout
+status open
+
+\begin_layout Plain Layout
+Notice: emergency mode means that your secondary nodes are usually in a
+
+\emph on
+consistent
+\emph default
+, but
+\emph on
+outdated
+\emph default
+ state (exception: when a sync was running in parallel to the emergency
+ mode, then the sync will be automatically started over again).
+ You can check consistency via
+\family typewriter
+marsadm view-flags all
+\family default
+.
+ Only when a local disk shows a lower-case letter
+\family typewriter
+"d"
+\family default
+ instead of an uppercase
+\family typewriter
+"D"
+\family default
+, it is known to be inconsistent (e.g.
+ during a sync).
+ When there is a dash instead, it usually means that the disk is detatched
+ or misconfigured or the kernel module is not started.
+ Please fix these problems first before believing that your local disk is
+ unusable.
+ Even if it is really inconsistent (which is very unlikely, typically occurring
+ only as a consequence of hardware failures, or of the above-mentioned exception
+), you have a big chance to recover most of the data via
+\family typewriter
+fsck
+\family default
+ and friends.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+A currently existing Emergency mode can be detected by
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+primary:~# marsadm view-is-emergency all
+\end_layout
+
+\begin_layout Plain Layout
+
+secondary:~# marsadm view-is-emergency all
+\end_layout
+
+\end_inset
+
+ Notice: this delivers the current state, telling nothing about the past.
+\end_layout
+
+\begin_layout Standard
+Currently, emergency mode will also show something like
+\family typewriter
+WARNING: SPLIT BRAIN at '' detected
+\family default
+.
+ This ambiguity will be resolved in a future MARS release.
+ It is however not crucial: the resolution methods for both cases are very
+ similar.
+ If in doubt, start emergency resolution first, and only proceed to split
+ brain resoultion if it did not help.
+\end_layout
+
+\begin_layout Standard
+Preconditions:
+\end_layout
+
+\begin_layout Itemize
+Only current version of MARS: the space at the primary side should have
+ been already released, and the emergency mode should have been already
+ left.
+ Otherwise, you might need the split-brain resolution method from section
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Resolution-of-Split"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Itemize
+The network
+\series bold
+must
+\series default
+ be working.
+ Check that the following gives an entry for each secondary:
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+primary:~# netstat --tcp | grep 7777
+\end_layout
+
+\end_inset
+
+When necessary, fix the network first (see instructions above).
+\end_layout
+
+\begin_layout Standard
+Emergency mode should now be resolved via the following instructions:
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+primary:~# marsadm view-is-emergency all
+\end_layout
+
+\begin_layout Plain Layout
+
+primary:~# du -s /mars/resource-* | sort -n
+\end_layout
+
+\end_inset
+
+Remember the affected resources.
+ Best practice is to do the following, starting with the
+\emph on
+biggest
+\emph default
+ resource as shown by the
+\family typewriter
+du | sort
+\family default
+ output in reverse order, but
+\emph on
+starting
+\emph default
+ the following only with the
+\emph on
+affected
+\emph default
+ resources in the first place:
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+secondary1:~# marsadm invalidate
+\end_layout
+
+\begin_layout Plain Layout
+
+secondary1:~# marsadm log-delete-all all
+\end_layout
+
+\begin_layout Plain Layout
+
+...
+ dito with all resources showing emergency mode
+\end_layout
+
+\begin_layout Plain Layout
+
+...
+ dito on all other secondaries
+\end_layout
+
+\begin_layout Plain Layout
+
+primary:~# marsadm log-delete-all all
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Hint: during the resolution process, some other resources might have gone
+ into emergency mode concurrently.
+ In addition, it is possible that some secondaries are stuck at particular
+ resources while the corresponding primary has
+\emph on
+not yet
+\emph default
+ entered emergency mode.
+ Please repeat the steps in such a case, and look for emergency modes at
+ secondaries additionally.
+ When necessary, extend your list of
+\emph on
+affected
+\emph default
+ resources.
+\end_layout
+
+\begin_layout Standard
+Hint: be patient.
+ Deleting large bulks of logfile data may take a long time, at least on
+ highly loaded systems.
+ You should give the cleanup processes at least 5 minutes before concluding
+ that an
+\family typewriter
+invalidate
+\family default
+ followed by
+\family typewriter
+log-delete-all
+\family default
+ had no effect! Don't forget to give the
+\family typewriter
+log-delete-all
+\family default
+ at all cluster nodes, even when seemingly unaffected.
+\end_layout
+
+\begin_layout Standard
+In very complex scenarios, when the primary roles of different resources
+ are spread over diffent hosts (aka mixed operation), you may need to repeat
+ the whole cycle iteratively for a few cycles until the jam is resolved.
+\end_layout
+
+\begin_layout Standard
+If it does not go away, you have another chance by the following split-brain
+ resolution process, which will also cleanup emergency mode as a side effect.
+\end_layout
+
+\begin_layout Section
+Resolution of Split Brain and of Emergency Mode
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Resolution-of-Split"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Hint: in many cases (but not guaranteed), the previous receipe for resolution
+ of emergency mode will also cleanup split brain.
+ Good chances are in case of
+\begin_inset Formula $k=2$
+\end_inset
+
+ total replicas.
+ Please collect your own experiences which method works better for you!
+\end_layout
+
+\begin_layout Standard
+Precondition: the network must be working.
+ Check that the following gives an entry for each secondary:
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+primary:~# netstat --tcp | grep 7777
+\end_layout
+
+\end_inset
+
+ When necessary, fix the network first (see instructions above).
+\end_layout
+
+\begin_layout Standard
+Inspect the split brain situation:
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+primary:~# marsadm view all
+\end_layout
+
+\begin_layout Plain Layout
+
+primary:~# du -s /mars/resource-* | sort -n
+\end_layout
+
+\end_inset
+
+Remember those resources where a message like
+\family typewriter
+WARNING: SPLIT BRAIN at '' detected
+\family default
+ appears.
+ Do the following only for
+\emph on
+affected
+\emph default
+ resources, starting with the biggest one (before proceeding to the next
+ one).
+\end_layout
+
+\begin_layout Standard
+Do the following with only
+\emph on
+one
+\emph default
+ resource at a time (before proceeding to the next one), and repeat the
+ actions on that resource at every secondary (if there are multiple secondaries)
+:
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+secondary1:~# marsadm leave-resource $res1
+\end_layout
+
+\begin_layout Plain Layout
+
+secondary1:~# marsadm log-delete-all all
+\end_layout
+
+\end_inset
+
+Check whether the split brain has vanished everywhere.
+ Startover with other resources at their secondaries when necessary.
+\end_layout
+
+\begin_layout Standard
+Finally, when no split brain is reported at any (former) secondary, do the
+ following on the primary:
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+primary:~# marsadm log-delete-all all
+\end_layout
+
+\begin_layout Plain Layout
+
+primary:~# sleep 30
+\end_layout
+
+\begin_layout Plain Layout
+
+primary:~# marsadm view all
+\end_layout
+
+\end_inset
+
+ Now, the split brain should be gone even at the primary.
+ If not, repeat this step.
+\end_layout
+
+\begin_layout Standard
+In case even this should fail on some
+\family typewriter
+$res
+\family default
+ (which is very unlikely), read the PDF manual before using
+\family typewriter
+marsadm log-purge-all $res
+\family default
+.
+
+\end_layout
+
+\begin_layout Standard
+Finally, when the split brain is gone everywhere, rebuild the redundancy
+ at every secondary via
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+secondary1:~# marsadm join-resource $res1 /dev//$res1
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+If even this method does not help, setup the whole cluster afresh by
+\family typewriter
+rmmod mars
+\family default
+ everywhere, and creating a fresh
+\family typewriter
+/mars/
+\family default
+ filesystem everywhere, followed by the same procedure as installing MARS
+ for the first time (which is outside the scope of this handout).
+\end_layout
+
+\begin_layout Section
+Handover of Primary Role
+\end_layout
+
+\begin_layout Standard
+When there exists a method for primary handover in higher layers such as
+ cluster managers, please prefer that method (e.g.
+
+\family typewriter
+cm3
+\family default
+ or other tools).
+\end_layout
+
+\begin_layout Standard
+If suchalike doesn't work, or if you need to handover some resource
+\family typewriter
+$res1
+\family default
+ by hand, do the following:
+\end_layout
+
+\begin_layout Itemize
+Stop the load / application corresponding to
+\family typewriter
+$res1
+\family default
+ on the old primary side.
+\end_layout
+
+\begin_layout Itemize
+
+\family typewriter
+umount /dev/mars/$res1
+\family default
+, or otherwise close any openers such as iSCSI.
+\end_layout
+
+\begin_layout Itemize
+At the new primary:
+\family typewriter
+marsadm primary $res1
+\end_layout
+
+\begin_layout Itemize
+Restart the application at the new site (in reverse order to above).
+ In case you want to switch
+\emph on
+all
+\emph default
+ resources which are not yet at the new side, you may use
+\family typewriter
+marsadm primary all
+\family default
+.
+\end_layout
+
+\begin_layout Section
+Emergency Switching of Primary Role
+\end_layout
+
+\begin_layout Standard
+Emergency switching is necessary when your primary is no longer reachable
+ over the network for a
+\emph on
+longer
+\emph default
+ time, or when the hardware is defective.
+\end_layout
+
+\begin_layout Standard
+Emergency switching will very often lead to a split brain, which requires
+ lots of manual actions to resolve (see above).
+ Therefore, try to avoid emergency switching when possible!
+\end_layout
+
+\begin_layout Standard
+Hint: MARS can automatically recover after a primary crash / reboot, as
+ well as after secondary crashes, just by executing
+\family typewriter
+modprobe mars
+\family default
+ after
+\family typewriter
+/mars/
+\family default
+ had been mounted.
+ Please consider to wait until your system comes up again, instead of risking
+ a split brain.
+\end_layout
+
+\begin_layout Standard
+The decision between emergency switching and continuing operation at the
+ same primary side is an operational one.
+ MARS can support your decision by the following information at the potentially
+ new primary side (which was in secondary mode before):
+\family typewriter
+\size scriptsize
+
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+istore-test-bap1:~# marsadm view all
+\end_layout
+
+\begin_layout Plain Layout
+
+--------- resource lv-0
+\end_layout
+
+\begin_layout Plain Layout
+
+lv-0 InConsistent Syncing dcAsFr Secondary istore-test-bs1
+\end_layout
+
+\begin_layout Plain Layout
+
+syncing: [====>..............] 27.84% (567/2048)MiB rate: 72583.00 KiB/sec remaining: 00:00:20
+ hrs
+\end_layout
+
+\begin_layout Plain Layout
+
+> sync: 567.293/2048 MiB rate: 72583 KiB/sec remaining: 00:00:20 hrs
+\end_layout
+
+\begin_layout Plain Layout
+
+replaying: [>:::::::::::::::::::] 0.00% (0/12902)KiB logs: [1..1]
+\end_layout
+
+\begin_layout Plain Layout
+
+> fetch: 0 B rate: 38 KiB/s remaining: 00:00:00
+\end_layout
+
+\begin_layout Plain Layout
+
+> replay: 12902.047 KiB rate: 0 B/s remaining: --:--:--
+\end_layout
+
+\end_inset
+
+
+\family default
+\size default
+When your target is syncing (like in this example), you cannot switch to
+ it (same as with DRBD).
+ When you had an emergency mode before, you should first resolve that (whenever
+ possible).
+ When a split brain is reported, try to resolve it first (same as with DRBD).
+ Only in case you
+\emph on
+know
+\emph default
+ that the primary is really damaged, or it is really impossible to the run
+ the application there for some reason, emergency switching is desirable.
+\end_layout
+
+\begin_layout Standard
+Hint: in case the secondary is inconsistent for some reason, e.g.
+ because of an incremental fast full-sync, you have a last chance to recover
+ most data after forceful switching by using a filesystem check or suchalike.
+ This might be even faster than restoring data from the backup.
+ But use it only if you are
+\emph on
+really
+\emph default
+ desperate!
+\end_layout
+
+\begin_layout Standard
+The amount of data which is
+\emph on
+known
+\emph default
+ to be missing at your secondary is shown after the
+\family typewriter
+> fetch:
+\family default
+ in human-readable form.
+ However, in cases of networking problems this information may be outdated.
+ You
+\emph on
+always
+\emph default
+ need to consider further facts which cannot be known by MARS.
+\end_layout
+
+\begin_layout Standard
+When there exists a method for emergency switching of the primary in higher
+ layers such as cluster managers, please prefer that method in front of
+ the following one.
+\end_layout
+
+\begin_layout Standard
+If suchalike doesn't work, or when a handover attempt has failed several
+ times, or if you
+\emph on
+really need
+\emph default
+ forceful switching of some resource
+\family typewriter
+$res1
+\family default
+ by hand, you can do the following:
+\end_layout
+
+\begin_layout Itemize
+When possible, stop the load / application corresponding to
+\family typewriter
+$res1
+\family default
+ on the old primary side.
+\end_layout
+
+\begin_layout Itemize
+When possible,
+\family typewriter
+umount /dev/mars/$res1
+\family default
+, or otherwise close any openers such as iSCSI.
+\end_layout
+
+\begin_layout Itemize
+When possible (if you have some time), wait until as much data has been
+ propagated to the new primary as possible (watch the
+\family typewriter
+fetch:
+\family default
+ indicator).
+\end_layout
+
+\begin_layout Itemize
+At the new primary:
+\family typewriter
+marsadm disconnect $res1; marsadm primary --force $res1
+\end_layout
+
+\begin_layout Itemize
+Restart the application at the new site (in reverse order to above).
+\end_layout
+
+\begin_layout Itemize
+After the application is known to run reliably, check for split brains and
+ cleanup them when necessary.
+\end_layout
+
+\begin_layout Chapter
+Alternative Methods for Split Brain Resolution
+\begin_inset CommandInset label
+LatexCommand label
+name "chap:Alternative-Methods-for"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Instead of
+\family typewriter
+marsadm invalidate
+\family default
+, the following steps may be used.
+ In preference, start with the old
+\begin_inset Quotes eld
+\end_inset
+
+wrong
+\begin_inset Quotes erd
+\end_inset
+
+ primaries first:
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+marsadm leave-resource mydata
+\end_layout
+
+\begin_layout Enumerate
+After having done this on one cluster node, check whether the split brain
+ is already gone (e.g.
+ by saying
+\family typewriter
+marsadm view mydata
+\family default
+).
+ There are chances that you don't need this on all of your nodes.
+ Only in very rare
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+When your network had partitioned in a very awkward way for a long time,
+ and when your partitioned primaries did several
+\family typewriter
+log-rotate
+\family default
+ operations indendently from each other, there is a small chance that
+\family typewriter
+leave-resource
+\family default
+ does not clean up
+\emph on
+all
+\emph default
+ remains of such an awkward situation.
+ Only in such a case, try
+\family typewriter
+log-purge-all
+\family default
+.
+\end_layout
+
+\end_inset
+
+ cases, it might happen that the preceding l
+\family typewriter
+eave-resource
+\family default
+ operations were not able to clean up all logfiles produced in parallel
+ by the split brain situation.
+
+\end_layout
+
+\begin_layout Enumerate
+Read the documentation about
+\family typewriter
+log-purge-all
+\family default
+ (see page
+\begin_inset CommandInset ref
+LatexCommand pageref
+reference "log-purge-all$res"
+
+\end_inset
+
+) and use it.
+\end_layout
+
+\begin_layout Enumerate
+If you want to restore redundancy, you can follow-up a
+\family typewriter
+join-resource
+\family default
+ phase to the old resource name (using the correct device name, double-check
+ it!) This will restore your redundancy by overwriting your bad split brain
+ version with the correct one.
+\end_layout
+
+\begin_layout Standard
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+It is important to resolve the split brain
+\emph on
+before
+\emph default
+ you can start the
+\family typewriter
+join-resource
+\family default
+ reconstruction phase! In order to keep as many
+\begin_inset Quotes eld
+\end_inset
+
+good
+\begin_inset Quotes erd
+\end_inset
+
+ versions as possible (e.g.
+ for emergency cases), don't re-join them all in parallel, but rather start
+ with the oldest / most outdated / worst / inconsistent version first.
+ It is recommended to start the next one only when the previous one has
+ sucessfully finished.
+\end_layout
+
+\begin_layout Chapter
+Alternative De- and Reconstruction of a Damaged Resource
+\begin_inset CommandInset label
+LatexCommand label
+name "chap:Alternative-De--and"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+In case
+\family typewriter
+leave-resource --host=
+\family default
+ does not work, you may use the following fallback.
+ On the surviving new designated primary, give the following commands:
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+marsadm disconnect-all mydata
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+marsadm down mydata
+\end_layout
+
+\begin_layout Enumerate
+Check by hand whether your local disk is consistent, e.g.
+ by test-mounting it readonly,
+\family typewriter
+fsck
+\family default
+, etc.
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+marsadm delete-resource mydata
+\end_layout
+
+\begin_layout Enumerate
+Check whether the other vital cluster nodes don't report the dead resource
+ any more, e.g.
+
+\family typewriter
+marsadm view all
+\family default
+ at
+\emph on
+each
+\emph default
+ of them.
+ In case the resource has not disappeared anywhere (which may happen during
+ network problems), do the
+\family typewriter
+down ; delete-resource
+\family default
+ steps also there (optionally again with
+\family typewriter
+--force
+\family default
+).
+\end_layout
+
+\begin_layout Enumerate
+Be sure that the resource has disappeared
+\emph on
+everywhere
+\emph default
+.
+ When necessary, repeat the
+\family typewriter
+delete-resource
+\family default
+ with
+\family typewriter
+--force
+\family default
+.
+\end_layout
+
+\begin_layout Enumerate
+
+\family typewriter
+marsadm create-resource newmydata ...
+
+\family default
+ at the
+\emph on
+correct
+\emph default
+ node using the
+\emph on
+correct
+\emph default
+ disk device containing the
+\emph on
+correct
+\emph default
+ version, and further steps to setup your resource from scratch, preferably
+ under a different name to minimize any risk.
+\end_layout
+
+\begin_layout Standard
+\noindent
+In any case,
+\series bold
+manually check
+\series default
+ whether a split brain is reported for any resource on any of your
+\emph on
+surviving
+\emph default
+ cluster nodes.
+ If you find one there (and only then), please (re-)execute the split brain
+ resolution steps on the affected node(s).
+\end_layout
+
+\begin_layout Chapter
+Cleanup in case of Complicated Cascading Failures
+\begin_inset CommandInset label
+LatexCommand label
+name "subsec:Cleanup-in-case"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+MARS does its best to recover even from multiple failures (e.g.
+
+\series bold
+rolling disasters
+\series default
+).
+ Chances are high that the instructions from sections
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Split-Brain-Resolution"
+
+\end_inset
+
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Final-Destroy-of"
+
+\end_inset
+
+ or appendix
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "chap:Alternative-Methods-for"
+
+\end_inset
+
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "chap:Alternative-De--and"
+
+\end_inset
+
+ will work even in case of multiple failures, such as a network failure
+ plus local node failure at only 1 node (even if that node is the former
+ primary node).
+\end_layout
+
+\begin_layout Standard
+However, in general (e.g.
+ when more than 1 node is damaged and/or when the filesystem
+\family typewriter
+/mars/
+\family default
+ is badly damaged) there is no general guarantee that recovery will
+\emph on
+always
+\emph default
+ succeed under
+\emph on
+any
+\emph default
+ (weird) circumstances.
+ That said, your chances for recovery are
+\emph on
+very
+\emph default
+ high when some disk remains usable at least at one of your surviving secondarie
+s.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+It should be very hard to finally trash a secondary, because the transaction
+ logfiles are containing
+\family typewriter
+md5
+\family default
+ checksums for all data records.
+ Any attempt to replay currupted logfiles is refused by MARS.
+ In addition, the sequence numbers of
+\family typewriter
+log-rotate
+\family default
+d logfiles are checked for contiguity.
+ Finally, the
+\emph on
+sequence path
+\emph default
+ of logfile applications (consisting of logfile names plus their respective
+ length) is additionally secured by a
+\family typewriter
+git
+\family default
+-like incremental checksum over the whole path history (so-called
+\begin_inset Quotes eld
+\end_inset
+
+version links
+\begin_inset Quotes erd
+\end_inset
+
+).
+ This should detect split brains even if logfiles are appended / modified
+
+\emph on
+after
+\emph default
+ a (forceful) switchover has already taken place.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresToxiques.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ That said, your risk of final data loss is very high if you remove the
+
+\series bold
+BBU
+\series default
+ from your hardware RAID controller before all hot data has been flushed
+ to the physical disks.
+ Therefore, never try to
+\begin_inset Quotes eld
+\end_inset
+
+repair
+\begin_inset Quotes erd
+\end_inset
+
+ a seemingly dead node before your replication is up again somewhere else!
+ Only unplug the network cables when advised, but never try to repair the
+ hardware instantly!
+\end_layout
+
+\begin_layout Standard
+In case of desperate situations where none of the previous instructions
+ have succeeded, your last chance is rebuilding all your resources from
+ intact disks as follows:
+\end_layout
+
+\begin_layout Enumerate
+Do
+\family typewriter
+rmmod mars
+\family default
+ on all your cluster nodes and/or reboot them.
+ Note: if you are less desperate, chances are high that the following will
+ also work when the kernel module remains active and everywhere a
+\family typewriter
+marsadm down
+\family default
+ is given instead, but for an
+\emph on
+ultimate
+\emph default
+ instruction you should eliminate
+\emph on
+potential
+\emph default
+ kernel problems by
+\family typewriter
+rmmod
+\family default
+ /
+\family typewriter
+reboot
+\family default
+, at least if you can afford the downtime on concurrently operating resources.
+\end_layout
+
+\begin_layout Enumerate
+For safety, physically remove the storage network cables on
+\emph on
+all
+\emph default
+ your cluster nodes.
+ Note: the same disclaimer holds.
+ MARS really does its best, even when
+\family typewriter
+delete-resource
+\family default
+ is given while the network is fully active and multiple split-brain primaries
+ are actively using their local device in parallel (approved by some testcases
+ from the automatic test suite, but note that it is impossible to catch
+ all possible failure scenarios).
+ Don't challenge your fate if you are desperate! Don't
+\emph on
+rely
+\emph default
+ on this! Nothing is absolutely fail-safe!
+\end_layout
+
+\begin_layout Enumerate
+
+\series bold
+Manually
+\series default
+ check which surviving disk is usable, and which is the
+\begin_inset Quotes eld
+\end_inset
+
+best
+\begin_inset Quotes erd
+\end_inset
+
+ one for your purpose.
+\end_layout
+
+\begin_layout Enumerate
+Do
+\family typewriter
+modprobe mars
+\family default
+
+\emph on
+only
+\emph default
+ on that node.
+ If that fails,
+\family typewriter
+rmmod
+\family default
+ and/or reboot again, and start over with a completely fresh
+\family typewriter
+/mars/
+\family default
+ partition (
+\family typewriter
+mkfs.ext4 /mars/
+\family default
+ or similar)
+\emph on
+everywhere
+\emph default
+ on
+\emph on
+all
+\emph default
+ cluster nodes, and continue with step 7.
+\end_layout
+
+\begin_layout Enumerate
+If your old
+\family typewriter
+/mars/
+\family default
+ works, and you did not already (forcefully) switch your designated primary
+ to the final destination, do it now (see description in section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "subsec:Forced-Switching"
+
+\end_inset
+
+).
+ Wait until any old logfile data has been replayed.
+\end_layout
+
+\begin_layout Enumerate
+Say
+\family typewriter
+marsadm delete-resource mydata --force
+\family default
+.
+ This will cleanup all internal symlink tree information for the resource,
+ but will leave your disk data intact.
+\end_layout
+
+\begin_layout Enumerate
+Locally build up the new resource(s) as usual, out of the underlying disks.
+\end_layout
+
+\begin_layout Enumerate
+Check whether the new resource(s) work in standalone mode.
+\end_layout
+
+\begin_layout Enumerate
+When necessary, repeat these steps with other resources.
+\end_layout
+
+\begin_layout Standard
+Now you can choose how the rebuild your cluster.
+ If you rebuilt
+\family typewriter
+/mars/
+\family default
+ anywhere, you
+\emph on
+must
+\emph default
+ rebuild it on
+\emph on
+all
+\emph default
+ new cluster nodes and start over with a fresh
+\family typewriter
+join-cluster
+\family default
+ on each of them, from scratch.
+ It is not possible to mix the old cluster with the new one.
+\end_layout
+
+\begin_layout Standard
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+begin{enumerate}
+\backslash
+setcounter{enumi}{9}
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+item
+\end_layout
+
+\end_inset
+
+ Finally, do all the necessary
+\family typewriter
+join-resource
+\family default
+s on the respective cluster nodes, according to your new redundancy scenario
+ after the failures (e.g.
+ after activating spare nodes, etc).
+ If you have
+\begin_inset Formula $k>2$
+\end_inset
+
+ replicas, start
+\family typewriter
+join-resource
+\family default
+ on the worst / most damaged version first, and start the next preferably
+ only after the previous sync has completed successfully.
+ This way, you will be permanently retaining some (old and outdated, but
+ hopefully potentially usable) replicas while a sync is running.
+ Don't start too many syncs in parallel.
+\end_layout
+
+\begin_layout Standard
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+end{enumerate}
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Never use
+\family typewriter
+delete-resource
+\family default
+ twice on the same resource name, after you have already a working standalone
+ primary
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Of course, when you don't have created the
+\emph on
+same
+\emph default
+ resource anew, you may repeat
+\family typewriter
+delete-resource
+\family default
+ on other cluster nodes in order to get rid of local files / symlinks which
+ had not been propagated to other nodes before.
+\end_layout
+
+\end_inset
+
+.
+ You might accidentally destroy your again-working copy! You
+\emph on
+can
+\emph default
+ issue
+\family typewriter
+delete-resource
+\family default
+ multiple times on different nodes, e.g.
+ when the network has problems, but doing so
+\emph on
+after
+\emph default
+ re-establishment of the initial primary bears some risk.
+ Therefore, the safest way is first deleting the resources everywhere, and
+ then starting over afresh.
+\end_layout
+
+\begin_layout Standard
+Before re-connecting any network cable on any non-primary (new secondaries),
+ ensure that all
+\family typewriter
+/dev/mars/mydata
+\family default
+ devices are no longer in use (e.g.
+ from an old primary role before the incident happened), and that each local
+ disk is detached.
+ Only after that, you should be able to safely re-connect the network.
+ The
+\family typewriter
+delete-resource
+\family default
+ given at the new primary should propagate now to each of your secondaries,
+ and your local disk should be usable for a re-
+\family typewriter
+join-resource
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+When you did not rebuild your cluster from scratch with fresh
+\family typewriter
+/mars/
+\family default
+ filesystems, and one of the old cluster nodes is supposed to be removed
+ permanently, use
+\family typewriter
+leave-resource
+\family default
+ (optionally with
+\family typewriter
+--host=
+\family default
+ and/or
+\family typewriter
+--force
+\family default
+) and finally
+\family typewriter
+leave-cluster
+\family default
+.
+\end_layout
+
+\begin_layout Chapter
+Experts only: Special Trick Switching and Rebuild
+\begin_inset CommandInset label
+LatexCommand label
+name "chap:Experts-only:-Special"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The following is a further alternative for
+\series bold
+experts
+\series default
+ who really know what they are doing.
+ The method is very simple and therefore well-suited for coping with mass
+ failures, e.g.
+
+\series bold
+power blackout of whole datacenters
+\series default
+.
+\end_layout
+
+\begin_layout Standard
+In case a primary datacenter fails as a whole for whatever reason and you
+ have a backup datacenter, do the following steps in the backup datacenter:
+\end_layout
+
+\begin_layout Enumerate
+Fencing step: by means of firewalling,
+\series bold
+ensure
+\series default
+ that the (virtually) damaged datacenter nodes
+\series bold
+cannot
+\series default
+ be reached over the network.
+ For example, you may place REJECT rules into all of your local iptables
+ firewalls at the backup datacenter.
+ Alternatively / additionally, you may block the routes at the appropriate
+ central router(s) in your network.
+\end_layout
+
+\begin_layout Enumerate
+Run the sequence
+\family typewriter
+marsadm disconnect all; marsadm primary --force all
+\family default
+ on all nodes in the backup datacenter.
+\end_layout
+
+\begin_layout Enumerate
+Restart your services in the backup datacenter (as far as necessary).
+ Depending on your network setup, further steps like switching BGP routes
+ etc may be necessary.
+\end_layout
+
+\begin_layout Enumerate
+Check that
+\emph on
+all
+\emph default
+ your services are
+\emph on
+really
+\emph default
+ up and running, before you try to repair anything! Failing to do so may
+ result in data loss when you execute the following restore method for
+\emph on
+experts
+\emph default
+.
+\end_layout
+
+\begin_layout Standard
+Now your backup datacenter should continue servicing your clients.
+ The final reconstruction of the originally primary datacenter works as
+ follows:
+\end_layout
+
+\begin_layout Enumerate
+At the damaged primary datacenter, ensure that nowhere the MARS kernel module
+ is running.
+ In case of a power blackout, you shouldn't have executed an automatic
+\family typewriter
+modprobe mars
+\family default
+ anywhere during reboot, so you should be already done when all your nodes
+ are up again.
+ In case some nodes had no reboot, execute
+\family typewriter
+rmmod mars
+\family default
+ everywhere.
+ If
+\family typewriter
+rmmod
+\family default
+ refuses to run, you may need to umount the
+\family typewriter
+/dev/mars/mydata
+\family default
+ device first.
+ When nothing else helps, you may just mass reboot your hanging nodes.
+\end_layout
+
+\begin_layout Enumerate
+At the failed side, do
+\family typewriter
+rm -rf /mars/resource-$mydata/
+\family default
+ for all those resources which had been primary before the blackout.
+ Do this
+\emph on
+only
+\emph default
+ for those cases, otherwise you will need unnecessary
+\family typewriter
+leave-resource
+\family default
+s or
+\family typewriter
+invalidate
+\family default
+s later (e.g.
+ when half of your nodes were already running at the surving side).
+ In order to avoid unnecessary traffic, please do this only as far as really
+ necessary.
+ Don't remove any other directories.
+ In particular,
+\family typewriter
+/mars/ips/
+\family default
+
+\emph on
+must
+\emph default
+ remain intact.
+ In case you accidentally deleted them, or you had to re-create
+\family typewriter
+/mars/
+\family default
+ from scratch, try
+\family typewriter
+rsync
+\family default
+ with the correct options.
+\begin_inset Newline newline
+\end_inset
+
+
+\begin_inset Graphics
+ filename images/MatieresCorrosives.png
+ lyxscale 50
+ scale 17
+
+\end_inset
+
+ Caution! before doing this, check that the corresponding directory exists
+ at the backup datacenter, and that it is
+\emph on
+really
+\emph default
+ healthy!
+\end_layout
+
+\begin_layout Enumerate
+Un-Fencing: restore your network firewall / routes and check that they work
+ (
+\family typewriter
+ping
+\family default
+ etc).
+\end_layout
+
+\begin_layout Enumerate
+Do
+\family typewriter
+modprobe mars
+\family default
+ everywhere.
+ All missing directories and their missing symlinks should be automatically
+ fetched from the backup datacenter.
+\end_layout
+
+\begin_layout Enumerate
+Run
+\family typewriter
+marsadm join-resource $res
+\family default
+, but only at those places where the directory was removed previously, while
+ using the same disk devices as before.
+ This will minimize actual traffic thanks to the fast full sync algorithm.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+It is
+\series bold
+crucial
+\series default
+ that the fencing step
+\series bold
+must
+\series default
+ be executed
+\emph on
+before
+\emph default
+ any
+\family typewriter
+primary --force
+\family default
+! This way, no split brain will be
+\emph on
+visible
+\emph default
+ at the backup datacenter side, because there is simply no chance for transferri
+ng different versions over the network.
+ It is also crucial to remove any (potentially diverging) resource directories
+
+\emph on
+before
+\emph default
+ the
+\family typewriter
+modprobe
+\family default
+! This way, the backup datacenter never runs into split brain.
+ This saves you a lot of detail work for split brain resolution when you
+ have to restore bulks of nodes in a short time.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+In case the repair of a full datacenter should take so extremely long that
+ some
+\family typewriter
+/mars/
+\family default
+ partitions are about to run out of space at the surviving side, you may
+ use the
+\family typewriter
+leave-resource --host=failed-node
+\family default
+ trick described earlier, followed by
+\family typewriter
+log-delete-all
+\family default
+.
+ Best if you have prepared a fully automatic script long before the incident,
+ which executes suchalike only as far as necessary in each individual case.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\begin_inset Graphics
+ filename images/lightbulb_brightlit_benj_.png
+ lyxscale 12
+ scale 7
+
+\end_inset
+
+Even better: train such scenarios in advance, and prepare scripts for mass
+ automation.
+ Look into section
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Scripting-HOWTO"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Chapter
+Mathematical Model of Architectural Reliability
+\begin_inset CommandInset label
+LatexCommand label
+name "chap:Mathematical-Model-of"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The assumptions used in the model are explained in detail in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sub:Detailed-explanation"
+
+\end_inset
+
+.
+ Here is a quick recap of the main parameters:
+\end_layout
+
+\begin_layout Itemize
+\begin_inset Formula $n$
+\end_inset
+
+ is the number of basic storage units.
+ It is also used for the number of application units, assumed to be the
+ same.
+\end_layout
+
+\begin_layout Itemize
+\begin_inset Formula $k$
+\end_inset
+
+ is the replication degree, or number of replicas.
+ In general, you will have to deploy
+\begin_inset Formula $N=k*n$
+\end_inset
+
+ storage servers for getting
+\begin_inset Formula $n$
+\end_inset
+
+ basic storage units.
+ This applies to any of the competing architectures.
+
+\end_layout
+
+\begin_layout Itemize
+\begin_inset Formula $s$
+\end_inset
+
+ is the architecture-dependent spread exponent: it tells whether a storage
+ incident will spread to the application units.
+ Examples:
+\begin_inset Formula $s=0$
+\end_inset
+
+ means that there is no spread between storage unit failures and application
+ unit failures, other than a local 1:1 one.
+
+\begin_inset Formula $s=1$
+\end_inset
+
+ means that an uncompensated storage node incident will cause
+\begin_inset Formula $n$
+\end_inset
+
+ application incidents.
+\end_layout
+
+\begin_layout Itemize
+\begin_inset Formula $p$
+\end_inset
+
+ is the probability of a storage server incident.
+ In the examples at section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sec:Reliability-Arguments-from"
+
+\end_inset
+
+, a fixed
+\begin_inset Formula $p=0.0001$
+\end_inset
+
+ was used for easy understanding, but the following formulae should also
+ hold for any other
+\begin_inset Formula $p\in(0,1)$
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Itemize
+\begin_inset Formula $T$
+\end_inset
+
+ is the observational period, introduced for convenience of understanding.
+ The following can also be computed independently from any
+\begin_inset Formula $T$
+\end_inset
+
+, as long as the probability
+\begin_inset Formula $p$
+\end_inset
+
+ does not change over time, which is assumed.
+ Because
+\begin_inset Formula $T$
+\end_inset
+
+ is only here for convenience of understanding, we set it to
+\begin_inset Formula $T=1/p$
+\end_inset
+
+.
+ In the examples from section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sub:Detailed-explanation"
+
+\end_inset
+
+, a fixed
+\begin_inset Formula $T=10,000$
+\end_inset
+
+ hours was used.
+\end_layout
+
+\begin_layout Section
+Formula for DRBD / MARS
+\end_layout
+
+\begin_layout Standard
+We need not discrimiate between a storage failure probability S and an applicati
+on failure probability A because applications are run locally at the storage
+ servers 1:1.
+ The probability for failure of a single shard consisting of
+\begin_inset Formula $k$
+\end_inset
+
+ nodes is
+\end_layout
+
+\begin_layout Standard
+\begin_inset Formula
+\[
+A_{p}(k)=p^{k}
+\]
+
+\end_inset
+
+because all
+\begin_inset Formula $k$
+\end_inset
+
+ shard members have to be down all at the same time.
+ In section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sub:Detailed-explanation"
+
+\end_inset
+
+ we assumed that there is no cross-communication between shards.
+ Therefore they are completely independent from each other, and the total
+ downtime of
+\begin_inset Formula $n$
+\end_inset
+
+ shards during the observational period
+\begin_inset Formula $T$
+\end_inset
+
+ is
+\end_layout
+
+\begin_layout Standard
+\begin_inset Formula
+\[
+A_{p,T}(k,n)=T*n*p^{k}
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+When introducing the spread exponent
+\begin_inset Formula $s$
+\end_inset
+
+, the formula turns into
+\end_layout
+
+\begin_layout Standard
+\begin_inset Formula
+\[
+A_{s,p,T}(k,n)=T*n^{s+1}*p^{k}
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+Formula for Unweighted BigCluster
+\end_layout
+
+\begin_layout Standard
+This is based on the Bernoulli formula.
+ The probability that exactly
+\begin_inset Formula $\bar{k}$
+\end_inset
+
+ storage nodes out of
+\begin_inset Formula $N=k*n$
+\end_inset
+
+ total storage nodes are down is
+\end_layout
+
+\begin_layout Standard
+\begin_inset Formula
+\[
+\bar{S}_{p}(\bar{k},N)=\binom{N}{\bar{k}}*p^{\bar{k}}*(1-p)^{N-\bar{k}}
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+Similarly, the probability for getting
+\begin_inset Formula $k$
+\end_inset
+
+ or more storage node failures (up to
+\begin_inset Formula $N$
+\end_inset
+
+) at the same time is
+\end_layout
+
+\begin_layout Standard
+\begin_inset Formula
+\[
+S_{p}(k,N)=\sum_{\bar{k}=k}^{N}\bar{S}_{p}(\bar{k},N)=\sum_{\bar{k}=k}^{N}\binom{N}{\bar{k}}*p^{\bar{k}}*(1-p)^{N-\bar{k}}
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+By replacing
+\begin_inset Formula $N$
+\end_inset
+
+ with
+\begin_inset Formula $k*n$
+\end_inset
+
+ (for conversion of the x axis into basic storage units) and by introducing
+
+\begin_inset Formula $T$
+\end_inset
+
+ we get
+\end_layout
+
+\begin_layout Standard
+\begin_inset Formula
+\[
+S_{p,T}(k,n)=T*\sum_{\bar{k}=k}^{k*n}\binom{k*n}{\bar{k}}*p^{\bar{k}}*(1-p)^{k*n-\bar{k}}
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+For comparability with DRBDorMARS, we have to compute the application downtime
+ A instead of the storage downtime S, which depends on the spread exponent
+
+\begin_inset Formula $s$
+\end_inset
+
+ as follows:
+\end_layout
+
+\begin_layout Standard
+\begin_inset Formula
+\[
+A_{s,p,T}(k,n)=n^{s+1}*S_{p,T}(k,n)=n^{s+1}*T*\sum_{\bar{k}=k}^{k*n}\binom{k*n}{\bar{k}}*p^{\bar{k}}*(1-p)^{k*n-\bar{k}}
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+Notice that at
+\begin_inset Formula $s=0$
+\end_inset
+
+ we have introduced a factor of
+\begin_inset Formula $n$
+\end_inset
+
+, which corresponds to the hashing effect (teardown of
+\begin_inset Formula $n$
+\end_inset
+
+ application instances by a single uncompensated storage incident) as described
+ in section
+\begin_inset CommandInset ref
+LatexCommand vref
+reference "sub:Detailed-explanation"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Section
+Formula for SizeWeighted BigCluster
+\end_layout
+
+\begin_layout Standard
+In difference to above, we need to introduce a correction factor by the
+ fraction of affected objects, relative to basic storage units.
+ Otherwise the y axis would not stay comparable due to different units.
+\end_layout
+
+\begin_layout Standard
+For the special case of
+\begin_inset Formula $k=1$
+\end_inset
+
+, there is no difference to above.
+\end_layout
+
+\begin_layout Standard
+For the special case of
+\begin_inset Formula $k=2$
+\end_inset
+
+ replica, the correction factor is
+\begin_inset Formula $1/(N-1)$
+\end_inset
+
+, because we assume that all the replica of the affected first node are
+ uniformly spread to all other nodes, which is
+\begin_inset Formula $N-1$
+\end_inset
+
+.
+ The probability for hitting the intersection of the first node with the
+ second node is thus
+\begin_inset Formula $1/(N-1)$
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+For higher values of
+\begin_inset Formula $k$
+\end_inset
+
+, and with a similar argument (never put another replica of the same object
+ onto the same storage node) we get the correction factor as
+\end_layout
+
+\begin_layout Standard
+\begin_inset Formula
+\[
+C(k,N)=\prod_{l=1}^{k-1}\frac{1}{N-l}
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+Hint: there are maximum
+\begin_inset Formula $k$
+\end_inset
+
+ physical replicas on the disks.
+ For higher values of
+\begin_inset Formula $\bar{k}\geq k$
+\end_inset
+
+, there are
+\begin_inset Formula $\binom{\bar{k}}{k}$
+\end_inset
+
+ combinations of object intersections (when assuming that the number of
+ objects on a node is very large such and no further object repetition can
+ occur execpt for the
+\begin_inset Formula $k$
+\end_inset
+
+-fold replica placement).
+ Thus the generalization to
+\begin_inset Formula $\bar{k}\geq k$
+\end_inset
+
+ is
+\end_layout
+
+\begin_layout Standard
+\begin_inset Formula
+\[
+C(k,\bar{k},N)=\binom{\bar{k}}{k}\prod_{l=1}^{k-1}\frac{1}{N-l}
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+By inserting this into the above fomula, we get
+\end_layout
+
+\begin_layout Standard
+\begin_inset Formula
+\[
+A_{s,p,T}(k,n)=n^{s+1}*T*\sum_{\bar{k}=k}^{k*n}C(k,\bar{k},k*n)*\binom{k*n}{\bar{k}}*p^{\bar{k}}*(1-p)^{k*n-\bar{k}}
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Chapter
+Command Documentation for Userspace Tools
+\begin_inset CommandInset label
+LatexCommand label
+name "chap:Command-Documentation-for"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+
+\family typewriter
+marsadm --help
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:marsadm-–help"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+input{marsadm.help}
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+
+\family typewriter
+football.sh --help
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:football-–help"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+input{football.help}
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+
+\family typewriter
+football.sh --help --verbose
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:football-help-verbose"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+input{football-verbose.help}
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+
+\family typewriter
+screener.sh --help
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:screener–help"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+input{screener.help}
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+
+\family typewriter
+screener.sh --help --verbose
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:screener-help-verbose"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+input{screener-verbose.help}
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Chapter
+Football Redundancy Diagrams
+\begin_inset CommandInset label
+LatexCommand label
+name "chap:Football-Redundancy-Diagrams"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The following tables are showing the number of replicas during Football.
+ We focus at the common case of starting with 2 replicas, and ending up
+ in a total of another 2 replicas at another machines.
+ Further cases, involving multiple secondaries, should go analogously.
+\end_layout
+
+\begin_layout Standard
+Active primaries are colored in red.
+\end_layout
+
+\begin_layout Standard
+Backup or shadow replicas (which are present at LVM level, but currently
+ not used by MARS) are in parentheses.
+ In case of emergency, they could be activated again.
+\end_layout
+
+\begin_layout Standard
+Replicas which are not in parentheses are kept in
+\family typewriter
+UpToDate
+\family default
+ state all the time, until they are retired into backup replicas.
+\end_layout
+
+\begin_layout Section
+Parallel
+\family typewriter
+migrate
+\end_layout
+
+\begin_layout Standard
+This creates two additional replicas in parallel, at the target pair.
+ After handover to the new site, and after some configurable waiting time,
+ the old replicas are deleted.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Tabular
+
+
+
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "14col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+migrate
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+SRC
+\begin_inset Newline newline
+\end_inset
+
+Primary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+SRC
+\begin_inset Newline newline
+\end_inset
+
+Secondary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+DST
+\begin_inset Newline newline
+\end_inset
+
+Primary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+DST
+\begin_inset Newline newline
+\end_inset
+
+Secondary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "12col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+# Replicas
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Start
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Migrate x 2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+4
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+After Cleanup
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+Stepwise
+\family typewriter
+migrate
+\end_layout
+
+\begin_layout Standard
+This variant is useful for hardware lifecycle.
+ The uplink of the old hardware is only loaded with creation of 1 replica
+ in migration step 1.
+ Step 2 creates then another replica at the new hardware, which should have
+ a better replication network (e.g.
+ better uplinks and/or better capacity for cross-traffic between datacenters).
+\end_layout
+
+\begin_layout Standard
+This variant is selected by parameter
+\family typewriter
+migrate_two_phase=1
+\family default
+.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Tabular
+
+
+
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "14col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+2-Step
+\begin_inset Newline newline
+\end_inset
+
+migrate
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+SRC
+\begin_inset Newline newline
+\end_inset
+
+Primary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+SRC
+\begin_inset Newline newline
+\end_inset
+
+Secondary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+DST
+\begin_inset Newline newline
+\end_inset
+
+Primary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+DST
+\begin_inset Newline newline
+\end_inset
+
+Secondary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "12col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+# Replicas
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Start
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Migrate 1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+3
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Migrate 2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+4
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+After Cleanup
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+Plain
+\family typewriter
+shrink
+\end_layout
+
+\begin_layout Standard
+Here we need to discriminate between replicas with the old size, and the
+ new size (which is typically smaller than the old size).
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Tabular
+
+
+
+
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "14col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+shrink
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+SRC
+\begin_inset Newline newline
+\end_inset
+
+Primary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+SRC
+\begin_inset Newline newline
+\end_inset
+
+Secondary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+DST
+\begin_inset Newline newline
+\end_inset
+
+Primary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+DST
+\begin_inset Newline newline
+\end_inset
+
+Secondary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "12col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+# Replicas
+\begin_inset Newline newline
+\end_inset
+
+old_size
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "12col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+# Replicas
+\begin_inset Newline newline
+\end_inset
+
+new_size
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Start
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\color inherit
++ (1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Working
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1) +
+\color red
+ 1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(2)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Finished
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1) +
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1) + 1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(2)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+After Cleanup
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+Full
+\family typewriter
+migrate+shrink
+\end_layout
+
+\begin_layout Standard
+This variant is
+\emph on
+almost
+\emph default
+ equivalent to
+\family typewriter
+migrate
+\family default
+ followed by
+\family typewriter
+shrink
+\family default
+.
+ The only difference is that cleanup is done
+\emph on
+later
+\emph default
+.
+ This means, more replicas are kept for some longer time.
+ Thus this variant is more safe than doing
+\family typewriter
+migrate
+\family default
+ and
+\family typewriter
+shrink
+\family default
+ separately.
+\end_layout
+
+\begin_layout Standard
+This variant is selected by parameters
+\family typewriter
+migrate_two_phase=0
+\family default
+ and
+\family typewriter
+migrate_always_all=1
+\family default
+ and
+\family typewriter
+migrate_early_cleanup=0
+\family default
+:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Tabular
+
+
+
+
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "14col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+FULL
+\begin_inset Newline newline
+\end_inset
+
+migrate+shrink
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+SRC
+\begin_inset Newline newline
+\end_inset
+
+Primary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+SRC
+\begin_inset Newline newline
+\end_inset
+
+Secondary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+DST
+\begin_inset Newline newline
+\end_inset
+
+Primary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+DST
+\begin_inset Newline newline
+\end_inset
+
+Secondary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "12col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+# Replicas
+\begin_inset Newline newline
+\end_inset
+
+old_size
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "12col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+# Replicas
+\begin_inset Newline newline
+\end_inset
+
+new_size
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Start
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Migrate x 2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+4
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Start
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\color inherit
++ (1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+4
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Working
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1) +
+\color red
+ 1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(4)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Finished
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1) +
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1) + 1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(4)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+After Cleanup
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+There is a variant which does early cleanup, which is roughly equivalent
+ to doing a standalone
+\family typewriter
+migrate
+\family default
+ followed by a standalone
+\family typewriter
+shrink
+\family default
+.
+
+\end_layout
+
+\begin_layout Standard
+This variant is selected by parameters
+\family typewriter
+migrate_two_phase=0
+\family default
+ and
+\family typewriter
+migrate_always_all=1
+\family default
+ and
+\family typewriter
+migrate_early_cleanup=1
+\family default
+.
+ It is less safe because it keeps less replicas and is thus less recommended:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Tabular
+
+
+
+
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "14col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+Sequential
+\begin_inset Newline newline
+\end_inset
+
+migrate+shrink
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+SRC
+\begin_inset Newline newline
+\end_inset
+
+Primary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+SRC
+\begin_inset Newline newline
+\end_inset
+
+Secondary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+DST
+\begin_inset Newline newline
+\end_inset
+
+Primary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+DST
+\begin_inset Newline newline
+\end_inset
+
+Secondary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "12col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+# Replicas
+\begin_inset Newline newline
+\end_inset
+
+old_size
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "12col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+# Replicas
+\begin_inset Newline newline
+\end_inset
+
+new_size
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Start
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Migrate x 2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+4
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Early Cleanup
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Start
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\color inherit
++ (1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Working
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1) +
+\color red
+ 1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(2)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Finished
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1) +
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1) + 1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(2)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+After Cleanup
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+Stepwise
+\family typewriter
+migrate+shrink
+\end_layout
+
+\begin_layout Standard
+This variant is useful for hardware lifecycle.
+ The uplink of the old hardware is only loaded with creation of 1 replica
+ in migration step 1.
+ Step 2 creates then another replica at the new hardware, which should have
+ a better replication network.
+
+\end_layout
+
+\begin_layout Standard
+This variant is selected by parameters
+\family typewriter
+migrate_two_phase=1
+\family default
+ and
+\family typewriter
+migrate_always_all=1
+\family default
+ and
+\family typewriter
+migrate_early_cleanup=0
+\family default
+:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Tabular
+
+
+
+
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "14col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+2-Step
+\begin_inset Newline newline
+\end_inset
+
+migrate+shrink
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+SRC
+\begin_inset Newline newline
+\end_inset
+
+Primary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+SRC
+\begin_inset Newline newline
+\end_inset
+
+Secondary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+DST
+\begin_inset Newline newline
+\end_inset
+
+Primary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+DST
+\begin_inset Newline newline
+\end_inset
+
+Secondary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "12col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+# Replicas
+\begin_inset Newline newline
+\end_inset
+
+old_size
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "12col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+# Replicas
+\begin_inset Newline newline
+\end_inset
+
+new_size
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Start
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Migrate 1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+3
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Migrate 2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+4
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Start
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\color inherit
++ (1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+4
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Working
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1) +
+\color red
+ 1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(4)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Finished
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1) +
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1) + 1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(4)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+After Cleanup
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+This variant can also be combined with early cleanup.
+ The result is similar to above.
+ The only difference is that the second additional replica is created at
+ the new hardware.
+
+\end_layout
+
+\begin_layout Standard
+This variant is selected by parameters
+\family typewriter
+migrate_two_phase=1
+\family default
+ and
+\family typewriter
+migrate_always_all=0
+\family default
+ and
+\family typewriter
+migrate_early_cleanup=
+\family default
+1.
+ Again, this variant is less safe and therefore less recommended.
+\end_layout
+
+\begin_layout Standard
+However, it keeps at least 2 (backup) replicas all the time and thus could
+ be an alternative when decommissioning of old hardware is time-critical.
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Tabular
+
+
+
+
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "14col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+2-Step
+\begin_inset Newline newline
+\end_inset
+
+migrate+shrink
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+SRC
+\begin_inset Newline newline
+\end_inset
+
+Primary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+SRC
+\begin_inset Newline newline
+\end_inset
+
+Secondary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+DST
+\begin_inset Newline newline
+\end_inset
+
+Primary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+DST
+\begin_inset Newline newline
+\end_inset
+
+Secondary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "12col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+# Replicas
+\begin_inset Newline newline
+\end_inset
+
+old_size
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "12col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+# Replicas
+\begin_inset Newline newline
+\end_inset
+
+new_size
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Start
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Migrate 1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+3
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Migrate 2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+4
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Early Cleanup
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Start
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\color inherit
++ (1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Working
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1) +
+\color red
+ 1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(2)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Finished
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1) +
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1) + 1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(2)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+After Cleanup
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+FAST
+\family typewriter
+migrate+shrink
+\end_layout
+
+\begin_layout Standard
+This variant tries to keep a balance by not creating too many unnecessary
+ replicas and to reduce network traffic.
+\end_layout
+
+\begin_layout Standard
+This variant is selected by parameters
+\family typewriter
+migrate_two_phase=0
+\family default
+ and
+\family typewriter
+migrate_always_all=0
+\family default
+ and
+\family typewriter
+migrate_early_cleanup=0
+\family default
+:
+\end_layout
+
+\begin_layout Standard
+\noindent
+\align center
+\begin_inset Tabular
+
+
+
+
+
+
+
+
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "14col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+FAST
+\begin_inset Newline newline
+\end_inset
+
+migrate+shrink
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+SRC
+\begin_inset Newline newline
+\end_inset
+
+Primary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+SRC
+\begin_inset Newline newline
+\end_inset
+
+Secondary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+DST
+\begin_inset Newline newline
+\end_inset
+
+Primary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "10col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+DST
+\begin_inset Newline newline
+\end_inset
+
+Secondary
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "12col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+# Replicas
+\begin_inset Newline newline
+\end_inset
+
+old_size
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+\begin_inset Box Frameless
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 0
+use_makebox 0
+width "12col%"
+special "none"
+height "1in"
+height_special "totalheight"
+thickness "0.4pt"
+separation "3pt"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+# Replicas
+\begin_inset Newline newline
+\end_inset
+
+new_size
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Start
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Migrate x 1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+3
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Start
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\color inherit
++ (1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+3
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Working
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1) +
+\color red
+ 1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(3)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+Shrink Finished
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(1) +
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+(3)
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+After Cleanup
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\color red
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+ |
+
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+ |
+
+
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+As before, this could
+\emph on
+theoretically
+\emph default
+ be combined with early cleanup.
+ Such a combination is however not recommended because there is one intermediate
+ step where all existing replicas are at the DST primary, and thus this
+ one machine must not fail.
+\end_layout
+
+\begin_layout Chapter
+GNU Free Documentation License
+\begin_inset CommandInset label
+LatexCommand label
+name "chap:GNU-FDL"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\noindent
+
+\family typewriter
+\size footnotesize
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+lstinputlisting{fdl.txt}
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_body
+\end_document