From 19e729d2b186ac6ff4f3fd471262e0b3213c88fb Mon Sep 17 00:00:00 2001 From: Thomas Schoebel-Theuer Date: Wed, 28 Aug 2019 10:21:55 +0200 Subject: [PATCH] doc: prepare split by copying the old manual --- docu/mars-architecture-guide.lyx | 60260 +++++++++++++++++++++++++++++ 1 file changed, 60260 insertions(+) create mode 100644 docu/mars-architecture-guide.lyx diff --git a/docu/mars-architecture-guide.lyx b/docu/mars-architecture-guide.lyx new file mode 100644 index 00000000..173a0c00 --- /dev/null +++ b/docu/mars-architecture-guide.lyx @@ -0,0 +1,60260 @@ +#LyX 2.3 created this file. For more info see +\lyxformat 544 +\begin_document +\begin_header +\save_transient_properties true +\origin unavailable +\textclass scrreprt +\begin_preamble +\usepackage{listings} +\end_preamble +\options abstracton,dvipsnames +\use_default_options true +\begin_modules +customHeadersFooters +enumitem +fixltx2e +\end_modules +\maintain_unincluded_children false +\language english +\language_package default +\inputencoding auto +\fontencoding global +\font_roman "default" "default" +\font_sans "default" "default" +\font_typewriter "default" "default" +\font_math "auto" "auto" +\font_default_family rmdefault +\use_non_tex_fonts false +\font_sc false +\font_osf false +\font_sf_scale 100 100 +\font_tt_scale 100 100 +\use_microtype false +\use_dash_ligatures false +\graphics default +\default_output_format default +\output_sync 0 +\bibtex_command default +\index_command default +\paperfontsize 10 +\spacing single +\use_hyperref true +\pdf_title "MARS Manual" +\pdf_author "Thomas Schöbel-Theuer" +\pdf_bookmarks true +\pdf_bookmarksnumbered false +\pdf_bookmarksopen false +\pdf_bookmarksopenlevel 1 +\pdf_breaklinks true +\pdf_pdfborder true +\pdf_colorlinks true +\pdf_backref false +\pdf_pdfusetitle true +\papersize a4paper +\use_geometry true +\use_package amsmath 1 +\use_package amssymb 1 +\use_package cancel 1 +\use_package esint 1 +\use_package mathdots 1 +\use_package mathtools 1 +\use_package mhchem 1 +\use_package stackrel 1 +\use_package stmaryrd 1 +\use_package undertilde 1 +\cite_engine basic +\cite_engine_type default +\biblio_style plain +\use_bibtopic false +\use_indices false +\paperorientation portrait +\suppress_date false +\justification true +\use_refstyle 1 +\use_minted 0 +\index Index +\shortcut idx +\color #008000 +\end_index +\leftmargin 3.7cm +\topmargin 2.7cm +\rightmargin 2.8cm +\bottommargin 2.3cm +\secnumdepth 3 +\tocdepth 3 +\paragraph_separation indent +\paragraph_indentation default +\is_math_indent 0 +\math_numbering_side default +\quotes_style english +\dynamic_quotes 0 +\papercolumns 1 +\papersides 2 +\paperpagestyle headings +\tracking_changes false +\output_changes false +\html_math_output 0 +\html_css_as_file 0 +\html_be_strict false +\end_header + +\begin_body + +\begin_layout Title + +\family typewriter +MARS Manual +\begin_inset Newline newline +\end_inset + + +\begin_inset space ~ +\end_inset + + +\end_layout + +\begin_layout Subtitle +Multiversion Asynchronous Replicated Storage +\begin_inset Newline newline +\end_inset + + +\begin_inset space ~ +\end_inset + + +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/earth-mars-transfer.fig + width 70col% + +\end_inset + + +\end_layout + +\begin_layout Author +Thomas Schöbel-Theuer ( +\family typewriter +\family default +) +\end_layout + +\begin_layout Date +Version 0.1a-72 +\end_layout + +\begin_layout Lowertitleback +\noindent +Copyright (C) 2013-16 Thomas Schöbel-Theuer +\begin_inset Newline newline +\end_inset + +Copyright (C) 2013-16 1&1 Internet AG (see +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + + shortly called 1&1 in the following). +\begin_inset Newline newline +\end_inset + + +\size footnotesize +Permission is granted to copy, distribute and/or modify this document under + the terms of the GNU Free Documentation License, Version 1.3 or any later + version published by the Free Software Foundation; with no Invariant Sections, + no Front-Cover Texts, and no Back-Cover Texts. + A copy of the license is included in the section entitled +\begin_inset Quotes eld +\end_inset + + +\begin_inset CommandInset ref +LatexCommand nameref +reference "chap:GNU-FDL" + +\end_inset + + +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Abstract + +\family typewriter +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +sloppy +\end_layout + +\end_inset + + MARS +\family default + is a block-level storage replication system for long distances / flaky + networks under GPL. + It runs as a Linux kernel module. + The sysadmin interface is similar to DRBD +\begin_inset Foot +status open + +\begin_layout Plain Layout +Registered trademarks are the property of their respective owner. +\end_layout + +\end_inset + +, but its internal engine is completely different from DRBD: it works with + +\series bold +transaction logging +\series default +, similar to some database systems. +\end_layout + +\begin_layout Abstract +Therefore, MARS can provide stronger +\series bold +consistency guarantees +\series default +. + Even in case of network bottlenecks / problems / failures, the secondaries + may become outdated (reflect an elder state), but never become inconsistent. + In contrast to DRBD, MARS preserves the +\series bold +order of write operations +\series default + even when the network is flaky ( +\series bold +Anytime Consistency +\series default +). +\end_layout + +\begin_layout Abstract +The current version of MARS supports +\begin_inset Formula $k>2$ +\end_inset + + replicas and works +\series bold +asynchronously +\series default +. + Therefore, application performance is completely decoupled from any network + problems. + Future versions are planned to also support synchronous or near-synchronous + modes. +\end_layout + +\begin_layout Abstract +MARS supports a new method for building Cloud Storage / Software Defined + Storage, called +\series bold +LV Football +\series default +. +\end_layout + +\begin_layout Abstract +It comes with some automation scripts, leading to a similar functionality + than Kubernetes, but devoted to stateful LVs over +\series bold +virtual LVM pools +\series default + in the petabytes range. +\end_layout + +\begin_layout Abstract +\paragraph_spacing double +\noindent +\begin_inset space ~ +\end_inset + + +\begin_inset Newline newline +\end_inset + + +\begin_inset space ~ +\end_inset + + +\begin_inset Newline newline +\end_inset + + +\begin_inset Box Frameless +position "c" +hor_pos "c" +has_inner_box 1 +inner_pos "c" +use_parbox 0 +use_makebox 1 +width "100col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/earth-mars-transfer.fig + width 70col% + +\end_inset + + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset CommandInset toc +LatexCommand tableofcontents + +\end_inset + + +\end_layout + +\begin_layout Chapter +Architectures of Cloud Storage / Software Defined Storage / Big Data +\begin_inset CommandInset label +LatexCommand label +name "chap:Cloud-Storage" + +\end_inset + + +\end_layout + +\begin_layout Standard +Datacenter architects have no easy job. + Building up some petabytes of data in the wrong way can easily endanger + a company, as will be shown later. + There are some architectural laws to know and some rules to follow. +\end_layout + +\begin_layout Standard +First, we need to take a look at the most general possibilities how storage + can be architecturally designed: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/storage-classification.fig + width 80col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +The topmost question is: do we always need to access bigger masses of (typically + unstructured) data over a network? +\end_layout + +\begin_layout Standard +There is a common belief that both reliability and scalability could be + only achieved this way. + In the past, local storage has often been viewed as +\begin_inset Quotes eld +\end_inset + +too simple +\begin_inset Quotes erd +\end_inset + + to provide both enterprise grade reliability, and scalability. + In the past, this was sometimes true. +\end_layout + +\begin_layout Standard +However, this picture has changed with the advent of a new +\series bold +load balancing +\series default + method called +\series bold +LV Football +\series default +, see chapter +\begin_inset CommandInset ref +LatexCommand ref +reference "chap:LV-Football" + +\end_inset + +. + We will later review what level of reliability and scalability can be achieved + with each of the fundamental models mentioned here. +\end_layout + +\begin_layout Section +What is Architecture +\begin_inset CommandInset label +LatexCommand label +name "sec:What-is-Architecture" + +\end_inset + + +\end_layout + +\begin_layout Standard +From +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + +: +\end_layout + +\begin_layout Quote +Software architecture refers to the +\series bold +high level structures +\series default + of a software system and the +\series bold +discipline +\series default + of creating such structures and systems. +\end_layout + +\begin_layout Standard +Throughout this paper, the term +\begin_inset Quotes eld +\end_inset + +architecture +\begin_inset Quotes erd +\end_inset + + is strictly separated from +\begin_inset Quotes eld +\end_inset + +implementations +\begin_inset Quotes erd +\end_inset + +. + Any of +\begin_inset Quotes eld +\end_inset + +architecture +\begin_inset Quotes erd +\end_inset + + or +\begin_inset Quotes eld +\end_inset + +implementation +\begin_inset Quotes erd +\end_inset + + can relate to both hard- and software in general. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + +Confusion of +\begin_inset Quotes eld +\end_inset + +architecture +\begin_inset Quotes erd +\end_inset + + with +\begin_inset Quotes eld +\end_inset + +implementation +\begin_inset Quotes erd +\end_inset + + is a major source of ill-designs, which then often cause major product + flaws and/or operational problems. + Be sure to understand the difference. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + +Another source of costly ill-designs is starting with a particular implementatio +n in mind, and not sufficiently reasoning abouts its fundamental architecture. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Recommended best practice is to (1) look at the +\series bold +problem space +\series default +, then (2) consider a +\emph on +set +\emph default + of +\series bold +architectural solution classes +\series default +, and (3) look at the +\series bold +mappings +\series default + between them. + This means: start with +\series bold +architectural requirements +\series default + for a particular +\series bold +application area +\series default + (typically covering +\emph on +multiple +\emph default + use cases), then look at +\series bold +multiple solution architectures +\series default +, and finally go down to a +\series bold +\emph on +set +\series default +\emph default + of potential implementations, but only +\emph on +after +\emph default + the former has been understood. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + +Starting with a particular single solution in mind is almost a +\emph on +guarantee +\emph default + for a non-optimum solution, or even a failed project, or even a disaster + at company level when +\series bold +enterprise-critical mass data +\series default + is involved. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Nevertheless, don't think in waterfall models. + Always work +\series bold +iteratively +\series default + and +\series bold +evolutionary +\series default +, but nevertheless obey the principle that any bug in an architectural ill-desig +n cannot be fixed by the best implementation of the world. + Be sure to understand the fundamental difference between architecture and + its (multiple / alternative) implemenations by their respective +\series bold +reach +\series default +. +\end_layout + +\begin_layout Section +What is +\emph on +Cloud Storage +\begin_inset CommandInset label +LatexCommand label +name "sec:Requirements-for-Cloud" + +\end_inset + + +\end_layout + +\begin_layout Standard +According to a popular definition from +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + + (retrieved June 2018), cloud storage is +\end_layout + +\begin_layout Description +(1) Made up of many +\series bold +distributed resources +\series default +, but still +\series bold +act as one +\series default +. +\end_layout + +\begin_layout Description +(2) Highly +\series bold +fault tolerant +\series default + through redundancy and distribution of data. +\end_layout + +\begin_layout Description +(3) Highly +\series bold +durable +\series default + through the creation of versioned copies. +\end_layout + +\begin_layout Description +(4) Typically +\series bold +eventually consistent +\series default + with regard to data replicas. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Notice that the term +\begin_inset Quotes eld +\end_inset + +network +\begin_inset Quotes erd +\end_inset + + does not occur in this definition. + However, the term +\begin_inset Quotes eld +\end_inset + +distributed resources +\begin_inset Quotes erd +\end_inset + + is implying +\emph on +some(!) +\emph default + kind of network. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Important! The definition does +\emph on +not +\emph default + imply some +\emph on +specific +\emph default + type of network, such as a +\series bold +storage network +\series default + which must be capable of transporting masses of IO operations in +\series bold +realtime +\series default +. + We are free to use other types of networks, such as +\series bold +replication networks +\series default +, which need not be dimensioned for realtime IO traffic, but are usable + for +\series bold +background data migration +\series default +, and even over long distances, where the network typically has some bottlenecks. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Notice that the definition says nothing about the +\series bold +time scale +\series default + of operations +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice: go down to a time scale of microseconds. + You will then notice that typical IO operations will require several hundreds + of machine instructions between IO request +\emph on +submission +\emph default + and the corresponding IO request +\emph on +completion +\emph default +. + This is not only true for local IO. + In network clusters like Ceph, it will even involve creation of network + packets, and lead to additional IO latencies implied by the network packet + transfer latencies. +\end_layout + +\end_inset + +. + We are free to implement certain operations, such as background data migrations +, in a rather long timescale (from a human point of view). + Example: increasing the number of replicas in an operational Ceph cluster, + already containing a few hundreds of terabytes of data, will not only require + additional storage hardware, but also take a rather long time, implied + by the very nature of such reorganisational tasks. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +The famous CAP theorem is one of the motivations behind requirement (4) + +\begin_inset Quotes eld +\end_inset + +eventually consistent +\begin_inset Quotes erd +\end_inset + +. + This is not an accident. + There is a +\emph on +reason +\emph default + for it, although it is not a +\emph on +hard +\emph default + requirement. + Strict consistency is not needed for many applications running on top of + cloud storage. + In addition, the CAP theorem and some other theorems cited at +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + + are telling us that Strict Consistency would be +\series bold + difficult and expensive +\series default + to achieve at global level in a bigger Distributed System, and at the cost + of other properties. + More detailed explanations are in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Explanation-via-CAP" + +\end_inset + +. +\end_layout + +\begin_layout Standard +There are some consequences from this definition of Cloud Storage, for each + of our high-level storage architectures: +\end_layout + +\begin_layout Description +Distributed +\begin_inset space ~ +\end_inset + +Storage, in particular +\family typewriter +BigCluster +\family default + architectures (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Distributed-vs-Local:" + +\end_inset + +): many of them (with few exceptions) are conforming to all of these requirement +s. + Typical granularity are objects, or chunks, or other relatively small units + of data. +\end_layout + +\begin_layout Description +Centralized +\begin_inset space ~ +\end_inset + +Storage: does not conform to (1) and to (4) by definition +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice that sharding on top of CentralStorage is no longer a CentralStorage + model by definition, but a RemoteSharding model according to section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Variants-of-Sharding" + +\end_inset + +. +\end_layout + +\end_inset + +. + By introduction of synchronous or asynchronous replication, it can be made + to +\emph on +almost +\emph default + conform, except for (1) where some concept mismatches remain (probably + resolvable by going to a RemoteSharding model on top of CentralStorage, + where CentralStorage is only a +\emph on +sub-component +\emph default +). + Typical granularity is replication of whole internal storage pools, or + of LVs, or of filesystem instances. +\end_layout + +\begin_layout Description +LocalStorage, and some further models like +\family typewriter +RemoteSharding +\family default + (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Variants-of-Sharding" + +\end_inset + +): +\end_layout + +\begin_deeper +\begin_layout Description +(1) can be achieved at LV granularity with Football (see chapter +\begin_inset CommandInset ref +LatexCommand ref +reference "chap:LV-Football" + +\end_inset + +), which creates a +\series bold +Big Virtual LVM Pool +\series default +. +\end_layout + +\begin_layout Description +(2) can be achieved at disk granularity with local RAID, and at LV granularity + with DRBD or MARS. +\end_layout + +\begin_layout Description +(3) can be achieved at LV granularity with LVM snapshots, and/or ZFS (or + other filesystem) snapshots, and/or above filesystem layer by addition + of classical backup. +\end_layout + +\begin_layout Description +(4) at least +\family typewriter +Eventually Consistent +\family default + or better can be alternatively achieved by +\end_layout + +\begin_deeper +\begin_layout Description +(4a) +\series bold +DRBD +\series default +, which provides +\family typewriter +Strict Consistency +\family default + during +\family typewriter +connected +\family default + state, but works only reliably with passive crossover cables over +\series bold +short distances +\series default + (see CAP theorem in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Explanation-via-CAP" + +\end_inset + +). +\begin_inset Newline newline +\end_inset + +Notice: DRBD violates any type of consistency within your +\emph on +replicas +\emph default + during (automatic) re-sync, and thus does not +\emph on +fully +\emph default + comply with the above definition of cloud storage in a +\emph on +strong +\emph default + sense. + But you can argue at a course time granularity level in order to fix this. +\end_layout + +\begin_layout Description +(4b) +\series bold +MARS +\series default +, which works over +\series bold +long distances +\series default + and provides two different consistency guarantees at different levels, + +\emph on +both at the same time +\emph default +: +\end_layout + +\begin_deeper +\begin_layout Description +locally: +\family typewriter + Strict Consistency +\family default + at local LV granularity, also +\emph on +within +\emph default + each of the LV replicas. +\end_layout + +\begin_layout Description +globally: +\family typewriter +Eventually Consistent +\family default + +\emph on +between +\emph default + different LV replicas (global level). +\begin_inset Newline newline +\end_inset + +The CAP theorem (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Explanation-via-CAP" + +\end_inset + +) says that +\family typewriter +Strict Consistency +\family default + is +\series bold +not possible +\series default + in general at +\emph on +unplanned failover +\emph default + during long-distance network outages (P = Partitioning Tolerance), when + A = Availability is also a requirement. +\begin_inset Newline newline +\end_inset + +However, in case of a +\emph on +planned handover +\emph default +, MARS is also +\family typewriter +Strictly Consistent +\family default + at a global level, but may need some extra time for catching up. +\begin_inset Newline newline +\end_inset + +Notice: global +\family typewriter +Strict Consistency +\family default + is also possible at a +\emph on +coarse timescale +\emph default +, in accordance with the CAP theorem, if you decide to sacrifice A = Availabilit +y during such a network incident by simply +\emph on +not +\emph default + doing a failover action. + Just wait until the network outage is gone, and MARS will automatically + resume +\begin_inset Foot +status open + +\begin_layout Plain Layout +This automatic MARS behaviour is similar to the behaviour of DRBD in such + situations, when DBRD can automatically go to +\family typewriter +disconnected +\family default +-like state, and you are later manually or automatically resuming the DRBD + connection for an incremental re-sync. + MARS does everything automatically because it has no firmly built-in assumption +s about the actual duration of any network communication. +\end_layout + +\end_inset + + everything ASAP, and thus you are using MARS +\emph on +only +\emph default + as a protection against +\series bold +fatal +\series default + storage failures / unplanned +\series bold +disasters +\series default +. +\begin_inset Newline newline +\end_inset + +Notice: A = Availability is +\emph on +not generally +\emph default + required by the above definition of cloud storage, because from a user's + perspective it would not generally make sense in the global internet where + connection loss may anyway occur at any time. + Thus it is a valid operational strategy to +\emph on +not +\emph default + fail-over your LVs during certain major network outages. +\begin_inset Newline newline +\end_inset + +Notice: long-term +\series bold +disaster tolerance +\series default + (e.g. + perpetual loss of some storage nodes during an earthquake) is +\emph on +not +\emph default + modeled by the CAP theorem, but is more or less required by (2) and (3) + from the above definition of cloud storage. +\end_layout + +\end_deeper +\end_deeper +\end_deeper +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Notice: +\family typewriter +BigCluster +\family default + architectures are creating +\emph on +virtual +\emph default + storage pools out of physically distributed storage servers. + For fairness reasons, creation of a big virtual LVM pool, must be considered + as +\emph on +another +\emph default + valid Cloud Storage +\emph on +model +\emph default +, matching the above definition of Cloud Storage. + The main architectural difference is granularity, as explained in section + +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Granularity-at-Architecture" + +\end_inset + +, and the stacking order of sub-components. + Notice that Football is creating +\series bold +location transparency +\series default + inside of the distributed virtual LVM pool. + This is an important (though not always required) basic property of any + type of clusters and/or grids. +\end_layout + +\begin_layout Section +Granularity at Architecture +\begin_inset CommandInset label +LatexCommand label +name "sec:Granularity-at-Architecture" + +\end_inset + + +\end_layout + +\begin_layout Standard +Here are the most important architectural differences between object-based + storages and LV-based (Logical Volume) storages: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +Objects +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +LVs +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Granularity +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +small (typically KiB) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +huge (several TiB) +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Number of instances +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +very high +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +low to medium +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Typical access +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +random keys +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +named +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Update in place +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Resize during operation +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Object support +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +native +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +on top of +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +LV support +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +on top of +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +native +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Filesystem support +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +on top of +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +on top of +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Scalable +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +at cluster +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +both cluster and grid +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Location distances +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +per datacenter / on campus +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +long distances possible +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Centralized pool management +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +per cluster +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +Football uniting clusters +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Easy sharding support +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +cumbersome +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Section +Replication vs Backup +\begin_inset CommandInset label +LatexCommand label +name "sec:Replication-vs-Backup" + +\end_inset + + +\end_layout + +\begin_layout Standard +Intuitively, data backup and data replication are two different solution + classes, addressing different problems. +\end_layout + +\begin_layout Standard +However, there exist descriptions where both solution classes are overlapping, + as well as their corresponding problem classes. + For example, backup as explained in +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + + could be seen as also encompassing some types of storage replications explained + in +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Standard +For a rough comparison of +\emph on +typical +\emph default + implementations, see the following +\emph on +typical +\emph default + differences: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +Backup +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +Replication +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Fast handover (planned) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +no, or cumbersome +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Fast failover (unplanned) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +no, or cumbersome +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Protect for physical failures +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Protect for logical data corruption +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes (partly) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +typically no +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Disaster Recovery Time (MTTR) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +typically (very) slow +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +fast +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +Because of these typical differences, enterprise-critical data typically + deserves +\emph on +both +\emph default + solution classes. +\end_layout + +\begin_layout Standard +Confusion of solution classes and/or their corresponding problem classes + / properties can be harmful to enterprises and to carreers of responsible + persons. +\end_layout + +\begin_layout Subsection +Example: Point-in-time Replication via ZFS Snapshots +\begin_inset CommandInset label +LatexCommand label +name "subsec:Example:-ZFS-Replication" + +\end_inset + + +\end_layout + +\begin_layout Standard +Some ZFS advocates believe that ZFS snapshots, which were originally designed + for backup-like use cases, are also appropriate solutions for achieving + geo-redundancy. + The basic idea is to run incremental ZFS snapshots in an endless loop, + e.g. + via some simple scripts, and expediting to another host where the snapshots + are then applied to another ZFS instance. + When there is less data to be expedited, loop cycle times can go down to + a few seconds. + When much data is written at the primary site, loop cycle times will rise + up. +\end_layout + +\begin_layout Standard +The following table tries to explain why geo-redundancy is not as simple + to achieve as believed, at least without addition of sophisticated additional + means +\begin_inset Foot +status open + +\begin_layout Plain Layout +ZFS advocates often argue with many features which aren't present at other + filesystem types. + The above table shows some dimensions not dealing with properties of local + filesystems, but with +\emph on +problems / tasks +\emph default + arising in long-distance distributed systems involving masses of enterprise-cri +tical storage. +\end_layout + +\end_inset + +: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout +OpenSource Component +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +DRBD +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +MARS +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +ZFS +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Synchronity (in average) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +delay +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +delay * 1.5 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Generic solution +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +FS-specific +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Granularity +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +LVs +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +LVs +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +subvolumes +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Built-in snapshots +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Long distances +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Replication parallelism (per gran.) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $1$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\geq2$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $1$ +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Built-in primary/secondary roles +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +no +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Built-in handover (planned) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +mostly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +no +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Built-in failover (unplanned) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +no +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Built-in data overflow handling +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +unnecessary +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +no, missing +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Unnoticed data loss due to overflow +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +possible +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Split-brain awareness +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +no +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Execute split-brain resolution +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +no +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Protect against illegal data modification +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +no +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +The last item means that ZFS by itself does not protect against amok-running + applications modifiying the secondary (backup) side in parallel to the + replication process (at least not by default). + Workarounds may be possible, but are not easy to create and to test for + enterprise-critical applications. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Notice that zfs snapshots can be combined with DRBD or MARS, because zfs + snapshots are residing at +\emph on +filesystem +\emph default + layer, while DRBD / MARS replicas are located at +\emph on +block +\emph default + layer. + Just create your zpools at the +\emph on +top +\emph default + of DRBD or MARS virtual devices, and import / export them +\emph on +individually +\emph default + upon handover / failover of each LV. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + There is a +\series bold +\emph on +fundamental +\series default +\emph default + difference between zpools and classical RAID / LVM stacked architectures. + Some zfs advocates are propagating zpools as a replacement for both RAID + and LVM. + However, there is a +\series bold +massive difference +\series default + in architecture, as explained in the following example (10 logical resources + over 48 physical spindles), achieving practically the +\series bold +\emph on +same +\series default + zfs snapshot functionality +\emph default + from a user's perspective, but in a different way: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/raid-lvm-architecture.fig + height 6cm + +\end_inset + + +\begin_inset Graphics + filename images/zpool-architecture.fig + height 6cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +When RAID functionality is executed by zfs, it will be located at the +\emph on +top +\emph default + of the hierarchy. + On one hand, this easily allows for different RAID levels for each of the + 10 different logical resources. + On the other hand, this +\emph on +exposes +\emph default + the +\series bold +physical spindle configuration +\series default + to the topmost filesystem layer (48 spindles in this example). + There is no easy way for replication of these +\emph on +physical properties +\emph default + in a larger / heterogenous distributed system, e.g. + when some hardware components are replaced over a longer period of time + (hardware lifecycle, or LV Football as explained in chapter +\begin_inset CommandInset ref +LatexCommand ref +reference "chap:LV-Football" + +\end_inset + +). + Essentially, only replication of +\emph on +logical +\emph default + structures like snapshots remains as the only reasonable option, with its + drawbacks as explained above. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + There is another argument: zfs tries to +\emph on +hide +\emph default + its internal structures and interfaces from the sysadmins, forming a more + or less +\series bold +monolithic +\begin_inset Foot +status open + +\begin_layout Plain Layout +Some sysadmins acting as zfs advocates are reclaiming this as an advantage, + because they need to understand only a single tool for managing +\begin_inset Quotes eld +\end_inset + +everything +\begin_inset Quotes erd +\end_inset + +. + However, this is a short-sighted argument when it comes to +\emph on +true +\emph default + flexibility as offered by a component-based system, where multiple types + of hardware / software RAID, multiple types of LVM functionality, and much + more can be almost orthogonally combined in a very flexible way. +\end_layout + +\end_inset + + architecture +\series default + as seen from outside. + This violates the classical +\emph on +layering rules +\emph default + from Dijkstra. + In contrast, classical LVM-based configurations are +\series bold +component oriented +\series default +, according to the +\series bold +Unix philosophy +\series default +. +\end_layout + +\begin_layout Section +Local vs Centralized Storage +\begin_inset CommandInset label +LatexCommand label +name "sec:Local-vs-Centralized" + +\end_inset + + +\end_layout + +\begin_layout Standard +There is some old-fashioned belief that only centralized storage systems, + as typically sold by commercial storage vendors, could achieve a high degree + of reliability, while local storage were inferior by far. + In the following, we will see that this is only true for an +\series bold +\emph on +unfair +\series default +\emph default + comparison involving different classes of storage systems. +\end_layout + +\begin_layout Subsection +Internal Redundancy Degree +\end_layout + +\begin_layout Standard +Centralized commerical storage systems are typically built up from highly + redundant +\emph on +internal +\emph default + components: +\end_layout + +\begin_layout Enumerate +Redundant power supplies with UPS. +\end_layout + +\begin_layout Enumerate +Redundancy at the storage HDDs / SSDs. +\end_layout + +\begin_layout Enumerate +Redandancy at internal transport busses. +\end_layout + +\begin_layout Enumerate +Redundant RAM / SSD caches. +\end_layout + +\begin_layout Enumerate +Redundant network interfaces. +\end_layout + +\begin_layout Enumerate +Redundant compute heads. +\end_layout + +\begin_layout Enumerate +Redundancy at control heads / management interfaces. +\end_layout + +\begin_layout Standard +What about local hardware RAID controllers? Many people think that these + relatively cheap units were massively inferior at practically each of these + points. + However, please take a +\emph on +really deep +\emph default + look at what classical RAID chip manufacturers like LSI / Avago / Broadcom + and their competitors are offering as configuration variants of their top + notch models. + The following enumeration is in the same order as above (item by item): +\end_layout + +\begin_layout Enumerate +Redundant hardware RAID cards with BBU caches, each with local goldcaps + surviving power outages, their BBU caches cross-coupled via high-speed + interconnects. +\end_layout + +\begin_layout Enumerate +HDD / SSD redundancy: almost any RAID level you can think of. +\end_layout + +\begin_layout Enumerate +Redundant SAS cross-cabling: any head can access any device. +\end_layout + +\begin_layout Enumerate +BBU caches are redundant and cross-coupled, similarly to RDMA. + When SSD caches are added to both cards, you also get redundancy there. +\end_layout + +\begin_layout Enumerate +When using cross-coupled redundant cards, you automatically get redundant + host bus interfaces (HBAs). +\end_layout + +\begin_layout Enumerate +The same story: you also get two independent RAID controller instances which + can do RAID computations independently from each other. + Some implementations do this even in hardware (ASICs). +\end_layout + +\begin_layout Enumerate +Dito: both cards may be plugged into two different servers, thereby creating + redundancy at control level. + As a side effect, you may also get a similar functionality than DRBD. +\end_layout + +\begin_layout Standard +If you compare typical prices for both competing systems, you will notice + a huge difference. + See also section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Cost-Arguments-from" + +\end_inset + +. +\end_layout + +\begin_layout Subsection +Capacity Differences +\end_layout + +\begin_layout Standard +There is another hard-to-die myth: commercial storage would provide higher + capacity. + Please read the data sheets. + It is +\emph on +possible +\emph default + (but not generally recommended) to put several hundreds of spindles into + several external HDD enclosures, and then connect them to a redundant cross-cou +pled pair of RAID controllers via several types of SAS busses. + By filling a rack this way, you can easily reach similar, if not higher + capacities than commercial storage boxes, for a +\emph on +fraction +\emph default + of the price. + +\end_layout + +\begin_layout Standard +However, this is not the recommended way for general use cases (but could + be an option for low demands like archiving). + The big advantage of RAID-based local storage is +\series bold +massive scale-out by sharding, +\series default + as explained in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Distributed-vs-Local:" + +\end_inset + +. +\end_layout + +\begin_layout Subsection +Caching Differences +\end_layout + +\begin_layout Standard +A frequent argument is that centralized storage systems had bigger caches + than local RAID systems. + While this argument is often true, it neglects an important point: +\end_layout + +\begin_layout Standard +Local RAID systems often +\emph on +don't need +\emph default + bigger caches, because they are typically located at the +\emph on +bottom +\emph default + of a cache hierarchy, playing only a +\emph on +particular +\emph default + role in that hierarchy. + There exist +\emph on +further +\emph default + caches which are +\series bold +erronously not considered +\series default + by such an argument! +\end_layout + +\begin_layout Standard +Example, see also section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Performance-Arguments-from" + +\end_inset + + for more details: At 1&1 Shared Hosting Linux (ShaHoLin), a typical LXC + container containing several thousands to tenthousands of customer home + directories, creates a long-term +\emph on +average(!) +\emph default + IOPS load at block layer of about 70 IOPS. + No, this isn't a typo. + It is not 70,000 IOPS. + It is only 70 IOPS. + +\end_layout + +\begin_layout Standard +Linux kernel experts know why I am not kidding. + The standard Linux kernel has two main caches, the Page Cache for file + content, and the Dentry Cache (plus Inode slave cache) for metadata. + Both caches are residing in +\series bold +RAM +\series default +, which is the +\emph on +fastest +\emph default + type of cache you can get. +\end_layout + +\begin_layout Standard +Nowadays, typical servers have several hundreds of gigabytes of RAM, sometimes + even up to terabytes, resulting in an incredible caching behaviour which + can be measured by those people who know how to do it (caution: it can + be easily done wrongly). +\end_layout + +\begin_layout Standard +Many people are neglecting these caches, sometimes not knowing of their + existence, and are falsely assuming that 1 application r +\family typewriter +ead() +\family default + or +\family typewriter +write() +\family default + operation will also lead to 1 IOPS at block layer. + As a consequence, they are demanding 50,000 IOPS or 100,000 or even 1,000,000 + IOPS. +\end_layout + +\begin_layout Standard +Some (but not all) commercial storage systems can deliver similar IOPS rates, + because they have internal RAM caches in the same order of magnitude. + People who are buying such systems are typically falling into some of the + following classes (list is probably incomplete): +\end_layout + +\begin_layout Itemize +some people know this, but price does not matter - the more caches, the + better. + Wasted money for doubled caches does not count for them, or is even viewed + as an advantage to them (personally). + Original citation of an anonymous person: +\begin_inset Quotes eld +\end_inset + +only the best and the most expensive storage is good enough for us +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Itemize +using NFS, which has extremely poor filesystem caching behaviour because + the Linux nfs client implementation does not take full advantage of the + dentry cache. + Sometimes people know this, sometimes not. + It seems that few people have read an important paper on the Linux implementati +on of nfs. + Please search the internet for +\begin_inset Quotes eld +\end_inset + +Why nfs sucks +\begin_inset Quotes erd +\end_inset + + from Olaf Kirch (who is one of the original Linux nfs implementors), and + +\emph on +read +\emph default + it. + Your opinion about nfs might change. +\end_layout + +\begin_layout Itemize +have transactional databases, where high IOPS may be +\emph on +really +\emph default + needed, but +\series bold +\emph on +exceptionally +\series default +\emph default +(!) for this class of application. + For very big enterprise databases like big SAP installations, there may + be a very valid justification for big RAM caches at storage layers. + However: smaller transactional loads, as in webhosting, are +\emph on +often +\emph default + (not always) hammering a +\emph on +low +\emph default + number of +\series bold +hot spots +\series default +, where +\emph on +big +\emph default + caches are not really needed. + Relatively small BBU caches of RAID cards will do it also. + Often people don't notice this because they don't measure the +\series bold +workingset behaviour +\series default + of their application, as could be done for example with +\family typewriter +blkreplay +\family default + (see +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + +). +\end_layout + +\begin_layout Itemize +do not notice that +\emph on +well-tuned +\emph default + filesystem caches over iSCSI are typically demanding much less IOPS, sometimes + by several orders of magnitude, and are wasting money with caches at commercial + boxes they don't need (classical +\series bold +over-engineering +\series default +). +\end_layout + +\begin_layout Standard +Anyway, local storage can be augmented with various types of local caches + with various dimensioning. +\end_layout + +\begin_layout Standard +However, there is no point in accessing the fastest possible type of RAM + cache remotely over a network. + Even expensive hardware-based RDMA (e.g. + over Infiniband) cannot deliver the same performance as +\series bold +directly caching +\series default + your data in the +\series bold +\emph on +same +\emph default + RAM +\series default + where your application is running. + The Dentry Cache in the Linux kernel provides highly optimized +\series bold +shared metadata +\series default + in SMP and NUMA systems (nowadays scaling to more than 100 processor cores), + while the Page Cache provides +\series bold +shared memory +\series default + via hardware MMU. + This is crucial for the performance of classical local filesystems. +\end_layout + +\begin_layout Standard +The physical laws of Einstein and others are telling us that neither this + type of caching, nor its shared memory behaviour, can be transported over + whatever type of network without causing performance degradation. +\end_layout + +\begin_layout Subsection +Latencies and Throughput +\begin_inset CommandInset label +LatexCommand label +name "subsec:Latencies-and-Throughput" + +\end_inset + + +\end_layout + +\begin_layout Standard +First of all: today there exist only a small number of HDD manufacturers + on the world. + The number of SSD manufacturers will likely decline in the long run. + Essentially, commercial storage vendors are more or less selling you the + same HDDs or SSDs as you could buy and deploy yourself. + If at all, there are only some minor technical differences. +\end_layout + +\begin_layout Standard +In the meantime, many people agree to a Google paper that the +\emph on +ratio +\emph default + of market prices (price per terabyte) between HDDs and SSDs are unlikely + to change in a fundamental +\begin_inset Foot +status open + +\begin_layout Plain Layout +In folklore, there exists a +\series bold +fundamental empirical law +\series default +, fuzzily called +\begin_inset Quotes eld +\end_inset + +Storage Pyramid +\begin_inset Quotes erd +\end_inset + + or +\begin_inset Quotes eld +\end_inset + +Memory Hierarchy Law +\begin_inset Quotes erd +\end_inset + + or similar, which is well-known at least in German OS academic circles. + The empirical law (extrapolated from +\series bold +observations +\series default +, similarly to Moore's law) tells us that faster storage technology is always + +\series bold +more expensive +\series default + than slower storage technology, and that capacities of faster storage are + typically always lesser than capacity of slower storage. + This observation has been roughly valid for more than 50 years now. + You can find it in several German lecture scripts. + Unfortunately, the Wikipedia article +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + + (retrieved in June 2018) does not cite this very important fundamental + law about +\series bold +costs +\series default +. + In contrast, the German article +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + + about roughly the same subject is mentioning +\begin_inset Quotes eld +\end_inset + +Kosten +\begin_inset Quotes erd +\end_inset + + which means +\begin_inset Quotes eld +\end_inset + +cost +\begin_inset Quotes erd +\end_inset + +, and +\begin_inset Quotes eld +\end_inset + +teuer +\begin_inset Quotes erd +\end_inset + + which means +\begin_inset Quotes eld +\end_inset + +expensive +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\end_inset + + way during the next 10 years. + Thus, most large-capacity enterprise storage systems are built on top of + HDDs. +\end_layout + +\begin_layout Standard +Typically, HDDs and their mechanics are forming the overall bottleneck. +\end_layout + +\begin_layout Itemize +by construction, a +\emph on +local +\emph default + HDD attached via HBAs or a hardware RAID controller will show the least + +\emph on +additional +\emph default + overhead in terms of +\emph on +additional +\emph default + latencies and throughput degradation caused by the attachment. +\end_layout + +\begin_layout Itemize +When the +\emph on +same +\emph default + HDD is +\emph on +indirectly +\emph default + attached via Ethernet or Infiniband or another rack-to-rack transport, + both latencies and throughput will become worse. + Depending on further factors and influences, the overall bottleneck may + shift to the network. +\end_layout + +\begin_layout Standard +The laws of information transfer are telling us: with increasing distance, + both latencies (laws of Einstein) and throughput (laws of energy needed + for compensation of SNR = signal to noise ratio) are becoming worse. + Distance matters. + And the number of intermediate components, like routers / switches and + their +\series bold +queuing +\series default +, matters too. +\end_layout + +\begin_layout Standard +This means that local storage has +\emph on +always +\emph default + an advantage in front of any attachment via network. + Centralized storages are bound to some network, and thus suffer from disadvanta +ges in terms of latencies and throughput. +\end_layout + +\begin_layout Standard +What is the expected long-term future? Will additional latencies and throughput + of centralized storages become better over time? +\end_layout + +\begin_layout Standard +It is difficult to predict the future. + Let us first look at the past evolution. + The following graphics has taken its numbers from Wikipedia articles +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + + and +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + +, showing that HDD capacities have grown +\series bold +over-proportionally +\series default + by about 2 orders of magnitude over about 30 years, when compared to the + relative growth of network bandwidth. +\end_layout + +\begin_layout Standard +In the following graphics, effects caused by decreasing form factors have + been neglected, which would even +\emph on +amplify +\emph default + the trend. + For fairness, bundling of parallel disks or parallel communication channels +\begin_inset Foot +status open + +\begin_layout Plain Layout +It is easy to see that the slopes of +\family typewriter +HDD.capacity +\family default + vs +\family typewriter +Infiniband.rates +\family default + are different. + Parallelizing by bundling of Infiniband wires will only lift the line a + little upwards, but will not alter its slope in logarithmic scale. + For extrapolated time +\begin_inset Formula $t\rightarrow\infty$ +\end_inset + +, the extrapolated empirical long-term behaviour is rather striking. +\end_layout + +\end_inset + + have been ignored. + All comparisons are in logarithmic y axis scale: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename BitRates/Capacity-BitRate-Comparison.pdf + width 100col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +What does this mean when extrapolated into the future? +\end_layout + +\begin_layout Standard +It means that concentrating more and more capacity into a single rack due + to increasing data density will likely lead to more problems in future. + Accessing more and more data over the network will become increasingly + more difficult when concentrating high-capacity HDDs or SSDs +\begin_inset Foot +status open + +\begin_layout Plain Layout +It is difficult to compare the space density of contemporary SSDs in a fair + way. + There are too many different form factors. + For example, M2 cards are typically consuming even less +\begin_inset Formula $cm^{3}/TB$ +\end_inset + + than classical 2.5 inch form factors. + This trend is likely to continue in future. +\end_layout + +\end_inset + + into the same space volume as before. +\end_layout + +\begin_layout Standard +In other words: centralized storages are no good idea yet, and will likely + become an even worse idea in the future. + +\end_layout + +\begin_layout Standard +Example: there was a major incident at a German web hosting company at the + beginning of the 2000's. + Their entire webhosting main business was running on a single proprietary + highly redundant CentralStorage solution, which failed. + Restore from backup took way too long from the viewpoint of a huge number + of customers, leading to major press attention. + Before this incident, they were the #1 webhoster in Germany. + A few years later, 1&1 was the #1 instead. + You can speculate whether this has to do with the incident. + But anyway, the later geo-redundancy strategy of 1&1 basing on a sharding + model (originally using DRBD, later MARS) was motivated by conclusions + drawn from this incident. +\end_layout + +\begin_layout Standard +Another example: in the 1980s, a CentralStorage +\begin_inset Quotes eld +\end_inset + +dinosaur +\begin_inset Foot +status open + +\begin_layout Plain Layout +With the advent of NVME, SSDs are almost directly driven by DMA. + Accessing any high-speed DMA devices by default via network is a foolish + idea, similarly foolish than playing games via an expensive high-end gamer + graphics cards which is then +\emph on +indirectly +\emph default + attached via RDMA, or even via Ethernet. + Probably no serious gamer would ever +\emph on +try +\emph default + to do that. + But some storage vendors do, for strategic reasons. + Probably for their own survival, their customers are to be misguided to + overlook the blinking red indicators that centralized SSD storage is likely + nothing but an expensive dead end in the history of dinosaur architectures. +\end_layout + +\end_inset + + +\begin_inset Quotes erd +\end_inset + + architecture called SLED = Single Large Expensive Disk was propagated with + huge marketing noise and effort, but its historic fate was predictable + for real experts not bound to particular interests: SLED finally lost against + their contemporary RAID competition. + Nowadays, many people don't even remember the term SLED. +\end_layout + +\begin_layout Standard +Today's future is likely dominated by +\series bold +scaling-out architectures +\series default + like sharding, as explained in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Distributed-vs-Local:" + +\end_inset + +. +\end_layout + +\begin_layout Subsection +Reliability Differences CentralStorage vs Sharding +\begin_inset CommandInset label +LatexCommand label +name "subsec:Reliability-Differences-CentralStorage" + +\end_inset + + +\end_layout + +\begin_layout Standard +In this section, we look at +\emph on +fatal +\emph default + failures only, ignoring temporary failures. + A fatal failure of a storage is an incident which needs to be corrected + by +\series bold +restore from backup +\series default +. +\end_layout + +\begin_layout Standard +By definition, even a +\emph on +highly redundant +\emph default + CentralStorage is +\emph on +nevertheless +\emph default + a SPOF = Single Point of Failure. + This also applies to fatal failures. +\end_layout + +\begin_layout Standard +Some people are incorrectly arguing with redundancy. + However, the problem is that +\emph on +any +\emph default + system, even a highly redundant one, can fail fatally. + There exists no perfect system on earth. + One of the biggest known sources of fatal failure is +\series bold +human error +\series default +. +\end_layout + +\begin_layout Standard +In contrast, sharded storage (for example the LocalSharding model, see also + section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Variants-of-Sharding" + +\end_inset + +) has MPOF = Multiple Points Of Failure. + It is unlikely that many shards are failing fatally at the same time, because + shards are +\emph on +independent +\emph default + +\begin_inset Foot +status open + +\begin_layout Plain Layout +When all shards are residing in the same datacenter, there exists a SPOF + by power loss or other impacts onto the whole datacenter. + However, this applies to both the CentralStorage and to the LocalSharding + model. + In contrast to CentralStorage, LocalSharding can be more easily distributed + over multiple datacenters. +\end_layout + +\end_inset + + from each other by definition (cf paragraph +\begin_inset CommandInset ref +LatexCommand vref +reference "par:Definition-of-Sharding" + +\end_inset + + for disambiguation of terms +\begin_inset Quotes eld +\end_inset + +sharding +\begin_inset Quotes erd +\end_inset + + and +\begin_inset Quotes eld +\end_inset + +shared-nothing +\begin_inset Quotes erd +\end_inset + +). +\end_layout + +\begin_layout Standard +What is the difference from the viewpoint of customers of the services? +\end_layout + +\begin_layout Standard +When a CentralStorage fails fatally, a +\emph on +huge +\emph default + number of customers will be affected for a +\emph on +long +\emph default + time (see the example German webhoster mentioned in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Latencies-and-Throughput" + +\end_inset + +). + Reason: restore from backup will take extremely long because huge masses + of data have to be restored. + MTBF = Mean Time Between Failures is (hopefully) longer thanks to redundancy, + but MTTR = Mean Time To Repair is also very long. +\end_layout + +\begin_layout Standard +With (Local)Sharding, the risk of +\emph on +some +\emph default + fatal incident +\emph on +somewhere +\emph default + in the sharding pool is higher, but the +\series bold +\emph on +size +\series default +\emph default + of such an incident is smaller in three dimensions at the same time: +\end_layout + +\begin_layout Enumerate +There are much +\series bold +less customers affected +\series default + (typically only +\begin_inset Formula $1$ +\end_inset + + shard out of +\begin_inset Formula $n$ +\end_inset + + shards). +\end_layout + +\begin_layout Enumerate + +\series bold +MTTR +\series default + = Mean Time To Repair is typically much better because there is much less + data to be restored. +\end_layout + +\begin_layout Enumerate + +\series bold +Residual risk +\series default + plus resulting fatal damage by +\series bold +un-repairable problems +\series default + is thus lower. +\end_layout + +\begin_layout Standard +What does this mean from the viewpoint of an investor of a big +\begin_inset Quotes eld +\end_inset + +global player +\begin_inset Quotes erd +\end_inset + + company? +\end_layout + +\begin_layout Standard +As is promised by the vendors, let us assume that failure of CentralStorage + might be occurring less frequently. + But +\emph on +when +\emph default + it happens on +\series bold +enterprise-critical mass data +\series default +, the stock exchange value of the affected company will be exposed to a + +\series bold +hazard +\series default +. + This is not bearable from the viewpoint of an investor. +\end_layout + +\begin_layout Standard +In contrast, the (Local)Sharding model is +\emph on +distributing +\emph default + the +\series bold +indispensible incidents +\series default + (because +\series bold +perfect systems do not exist +\series default +, and +\series bold +perfect humans do not exist +\series default +) to a lower number of customers with higher frequency, such that the +\series bold +total impact onto the business +\series default + becomes bearable. +\end_layout + +\begin_layout Standard +Risk analysis of enterprise-critical use cases is summarized in the following + table: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +CentralStorage +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(Local)Sharding +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Probability of +\emph on +some +\emph default + fatal incident +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +lower +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +higher +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +# Customers affected +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +very high +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +very low +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +MTBF per storage +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +higher +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +lower +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +MTTR per storage +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +higher +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +lower +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Unrepairable residual risk +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +higher +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +lower +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Total impact +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +higher +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +lower +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Investor's risk +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\series bold +unbearable +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +stock exchange compatible +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +Summary: CentralStorage is something for +\end_layout + +\begin_layout Itemize +\noindent +Small to medium-sized companies which don't have the +\series bold +manpower +\series default + and the +\series bold +skills +\series default + for professionally building and operating a (Local)Sharding (or similar) + system for their enterprise-critical mass data their business is relying + upon. +\end_layout + +\begin_layout Itemize + +\series bold +\emph on +Monolithic +\emph default + enterprise applications +\series default + like classical SAP which are anyway bound to a specific vendor, where you + cannot select a different solution (so-called +\series bold +Vendor Lock-In +\series default +). +\end_layout + +\begin_layout Itemize +When your application +\series bold +is neither shardable +\series default + by construction (c.f. + section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Distributed-vs-Local:" + +\end_inset + +), or when doing so would be a too high effort, +\series bold +nor going to BigCluster +\begin_inset Foot +status open + +\begin_layout Plain Layout +Theoretically, BigCluster can be used to create 1 single huge remote LV + (or 1 single huge remote FS instance) out of a pool of storage machines. + Double-check, better triple-check that such a +\series bold +big +\emph on +logical +\emph default + SPOF +\series default + is +\emph on +really +\emph default + needed, and cannot be circumvented by any means. + Only in such a case, the current version of MARS cannot help (yet), because + its +\emph on +current +\emph default + +\emph on +focus +\emph default + is on a big number of machines each having relatively small LVs. + At 1&1 ShaHoLin, the biggest LVs are 40TiB at the moment, running for years + now, and bigger ones are certainly possible. + Only when current local RAID technology with external enclosures cannot + easily create a single LV in the petabyte scale, BigCluster is probably + the better solution (c.f. + section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Reliability-Arguments-from" + +\end_inset + +). +\end_layout + +\end_inset + + +\series default + (e.g. + Ceph / Swift / etc, see secion +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Reliability-Arguments-from" + +\end_inset + +) is an option. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + +If you have an +\emph on + already sharded +\emph default + system, e.g. + in webhosting, don't convert it to a non-shardable one, and don't introduce + SPOFs needlessly. + You will introduce +\series bold +technical debts +\series default + which are likely to hurt back somewhen in future! +\end_layout + +\begin_layout Standard +As a real big +\begin_inset Quotes eld +\end_inset + +global player +\begin_inset Quotes erd +\end_inset + +, or as a company being part of such a structure, you should be careful + when listening to +\begin_inset Quotes eld +\end_inset + +marketing drones +\begin_inset Quotes erd +\end_inset + + of proprietary CentralStorage vendors. + Always check your +\emph on +concrete +\emph default + use case. + Never believe in wrongly generalized claims, which are only valid in some + specific context, but do not really apply to your use case. + It could be about your +\emph on +life +\emph default +. +\end_layout + +\begin_layout Subsection +Proprietary vs OpenSource +\begin_inset CommandInset label +LatexCommand label +name "subsec:Proprietary-vs-OpenSource" + +\end_inset + + +\end_layout + +\begin_layout Standard +In theory, the following dimensions are orthogonal to each other: +\end_layout + +\begin_layout Description +Architecture: LocalStorage vs CentralStorage vs DistributedStorage +\end_layout + +\begin_layout Description +Licensing: Proprietary vs OpenSource +\end_layout + +\begin_layout Standard +In practice, however, many vendors of proprietary storage systems are selecting + the CentralStorage model. + This way, they can avoid inter-operability with their competitors. + This opens the door for the so-called +\series bold +Vendor Lock-In +\series default +. +\end_layout + +\begin_layout Standard +In contrast, the OpenSource community is based on +\emph on +cooperation +\emph default +. + Opting for OpenSource means that you can +\series bold +combine and exchange +\series default + numerous +\series bold +components +\series default + with each other. + +\end_layout + +\begin_layout Standard +Key OpenSource players are +\emph on +basing +\emph default + their business on the +\series bold +usefulness +\series default + of their software components for you, their customer. + Please search the internet for further explanations from Eric S. + Raymond. +\end_layout + +\begin_layout Standard +Therefore +\series bold +interoperability +\series default + is a +\emph on +must +\emph default + in the opensource business. + For example, you can relatively easily migrate between DRBD and MARS, forth + and backwards, see section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Setup-Primary-and" + +\end_inset + +. + The +\emph on +generic +\emph default + block devices provided by both DRBD and MARS (and by the kernel LVM2 implementa +tion, and many others +\begin_inset Formula $\ldots$ +\end_inset + +) can interact with zillions of filesystems, VMs, applications, and so forth. +\end_layout + +\begin_layout Standard +Summary: +\series bold +genericity +\series default + is a highly desired property in OpenSource communities, while proprietary + products often try to control their usage by limiting either technical + interoperability at certain layers, and/or legally by contracts. + Trying to do so with OpenSource would make no sense, because +\emph on +you +\emph default +, the customer, are the +\emph on +real +\emph default + king who can +\emph on +really +\emph default + select and combine components. + You can form a +\series bold +really customized system +\series default + to your +\series bold +\emph on +real needs +\series default +\emph default +, not as just promised but not always actually delivered by so-called +\begin_inset Quotes eld +\end_inset + +marketing drones +\begin_inset Quotes erd +\end_inset + + from commercial vendors who are actually prefering the needs of their employer + in front of yours. +\end_layout + +\begin_layout Standard +There is another fundamental difference between proprietary software and + OpenSource: the former is bound to some company, which may +\emph on +vanish +\emph default + from the market. + Commercial storage systems may be +\series bold +discontinued +\series default +. + +\end_layout + +\begin_layout Standard +This can be a serious threat to your business relying on the value of your + data. + In particular, buying storage systems from +\emph on +small +\emph default + vendors may increase this risk +\begin_inset Foot +status open + +\begin_layout Plain Layout +There is a risk of a +\emph on +domino effect +\emph default +: once there is a critical incident on highly redundant CentralStorage boxes + from a particular (smaller) vendor, this may lead to major public media + attention. + This may form the +\emph on +root cause +\emph default + for such a vendor to vanish from the market. + Thus you may be left alone with a buggy system, even if you aren't the + victim of the concrete incident. +\end_layout + +\begin_layout Plain Layout +In contrast, bugs in an OpenSource component can be fixed by a larger community + of interested people, or by yourself if you hire somebody for this. +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Standard +OpenSource is different: it cannot die, even if the individual, or the (small) + company which produced it, does no longer exist. + The sourcecode is in the +\series bold +public +\series default +. + It just could get +\emph on +outdated +\emph default + over time. + However, as long as there is enough public interest, you will always find + somebody who is willing to adapt and to +\emph on +maintain +\emph default + it. + Even if you would be the only one having such an interest, you can +\emph on +hire +\emph default + a maintainer for it, specifically for your needs. + You aren't +\series bold +helpless +\series default +. +\end_layout + +\begin_layout Section +Distributed vs Local: Scalability Arguments from Architecture +\begin_inset CommandInset label +LatexCommand label +name "sec:Distributed-vs-Local:" + +\end_inset + + +\end_layout + +\begin_layout Standard +Datacenters aren't usually operated for fun or for hobby. + Scalability of an +\emph on +architecture +\emph default + (cf section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:What-is-Architecture" + +\end_inset + +) is very important, because it can seriously limit your business. + Overcoming architectural ill-designs can grow extremely cumbersome and + costly. +\end_layout + +\begin_layout Standard +Many enterprise system architects are starting with a particular architecture + in mind, called +\begin_inset Quotes eld +\end_inset + +Big Cluster +\begin_inset Quotes erd +\end_inset + +. + There is a common belief that otherwise +\series bold +scalability +\series default + could not be achieved: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/Architecure_Big_Cluster.pdf + width 100col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +The crucial point is the +\series bold +storage network +\series default + here: +\begin_inset Formula $n$ +\end_inset + + storageservers are interconnected with +\begin_inset Formula $m=O(n)$ +\end_inset + + frontend servers, in order to achieve properties like scalability, failure + tolerance, etc. +\end_layout + +\begin_layout Standard +Since +\emph on +any +\emph default + of the +\begin_inset Formula $m$ +\end_inset + + frontends must be able to access +\emph on +any +\emph default + of the +\begin_inset Formula $n$ +\end_inset + + storages in realtime, the storage network must be dimensioned for +\begin_inset Formula $O(n\cdot m)=O(n^{2})$ +\end_inset + + network connections running in parallel. + Even if the total network throughput is scaling only with +\begin_inset Formula $O(n)$ +\end_inset + +, nevertheless +\begin_inset Formula $O(n^{2})$ +\end_inset + + network connections have to be maintained at connection oriented protocols + and at various layers of the operating software. + The network has to +\emph on +switch +\emph default + the packets from +\begin_inset Formula $n$ +\end_inset + + sources to +\begin_inset Formula $m$ +\end_inset + + destinations (and their opposite way back) in +\series bold +realtime +\series default +. +\end_layout + +\begin_layout Standard +This +\series bold +cross-bar functionality +\series default + in realtime makes the storage network complicated and expensive. + Some further factors are increasing the costs of storage networks: +\end_layout + +\begin_layout Itemize +In order to limit error propagation from other networks, the storage network + is often built as a +\emph on +physically separate +\emph default + = +\emph on +dedicated +\emph default + network. + +\end_layout + +\begin_layout Itemize +Because storage networks are heavily reacting to high latencies and packet + loss, they often need to be dimensioned for the +\series bold +worst case +\series default + (load peaks, packet storms, etc), needing one of the best = typically most + expensive components for reducing latency and increasing throughput. + Dimensioning to the worst case instead of an average case plus some safety + margins is nothing but an expensive +\series bold +overdimensioning +\series default + / +\series bold +over-engineering +\series default +. +\end_layout + +\begin_layout Itemize +When +\series bold +multipathing +\series default + is required for improving fault tolerance of the storage network itself, + these efforts will even +\emph on +double +\emph default +. +\end_layout + +\begin_layout Itemize +When geo-redundancy is required, the total effort may easily more than double + another time because in cases of disasters like terrorist attacks the backup + datacenter must be prepared for taking over for multiple days or weeks. +\end_layout + +\begin_layout Standard +Fortunately, there is an alternative called +\begin_inset Quotes eld +\end_inset + + +\series bold +Sharding Architecture +\series default + +\begin_inset Quotes erd +\end_inset + + or +\begin_inset Quotes eld +\end_inset + + +\series bold +Shared-nothing Architecture +\series default + +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Paragraph +Definition of Sharding +\begin_inset CommandInset label +LatexCommand label +name "par:Definition-of-Sharding" + +\end_inset + + +\end_layout + +\begin_layout Standard +Notice that the term +\begin_inset Quotes eld +\end_inset + +Sharding +\begin_inset Quotes erd +\end_inset + + originates from database architecture +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + + where it has a slightly different meaning than used here. + Our usage of the term +\begin_inset Quotes eld +\end_inset + +sharding +\begin_inset Quotes erd +\end_inset + + reflects slightly different situations in some webhosting companies +\begin_inset Foot +status open + +\begin_layout Plain Layout +According to +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + +, Google also uses the term +\begin_inset Quotes eld +\end_inset + +sharding +\begin_inset Quotes erd +\end_inset + + for a particular +\begin_inset Quotes eld +\end_inset + +shared-nothing architecture +\begin_inset Quotes erd +\end_inset + +. + Although our above definition of +\begin_inset Quotes eld +\end_inset + +sharding +\begin_inset Quotes erd +\end_inset + + does not fully comply with its original meaning, a similar usage by Google + probably means that our usage of the term is not completely uncommon. +\end_layout + +\end_inset + +, and can be certainly transferred to some more application areas. + Our more specific use of the term +\begin_inset Quotes eld +\end_inset + +sharding +\begin_inset Quotes erd +\end_inset + + has the following properties, +\emph on +all at the same time: +\end_layout + +\begin_layout Enumerate +User / customer data is +\series bold +partitioned +\series default +. + This is very similar to database sharding. + However, the original database term also allows +\emph on +some +\emph default + data to remain unpartitioned. + In webhosting, suchalike may exists also, but typically only for +\emph on +system data, +\emph default + like OS images, including large parts of their configuration data. + Suchalike system data is typically +\emph on +replicated +\emph default + from a central +\begin_inset Quotes eld +\end_inset + +golden image +\begin_inset Quotes erd +\end_inset + + in an +\emph on +offline +\emph default + fashion, e.g. + via regular +\family typewriter +rsync +\family default + cron jobs, etc. + Typically, it comprises only of few gigabytes per instance and is mostly + read-only with a slow change rate, while total customer data is typically + in the range of some petabytes with a higher total change rate. +\end_layout + +\begin_layout Enumerate +Servers have +\series bold +no single point of contention +\series default +, and thus are +\series bold +completely independent +\series default + from each other, like in +\series bold +shared-nothing +\series default + architectures +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + +. + However, the original term +\begin_inset Quotes eld +\end_inset + +shared-nothing +\begin_inset Quotes erd +\end_inset + + has also been used for describing +\emph on +replicas +\emph default +, e.g. + DRBD mirrors. + In our context of +\begin_inset Quotes eld +\end_inset + +sharding +\begin_inset Quotes erd +\end_inset + +, the shared-nothing principle +\emph on +only +\emph default + refers to the +\begin_inset Quotes eld +\end_inset + + +\series bold +no single point of contention +\series default + +\begin_inset Quotes erd +\end_inset + + principle at +\emph on +partitioning +\emph default + level, which means it +\emph on +only +\emph default + refers to to the +\emph on +partitioning +\emph default + of the user data, but +\emph on +not +\emph default + to their replicas. + Shared-nothing replicas in the sense of DRBD may be also present (and in + fact they are at 1&1 Shared Hosting Linux), but these replicas are +\emph on +not +\emph default + meant by our usage of the term +\begin_inset Quotes eld +\end_inset + +sharding +\begin_inset Quotes erd +\end_inset + +. + Customer data replicas form an +\emph on +independent +\emph default + dimension called +\begin_inset Quotes eld +\end_inset + +replication layer +\begin_inset Quotes erd +\end_inset + +. + The replication layer also obeys the shared-nothing principle in original + sense, but it is +\emph on +not +\emph default + meant by our term +\begin_inset Quotes eld +\end_inset + +sharding +\begin_inset Quotes erd +\end_inset + + in order to avoid confusion +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice that typically +\family typewriter +BigCluster +\family default + architectures are also abstracting away their replicas when talking about + their architecture. +\end_layout + +\end_inset + + between these two independent dimensions. +\end_layout + +\begin_layout Standard +Our sharding model does not need a dedicated storage network at all, at + least when built and dimensioned properly. + Instead, it +\emph on +should have +\emph default + (but not always needs) a so-called +\series bold +replication network +\series default + which can, when present, be dimensioned much smaller because it does neither + need realtime operations nor scalabiliy to +\begin_inset Formula $O(n^{2})$ +\end_inset + +: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/Architecure_Sharding.pdf + width 100col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +Sharding architectures are extremely well suited when both the input traffic + and the data is +\series bold +already partitioned +\series default +. + For example, when several thousands or even millions of customers are operating + on disjoint data sets, like in web hosting where each webspace is residing + in its own home directory, or when each of millions of mySQL database instances + has to be isolated from its neighbour. + Masses of customers are also appearing at cloud storage applications like + Cloud Filesystems (e.g. + Dropbox or similar). +\end_layout + +\begin_layout Standard +Even in cases when any customer may potentially access any of the data items + residing in the whole storage pool (e.g. + like in a search engine), sharding can be often applied. + The trick is to create some relatively simple content-based dynamic switching + or redirect mechanism in the input network traffic, similar to HTTP load + balancers or redirectors. +\end_layout + +\begin_layout Standard +Only when partitioning of input traffic plus data is not possible in a reasonabl +e way, big cluster architectures as implemented for example in Ceph or Swift + (and partly even possible with MARS when restricted to the block layer) + have a very clear use case. +\end_layout + +\begin_layout Standard +In the following sections, we will see: when sharding is possible, it is + the preferred model due to reliability and cost and performance reasons. + Another good explanation can be found at +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +e/ +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Subsection +Variants of Sharding +\begin_inset CommandInset label +LatexCommand label +name "subsec:Variants-of-Sharding" + +\end_inset + + +\end_layout + +\begin_layout Description +LocalSharding The simplest possible sharding architecture is simply putting + both the storage and the compute CPU power onto the same iron. +\begin_inset Newline newline +\end_inset + +Example: at 1&1 Shared Hosting Linux (ShaHoLin), we have dimensioned several + variants of this. + (a) we are using 1U pizza boxes with local hardware RAID controllers with + fast hardware BBU cache and up 10 local disks for the majority of LXC container + instances where the +\begin_inset Quotes eld +\end_inset + +small-sized +\begin_inset Quotes erd +\end_inset + + customers (up to ~100 GB webspace per customer) are residing. + Since most customers have very small home directories with extremely many + but small files, this is a very cost-efficient model. + (b) less that 1 permille of all customers have > 250 GB (up to 2TB) per + home directory. + For these few customers we are using another dimensioning variant of the + same architecture: 4U servers with 48 high-capacity spindles on 3 RAID + sets, delivering a total PV capacity of ~300 TB, which are then cut down + to ~10 LXC containers of ~30 TB each. +\begin_inset Newline newline +\end_inset + +In order to operate this model at a bigger scale, you should consider the + +\begin_inset Quotes eld +\end_inset + +container football +\begin_inset Quotes erd +\end_inset + + method as described in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Principle-of-Background" + +\end_inset + + and in chapter +\begin_inset CommandInset ref +LatexCommand vref +reference "chap:LV-Football" + +\end_inset + +. +\end_layout + +\begin_layout Description +RemoteSharding This variant needs a (possibly dedicated) storage network, + which is however only +\begin_inset Formula $O(n)$ +\end_inset + + in total. + Each storage server exports a block device over iSCSI (or over another + transport) to at most +\begin_inset Formula $O(k)$ +\end_inset + + dedicated compute nodes where +\begin_inset Formula $k$ +\end_inset + + is some +\series bold +constant +\series default +. +\begin_inset Newline newline +\end_inset + +Hint 1: it is advisable to build this type of storage network with +\series bold +local switches +\series default + and no routers inbetween, in order to avoid +\begin_inset Formula $O(n^{2})$ +\end_inset + +-style network architectures and traffic. + This reduces error propagation upon network failures. + Keep the storage and the compute nodes locally close to each other, e.g. + in the same datacenter room, or even in the same rack. +\begin_inset Newline newline +\end_inset + +Hint 2: additionally, you can provide some (low-dimensioned) backbone for + +\series bold +exceptional(!) +\series default + cross-traffic between the local storage switches. + Don't plan to use any realtime cross-traffic +\emph on +regularly +\emph default +, but only in clear cases of emergency! +\begin_inset Newline newline +\end_inset + +Notice: in this model, a shard typically consists of one storage node plus + +\begin_inset Formula $k+1$ +\end_inset + + or +\begin_inset Formula $k+2$ +\end_inset + + compute servers, introducing some additional failure redundancy +\emph on +within +\emph default + such a shard, while retaining the +\begin_inset Quotes eld +\end_inset + +no single point of contention +\begin_inset Quotes erd +\end_inset + + property +\emph on +between +\emph default + the shards (according to the definition +\begin_inset CommandInset ref +LatexCommand vref +reference "par:Definition-of-Sharding" + +\end_inset + +). +\end_layout + +\begin_layout Description +FlexibleSharding This is a dynamic combination of LocalSharding and RemoteShardi +ng, dynamically re-configurable, as explained below. +\end_layout + +\begin_layout Description +BigClusterSharding The sharding model can also be placed +\series bold +on top of +\series default + a BigCluster model, or possibly +\begin_inset Quotes eld +\end_inset + +internally +\begin_inset Quotes erd +\end_inset + + in such a model, leading to a similar effect. + Whether this makes sense needs some discussion. + It can be used to reduce the +\emph on +logical +\emph default + BigCluster size from +\begin_inset Formula $O(n)$ +\end_inset + + to some +\begin_inset Formula $O(k)$ +\end_inset + +, such that it is no longer a +\begin_inset Quotes eld +\end_inset + +big cluster +\begin_inset Quotes erd +\end_inset + + but a +\begin_inset Quotes eld +\end_inset + +small cluster +\begin_inset Quotes erd +\end_inset + +, and thus reducing the serious problems described in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Reliability-Arguments-from" + +\end_inset + + to some degree. + This could make sense in the following use cases: +\end_layout + +\begin_deeper +\begin_layout Itemize +When you +\series bold +already have +\series default + invested into a big cluster, e.g. + Ceph or Swift, which does not really scale and/or does not really deliver + the expected reliability. + Some possible reasons for this are explained in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Reliability-Arguments-from" + +\end_inset + +. +\end_layout + +\begin_layout Itemize +When you really need a +\emph on +single +\emph default + LV which is necessarily +\series bold +bigger +\series default + than can be reasonably built on top of local LVM. + This means, you are likely claiming that you really need +\series bold +strict consistency +\series default + as provided by a block device on more than 1 PB with current technology + (2018). + Examples are very +\series bold +big enterprise databases +\series default + like classical SAP (c.f. + section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Local-vs-Centralized" + +\end_inset + +), or if you really need +\series bold +POSIX-compliance +\series default + on a single big filesystem instance. + Be conscious when you think this is the only solution to your problem. + Double-check or triple-check whether there is +\emph on +really +\emph default + no other solution than creating such a huge block device and/or such a + huge filesystem instance. + Such huge SPOFs are tending to create similar problems as described in + section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Reliability-Arguments-from" + +\end_inset + + for similar reasons. +\end_layout + +\end_deeper +\begin_layout Standard +When building a +\series bold +new +\series default + storage system, be sure to check the following use cases. + You should seriously consider a LocalSharding / RemoteSharding / FlexibleShardi +ng model in favor of BigClusterSharding when ... +\end_layout + +\begin_layout Itemize +... + when more than 1 LV instance would be placed onto your +\begin_inset Quotes eld +\end_inset + +small cluster +\begin_inset Quotes erd +\end_inset + + shards. + Then a +\series bold +{Local,Remote,Flexible}Sharding +\series default + model could be likely used instead. + Then the total overhead ( +\series bold +total cost of ownership +\series default +) introduced by a BigCluster +\emph on +model +\emph default + but actually stripped down to a +\begin_inset Quotes eld +\end_inset + +SmallCluster +\begin_inset Quotes erd +\end_inset + + +\emph on +implementation / configuration +\emph default + should be examined separately. + Does it really pay off? +\end_layout + +\begin_layout Itemize +... + when there are +\series bold +legal requirements +\series default + that you can tell at any time where your data is. + Typically, this is all else but easy on a BigCluster model, even when stripped + down to SmallCluster size. +\end_layout + +\begin_layout Subsection +FlexibleSharding +\begin_inset CommandInset label +LatexCommand label +name "subsec:FlexibleSharding" + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Notice that MARS' new remote device feature from the 0.2 branch series (which + is kind of replacement for iSCSI) +\emph on +could +\emph default + be used for implementing some sort of +\begin_inset Quotes eld +\end_inset + +big cluster +\begin_inset Quotes erd +\end_inset + + model at block layer. +\end_layout + +\begin_layout Standard +Nevertheless, such models re-introducing some kind of +\begin_inset Quotes eld +\end_inset + +big dedicated storage network +\begin_inset Quotes erd +\end_inset + + into MARS operations are not the preferred model. + Following is the a super-model which combines both the +\begin_inset Quotes eld +\end_inset + +big cluster +\begin_inset Quotes erd +\end_inset + + and sharding model at block layer in a very flexible way. + The following example shows only two servers from a pool consisting of + hundreds or thousands of servers: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/MARS_Cluster_on_Demand.pdf + width 100col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +The idea is to use iSCSI or the MARS remote device +\emph on +only where necessary +\emph default +. + Preferably, local storage is divided into multiple Logical Volumes (LVs) + via LVM, which are +\emph on +directly +\emph default + used +\emph on +locally +\emph default + by Virtual Machines (VMs), such as KVM or filesystem-based variants like + LXC containers. +\end_layout + +\begin_layout Standard +In the above example, the left machine has relatively less CPU power or + RAM than storage capacity. + Therefore, not +\emph on +all +\emph default + LVs could be instantiated locally at the same time without causing operational + problems, but +\emph on +some +\emph default + of them can be run locally. + The example solution is to +\emph on +exceptionally(!) +\emph default + export LV3 to the right server, which has some otherwise unused CPU and + RAM capacity. +\end_layout + +\begin_layout Standard +Notice that local operations of VMs doesn't produce any storage network + traffic at all. + Therefore, this is the preferred runtime configuration. +\end_layout + +\begin_layout Standard +Only in cases of resource imbalance, such as (transient) CPU or RAM peaks + (e.g. + caused by DDOS attacks), +\emph on +some +\emph default + VMs or containers may be run somewhere else over the network. + In a well-balanced and well-dimensioned system, this will be the +\series bold +vast minority +\series default +, and should be only used for dealing with timely load peaks etc. +\end_layout + +\begin_layout Standard +Running VMs directly on the same servers as their storage is a +\series bold +major cost reducer. +\end_layout + +\begin_layout Standard +You simply don't need to buy and operate +\begin_inset Formula $n+m$ +\end_inset + + servers, but only about +\begin_inset Formula $\max(n,m)+m\cdot\epsilon$ +\end_inset + + servers, where +\begin_inset Formula $\epsilon$ +\end_inset + + corresponds to some relative small extra resources needed by MARS. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +In addition to this and to reduced networking costs, there are further cost + savings at power consumption, air conditioning, Height Units (HUs), number + of HDDs, operating costs, etc as explained below in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Cost-Arguments-from" + +\end_inset + +. +\end_layout + +\begin_layout Subsection +Principle of Background Migration +\begin_inset CommandInset label +LatexCommand label +name "subsec:Principle-of-Background" + +\end_inset + + +\end_layout + +\begin_layout Standard +The sharding model needs a different approach to load balancing of storage + space than the big cluster model. + There are serveral possibilities at different layers, each addressing different + +\series bold +granularities +\series default +: +\end_layout + +\begin_layout Itemize +Moving customer data at filesystem or database level via +\family typewriter +rsync +\family default + or +\family typewriter +mysqldump +\family default + or similar. + +\begin_inset Newline newline +\end_inset + +Example: at 1&1 Shared Hosting Linux, we have about 9 millions of customer + home directories. + We also have a script +\family typewriter +\family default + using incremental +\family typewriter +tar +\family default + for their moves. + Now, if we would try to move around +\emph on +all +\emph default + of them this way, it could easily take years or even decades for millions + of extremely small home directories, due to overhead like DNS updates etc. + However, there exist a small handful of large customer home directories + in the terabyte range. + For these, and only for these, it is a clever idea to use +\family typewriter +\family default + because thereby the size of a LV can be regulated more fine grained than + at LV level. +\end_layout + +\begin_layout Itemize +Dynamically growing the sizes of LVs during operations: +\family typewriter +lvresize +\family default + followed by +\family typewriter +marsadm resize +\family default + followed by +\family typewriter +xfs_growfs +\family default + or similar operations. +\end_layout + +\begin_layout Itemize +Moving whole LVs via MARS, as shown in the following example: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/MARS_Background_Migration.pdf + width 100col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +The idea is to dynamically create +\emph on +additional +\emph default + LV replicas for the sake of +\series bold +background migration +\series default +. + Examples: +\end_layout + +\begin_layout Itemize +In case you had no redundancy at LV level before, you have +\begin_inset Formula $k=1$ +\end_inset + + replicas during ordinary operation. + If not yet done, you should transparently introduce MARS into your LVM-based + stack by using the so-called +\begin_inset Quotes eld +\end_inset + +standalone mode +\begin_inset Quotes erd +\end_inset + + of MARS. + When necessary, create the first MARS replica with +\family typewriter +marsadm create-resource +\family default + on your already-existing LV data, which is retained unmodified, and restart + your application again. + Now, for the sake of migration, you just create an additional replica at + another server via +\family typewriter +marsadm join-resource +\family default + there and wait until the second mirror has been fully +\series bold +synced +\series default + in background, while your application is running and while the contents + of the LV is modified +\emph on +in parallel +\emph default + by your ordinary applications. + Then you do a primary +\series bold +handover +\series default + to your mirror. + This is usually a matter of minutes, or even seconds. + Once the application runs again at the new location, you can delete the + old replica via +\family typewriter +marsadm leave-resource +\family default + and +\family typewriter +lvremove +\family default +. + Finally, you may re-use the freed-up space for something else (e.g. + +\family typewriter +lvresize +\family default + of +\emph on +another +\emph default + LV followed by +\family typewriter +marsadm resize +\family default + followed by +\family typewriter +xfs_growfs +\family default + or similar). + For the sake of some hardware lifecycle, you may run a different strategy: + evacuate the original source server completely via the above MARS migration + method, and eventually decommission it. +\end_layout + +\begin_layout Itemize +In case you already have a redundant LV copy somewhere, you should run a + similar procedure, but starting with +\begin_inset Formula $k=2$ +\end_inset + + replicas, and temporarily increasing the number of replicas to either +\begin_inset Formula $k'=3$ +\end_inset + + when moving each replica step-by-step, or you may even directly go up to + +\begin_inset Formula $k'=4$ +\end_inset + + when moving pairs at once. +\begin_inset Newline newline +\end_inset + +Example: see +\family typewriter +\family default + in the +\family typewriter +football/ +\family default + directory of MARS, which is a checkout of the Football sub-project (see + chapter +\begin_inset CommandInset ref +LatexCommand ref +reference "chap:LV-Football" + +\end_inset + +). +\end_layout + +\begin_layout Itemize +When already starting with +\begin_inset Formula $k>2$ +\end_inset + + LV replicas in the starting position, you can do the same analogously, + or you may then use a lesser variant. + For example, we have some mission-critical servers at 1&1 which are running + +\begin_inset Formula $k=4$ +\end_inset + + replicas all the time on relatively small but important LVs for extremely + increased safety. + Only in such a case, you may have the freedom to temporarily decrease from + +\begin_inset Formula $k=4$ +\end_inset + + to +\begin_inset Formula $k'=3$ +\end_inset + + and then going up to +\begin_inset Formula $k''=4$ +\end_inset + + again. + This has the advantage of requiring less temporary storage space for +\emph on +swapping +\emph default + some LVs. +\end_layout + +\begin_layout Section +Cost Arguments +\begin_inset CommandInset label +LatexCommand label +name "sec:Cost-Arguments-from" + +\end_inset + + +\end_layout + +\begin_layout Standard +A common pre-jugdement is that +\begin_inset Quotes eld +\end_inset + +big cluster +\begin_inset Quotes erd +\end_inset + + is the cheapest scaling storage technology when built on so-called +\begin_inset Quotes eld +\end_inset + +commodity hardware +\begin_inset Quotes erd +\end_inset + +. + While this is very often true for the +\begin_inset Quotes eld +\end_inset + +commodity hardware +\begin_inset Quotes erd +\end_inset + + part, it is often not true for the +\begin_inset Quotes eld +\end_inset + +big cluster +\begin_inset Quotes erd +\end_inset + + part. + But let us first look at the +\begin_inset Quotes eld +\end_inset + +commodity +\begin_inset Quotes erd +\end_inset + + part. +\end_layout + +\begin_layout Subsection +Cost Arguments from Technology +\end_layout + +\begin_layout Standard +Here are some rough market prices for basic storage as determined around + end of 2016 / start of 2017: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +Technology +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +Enterprise-Grade +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +Price in € / TB +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +Consumer SATA disks via on-board SATA controllers +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +no (small-scale) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +< 30 possible +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +SAS disks via SAS HBAs (e.g. + in external 14 +\begin_inset Quotes erd +\end_inset + + shelfs) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +halfways +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +< 80 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +SAS disks via hardware RAID + LVM (+DRBD/MARS) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +80 to 150 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +Commercial storage appliances via iSCSI +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +around 1000 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +Cloud storage, S3 over 5 years lifetime +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size small +3000 to 8000 +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +You can see that any self-built and self-administered storage (whose price + varies with slower high-capacity disks versus faster low-capacity disks) + is much cheaper than any commercial offering by about a factor of 10 or + even more. + If you need to operate several petabytes of data, self-built storage is + always cheaper than commercial one, even if additional manpower is needed + for commissioning and operating. + You don't have to pay the shareholders of the storage provider. + Here we just assume that the storage is needed permanently for at least + 5 years, as is the case in web hosting, databases, backup / archival systems, + and many other application areas. +\end_layout + +\begin_layout Standard +Commercial offerings of cloud storage are way too much hyped. + Some people apparently don't know that the generic term +\begin_inset Quotes eld +\end_inset + +Cloud Storage +\begin_inset Quotes erd +\end_inset + + refers to a +\emph on +storage class +\emph default +, not to a particular +\emph on +instance +\emph default + like original Amazon S3, and that it is possible to build and operate almost + any instance of any storage class yourself. + From a commercial perspective, +\series bold +outsourcing +\series default + of +\emph on +huge masses +\emph default +of enterprise-critical storage (to whatever class of storage) usually pays + off +\series bold +only when +\series default + your storage demands are either +\emph on +relatively low +\emph default +, or are +\emph on +extremely +\emph default + varying over time, and/or when you need some +\emph on +extra +\emph default + capacity only +\emph on +temporarily +\emph default + for a +\emph on +very +\emph default + short time. +\end_layout + +\begin_layout Subsection +Cost Arguments from Architecture +\end_layout + +\begin_layout Standard +In addition to basic storage prices, many further factors come into play + when roughly comparing big cluster architectures versus sharding. + The following table bears the +\emph on +unrealistic assumption +\emph default + that BigCluster can be reliably operated with 2 replicas ( +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +the suffix +\begin_inset Formula $\times2$ +\end_inset + + +\family default +\series default +\shape default +\size default +\emph default +\bar default +\strikeout default +\uuline default +\uwave default +\noun default +\color inherit + means with additional geo-redundancy): +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +BC +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +SHA +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +BC +\begin_inset Formula $\times2$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +SHA +\begin_inset Formula $\times2$ +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +# of Disks +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +>200% +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +<120% +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +>400% +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +<240% +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +# of Servers +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times2$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times1.1$ +\end_inset + + possible +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times4$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times2.2$ +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Power Consumption +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times2$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times1.1$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times4$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times2.2$ +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +HU Consumption +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times2$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times1.1$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times4$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times2.2$ +\end_inset + + +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +As shown in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Reliability-Arguments-from" + +\end_inset + +, two replicas are typically not sufficient for BigCluster. + Even addicts of BigCluster are typically recommending 3 replicas in some + so-called +\begin_inset Quotes eld +\end_inset + +best practices +\begin_inset Quotes erd +\end_inset + +, leading to the following more realistic table: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +BC +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +SHA +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +BC +\begin_inset Formula $\times2$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +SHA +\begin_inset Formula $\times2$ +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +# of Disks +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +>300% +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +<120% +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +>600% +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +<240% +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +# of Servers +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times3$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times1.1$ +\end_inset + + possible +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times6$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times2.2$ +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Power Consumption +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times3$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times1.1$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times6$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times2.2$ +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +HU Consumption +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times3$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times1.1$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times6$ +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Formula $\approx\times2.2$ +\end_inset + + +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +The crucial point is not only the number of extra servers needed for dedicated + storage boxes, but also the total number of HDDs. + While big cluster implementations like Ceph or Swift can +\emph on +theoretically +\emph default + use some erasure encoding for avoiding full object replicas, their +\emph on +practice +\emph default + as seen in internal 1&1 Ceph clusters is similar to RAID-10, but just on + objects instead of block-based sectors. +\end_layout + +\begin_layout Standard +Therefore a big cluster typically needs >300% disks to reach the same net + capacity as a simple sharded cluster. + The latter can typically take advantage of hardware RAID-60 with a significantl +y smaller disk overhead, while providing sufficient failure tolerance at + disk level. +\end_layout + +\begin_layout Standard +There is a surprising consequence from this: geo-redundancy is not as expensive + as many people are believing. + It just needs to be built with the proper architecture. + A sharded geo-redundant pool based on hardware RAID-60 (last column +\begin_inset Quotes eld +\end_inset + +SHA +\begin_inset Formula $\times2$ +\end_inset + + +\begin_inset Quotes erd +\end_inset + +) costs typically +\emph on +less +\emph default + than a non-georedundant big cluster with typically needed / recommended + number of replicas (column +\begin_inset Quotes eld +\end_inset + +BC +\begin_inset Quotes erd +\end_inset + +). + A geo-redundant sharded pool provides even better failure compensation + (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Reliability-Arguments-from" + +\end_inset + +). +\end_layout + +\begin_layout Standard +Notice that geo-redundancy implies by definition that an unforeseeable +\series bold +full datacenter loss +\series default + (e.g. + caused by +\series bold +disasters +\series default + like a terrorist attack or an earthquake) must be compensated for +\series bold +several days or weeks +\series default +. + Therefore it is +\emph on +not +\emph default + sufficient to take a big cluster and just spread it to two different locations. +\end_layout + +\begin_layout Standard +In any case, a MARS-based geo-redundant sharding pool is cheaper than using + commercial storage appliances which are much more expensive by their nature. +\end_layout + +\begin_layout Section +Reliability Arguments from Architecture +\begin_inset CommandInset label +LatexCommand label +name "sec:Reliability-Arguments-from" + +\end_inset + + +\end_layout + +\begin_layout Standard +A contemporary common belief is that big clusters and their random replication + methods would provide better reliability than anything else. + There are some practical observations at 1&1 and its daughter companies + which cannot confirm this. +\end_layout + +\begin_layout Standard +Similar experiences are part of a USENIX paper about copysets, see +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + +. + Their proposed solution is different from the solution proposed here, but + interestingly their +\emph on +problem analysis +\emph default + part contains not only similar observations, but also comes to similar + conclusions about random replication. + Citation from the abstract: +\end_layout + +\begin_layout Quote +However, random replication is +\series bold +almost guaranteed +\series default + to lose data in the common scenario of simultaneous node failures due to + cluster-wide power outages. + +\size footnotesize + [emphasis added by me] +\end_layout + +\begin_layout Standard +Stimulated by our practical experiences even in truly less disastrous scenarios + than mass power outage, theoretical explanations were sought. + Surprisingly, they show that LocalSharding is superior to true big clusters + under practically important preconditions. + Here is an intutitive explanation. + A detailed mathematical description of the model can be found in appendix + +\begin_inset CommandInset ref +LatexCommand vref +reference "chap:Mathematical-Model-of" + +\end_inset + +. +\end_layout + +\begin_layout Subsection +Storage Server Node Failures +\end_layout + +\begin_layout Subsubsection +Simple intuitive explanation +\end_layout + +\begin_layout Standard +Block-level replication systems like DRBD are constructed for failover in + local redundancy scenarios. + Or, when using MARS, even for geo-redundant failover scenarios. + They are traditionally dealing with +\series bold +pairs +\series default + of servers, or with triples, etc. + In order to get a storage incident with them, +\emph on +both +\emph default + sides of a DRBD or MARS small-cluster (also called +\series bold +shard +\series default + in section +\begin_inset CommandInset ref +LatexCommand vref +reference "par:Definition-of-Sharding" + +\end_inset + +) must have an incident +\emph on +at the same time +\emph default +. +\end_layout + +\begin_layout Standard +In contrast, big clusters are conceptually spreading their objects over + a huge number of nodes +\begin_inset Formula $O(n)$ +\end_inset + +, with some redundancy degree +\begin_inset Formula $k$ +\end_inset + + denoting the number of replicas. + As a consequence, +\emph on +any +\emph default + +\begin_inset Formula $k$ +\end_inset + + node failures out of +\begin_inset Formula $O(n)$ +\end_inset + + will produce an incident. + For example, when +\begin_inset Formula $k=2$ +\end_inset + + and +\begin_inset Formula $n$ +\end_inset + + is equal for both models, then +\emph on +any +\emph default + combination to two node failures occurring at the same time will lead to + an incident: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/Incident_Probabilities.pdf + width 100col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +Intuitively, it is easy to see that hitting both members of the same pair + at the same time is less likely than hitting +\emph on +any +\emph default + two nodes of a big cluster. +\end_layout + +\begin_layout Standard +If you are curious about some concrete numbers, read on. +\end_layout + +\begin_layout Subsubsection +Detailed explanation +\begin_inset CommandInset label +LatexCommand label +name "sub:Detailed-explanation" + +\end_inset + + +\end_layout + +\begin_layout Standard +For the sake of simplicity, the following more detailed explanation is based + on the following assumptions: +\end_layout + +\begin_layout Itemize +We are looking at +\series bold +storage node +\series default + failures only. +\end_layout + +\begin_layout Itemize +Disk failures are regarded as already solved (e.g. + by local RAID-6 or by the well-known compensation mechanisms of big clusters). + Only in case they don't work, they are mapped to node failures, and are + already included in the probability of storage node failures. +\end_layout + +\begin_layout Itemize +We only look at +\series bold +data replication +\series default + with a redundancy degree of a relatively small +\begin_inset Formula $k$ +\end_inset + +. + CRC methods are not used across storage nodes, but may be present +\emph on +internally +\emph default + at some storage nodes, e.g. + RAID-5 or RAID-6 or similar methods. + Notice that CRC methods generally involve very high overhead, and even + won't work in realtime across long distances (geo-redundancy). +\end_layout + +\begin_layout Itemize +We restrict ourselves to temporary / +\series bold +transient +\series default + failures, without regarding permanent data loss. + Otherwise, the differences between local-storage sharding architectures + and big clusters would become even worse. + When loosing some physical storage nodes forever in a big cluster, it is + typically all else but easy to determine which data of which application + instances / customers have been affected, and which will need a restore + from backup. +\end_layout + +\begin_layout Itemize +Storage network failures (as a whole) are ignored. + Otherwise a fair comparison between the architectures would become difficult. + If they were taken into account, the advantages of LocalSharding would + become even bigger. +\end_layout + +\begin_layout Itemize +We assume that the storage network (when present) forms no bottleneck. + Network implementations like TCP/IP versus Infiniband or similar are thus + ignored. +\end_layout + +\begin_layout Itemize +Software failures / bugs are also ignored. + We only compare +\emph on +architectures +\emph default + here, not their various implementations. +\end_layout + +\begin_layout Itemize +The x axis shows the number of basic storage units +\begin_inset Formula $n$ +\end_inset + + from an +\emph on +application +\emph default + perspective, meaning +\begin_inset Quotes eld +\end_inset + +usable storage +\begin_inset Quotes erd +\end_inset + + or +\begin_inset Quotes eld +\end_inset + +net amount of storage +\begin_inset Quotes erd +\end_inset + +. + For simplicitiy of the model, one basic application storage unit equals + to the total disk space provided by one physical storage node in the special + case of +\begin_inset Formula $k=1$ +\end_inset + + replicas. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Attention! when increasing the number of replicas +\begin_inset Formula $k$ +\end_inset + +, the total number of storage nodes needs to be +\series bold +increased accordingly +\series default +. + Typically, you will need to deploy +\begin_inset Formula $k\cdot n$ +\end_inset + + physical storage nodes in order to get +\begin_inset Formula $n$ +\end_inset + + net storage units from a user's perspective. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Attention! +\begin_inset space ~ +\end_inset + + +\begin_inset Formula $k$ +\end_inset + + has a strong influence at the +\series bold +price tag +\series default + of any of the competing architectures. + You cannot assume an +\begin_inset Quotes eld +\end_inset + +infinite amount of money +\begin_inset Quotes erd +\end_inset + +. + Therefore, only relatively small +\begin_inset Formula $k$ +\end_inset + + are bearable for business cases. +\end_layout + +\begin_layout Itemize +We assume that the number of application instances is linearly scaling with + +\begin_inset Formula $n$ +\end_inset + +. + For simplicity, we assume that the number of applications running on the + whole pool is exactly +\begin_inset Formula $n$ +\end_inset + +. +\end_layout + +\begin_layout Itemize +We assume that the storage nodes are (almost completely) filled with data + (sectors with RAID, and/or objects with BigCluster). +\end_layout + +\begin_layout Itemize +We assume that the number of sectors / objects per storage node is +\begin_inset Quotes eld +\end_inset + +very large +\begin_inset Quotes erd +\end_inset + +. + Some examples: a logical volume of 4 TB has 1,000,000,000 sectors or object, + each 4 KB in size. + A physical storage node providing 40 TB of storage will then provide 10 + billions of sectors / objects. +\end_layout + +\begin_layout Itemize +For the BigCluster architecture, we assume that all objects are always distribut +ed to +\begin_inset Formula $O(n)$ +\end_inset + + nodes. + For simiplicy of the model, we assume a distribution via a +\emph on +uniform +\emph default + hash function. + When other hash functions were used (e.g. + distributing only to a constant number of nodes), it would no longer be + a big cluster architecture in our sense. +\begin_inset Newline newline +\end_inset + +In the following example, we assume a uniform object distribution to exactly + +\begin_inset Formula $n$ +\end_inset + + nodes. + Notice that any other +\begin_inset Formula $n'=O(n)$ +\end_inset + + with +\begin_inset Formula $n' + + + + + + +\begin_inset Text + +\begin_layout Plain Layout +LocalSharding +\size tiny +(DRBDorMARS) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +A up +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +A down +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +B up +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +0 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +B down +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +hfill +\end_layout + +\end_inset + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout +BigCluster +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +A up +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +A down +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +B up +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +0 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +B down +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +hfill +\end_layout + +\end_inset + + +\begin_inset space ~ +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +What is the heart of the difference? While a node failure at LocalSharding + (DRBDorMARS) will tear down only the local application, the teardown produced + by BigCluster will spread to +\emph on +all +\emph default + of the +\begin_inset Formula $n=2$ +\end_inset + + application units, because of the uniform hashing and because we have only + +\begin_inset Formula $k=1$ +\end_inset + + replica. +\end_layout + +\begin_layout Standard +Would it help to increase both +\begin_inset Formula $n$ +\end_inset + + and +\begin_inset Formula $k$ +\end_inset + + to larger values? +\end_layout + +\begin_layout Standard +In the following graphics, the thick red line shows the behaviour for +\begin_inset Formula $k=1$ +\end_inset + + PlainServers (which is the same as +\begin_inset Formula $k=1$ +\end_inset + + DRBDorMARS) with increasing number of storage units +\begin_inset Formula $n,$ +\end_inset + + ranging from 1 to 10,000 storage units = number of servers for +\begin_inset Formula $k=1$ +\end_inset + +. + Higher values of +\begin_inset Formula $k\in[1,4]$ +\end_inset + + are also displayed. + All lines corresponding to the same +\begin_inset Formula $k$ +\end_inset + + are drawn in the same color. + Notice that both the x and y axis are logscale: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/SERVICE_Comparison_of_Reversible_StorageNode_Failures.pdf + lyxscale 200 + width 100col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +When you look at the thin solid BigCluster lines for +\begin_inset Formula $k=2,\ldots$ +\end_inset + + drawn in different colors, you may wonder why they are alltogether converging + to the thin red BigCluster line, which corresponds to +\begin_inset Formula $k=1$ +\end_inset + + BigCluster. + And they also converge against the grey dotted topmost line indicating + the total possible uptime of all applications (depending on x). + It can be explained as follows: +\end_layout + +\begin_layout Standard +The x axis shows the number of basic storage units. + When you have to create 10,000 storage units with a replication degree + of +\begin_inset Formula $k=2$ +\end_inset + + replicas, then you will have to deploy +\begin_inset Formula $k*10,000=20,000$ +\end_inset + + servers in total. + When operating a pool of 20,000 servers, in statistical average 2 servers + of them will be down at any given point in time. + However, 2 is the same number as the replication degree +\begin_inset Formula $k.$ +\end_inset + + Because our BigCluster model as defined above will distribute +\emph on +all +\emph default + objects to +\emph on +all +\emph default + servers uniformly, there will almost always +\emph on +exist +\emph default + some objects for which no replica is available at any given point in time. + This means, you will almost always have a +\series bold +permanent incident +\series default + involving the same number of nodes as your replication degree +\begin_inset Formula $k$ +\end_inset + +, and in turn +\emph on +some +\emph default + of your objects will not be accessible at all. + This means, at +\begin_inset Formula $x=10,000$ +\end_inset + + storage units you will loose almost any advantage from increasing the number + of replicas. + Adding more replicas will no longer help at +\begin_inset Formula $x\geq10,000$ +\end_inset + + storage units. +\end_layout + +\begin_layout Standard +Notice that the +\emph on +solid +\emph default + lines are showing the probability of +\emph on +some +\emph default + incident, disregarding the +\series bold +size of the incident +\series default +. +\end_layout + +\begin_layout Standard +What's about the +\emph on +dashed +\emph default + lines showing much better behaviour for BigCluster? +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Under some further preconditions, it would be possible to argue with the + +\emph on +size +\emph default + of incidents. + However, now a big fat warning. + When you are +\series bold +responsible +\series default + for operations of thousands of servers, you should be very conscious about + these preconditions. + Otherwise you could risk your career. + In short: +\end_layout + +\begin_layout Itemize +When your application, e.g. + a smartphone app, consists of accessing only 1 object at all during a reasonabl +y long timeframe, you can safely +\series bold +assume that there is no interdependency +\series default + between all of your objects. + In addition, you have to assume (and you should check) that your cluster + operating software as a whole does not introduce any further +\series bold +hidden / internal interdependencies +\series default +. + Only in this case, and only then, you can take the dashed lines arguing + with the number of inaccessible objects instead of with the number of basic + storage units. +\end_layout + +\begin_layout Itemize +Whenever your application uses +\series bold +bigger structured logical objects +\series default +, such as filesystems or block devices or whole VMs / containers, then you + likely will get +\series bold +interdependent objects +\series default + at your big cluster storage layer. +\begin_inset Newline newline +\end_inset + +Practical example: experienced sysadmins will confirm that even a data loss + rate of only 1/1,000,000 of blocks in a classical Linux filesystem like + +\family typewriter +xfs +\family default + or +\family typewriter +ext4 +\family default + will likely imply the need of an offline filesystem check ( +\family typewriter +fsck +\family default +), which is a major incident for the affected filesystem instances. +\begin_inset Newline newline +\end_inset + +Theoretical explanation: servers are running for a very long time, and filesyste +ms are typically also mounted for a long time. + Notice that the probability of hitting any vital filesystem data roughly + equals the probability of hitting any other data. + Sooner or later, any defective sector in the metadata structures or in + freespace management etc will stop your whole filesystem, and in turn will + stop your application instance(s) running on top of it. +\begin_inset Newline newline +\end_inset + +Similar arguments hold for transient failures: most classical filesystems + are not constructed for compensation of hanging IO, typically leading to + +\series bold +system hangs +\series default +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Blindly taking the dashed lines will expose you to a high risk of error. + Practical experience shows that there are often +\series bold +hidden dependencies +\series default + in many applications, often also at application level. + You cannot necessarily see them when inspecting their data structures! + You will only notice some of them by analyzing their +\series bold +runtime behaviour +\series default +, e.g. + with tools like +\family typewriter +strace +\family default +. + Notice that in general the runtime behaviour of an arbitrary program is + +\series bold +undecidable +\series default +. + Be cautious when drawing assumptions out of thin air! +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Conversely, the assumption that +\emph on +any +\emph default + unaccessible object may halt your application, might be too strong for + +\emph on +some +\emph default + use cases. + Therefore, some practical behaviour may be inbetween the solid thin lines + and the dashed lines of some given color. + Be extremely careful when constructing such an intermediate case. + The above example of a loss rate of 1/1,000,000 of sectors in a classical + filesystem should not be extended to lower values like 1/1,000,000,000 + without knowing exactly how the filesystem works, and how it will react + +\emph on +in detail +\emph default + +\begin_inset Foot +status open + +\begin_layout Plain Layout +In general, it is insufficient to analyze the logical dependencies inside + of a filesystem instance, such as which inode contains some pointers to + which other filesystem objects, etc. + There exist further +\series bold +runtime dependencies +\series default +, such as +\family typewriter +nr_requests +\family default + block-layer restrictions on IO queue depths, and/or capabilities / limitiations + of the hardware, and so on. + Trying to model all of these influences in a reasonable way could be a + +\emph on +major +\emph default + research undertakement outside the scope of this MARS manual. +\end_layout + +\end_inset + +. + The grey zone between the extreme cases thin solid vs dashed is a +\series bold +dangerous zone +\series default +! +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +If you want to stay at the +\series bold +safe side +\series default +, simply obey the fundamental law as explained in the next section: +\end_layout + +\begin_layout Subsection +Optimum Reliability from Architecture +\begin_inset CommandInset label +LatexCommand label +name "subsec:Optimum-Reliability-from" + +\end_inset + + +\end_layout + +\begin_layout Standard +Another argument could be: don't distribute the BigCluster objects to exactly + +\begin_inset Formula $n$ +\end_inset + + nodes, but to less nodes. + Would the result be better than DRBDorMARS LocalSharding? +\end_layout + +\begin_layout Standard +When distributing to +\begin_inset Formula $O(k')$ +\end_inset + + nodes with some constant +\begin_inset Formula $k'$ +\end_inset + +, we have no longer a BigCluster architecture, but a mixed BigClusterSharding + form. +\end_layout + +\begin_layout Standard +As can be generalized from the above tables, the reliability of +\series bold +any +\series default + BigCluster on +\begin_inset Formula $k'>k$ +\end_inset + + nodes is +\series bold +always +\series default + worse than of LocalSharding on exactly +\begin_inset Formula $k$ +\end_inset + + nodes, where +\begin_inset Formula $k$ +\end_inset + + is also the redundancy degree. + In general: +\end_layout + +\begin_layout Quote + +\series bold +\size large +The LocalSharding model is the optimum model for reliability of operation, + compared to any other model truly distributing its data and operations + over truly more nodes, like RemoteSharding or BigClusterSharding or BigCluster + does. +\end_layout + +\begin_layout Standard +There exists no better model because shards consisting of exactly +\begin_inset Formula $k$ +\end_inset + + nodes where +\begin_inset Formula $k$ +\end_inset + + is the redundancy degree are already the +\emph on +smallest possible shards +\emph default + under the assumptions of section +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Detailed-explanation" + +\end_inset + +. + Any other model truly involving +\begin_inset Formula $k'>k$ +\end_inset + + nodes for distribution of objects at any shard is +\series bold +always +\series default + worse in the dimension of reliability. + Thus the above sentence follows by induction. +\end_layout + +\begin_layout Standard +The above sentence is formulating a +\series bold +fundamental law of storage systems +\series default +. +\end_layout + +\begin_layout Subsection +Error Propagation to Client Mountpoints +\begin_inset CommandInset label +LatexCommand label +name "subsec:Error-Propagation-to" + +\end_inset + + +\end_layout + +\begin_layout Standard +The following is only applicable when filesystems (or their objectstore + counterparts) are exported over a storage network, in order to be mounted + in parallel at +\begin_inset Formula $O(n)$ +\end_inset + + mountpoints each. +\end_layout + +\begin_layout Standard +In such a scenario, any problem / incident inside of your storage pool for + the filesystem instances will be spread to +\begin_inset Formula $O(n)$ +\end_inset + + clients, leading to an increase of the incident size by a factor of +\begin_inset Formula $O(n)$ +\end_inset + + when measured in number of affected mountpoints: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/MOUNTPOINTS_Comparison_of_Reversible_StorageNode_Failures.pdf + lyxscale 200 + width 100col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +As a results, we now have a total of +\begin_inset Formula $O(n^{2})$ +\end_inset + + mountpoints = our new basic application units. + Such +\begin_inset Formula $O(n^{2})$ +\end_inset + + architectures are quickly becoming even worse than before. + Thus a clear warning: don't try to build systems in such a way. +\end_layout + +\begin_layout Standard +Notice: DRBD or MARS are traditionally used for running the application + on the same box as the storage. + Thus they are not vulnerable to these kinds of failure propagation over + network. + Even with traditional iSCSI exports over DRBD or MARS, you won't have suchalike + problems. + Your only chance to increase the error propagation are +\begin_inset Formula $O(n)$ +\end_inset + + NFS or +\family typewriter +glusterfs +\family default + exports to +\begin_inset Formula $O(n)$ +\end_inset + + clients leading to a total number of +\begin_inset Formula $O(n^{2})$ +\end_inset + + mountpoints, or similar setups. +\end_layout + +\begin_layout Standard +Clear advice: don't do that. + It's a bad idea. +\end_layout + +\begin_layout Subsection +Similarities and Differences to Copysets +\begin_inset CommandInset label +LatexCommand label +name "subsec:Similarities-and-differences" + +\end_inset + + +\end_layout + +\begin_layout Standard +This section is mostly of academic interest. + You can skip it when looking for practical advice. +\end_layout + +\begin_layout Standard +The USENIX paper about copysets (see +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + +) relates to the Sharding model in the following way: +\end_layout + +\begin_layout Paragraph +Similarities +\end_layout + +\begin_layout Standard +The concept of Random Replication of the storage data to large number of + machines will reduce reliability. + When chosing too big sets of storage machines, then the storage system + as a whole will become practically unusable. + This is common sense between the USENIX paper and the Sharding Approach + as propagated here. +\end_layout + +\begin_layout Paragraph +Differences +\end_layout + +\begin_layout Standard +The USENIX paper and many other Cloud Storage approaches are +\emph on +presuming +\emph default + that there exists a storage network, allowing real-time distribution of + replicas over this kind of network. +\end_layout + +\begin_layout Standard +In contrast, the Sharding Approach to Cloud Storage tries to +\emph on +avoid +\emph default + real-time storage networks +\emph on +as much as possible +\emph default +. + Notice that RemoteSharding and further variants (including future improvements) + do +\emph on +not +\emph default + preclude it, but are trying to +\emph on +avoid +\emph default + real-time storage network traffic. + Instead, the load-balancing problem is addressed via +\series bold +background data migration +\series default +. +\end_layout + +\begin_layout Standard +This changes the +\emph on +timely granularity +\emph default + of data access: many real-time accesses are +\emph on +shifted over +\emph default + to migration processes, which in turn are weakening the requirements to + the network. +\end_layout + +\begin_layout Standard +In detail, there are some more differences to the USENIX paper. + Some examples: +\end_layout + +\begin_layout Itemize +Terminology: the scatter width +\begin_inset Formula $S$ +\end_inset + + is defined (see page 39 of the paper) as: each node's data is split +\emph on +uniformly +\emph default + across a group of +\begin_inset Formula $S$ +\end_inset + + +\emph on +other +\emph default + nodes. + In difference, we neither assume uniformity, nor do we require the data + to be distributed to +\emph on +other +\emph default + nodes. + By using the term +\begin_inset Quotes eld +\end_inset + +other +\begin_inset Quotes erd +\end_inset + +, the USENIX paper (as well as many other BigCluster approaches) are probably + presuming something like a distinction between +\begin_inset Quotes eld +\end_inset + +client +\begin_inset Quotes erd +\end_inset + + and +\begin_inset Quotes eld +\end_inset + +server +\begin_inset Quotes erd +\end_inset + + machines: while data processing is done on a +\begin_inset Quotes eld +\end_inset + +client +\begin_inset Quotes erd +\end_inset + +, data storage is on a +\begin_inset Quotes eld +\end_inset + +server +\begin_inset Quotes erd +\end_inset + +. + +\end_layout + +\begin_layout Itemize +We don't disallow this in variants like RemoteSharding or FlexibleSharding + and so on, but we gave some arguments why we are trying to +\emph on +avoid +\emph default + this. +\end_layout + +\begin_layout Itemize +It seems that some definitions in the USENIX paper may implicitly relate + to +\begin_inset Quotes eld +\end_inset + +each chunk +\begin_inset Quotes erd +\end_inset + +. + In contrast, the Sharding Approach typically relates to LVs (logical volumes), + which could however be viewed as a special case of +\begin_inset Quotes eld +\end_inset + +chunk +\begin_inset Quotes erd +\end_inset + +, e.g. + by minimizing the number of chunks in a system. + However notice: there exists definitions of +\begin_inset Quotes eld +\end_inset + +chunk +\begin_inset Quotes erd +\end_inset + + where it is the basic transfer unit. + An LV has the fundamental property that small-granularity +\series bold +update in place +\series default + (at any offset inside the LV) can be executed. +\end_layout + +\begin_layout Itemize +Notice: we do not preclude further fine-grained distribution of LV data, + but this is something which should be +\emph on +avoided +\emph default + if not absolutely necessary. + Preferred method in typical practical use cases: some storage servers may + have some spare RAID slots to be populated later, by resizing the PVs = + Physical Volumes before resizing LVs. +\end_layout + +\begin_layout Itemize +Notice that a typical local RAID system +\emph on +is also +\emph default + a Distributed System, according to some reasonable definition. + Typical RAID implementations just involve SAS cables instead of Ethernet + cables or Infiniband cables. + Notice that this also applies to many +\begin_inset Quotes eld +\end_inset + +Commodity Hardware +\begin_inset Quotes erd +\end_inset + + approaches, like Ceph storage nodes driving dozens of local HDDs connected + over SAS or SATA. + The main difference is just that instead of a hardware RAID controller, + a hardware HBA = Host Bus Adapter is used instead. + Instead of Ethernet switches, SAS multiplexers in backplanes are used. + Anyway, this forms a locally distributed sub-system. +\end_layout + +\begin_layout Itemize +Future variants of the Sharding Approach might extend this already present + locally Distributed System to a somewhat wider one. + For example, creation of a local LV (called +\begin_inset Quotes eld +\end_inset + +disk +\begin_inset Quotes erd +\end_inset + + in MARS terminology) could be implemented by a subordinate DRBD instance + implementing a future RAID-10 mode over local Infiniband or crossover Ethernet + cables, avoiding local switches. + While DRBD would essentially create the +\begin_inset Quotes eld +\end_inset + +local +\begin_inset Quotes erd +\end_inset + + LV, the higher-level MARS instance would then be responsible for its wide-dista +nce replication. + See chapter +\begin_inset CommandInset ref +LatexCommand ref +reference "chap:Use-Cases-for" + +\end_inset + + about use cases of MARS vs DRBD. + Potential future use cases could be +\emph on +extremely huge +\emph default + LVs where external SAS disk shelves are no longer sufficient to get the + desired capacity. +\end_layout + +\begin_layout Itemize +The USENIX paper needs to treat the following parameters as more or less + fixed (or only slowly changable) +\series bold +constants +\series default +, given by the system designer: the replication degree +\begin_inset Formula $R$ +\end_inset + +, and the scatter width +\begin_inset Formula $S$ +\end_inset + +. + In contrast, the replication degree +\begin_inset Formula $k$ +\end_inset + + of our Sharding Approach is not necessarily firmly given by the system, + but can be +\series bold +dynamically changed +\series default + at runtime on a per-LV basis. + For example, during background migration via MARS the command +\family typewriter +marsadm join-resource +\family default + is used for creating additional per-LV replicas. + However notice: this freedom is limited by the total number of deployed + hardware nodes. + If you want +\begin_inset Formula $k=3$ +\end_inset + + replicas at the +\emph on +whole +\emph default + pool, then you will need to (dynamically) deploy at least about +\begin_inset Formula $k*x$ +\end_inset + + nodes in general. +\end_layout + +\begin_layout Itemize +The USENIX paper defines its copysets on a per-chunk basis. + Similarly to before, we can transfer this definition to a Sharding Approach + by relating it to a per-LV basis. + As a side effect, a copyset can then trivially become identical to +\begin_inset Formula $S$ +\end_inset + + when the definition is +\begin_inset Formula $S$ +\end_inset + + is also changed to a per-LV basis, analogously. + In the Sharding Approach, a distiction is not absolutely necessary, while + the USENIX paper has to invest some effort into clarifying the relationship + between +\begin_inset Formula $S$ +\end_inset + + and copysets as defined on a BigCluster model. +\end_layout + +\begin_layout Itemize +Neglecting the mentioned differences, we see our typical use case (LocalSharding +) roughly equivalent to +\begin_inset Formula $S=R$ +\end_inset + + in the terminology of the USENIX paper, or to +\begin_inset Formula $S=k$ +\end_inset + + (our number of replicas) in our terminology. +\end_layout + +\begin_layout Itemize +This means: we try to minimize the +\emph on +size +\emph default + of +\begin_inset Formula $S$ +\end_inset + + for any given per-LV +\begin_inset Formula $k$ +\end_inset + +, which will lead to the best possible reliability (under the conditions + described in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Detailed-explanation" + +\end_inset + +) as has been shown in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Optimum-Reliability-from" + +\end_inset + +. +\end_layout + +\begin_layout Section +Performance Arguments from Architecture +\begin_inset CommandInset label +LatexCommand label +name "sec:Performance-Arguments-from" + +\end_inset + + +\end_layout + +\begin_layout Standard +Some people think that replication is easily done at filesystem layer. + There exist lots of cluster filesystems and other filesystem-layer solutions + which claim to be able to replicate your data, sometimes even over long + distances. +\end_layout + +\begin_layout Standard +Trying to replicate several petabytes of data, or some billions of inodes, + is however a much bigger challenge than many people can imagine. +\end_layout + +\begin_layout Standard +Choosing the wrong layer for +\series bold +mass data replication +\series default + may get you into trouble. + Here is an architectural-level (cf section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:What-is-Architecture" + +\end_inset + +) explanation why replication at the block layer is more easy and less error + prone: +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/Layers.pdf + width 100col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +The picture shows the main components of a standalone Unix / Linux system. + In the late 1970s / early 1980s, a so-called +\emph on +Buffer Cache +\emph default + had been introduced into the architecture of Unix. + Today's Linux has refined the concept to various internal caches such as + the +\series bold +Page Cache +\series default + (for data) and the +\series bold +Dentry Cache +\series default + (for metadata). +\end_layout + +\begin_layout Standard +All these caches serve one main purpose +\begin_inset Foot +status open + +\begin_layout Plain Layout +Another important purpose is +\series bold +providing shared memory +\series default +. +\end_layout + +\end_inset + +: they are reducing the load onto the storage by exploitation of fast RAM. + A well-tuned cache can yield high cache hit ratios, typically 99%. + In some cases (as observed in practice) even more than 99.9%. +\end_layout + +\begin_layout Standard +Now start distributing the system over long distances. + There are potential cut points A and B and C +\begin_inset Foot +status open + +\begin_layout Plain Layout +In theory, there is another cut point D by implementing a generically distribute +d cache. + There exists some academic research on this, but practically usable enterprise- +grade systems are rare and not wide-spread. +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Standard +Cut point A is application specific, and can have advantages because it + has knowledge of the application. + For example, replication of mail queues can be controlled much more fine-graine +d than at filesystem or block layer. +\end_layout + +\begin_layout Standard +Cut points B and C are +\emph on +generic +\emph default +, supporting a wide variety of applicactions, without altering them. + Cutting at B means replication at filesystem level. + C means replication at block level. +\end_layout + +\begin_layout Standard +When replicating at B, you will notice that the caches are +\emph on +below +\emph default + your cut point. + Thus you will have to re-implement +\series bold +distributed caches +\series default +, and you will have to +\series bold +maintain cache coherence +\series default +. +\end_layout + +\begin_layout Standard +When replicating at C, the Linux caches are +\emph on +above +\emph default + your cut point. + Thus you will receive much less traffic, typically already reduced by a + factor of 100, or even more. + This is much more easy to cope with. + You will also profit from +\series bold +journalling filesystems +\series default + like +\family typewriter +ext4 +\family default + or +\family typewriter +xfs +\family default +. + In contrast, +\emph on +truly distributed +\begin_inset Foot +status open + +\begin_layout Plain Layout +In this context, +\begin_inset Quotes eld +\end_inset + +truly +\begin_inset Quotes erd +\end_inset + + means that the POSIX semantics would be always guaranteed cluster-wide, + and even in case of partial failures. + In practice, some distributed filesystems like NFS don't even obey the + POSIX standard +\emph on +locally +\emph default + on 1 standalone client. + We know of projects which have +\emph on +failed +\emph default + right because of this. +\end_layout + +\end_inset + + +\emph default + journalling is typically not available with distributed cluster filesystems. +\end_layout + +\begin_layout Standard +A +\emph on +potential +\emph default + drawback of block layer replication is that you are typically limited to + active-passive replication. + An active-active operation is not impossible at block layer (see combinations + of DRBD with +\family typewriter +ocfs2 +\family default +), but less common, and less safe to operate. +\end_layout + +\begin_layout Standard +This limitation isn't necessarily caused by the choice of layer. + It is simply caused by the +\series bold +laws of physics +\series default +: communication is always limited by the speed of light. + A distributed filesystem is nothing else but a logically +\series bold +distributed shared memory +\series default + (DSM). +\end_layout + +\begin_layout Standard +Some decades of research on DSM have shown that there exist applications + / workloads where the DSM model is +\emph on +inferior +\emph default + to the direct communication paradigm. + Even in short-distance / cluster scenarios. + Long-distance DSM is extremely cumbersome. +\end_layout + +\begin_layout Standard +Therefore: you simply shouldn't try to solve long-distance communication + needs via communication over filesystems. + Even simple producer-consumer scenarios (one-way communication) are less + performant (e.g. + when compared to plain TCP/IP) when it comes to distributed POSIX semantics. + There is simply too much +\series bold +synchronisation overhead at metadata level +\series default +. +\end_layout + +\begin_layout Standard +If you have a need for mixed operations at different locations in parallel: + just split your data set into disjoint filesystem instances (or database + / VM instances, etc). + All you need is careful thought about the +\emph on +appropriate +\emph default + +\emph on +granularity +\emph default + of your data sets (such as well-chosen +\emph on +sets +\emph default + of user homedirectory subtrees, or database sets logically belonging together, + etc). +\end_layout + +\begin_layout Standard +Replication at filesystem level is often at single-file granularity. + If you have several millions or even billions of inodes, you may easily + find yourself in a snakepit. +\end_layout + +\begin_layout Standard +Conclusion: active-passive operation over long distances (such as between + continents) is even an advantage. + It keeps you from trying bad / almost impossible things. +\end_layout + +\begin_layout Section +Scalability Arguments from Architecture +\begin_inset CommandInset label +LatexCommand label +name "sec:Scalability-Arguments-from" + +\end_inset + + +\end_layout + +\begin_layout Standard +Some people are talking about scalability by (1) looking at a relatively + small example cluster +\emph on +implementation +\emph default + of their respective (pre-)chosen +\emph on +architecture +\emph default + having +\begin_inset Formula $n$ +\end_inset + + machines or +\begin_inset Formula $n$ +\end_inset + + network components or running +\begin_inset Formula $n$ +\end_inset + + application instances, and then (2) extrapolating its behaviour to bigger + +\begin_inset Formula $n$ +\end_inset + +. + They think if it runs with small +\begin_inset Formula $n$ +\end_inset + +, it will also run for bigger +\begin_inset Formula $n$ +\end_inset + +. +\end_layout + +\begin_layout Standard +This way of thinking and acting is completely broken, and can endanger both + companies and careers. +\end_layout + +\begin_layout Standard +This is not only because of confusion of +\begin_inset Quotes eld +\end_inset + +architecture +\begin_inset Quotes erd +\end_inset + + with +\begin_inset Quotes eld +\end_inset + +implementation +\begin_inset Quotes erd +\end_inset + +, cf section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:What-is-Architecture" + +\end_inset + +. + It is also fundamentally broken because it assumes some +\begin_inset Quotes eld +\end_inset + +linearity +\begin_inset Quotes erd +\end_inset + + in a field which is non-linear +\emph on +by definition +\emph default +. + If scalability would be linear, the term would not be useful at all, because + there would be +\emph on +no limit +\emph default +. + However, limits exist in practice, and the term +\begin_inset Quotes eld +\end_inset + +scalability +\begin_inset Quotes erd +\end_inset + + is the +\emph on +means +\emph default + for describing the behaviour at or around the limit. +\end_layout + +\begin_layout Standard +Another +\emph on +incorrect +\emph default + way of ill-defining the term +\begin_inset Quotes eld +\end_inset + +scalability +\begin_inset Quotes erd +\end_inset + + is looking at some relatively big +\emph on +example +\emph default + cluster, which is working in practice. + Arguing with an example of a working system is wrong by construction. +\end_layout + +\begin_layout Standard + +\emph on +Every +\emph default + storage system on this globe has +\emph on +always +\emph default + some scalability limit, somewhere. + Scalability is +\emph on +always +\emph default + a +\series bold +non-linear +\series default + behaviour. + In order to find the practical limit, you must +\emph on +reach +\emph default + it. +\end_layout + +\begin_layout Standard +Therefore, examples are principally insufficient for proving scalability, + as well as for comparing the scalability of architectures and/or of certain + implementations. + Examples can be only used for +\emph on +disproving +\emph default + scalability. +\end_layout + +\begin_layout Subsection +Example Failures of Scalability +\begin_inset CommandInset label +LatexCommand label +name "subsec:Example-Failures-of" + +\end_inset + + +\end_layout + +\begin_layout Standard +The following description is a +\series bold +must read +\series default + for sysadmins and system architects, and also for managers who are +\series bold +responsible +\series default +. + The numbers and some details are from my memory, thus it need not be 100% + accurate in all places. +\end_layout + +\begin_layout Standard +It is about an operation environment for a +\emph on +new +\emph default + product, which was a proprietary web page editor running under a complicated + variant of a LAMP stack. +\end_layout + +\begin_layout Standard +The setup started with a +\family typewriter +BigCluster +\family default + +\emph on +architecture +\emph default +, but actually sized as a +\family typewriter + +\begin_inset Quotes eld +\end_inset + +SmallCluster +\begin_inset Quotes erd +\end_inset + + +\family default + implementation. +\end_layout + +\begin_layout Paragraph +Setup 1 (NFS) +\end_layout + +\begin_layout Standard +The first setup consisted of +\begin_inset Formula $n=6$ +\end_inset + + storage servers, each replicated to another datacenter via DRBD. + Each were exporting their filesystems via NFS to about the same number + of client servers, where Apache/PHP was supposed to serve the HTTP requests + from the customers, which were entering the client cluster via a HTTP load + balancer. + The load balancer was supposed to spread the HTTP load to the client servers + in a +\series bold +round-robin +\series default + fashion. +\end_layout + +\begin_layout Standard + +\color lightgray +At this point, eager readers may notice some similarity with the error propagati +on problem treated in section +\begin_inset CommandInset ref +LatexCommand vref +reference "subsec:Error-Propagation-to" + +\end_inset + +. + Notice that this is about +\emph on +scalability +\emph default + instead, but you should compare with that, to find some similarities. +\end_layout + +\begin_layout Standard +After the complicated system was built up and was working well enough, the + new product was launched via a marketing campaign with free trial accounts, + limited to some time. +\end_layout + +\begin_layout Standard +So the number of customers was ramping up from 0 to about 20,000 within + a few weeks. + When about 20,000 customers were running on the client machines, system + hangs were noticed, and also from a customer's perspective. + When too many customers were pressing the +\begin_inset Quotes eld +\end_inset + +save +\begin_inset Quotes erd +\end_inset + + button in parallel on reasonably large web page projects, a big number + of small files, including a huge bunch of small image files, was generated + over a short period of time. + A few customers were pressing the +\begin_inset Quotes eld +\end_inset + +save +\begin_inset Quotes erd +\end_inset + + button several times a minute, each time re-creating all of these files + again and again from the proprietary web page generator. + Result: the system appeared to hang. +\end_layout + +\begin_layout Standard +However, all of the servers, including the storage servers, were almost + +\emph on +idle +\emph default + with respect to CPU consumption. + RAM sizes were also no problem. +\end_layout + +\begin_layout Standard +After investigating the problem for a while, it was noticed that the +\series bold +\emph on +network +\series default +\emph default + was the bottleneck, but not in terms of throughput. + The internal sockets were forming some +\series bold +queues +\series default + which were +\emph on +delaying +\emph default + the NFS requests in some +\series bold +ping-pong +\series default + like fashion, almost resulting in a +\begin_inset Quotes eld +\end_inset + +deadlock +\begin_inset Quotes erd +\end_inset + + from a customer's perspective (a better term would be +\series bold +distributed livelock +\series default + or +\series bold +distributed thrashing +\series default +). +\end_layout + +\begin_layout Paragraph +Setup 2 ( +\family typewriter +ocfs2 +\family default +) +\end_layout + +\begin_layout Standard +Due to some external investigations and recommendations, the system was + converted from NFS to +\family typewriter +ocfs2 +\family default +. + Now DRBD was operated in active-active mode. + Only one system software component was replaced with another one, without + altering the +\family typewriter +BigCluster +\family default + architecture, and without changing the number of servers, which remained + a stripped-down +\family typewriter +SmallCluster +\family default + implementation. +\end_layout + +\begin_layout Standard +Result: the problem with the +\begin_inset Quotes eld +\end_inset + +hangs +\begin_inset Quotes erd +\end_inset + + disappeared. +\end_layout + +\begin_layout Standard +However, after the number of customers had exceeded the +\series bold +next scalability limit +\series default + of about 30,000 customers, the +\begin_inset Quotes eld +\end_inset + +hang +\begin_inset Quotes erd +\end_inset + + problem appeared once again, in a similar way. + The system showed systematical incidents again. +\end_layout + +\begin_layout Paragraph +Setup 3 ( +\family typewriter +glusterfs +\family default + as a substitute for NFS) +\end_layout + +\begin_layout Standard +After investigating the network queueing behaviour and the lock contention + problems of +\family typewriter +ocfs2 +\family default +, the next solution was +\family typewriter +glusterfs +\family default +. +\end_layout + +\begin_layout Standard +However, when the number of customers exceeded the +\series bold +\emph on +next +\emph default + scalability limit +\series default +, which was about 50,000 customers, some of them hammering the cluster with + their +\begin_inset Quotes eld +\end_inset + +save +\begin_inset Quotes erd +\end_inset + + button, the +\begin_inset Quotes eld +\end_inset + +hangs +\begin_inset Quotes erd +\end_inset + + appeared again. +\end_layout + +\begin_layout Paragraph +Setup 4 ( +\family typewriter +glusterfs +\family default + replication as a substitute for DRBD) +\end_layout + +\begin_layout Standard +After analyzing the problem once again, it was discovered by accident that + +\family typewriter +drbdadm disconnect +\family default + +\emph on +appeared +\emph default + to +\begin_inset Quotes eld +\end_inset + +solve +\begin_inset Quotes erd +\end_inset + + the problem. +\end_layout + +\begin_layout Standard +Therefore DRBD was replaced with +\family typewriter +glusterfs +\family default + replication. + There exists a +\family typewriter +glusterfs +\family default + feature allowing replication of files at filesystem level. +\end_layout + +\begin_layout Standard +This attempt was +\emph on +immediately +\emph default + resulting in an almost fatal disaster, and thus was stopped immediately: + the cluster completely broke down. + Almost nothing was working anymore. +\end_layout + +\begin_layout Standard +The problem was even worse: switching off the +\family typewriter +glusterfs +\family default + replication and rollback to DRBD did not work. + The system remained unusable. +\end_layout + +\begin_layout Standard +As a temporary workaround, +\family typewriter +drbdadm disconnect +\family default + was improving the situation enough for some humbling operation. +\end_layout + +\begin_layout Standard +Retrospective explanation: some of the reasons can be found in section +\begin_inset CommandInset ref +LatexCommand vref +reference "subsec:Behaviour-of-DRBD" + +\end_inset + +. + +\family typewriter +glusterfs +\family default + replication does not scale at all because it stores its replication information + at +\series bold +per-inode granularity +\series default + in EAs (extended attributes), which must +\emph on +necessarily +\emph default + be worse than DRBD, because there were some hundreds of millions of them + in total as reported by +\family typewriter +df -i +\family default + (see the cut point discussion in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Performance-Arguments-from" + +\end_inset + +). + Overnight in some cron jobs, these EAs had to be deleted in reasonably + sized batches in order to become more or less +\begin_inset Quotes eld +\end_inset + +operable +\begin_inset Quotes erd +\end_inset + + again. +\end_layout + +\begin_layout Paragraph +Setup5 (Sharding on top of DRBD) +\end_layout + +\begin_layout Standard +After the almost fatal incident had been resolved to a less critical one, + the responsibility for setup was taken over by another person. + After the +\begin_inset Formula $O(n^{2})$ +\end_inset + + behaviour from section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Distributed-vs-Local:" + +\end_inset + + had been understood, and after it was clear that sharding is only +\begin_inset Formula $O(k)$ +\end_inset + + from a customer's perspective, it was the final solution. + Now the problem was resolved at +\series bold +\emph on +architectural level +\series default +\emph default +, no longer by just replacing some components with some others. +\end_layout + +\begin_layout Standard +The system was converted to a variant of a +\family typewriter +RemoteSharding +\family default + model (see section +\begin_inset CommandInset ref +LatexCommand vref +reference "subsec:Variants-of-Sharding" + +\end_inset + +), and some +\family typewriter +migrate +\family default + scripts were introduced for load balancing of customer homedirectories + and databases between shards. +\end_layout + +\begin_layout Standard +As a side effect, the load balancer became a new role: instead of spreading + +\emph on +all +\emph default + of the HTTP requests to +\emph on +all +\emph default + of the client servers in a round-robin fashion, it now acted as a redirection + mechanism at +\emph on +shard granularity +\emph default +, e.g. + when one of the client servers was handed over to another one for maintenance. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + Retrospective explanation: DRBD was definitely +\emph on +not +\emph default + the real reason for the critical incident. + The replication traffic per shard is so low in average that until today, + no replacement by MARS was absolutely necessary +\begin_inset Foot +status open + +\begin_layout Plain Layout +Many sysadmins are running a conservative strategy: never touch a running + system... +\end_layout + +\end_inset + +, although the distance is over 50 km. + If you wonder why such low write traffic demands can cause such a big incident: + look at the +\series bold +cache reduction +\series default + graphics in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Performance-Arguments-from" + +\end_inset + +. + Today, the +\begin_inset Quotes eld +\end_inset + +save +\begin_inset Quotes erd +\end_inset + + buttons of the customers are just triggering some +\emph on +extra +\emph default + +\series bold +writebacks +\series default + from the Page Cache of the kernel into the block layer, after some +\emph on +delay +\emph default +. + These writebacks are not performance critical in reality, because the Page + Cache is running them +\series bold +\emph on +asynchronously in background +\series default +\emph default +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + In contrast, distributed filesystems like +\family typewriter +NFS +\family default + or +\family typewriter +ocfs2 +\family default + or +\family typewriter +glusterfs +\family default + are not working asynchronously in many places, but will often schedule + their requests +\emph on +synchronously +\emph default + into ordinary network queues, which form a +\series bold +sequential bottleneck +\series default +, competing with other high-frequent filesystem operations. + In addition, the +\begin_inset Quotes eld +\end_inset + +save +\begin_inset Quotes erd +\end_inset + + button triggers masses of metadata / inode updates in a short time, often + residing in the same directory. + Such a directory may thus form a +\begin_inset Quotes eld +\end_inset + +global +\begin_inset Quotes erd +\end_inset + + bottleneck. + When suchalike competing +\series bold +metadata updates +\series default + are distributed via a round-robin load balancer, the problem can easily + become critical by the +\series bold +cache coherence problem +\series default +. + While local filesystems can smoothen such application behaviour via the + Dentry Cache plus Inode Cache, which also show some asynchronous writeback + behaviour, network filesystems are often unable to deal with this performantly. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + Although DRBD has a similar sequential bottleneck at the low-frequency + block layer by its write-through strategy into its replica, this does not + really matter: all other writebacks from the Page Cache are +\emph on +also +\emph default + started asynchronously, and triggered low-frequently, and are occurring + after some +\emph on +delay +\emph default + (which in turn will smoothen the +\series bold +spikes +\series default + caused by +\series bold +mass dirtification +\series default + of many small files and inodes in a short time as caused by the +\begin_inset Quotes eld +\end_inset + +save +\begin_inset Quotes erd +\end_inset + + button), and thus are not really performance critical for this particular + use case. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + This is a striking example why careful +\series bold +selection of granularity level +\series default + (filesystem vs block layer) is essential. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + This is also a striking example why asynchronous operations can form a + huge advantage in certain use cases. +\end_layout + +\begin_layout Standard +The sharding setup is working until today, scaling up to the current number + of customers, which is more than an order of magnitude, in the range of + about a million of customers. + Of course, the number of shards had to be increased, but this is just what + sharding is about. +\end_layout + +\begin_layout Subsection +Properties of Storage Scalability +\begin_inset CommandInset label +LatexCommand label +name "subsec:Properties-Scalability" + +\end_inset + + +\end_layout + +\begin_layout Subsubsection +Influence Factors at Scalability +\begin_inset CommandInset label +LatexCommand label +name "subsec:Influence-Factors-Scalability" + +\end_inset + + +\end_layout + +\begin_layout Standard +In general, scalability of storage systems depends on the following factors + (list may be incomplete): +\end_layout + +\begin_layout Enumerate +The +\series bold +application class +\series default +, in particular its principal +\series bold +workingset behaviour +\series default + (in both dimensions: timely and locality). + More explanations about workingsets can be found at +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Enumerate +The +\series bold +size +\series default + +\begin_inset Formula $x$ +\end_inset + + of the application data and/or the +\series bold +number of application instances +\series default + (possibly also denoted by +\begin_inset Formula $x$ +\end_inset + +), and the amount of storage needed for it (could be also termed +\begin_inset Formula $x$ +\end_inset + +). + Besides the data itself, the corresponding +\series bold +metadata +\series default + (inodes, indexes, etc) can form an important factor, or can even +\emph on +dominate +\emph default + the whole story. + Typically, critical datacenter application data is tremendously differently + sized from workstation data. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + Caution! Many people think erronously that scalability would be +\emph on +linearly +\emph default + depending on +\begin_inset Formula $x$ +\end_inset + +. + However, as is known at least since the 1960s (read some ancient papers + from Saltzer and/or from Denning), scalability is +\series bold +never linear +\series default +, but sometimes even +\series bold +\emph on +disruptive +\series default +\emph default +, in particular when RAM size is the bottleneck. + IO queues and/or networking queues are often also reacting to overload + in a disruptive fashion. + This means: after exceeding the +\series bold +scalability limit +\series default + of a particular system for its particular class of applications, the system + will always +\series bold +break down +\series default + from a customer's perspective, sometimes almost completely, and sometimes + even +\series bold +\emph on +fatally +\series default +\emph default +. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + On the other hand, some other systems are reacting with +\series bold +graceful degradation +\series default +. + Whether a particular systems reacts to a particular type of (over)load, + either with graceful degradation, or with fatal disruption, or with some + intermediate behaviour, is some sort of +\begin_inset Quotes eld +\end_inset + +quality property +\begin_inset Quotes erd +\end_inset + + of the system and/or of the application. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + EVERY SYSTEM, even sharded systems, and even the internet as a whole, has + +\emph on +always +\emph default + some scalability limit +\emph on +somewhere +\emph default +. + There exists +\series bold +no +\begin_inset Quotes eld +\end_inset + +inifinitely scaling +\begin_inset Quotes erd +\end_inset + + system +\series default + on earth! +\end_layout + +\begin_layout Enumerate +The +\series bold +\emph on +distribution +\series default +\emph default + of the application behaviour in both +\series bold +timely +\series default + and +\series bold +locality +\series default + dimensions. + Depending on the application class, this is often an +\emph on +exponential +\emph default + distribution according to Zipf's law. + By falsely +\emph on +assuming +\emph default + an equal distribution (or a Gaussian distribution) instead of actually + measuring the distribution in both dimensions, you can easily induce zillions + of costly problems for big +\begin_inset Formula $x$ +\end_inset + +, or even fatal failure of the whole system / project. +\end_layout + +\begin_layout Enumerate +The +\series bold +transformation +\series default + of the application workingset behaviour at architectural level, sometimes + caused by certain components resp their specific implementation or parameteriza +tion. + Examples are intermediate virtualization layers, e.g. + vmware +\family typewriter +*.vmdk +\family default + or KVM +\family typewriter +*.qcow2 +\family default + container formats which can completely change the game, not only in extreme + cases. + Another example is +\series bold +random distribution +\series default + to object stores, which can turn some uncomplicated sequential workloads + into highly problematic +\emph on +random IO +\emph default + workloads. + Don't overlook such potential pitfalls! +\end_layout + +\begin_layout Enumerate +The storage +\series bold +architecture +\series default + to be chosen, such as +\family typewriter +CentralStorage +\family default + vs +\family typewriter +BigCluster +\family default + vs +\family typewriter +*Sharding +\family default +. + Choice of the wrong architecture can be fatal for big +\begin_inset Formula $n$ +\end_inset + + and/or for certain timely / spatial application behaviour. + Changing an architecture during operations on some petabytes of data and/or + some billions of inodes can be almost impossible, and/or can consume a + lot of time and money. +\end_layout + +\begin_layout Enumerate +The +\series bold +number +\series default + of storage +\series bold +nodes +\series default + +\begin_inset Formula $n$ +\end_inset + +. + In some architectures, addition of more nodes can make the system +\emph on +worse +\emph default + instead of better, c.f. + section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Reliability-Arguments-from" + +\end_inset + +. +\end_layout + +\begin_layout Enumerate +In case of architectures relying on a storage network: choice of +\series bold +layer +\series default + for cut point, e.g. + filesystem layer vs block layer, see section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Performance-Arguments-from" + +\end_inset + +, and/or introduction of an additional intermediate object storage layer + (which can result in major degradation from an architectural view). + Due to fundamental differences in distributed vs local +\series bold +cache coherence +\series default +, suchalike can have a +\emph on +tremendous +\emph default + effect on scalability. +\end_layout + +\begin_layout Enumerate +The +\series bold +implementation +\series default + of the architecture. + Be sure to understand the difference between an +\emph on +architecture +\emph default + and an +\emph on +implementation +\emph default + of that architecture. +\end_layout + +\begin_layout Enumerate +The size and types / properties of various +\series bold +caches +\series default + at various layers. + You need to know the general properties of +\series bold +inclusive +\series default + vs +\series bold +exclusive +\series default + cache architecture. + You absolutely need to know what +\series bold +thrashing +\series default + is, and under which conditions it can occur. +\begin_inset Newline newline +\end_inset + +It is advantagous for system architects to know +\begin_inset Foot +status open + +\begin_layout Plain Layout +Reading a few Wikipedia articles does not count as +\begin_inset Quotes eld +\end_inset + +knowledge +\begin_inset Quotes erd +\end_inset + +. + You need to be able to +\emph on +apply +\emph default + your knowdedge to enterprise level systems (as opposed to workstation-sized + systems), +\emph on +sustainable +\emph default + and +\emph on +reproducible +\emph default +. + Therefore you need to have +\emph on +actually worked +\emph default + in the matter and gained some extraordinary experiences, on top of deep + understanding of the matter. +\end_layout + +\end_inset + + pre-loading strategies, as well as replacement strategies. + It is advantageous to know what +\family typewriter +LRU +\family default + or +\family typewriter +MFU +\family default + means, what their induced +\emph on +overhead +\emph default + is, and how they +\emph on +really +\emph default + work on +\emph on +actual +\emph default + data, not just on some artificial lab data. + You also should know what an +\series bold +anomaly +\series default + is, and how it can be produced not only by +\family typewriter +FIFO +\family default + strategies, but also by certain types of ill-designed multi-layer caching. + Beware: there are places where +\family typewriter +FIFO +\family default +-like behaviour is almost impossible to avoid, such as networks. + All of these is outside the scope of this MARS manual. + You should +\emph on +measure +\emph default +, when possible, the +\series bold +overhead +\series default + of cache implementations. + I know of +\emph on +examples +\emph default + where caching is c +\emph on +ounter-productive +\emph default +. + For example, certain types and implementations of SSD caches are over-hyped. + Removing a certain cache will then +\emph on +improve +\emph default + the situation. + Notice: caches are conceptually based on some type of +\series bold +associative memory +\series default +, which is either very costly when directly implemented in hardware, or + can suffer from tremendous performance penalties when implemented inappropriate +ly in software. +\end_layout + +\begin_layout Enumerate + +\series bold +Hardware dimensioning +\series default + of the implementation: choice of storage hardware, for each storage node. + This includes SSDs vs HDDs, their attachment (e.g. + SAS multiplexing bottlenecks), RAID level, and controller limitations, + etc. +\end_layout + +\begin_layout Enumerate +Only for architectures relying on a storage network: network +\series bold +throughput +\series default + and network +\series bold +latencies +\series default +, and network +\series bold +bottlenecks +\series default +, including the +\series bold +queueing +\series default + behaviour / congestion control / +\series bold +packet loss +\series default + behaviour upon overload. + The latter is often neglected, leading to unexpected behaviour at load + peaks, and/or leading to costly over-engineering (examples see section + +\begin_inset CommandInset ref +LatexCommand vref +reference "subsec:Example-Failures-of" + +\end_inset + +). +\end_layout + +\begin_layout Enumerate + +\series bold +\emph on +Hidden +\emph default + bottlenecks +\series default + of various types. + A complete enumeration is almost impossible, because there are too many + +\begin_inset Quotes eld +\end_inset + +opportunities +\begin_inset Quotes erd +\end_inset + +. + To reduce the latter, my general advice is to try to build bigger systems + as +\emph on +simple +\emph default + as possible. + This is why you should involve some +\emph on +real +\emph default + experts in storage systems, at least on critical enterprise data. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + +\emph on +Any +\emph default + of these factors can be dangerous when not carefully thought about and + treated, depending on your use case. +\end_layout + +\begin_layout Subsubsection +Example Scalability Scenario +\begin_inset CommandInset label +LatexCommand label +name "subsec:Example-Scalability-Scenario" + +\end_inset + + +\end_layout + +\begin_layout Standard +To get an impression what +\begin_inset Quotes eld +\end_inset + +enterprise critical data +\begin_inset Quotes erd +\end_inset + + can mean in a concrete example, here are some characteristic numbers on + 1&1 ShaHoLin (Shared Hosting Linux) around spring 2018, which would be + the +\emph on +input parameters +\emph default + for +\emph on +any +\emph default + potential solution architecture +\family typewriter +CentralStorage +\family default + vs +\family typewriter +BigCluster +\family default + vs +\family typewriter +Sharding +\family default +: +\end_layout + +\begin_layout Itemize +About 9 millions of customer homedirectories. +\end_layout + +\begin_layout Itemize +About 10 billions of inodes, with daily incremental backup. +\end_layout + +\begin_layout Itemize +More than 4 petabytes of +\emph on +net +\emph default + data (total +\family typewriter +df +\family default + filling level) in spring 2018, with a growth rate of 21% per year. +\end_layout + +\begin_layout Itemize +All of this permanently replicated into a second datacenter. +\end_layout + +\begin_layout Itemize +Webhosting very close to 24/7/365. + For maintenance, any resource must be switchable to the other datacenter + at any time, indepently from other resources; while in catastrophic failure + scenarios +\emph on +all +\emph default + resources must be switchable within a short time. +\end_layout + +\begin_layout Standard +For simplicity of our sandbox game, we assume that all of this is in one + campus. + In reality, about 30% is residing in another continent. + Introducing this as an additional input parameter would not fundamentally + change the game. + Many other factors, like dependencies from existing infrastructure, are + also neglected. +\end_layout + +\begin_layout Paragraph +Theoretical Solution: +\family typewriter +CentralStorage +\end_layout + +\begin_layout Standard +Let us assume somebody would try to operate this on classical +\family typewriter +CentralStorage +\family default +, and let us assume that migration of this amount of data including billions + of inodes would be no technical problem. + What would be the outcome? +\end_layout + +\begin_layout Standard +With current technology, finding a single +\family typewriter +CentralStorage +\family default + appliance would be all else but easy. + Dimensioning would be needed for the +\emph on +lifetime +\emph default + of such a solution, which is at least 5 years. + In five years, the data would grow by a factor of about +\begin_inset Formula $1.21^{5}=2.6$ +\end_inset + +, which is then about +\begin_inset Formula $10.5$ +\end_inset + + petabytes. + This is only the +\emph on +net +\emph default + capacity; at hardware layer much more is needed for spare space and for + local redundancy. + The single +\family typewriter +CentralStorage +\family default + instance will need to scale up to at least this number, in one datacenter + (under the simplified game assumptions). +\end_layout + +\begin_layout Standard +The current number of client LXC containers is about +\begin_inset Formula $2600$ +\end_inset + +, independent from location. + You will have to support growth in number of them. + For maintenance, any of these need to be switchable to a different location + at any time. + The number of bare metal servers running them can vary with hardware architectu +re / hardware lifecycle, and with growth. + You will need to dimension a dedicated storage network for all of this. +\end_layout + +\begin_layout Standard +If you find a solution which can do this with current +\family typewriter +CentralStorage +\family default + technology for the next 5 years, then you will have to ensure that restore + from backup +\begin_inset Foot +status open + +\begin_layout Plain Layout +Local snapshots, whether LVM or via some COW filesystem, do not count as + backups! You need a +\emph on +logical +\emph default + copy, not a +\emph on +physical +\emph default + one, in case your production filesystem instance gets damaged. +\end_layout + +\end_inset + + can be done in less than 1 day in case of a fatal disaster, see also treatment + of +\family typewriter +CentralStorage +\family default + reliability in section +\begin_inset CommandInset ref +LatexCommand vref +reference "subsec:Reliability-Differences-CentralStorage" + +\end_inset + +. + Notice that the current self-built backup solution for a total of 15 billions + of inodes is based on a sharding model; converting this to some more or + less centralized solution would turn out as another challenge. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Attention! Buying 10 or 50 or 100 CentralStorage instances does not count + as a +\family typewriter +CentralStorage +\family default + architecture. + By definition, suchalike would be +\family typewriter +RemoteSharding +\family default + instead. + Notice that the current 1&1 solution is already a mixture of +\family typewriter +LocalSharding +\family default + and +\family typewriter +RemoteSharding +\family default +, so you would win +\emph on +nothing +\emph default + at architectural level. + +\end_layout + +\begin_layout Standard +In your business case, you would need to justify the price difference between + the current component-based hardware solution (horizontally extensible + by +\emph on +scale-out +\emph default +) and +\family typewriter +CentralStorage +\family default +, which is about a factor of 10 per terabyte according to the table in section + +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Cost-Arguments-from" + +\end_inset + +. + Even if you manage to find a vendor who is willing to subsidize to a factor + of only 3, this is not all you need. + You need to add the costs for the dedicated storage network. + On top of this, you need to account for the +\emph on +migration costs +\emph default + after the lifetime of 5 years has passed, where the full data set needs + to be migrated to a successor storage system. +\end_layout + +\begin_layout Standard +Notice that classical argumentations with +\series bold +\emph on +manpower +\series default +\emph default + will not work. + The current operating team is about 10 persons, with no dedicated storage + admin. + This relatively small team is not only operating a total of more than 6,000 + shared boxes in all datacenters, but also some tenthousands of managed + dedicated servers, running essentially the same software stack, with practicall +y fully automated mass deployment. + Most of their tasks are related to central software installation, which + is then automatically distributed, and to operation / monitoring / troubleshoot +ing of masses of client servers. + Storage administration tasks in isolation are costing only a +\emph on +fraction +\emph default + of this. + Typical claims that +\family typewriter +CentralStorage +\family default + would require less manpower will not work here. + Almost everything which is needed for +\emph on +mass automation +\emph default + is already automated. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Neglecting the tenthousands of managed dedicated servers would be a catastrophi +c ill-design. + Their hardware is already given, by existing customer contracts, some of + them decades old. + You simply cannot fundamentally change the hardware of these customers + including their +\emph on +dedicated +\emph default + local disks, which is their +\emph on +main selling point +\emph default +. + You cannot simply convert them to a shared +\family typewriter +CentralStorage +\family default +, even if it would be technically possible, and if it would deliver similar + IOPS rates than tenthousands of local spindles (and if you could reach + the bundled performance of local SSDs from newer contracts), and even if + you would introduce some interesting +\series bold +storage classes +\series default + for all of this. + A dedicated server on top of a shared storage is no longer a dedicated + one. + You would have to migrate these customers to another product, with all + of its consequences. + Alone for these machines, +\emph on +most +\begin_inset Foot +status open + +\begin_layout Plain Layout +Only a few out of >1000 self-built or customized Debian packages are dealing + with MARS and/or with the clustermanager +\family typewriter +cm3 +\family default +. +\end_layout + +\end_inset + + +\emph default + of the current automation of +\family typewriter +LocalStorage +\family default + is needed +\emph on +anyway +\emph default +, although they are not geo-redundant at current stage. +\end_layout + +\begin_layout Standard +Conclusion: +\family typewriter +CentralStorage +\family default + is simply +\emph on +unrealistic +\emph default +. +\end_layout + +\begin_layout Paragraph +Theoretical Solution: +\family typewriter +BigCluster +\end_layout + +\begin_layout Standard +The main problem of +\family typewriter +BigCluster +\family default + is +\series bold +reliability +\series default +, as explained intuitively in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Reliability-Arguments-from" + +\end_inset + + and mathematically in appendix +\begin_inset CommandInset ref +LatexCommand vref +reference "chap:Mathematical-Model-of" + +\end_inset + +, and as observed in numerous installations not working as expected. +\end_layout + +\begin_layout Standard +Let us assume that all of these massive technical problems were solved, + somehow. + Then the business case would have to deal with the following: +\end_layout + +\begin_layout Standard +The total number of servers would need to be roughly +\emph on +doubled +\emph default +. + Not only their CAPEX, but also the corresponding OPEX (electrical power, + rackspace, manpower) would increase. + Alone their current electrical power cost, including cooling, is more than + the current sysadmin manpower cost. + Datacenter operations would also increase. + On top, a dedicated storage network and its administration would also be + needed. +\end_layout + +\begin_layout Standard +With respect to the tenthousands of managed dedicated servers and their + customer contracts, a similar argument as above holds. + You simply cannot convert them to +\family typewriter +BigCluster +\family default +. +\end_layout + +\begin_layout Standard +Conclusion: +\family typewriter +BigCluster +\family default + is also +\emph on +unrealistic +\emph default +. + There is nothing to win, but a lot to loose. +\end_layout + +\begin_layout Paragraph +Current Solution: +\family typewriter +LocalSharding +\family default +, sometimes +\family typewriter +RemoteSharding +\end_layout + +\begin_layout Standard +Short story: it works since decades, and is both cheap and robust since + geo-redundancy had been added around 2010. +\end_layout + +\begin_layout Standard +With the advent of Football (see chapter +\begin_inset CommandInset ref +LatexCommand vref +reference "chap:LV-Football" + +\end_inset + +), the +\family typewriter +LocalSharding +\family default + architecture is raising up on par with the most important management abilities + of +\family typewriter +CentralStorage +\family default + and +\family typewriter +BigCluster +\family default + / Software Defined Storage. +\end_layout + +\begin_layout Standard +The story with the tenthousands of managed dedicated servers is arguing + vice versa: without the traditional ShaHoLin sharding architecture and + all of its automation, including the newest addition called Football, the + product +\begin_inset Quotes eld +\end_inset + +managed dedicated servers +\begin_inset Quotes erd +\end_inset + + would not be possible in this scale. + +\end_layout + +\begin_layout Standard +Summay: the sharded +\begin_inset Quotes eld +\end_inset + +shared +\begin_inset Quotes erd +\end_inset + + product enables another +\begin_inset Quotes eld +\end_inset + +dedicated +\begin_inset Quotes erd +\end_inset + + product, which is sharded by definition, and it actually is known to scale + up by at least another order of magnitude (in terms of number of servers). +\end_layout + +\begin_layout Subsection +Scalability of Filesystem Layer vs Block Layer +\begin_inset CommandInset label +LatexCommand label +name "subsec:Filesystem-Layer-vs" + +\end_inset + + +\end_layout + +\begin_layout Standard +Following factors are responsible for better architectural (cf section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:What-is-Architecture" + +\end_inset + +) scalability of the block layer vs the filesystem layer, at least in many + cases, with a few exceptions (list may be incomplete): +\end_layout + +\begin_layout Enumerate + +\series bold +Granularity +\series default + of access: +\series bold +metadata +\series default + is often smaller than the content data it refers to, but access to data + is typically not possible without accessing corresponding metadata +\emph on +first +\emph default +. + When masses of metadata are present (e.g. + some billions of inodes as in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Example-Scalability-Scenario" + +\end_inset + +), and when it is accessed +\series bold +more frequently +\series default + than the corresponding data (e.g. + in stateless designs like Apache), it is likely to become the bottleneck. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + Neglecting metadata and its access patterns is a major source of ill-designs. + I know of projects which have failed (in their original setup) because + of this. + Repair will typically involve some non-trivial architectural changes. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +By default, the block layer itself has almost no metadata at all (or only + tiny ones, such as describing a whole block device). + Therefore it has an +\emph on +inherent advantage +\emph default + over the filesystem layer in such use cases. +\end_layout + +\begin_layout Enumerate + +\series bold +Caching +\series default +: shared memory caches in kernelspace (page cache + dentry cache) vs distributed + caches over network. + See the picture in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Performance-Arguments-from" + +\end_inset + +. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + There exist +\emph on +examples +\emph default + where shared distributed caches do not work at all. + I know of +\emph on +several +\emph default + projects which have failed. + Another project than mentioned in section +\begin_inset CommandInset ref +LatexCommand vref +reference "subsec:Example-Failures-of" + +\end_inset + + has failed because of violations of POSIX filesystem semantics. +\end_layout + +\begin_layout Enumerate +Only in distributed systems: the +\series bold +cache coherence problem +\series default +, both on metadata and on data. + Depending on load patterns, this can lead to tremendous performance degradation +, see example in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Example-Failures-of" + +\end_inset + +. +\end_layout + +\begin_layout Enumerate +Dimensioning of the +\series bold +network +\series default +: throughput, latencies, queueing behaviour. +\end_layout + +\begin_layout Standard +There exist a few known exceptions (list may be incomplete, please report + further examples if you know some): +\end_layout + +\begin_layout Itemize +Databases: these are typically operating on specific container formats, + where no frequent +\emph on +external +\emph default + metadata access is necessary, and where no sharing of the +\emph on +container as such +\emph default + is necessary. + Typically, there is no big difference between storing them in block devices + vs local filesystems. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Exception from the exception: MyISAM is an old design from the 1980s, originall +y based on DBASE data structures. + Don't try to access them over NFS or similar. + Or, better, try to avoid them at all if possible. +\end_layout + +\begin_layout Itemize +VM images: these are logical BLOBS, so there is typically no big difference + whether you have an intermediate +\emph on +true +\emph default + filesystem layer, or not. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Filesystems on top of object stores are no true intermediate filesystems. + They are violating Dijkstra's important layering rules, as stated in his + famous articles on THE. + A similar argument holds for block devices on top of object stores. + Intermediate container formats like +\family typewriter +*.vmdk +\family default + or +\family typewriter +*.qcow2 +\family default + can also act as game changers. + This does not mean that you have to avoid them at all. + However, be sure to +\series bold +check their influence +\series default +, and don't forget their +\emph on +workingset +\emph default + and their +\emph on +caching behaviour +\emph default + (which can go both into positive and into negative direction), in order + to really +\emph on +know what you are doing! +\end_layout + +\begin_layout Standard +There exist a few cases where a distributed filesystem, sometimes even actually + with +\begin_inset Formula $O(n^{2})$ +\end_inset + + behaviour, +\emph on +must +\emph default + be used, because there exists a +\emph on +requirement +\emph default + for it. + Some examples (list is certainly incomplete): +\end_layout + +\begin_layout Itemize +HPC = +\series bold +High Performance Computing +\series default + on modern supercomputers, consisting of a high number of +\begin_inset Formula $n$ +\end_inset + + compute nodes, are often requiring access to a shared persistent data pool, + where each of the +\begin_inset Formula $n$ +\end_inset + + nodes must be sometimes able to access the same persistent data, sometimes + both for reading and writing. + Therefore, several supercomputers are using cluster filesystems like Lustre. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Care must be taken that high-frequency / fine granularity communication + over the distributed filesystem and its dedicated storage network does + not take place, but instead occurs over the ordinary low-latency communication + fabrics each modern supercomputer is relying on. + True +\begin_inset Formula $O(n^{2})$ +\end_inset + + storage access behaviour should be avoided as far as possible (given by + the problem to be solved). + When absolutely necessary, location transparency (as possible with cluster + filesystems like Lustre) as well as its DSM = Distributed Shared Memory + model must be given up, and an +\series bold +explicit communication model +\series default + must be used instead, which allows explicit control over replicas and their + communication paths (e.g. + propagation in a binary tree fashion), although it results in much more + work for the programmers. + Only low frequency / coarse granularity transfers of +\emph on +bulk data +\emph default + with +\emph on +high locality +\emph default + should run over distributed filesystems, preferably in streaming mode. + The total frequency of metadata access should be low, because metadata + consistency may form a bottleneck when updated too frequently. + The programmers of the distributed application software need to take care + for this. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Notice that certain supercomputer workloads may be crying for a RemoteSharding + or FlexibleSharding storage architecture in place of a BigCluster architecture. + However, this is very application specific. +\end_layout + +\begin_layout Itemize +Student pools at universities, or location-independent workplaces at companies. + This is just the usecase where NFS was originally constructed for. + Typically, +\series bold +workstation workloads +\series default + are neither performance critical, nor prone to actual +\begin_inset Formula $O(n^{2})$ +\end_inset + + behaviour (although the network infrastructure would +\emph on +allow +\emph default + for it), because each user has her own home directory which is typically + +\emph on +not shared +\emph default + with others, and she cannot split herself and sit in front of multiple + workstations at the same time. + Thus the +\emph on +local per-workstation +\emph default + NFS caching strategies have a good chance to hide much of the network latencies +, and thus the actual total network workload is typically only +\begin_inset Formula $O(n).$ +\end_inset + + +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + This can lead to a dangerous misinterpretation: because it apparently works + even for a few thousands of workstations, people conclude +\emph on +wrongly +\emph default + that the network filesystem +\begin_inset Quotes eld +\end_inset + +must be scalable +\begin_inset Quotes erd +\end_inset + +. + Some people are then applying their experience to completely different + usecases, where much higher metadata traffic by several orders of magnitudes + is occurring (such as in webhosting), or even where true +\begin_inset Formula $O(n^{2})$ +\end_inset + + runtime behaviour is occuring (see example of a failed scalability scenario + in section +\begin_inset CommandInset ref +LatexCommand vref +reference "subsec:Example-Failures-of" +plural "false" +caps "false" +noprefix "false" + +\end_inset + +). +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + In general: when something works for usecase A, this +\series bold +does +\emph on +not +\emph default + prove +\series default + that it will also work for another usecase B. +\end_layout + +\begin_layout Section +Recommendations for Design and Operation of Storage Systems +\begin_inset CommandInset label +LatexCommand label +name "sec:Recommendations-for-Designing" + +\end_inset + + +\end_layout + +\begin_layout Subsection +Recommendations for Managers +\begin_inset CommandInset label +LatexCommand label +name "subsec:Recommendations-for-Managers" + +\end_inset + + +\end_layout + +\begin_layout Standard +When you are responsible for +\series bold +masses of enterprise-critical data +\series default +, the most important point is to get people with +\series bold +the right skills +\series default +, in +\emph on +addition(!) to +\emph default + the +\emph on +right mindset +\emph default +, and to assign the right roles to them. +\end_layout + +\begin_layout Standard +Practical observation from many groups in many companies: which storage + systems / architectures are in use, and how much they are +\emph on +really +\emph default + failure resistent and reliable, and how much they are +\emph on +really +\emph default + scalable for their workload, and what is their TCO (Total Cost of Ownership), + does often +\emph on +not +\emph default + depend on real knowledge and facts. + It often depends on +\series bold +personal habits +\series default + and +\series bold +pre-judgement +\series default + of staff +\begin_inset Foot +status open + +\begin_layout Plain Layout +\noindent +This can be seen in a bigger company (e.g. + after mergers etc) when very different architectures have been built by + different teams for very similar usecases, although they are sometimes + even roughly comparable in size and workload. +\end_layout + +\end_inset + +. + In essence, this results in a gambling game how safe / cost-effective etc + your critical data +\emph on +really +\emph default + is. +\end_layout + +\begin_layout Standard +As just explained in the previous section, there are so many pitfalls, and + there are only a few people who know them, because more people are working + in small-scale systems than in large-scale enterprise ones. + There are so many lots of people at the market who +\emph on +claim +\emph default + to have some experience, but in reality they don't know what they don't + know ( +\series bold +second-order ignorance +\series default +). +\end_layout + +\begin_layout Standard +Second-order ignorance is very dangerous, even for affected people themselves, + because they are in good faith about their own skills, and that they would + be able to control everything (sometimes they really want to control literally + +\emph on +everything +\emph default +, even other people who have more real experience and knowledge). + See for example wrong assumptions and +\begin_inset Quotes eld +\end_inset + +false proofs +\begin_inset Quotes erd +\end_inset + + about scalability, derived from different usecases (or in extreme cases + even from workstations workloads), or the failed scalability scenario in + section +\begin_inset CommandInset ref +LatexCommand vref +reference "subsec:Example-Failures-of" +plural "false" +caps "false" +noprefix "false" + +\end_inset + + where some freelancers were consulted as +\begin_inset Quotes eld +\end_inset + +external experts +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Quotation +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Check your information sources! There is a +\emph on +systematic reason +\emph default + for ill-informed +\begin_inset Quotes eld +\end_inset + +experts +\begin_inset Quotes erd +\end_inset + +. + On the internet, you can find a lot of so-called +\begin_inset Quotes eld +\end_inset + +best practices +\begin_inset Quotes erd +\end_inset + +. + Many of them propagating badly scaling storage architectures for enterprise + workloads, sometimes even +\emph on +generally +\emph default + claiming they would +\begin_inset Quotes eld +\end_inset + +scale very well +\begin_inset Quotes erd +\end_inset + +, which is however often based on +\emph on +assumptions +\emph default + instead of knowledge (and almost never based on +\emph on +measurements +\emph default + at the right measurement points for deriving substantial knowledge about + your real application behaviour). + Literally +\emph on +anyone +\emph default + can post falsely generalized +\begin_inset Quotes eld +\end_inset + +best practices +\begin_inset Quotes erd +\end_inset + + to the internet. + Together with second-order ignorance about the non-transferability of +\begin_inset Quotes eld +\end_inset + +success stories +\begin_inset Quotes erd +\end_inset + + from usecase A to usecase B (resulting in +\emph on +false +\begin_inset Quotes eld +\end_inset + +proofs +\emph default + +\begin_inset Quotes erd +\end_inset + +), the internet is creating +\series bold +information bubbles +\series default +. + +\end_layout + +\begin_layout Quotation +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Real knowledge originates from evaluated sources, such as +\series bold +scientific publications +\series default + which have undergone at least some minimum +\emph on +quality check +\emph default +, and which are trying to describe their preconditions and operating environment +s as precisely +\begin_inset Foot +status open + +\begin_layout Plain Layout +\noindent +Therefore, chances are better to get a real expert when he has some (higher) + academic degrees, and was working in the area for a longer time. +\end_layout + +\end_inset + + as possible. +\end_layout + +\begin_layout Quotation +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Real experts will tell you when they don't know something. + In addition, they will tell you +\emph on +multiple +\emph default + ways for abtaining such information, such as measurements, simulation, + etc. +\end_layout + +\begin_layout Standard +If you don't have anyone in your teams who knows how +\series bold +caching +\series default + +\emph on +really +\emph default + works, or if it is a single guy who cannot withstand the pressure from + a whole group of +\begin_inset Quotes eld +\end_inset + +alpha animals +\begin_inset Quotes erd +\end_inset + +, you are running an +\series bold +increased risk +\series default + of unnecessary expenses +\begin_inset Foot +status open + +\begin_layout Plain Layout +I know of cases which have produced unnecessary +\emph on +direct +\emph default + cost of at least € 20 millions. +\end_layout + +\end_inset + +, worse services (indirect costs), failed projects, and sometimes even resulting + in loss of market share and/or of stock exchange value. +\end_layout + +\begin_layout Standard +The problem is that it +\emph on +looks so easy +\emph default +, as if everyone could build a larger storage system, with ease. + For example, just +\begin_inset Quotes eld +\end_inset + +spend some more money +\begin_inset Quotes erd +\end_inset + +, that's all you would need. + Unfortunately, both +\begin_inset Quotes eld +\end_inset + +marketing drones +\begin_inset Quotes erd +\end_inset + + from commercial storage vendors, and even a few OpenSource advocates, are + propagating this +\series bold +dangerous mindset +\series default +. +\end_layout + +\begin_layout Standard +As a responsible manager, how can you detect dangerous partly knowledge? + Good indicators are wrong usage of the term +\begin_inset Quotes eld +\end_inset + +architecture +\begin_inset Quotes erd +\end_inset + + (see definition in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:What-is-Architecture" +plural "false" +caps "false" +noprefix "false" + +\end_inset + +), and/or +\series bold + confusion of architecture with implementation +\series default +. + When somebody confuses +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice that there exist people who use the term +\begin_inset Quotes eld +\end_inset + +architecture +\begin_inset Quotes erd +\end_inset + + inadvertly. + They even don't even know that they are confusing architecture with implementat +ion. + Pure usage of a certain term is no clear indicator that somebody is really + an expert. +\end_layout + +\end_inset + + this, he does not really have an overview of different architectural solution + classes. + Instead, such people are tending to propagate their random +\begin_inset Quotes eld +\end_inset + +favourite product +\begin_inset Quotes erd +\end_inset + +. + For a responsible, this increases the risk of getting a non-optimum or + even bad / dangerous solutions. +\end_layout + +\begin_layout Standard +Not everything which works in a garage, or in a student pool, or in the + testlab (whether it's yours or from a commercial storage vendor), or in + a PoC with some +\begin_inset Quotes eld +\end_inset + +friendly customers +\begin_inset Quotes erd +\end_inset + +, is well-suited for large enterprises and their critical data (measured + in petabytes / billions of files / etc), or is the optimum solution for + TCO. + Some rules of thumb, out of experience and observation: +\end_layout + +\begin_layout Itemize +For each 1 or 2 orders of magnitude of the +\series bold +size +\series default + of your data, you need better methods for safe construction and operation. + At least for each 3 to 4 orders of magnitude (sometimes even for less), + you need +\series bold +better architectures +\series default +, and people who can deal with them. +\end_layout + +\begin_layout Itemize +For each 1 or 2 orders of magntitude of +\series bold +criticality +\series default + of your data (measured by +\emph on +losses +\emph default + in case of certain incidents), you will also need better architecture, + not just better components. +\end_layout + +\begin_layout Subsection +Recommendations for Architects and Sysadmins +\begin_inset CommandInset label +LatexCommand label +name "subsec:Recommendations-for-Architects" + +\end_inset + + +\end_layout + +\begin_layout Standard +In order of precedence, do the following: +\end_layout + +\begin_layout Enumerate + +\series bold +Fix and/or limit and/or tune the +\emph on +application +\series default +\emph default +. +\begin_inset Newline newline +\end_inset + +Some extreme examples: +\end_layout + +\begin_deeper +\begin_layout Itemize +When you encounter a classical Unix +\series bold +fork bomb +\series default +, you have no chance against it. + Even the +\begin_inset Quotes eld +\end_inset + +best and the most expensive hardware +\begin_inset Quotes erd +\end_inset + + is unable to successfully run a fork bomb. + The only countermeasure is +\emph on +limitation of resources +\emph default +. + Reason: unlimited resources do not exist on earth. +\end_layout + +\begin_layout Itemize +If you think that this were only of academic interest: several types of + internet +\series bold +DDOS attacks +\series default + are acting like a fork bomb, and +\series bold +Apache +\series default + is also acting similar to a fork bomb when not configured properly. + This is not about academics, it is about +\emph on +your survival +\emph default + (in the sense of Darwin). +\end_layout + +\begin_layout Itemize +If you think it cannot hurt you because you are running +\family typewriter +fast-cgi +\family default + or another application scheme where forks are not part of the game (e.g. + databases and many others): please notice that +\series bold +network queues +\series default + are often acting as a replacement for processes. + Overflow of queues can have a similar effect than fork bombs from the viewpoint + of customers: they simply don't get the service they are expecting. +\end_layout + +\begin_layout Itemize +Real-life example: some percentage of +\family typewriter +WordPress +\family default + customers are typically and +\emph on +systematically +\emph default + +\series bold +misconfiguring +\series default + their +\family typewriter +wp-cron +\family default + cron jobs. + They create backups of their website, which +\emph on +include +\emph default + their old backups. + Result: in each generation of the backups, the needed disk space will roughly + +\emph on +double +\emph default +. + Even if you had +\begin_inset Quotes eld +\end_inset + +unlimited storage +\begin_inset Quotes erd +\end_inset + + on top of the +\begin_inset Quotes eld +\end_inset + +best and the most expensive storage system +\begin_inset Quotes erd +\end_inset + +, and even if you would like to give +\begin_inset Quotes eld +\end_inset + +unlimited storage +\begin_inset Quotes erd +\end_inset + + to your customers, it simply cannot work at all. + Exponential growth is exponential growth. + After a few months of this kind of daily backup, you would need more storage + than atoms exist in the whole universe. + You +\emph on +must +\emph default + introduce some quota limits somewhere. + And you +\emph on +must +\emph default + ensure that the +\family typewriter +wp-cron +\family default + misconfiguration is fixed, whoever is responsible for fixing it. +\end_layout + +\begin_layout Itemize +Another +\family typewriter +WordPress +\family default + example: the +\family typewriter +wp-cron +\family default + configuration syntax is not easily understandable by laymen. + It is easy to +\series bold +misconfigure +\series default + such that a backup is created +\emph on +once per minute +\emph default +. + As long as the website is very small, this will not even be noticed by + sysadmins. + However, for bigger websites (and they are typically growing over time), + the IO load may increase to a point until even asynchronous replication + over 10Gig interfaces cannot catch up. + Even worse: the next run of +\family typewriter +wp-cron +\family default + may start before the old one has finished within a minute. + Again, there is no chance except fixing the +\emph on +root cause +\emph default + at application level. +\end_layout + +\end_deeper +\begin_layout Enumerate + +\series bold +Choose the right +\emph on +overall +\emph default + architecture +\series default + (not limited to storage). +\begin_inset Newline newline +\end_inset + +An impressive example for architectural (cf section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:What-is-Architecture" + +\end_inset + +) ill-design can be found in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Example-Failures-of" + +\end_inset + +. + Important explanations are in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Properties-Scalability" + +\end_inset + +, in particular subsection +\begin_inset CommandInset ref +LatexCommand vref +reference "subsec:Influence-Factors-Scalability" + +\end_inset + +, and section +\begin_inset CommandInset ref +LatexCommand vref +reference "subsec:Filesystem-Layer-vs" + +\end_inset + +. + A strategic example is in subsection +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Example-Scalability-Scenario" + +\end_inset + +. + It is absolutely necessary to know the standard cache hierarchy of Unix + (similarly also found in Windows) from section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Performance-Arguments-from" + +\end_inset + +. + More explanations are in this manual at many places. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + In general, major ill-designs of overall architectures (end-to-end) cannot + be fixed at component level. + Even the +\begin_inset Quotes eld +\end_inset + +best tuning of the world +\begin_inset Quotes erd +\end_inset + + executed by the +\begin_inset Quotes eld +\end_inset + +best tuning expert +\begin_inset Quotes erd +\end_inset + + on top of the +\begin_inset Quotes eld +\end_inset + +best and most expensive storage +\emph on +components +\emph default + and the best storage +\emph on +network +\emph default + of the world +\begin_inset Quotes erd +\end_inset + + cannot compensate major ill-designs, such as +\begin_inset Formula $O(n^{2})$ +\end_inset + + behaviour. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Similarly for reliability: if you have problems with too many and/or too + large incidents affecting too many customers, read sections +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Reliability-Arguments-from" + +\end_inset + + and +\begin_inset CommandInset ref +LatexCommand vref +reference "subsec:Reliability-Differences-CentralStorage" + +\end_inset + +. +\end_layout + +\begin_layout Enumerate + +\series bold +Choice and tuning of components +\series default +. +\begin_inset Newline newline +\end_inset + +No further explanations necessary, because most people already know this. + In case you think this is the only way: no, it is typically the +\emph on +worst +\emph default + and typically only the +\emph on +last resort +\emph default + when compared to the previous enumeration items. +\begin_inset Newline newline +\end_inset + +Exception: choice of wrong components with insufficient properties for your + particular application / use case. + But this is an +\emph on +architectural +\emph default + problem in reality. +\end_layout + +\begin_layout Chapter +Use Cases for MARS vs DRBD +\begin_inset CommandInset label +LatexCommand label +name "chap:Use-Cases-for" + +\end_inset + + +\end_layout + +\begin_layout Standard +DRBD has a long history of successfully providing HA features to many users + of Linux. + With the advent of MARS, many people are wondering what the difference + is. + They ask for recommendations. + In which use cases should DRBD be recommended, and in which other cases + is MARS the better choice? +\end_layout + +\begin_layout Standard +The following table is a short guide to the most important cases where the + decision is rather clear: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + +\begin_inset Text + +\begin_layout Plain Layout +Use Case +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +Recommendation +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +server pairs, each directly connected via +\series bold +crossover cables +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +DRBD +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\series bold +active-active +\series default + / dual-primary, e.g. + +\family typewriter +\series bold +gfs2 +\family default +\series default +, +\family typewriter +\series bold +ocfs2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +DRBD +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +distance +\series bold +> 50km +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +MARS +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\series bold +> 100 server pairs +\series default + over a short-distance +\series bold +shared +\series default + line +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +MARS +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +all else / intermediate cases +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +read the following details +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +There exist some use cases where DRBD is clearly better than MARS. + 1&1 has a long history of experiences with DRBD where it works very fine, + in particular coupling Linux devices rack-to-rack via crossover cables. + DRBD is just +\emph on +constructed +\emph default + for that use case (RAID-1 over network). + In such a scenario, DRBD is better than MARS because it uses up less disk + space resources. + In addition, newer DRBD versions can run over high-speed but short-distance + interconnects like Infiniband (via the SDP protocol). + Another use case for DRBD is active-active / dual-primary mode, e.g. + +\family typewriter +ocfs2 +\family default + +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice that +\family typewriter +ocfs2 +\family default + is appearantly not constructed for long distances. + 1&1 has some experiences on a specific short distance cluster where the + +\family typewriter +ocfs2 +\family default + / +\family typewriter +DRBD +\family default + combination scaled a little bit better than +\family typewriter +NFS +\family default +, but worse than +\family typewriter +glusterfs +\family default + (using 2 clients in both cases – notice that +\family typewriter +glusterfs +\family default + showed extremely bad performance when trying to enable active-active +\family typewriter +glusterfs +\family default + replication between 2 server instances, therefore we ended up using active-pass +ive DRBD replication below a single +\family typewriter +glusterfs +\family default + server). + Conclusion: +\family typewriter +NFS +\family default + < +\family typewriter +ocfs2 +\family default + < +\family typewriter +glusterfs +\family default + < sharding. + We found that +\family typewriter +glusterfs +\family default + on top of active-passive DRBD scalability was about 2 times better than + +\family typewriter +NFS +\family default + on top of active-passive DRBD, while +\family typewriter +ocfs2 +\family default + on top of +\family typewriter +DRBD +\family default + in active-active mode was somewhere inbetween. + All cluster comparisons with an increasing workload over time (measured + as number of customers which could be safely operated). + Each system was replaced by the next one when the respective scalability + was at its respective end, each time leading to operational problems. + The ultimate solution was to replace all of these clustering concepts by + the general concept of +\series bold +sharding +\series default +. +\end_layout + +\end_inset + + over short +\begin_inset Foot +status open + +\begin_layout Plain Layout +Active-active won't work over long distances at all because of high network + latencies (cf chapter +\begin_inset CommandInset ref +LatexCommand ref +reference "chap:Cloud-Storage" + +\end_inset + +). + Probably, for replication of whole clusters over long distances DRBD and + MARS could be stacked: using DRBD on top for MARS for active-active clustering + of +\family typewriter +gfs2 +\family default + or +\family typewriter +ocfs2 +\family default +, and a MARS instance +\emph on +below +\emph default + for failover of +\emph on +one +\emph default + of the DRBD replicas over long distances. +\end_layout + +\end_inset + + distances. +\end_layout + +\begin_layout Standard +On the other hand, there exist other use cases where DRBD did not work as + expected, leading to incidents and other operational problems. + We analyzed them for our specific use cases. + The later author of MARS came to the conclusion that they could only be + resolved by fundamental changes in the overall architecture of DRBD. + The development of MARS started at the personal initiative of the author, + first in form of a personal project during holidays, but later picked up + by 1&1 as an official project. +\end_layout + +\begin_layout Standard +MARS and DRBD simply have +\series bold +different application areas +\series default +. +\end_layout + +\begin_layout Standard +In the following, we will discuss the pros and cons of each system in particular + situations and contexts, and we shed some light at their conceptual and + operational differences. +\end_layout + +\begin_layout Section +Network Bottlenecks +\begin_inset CommandInset label +LatexCommand label +name "sec:Network-Bottlenecks" + +\end_inset + + +\end_layout + +\begin_layout Subsection +Behaviour of DRBD +\begin_inset CommandInset label +LatexCommand label +name "subsec:Behaviour-of-DRBD" + +\end_inset + + +\end_layout + +\begin_layout Standard +In order to describe the most important problem we found when DRBD was used + to couple whole datacenters (each encompassing thousands of servers) over + metro distances, we strip down that complicated real-life scenario to a + simplified laboratory scenario in order to demonstrate the effect with + minimal means. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Notice that the following DRBD effect does not appear at crossover cables. + The following scenario covers a non-standard case of DRBD. + DRBD works fine when no network bottleneck appears! +\end_layout + +\begin_layout Standard +The following picture illustrates an effect which has been observed in 1&1 + datacenters when running masses of DBRD instances through a single network + bottleneck. + In addition, the effect is also reproducible by an elder version of the + MARS test suite +\begin_inset Foot +status open + +\begin_layout Plain Layout +The effect has been demonstrated some years ago with DRBD version 8.3.13. + By construction, is is independent from any of the DRBD series 8.3.x, 8.4.x, + or 9.0.x. +\end_layout + +\end_inset + +: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/network-bottleneck-drbd.fig + width 80col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +The simplified scenario is the following: +\end_layout + +\begin_layout Enumerate +DRBD is loaded with a low to medium, but constant rate of write operations + for the sake of simplicity of the scenario. +\end_layout + +\begin_layout Enumerate +The network has some throughput bottleneck, depicted as a red line. + For the sake of simplicity, we just linearly decrease it over time, starting + from full throughput, down to zero. + The decrease is very slowly over time (some minutes, or even hours). +\end_layout + +\begin_layout Standard +What will happen in this scenario? +\end_layout + +\begin_layout Standard +As long as the actual DRBD write throughput is lower than the network bandwidth + (left part of the horizontal blue line), DRBD works as expected. +\end_layout + +\begin_layout Standard +Once the maximum network throughput (red line) starts to fall short of the + required application throughput (first blue dotted line), we get into trouble. + By its very nature, DRBD works +\series bold +synchronously +\series default +. + Therefore, it +\emph on +must +\emph default + transfer all your application writes through the bottleneck, but now it + is impossible +\begin_inset Foot +status open + +\begin_layout Plain Layout +This is independent from the DRBD protocols A through C, because it just + depends on an information-theoretic argument independently from any protocol. + We have a fundamental conflict between network capabilities and application + demands here, which cannot be circumvented due to the +\series bold +synchronous +\series default + nature of DRBD. +\end_layout + +\end_inset + + due to the bottleneck. + As a consequence, the application running on top of DRBD will see increasingly + higher IO latencies and/or stalls / hangs. + We found practical cases (at least with former versions of DRBD) where + IO latencies exceeded practical monitoring limits such as +\begin_inset Formula $5$ +\end_inset + + s by far, up to the range of +\emph on +minutes +\emph default +. + As an experienced sysadmin, you know what happens next: your application + will run into an incident, and your customers will be dissatisfied. +\end_layout + +\begin_layout Standard +In order to deal with such situations, DRBD has lots of tuning parameters. + In particular, the +\family typewriter +timeout +\family default + parameter and/or the +\family typewriter +ping-timeout +\family default + parameter will determine when DRBD will give up in such a situation and + simply drop the network connection as an emergency measure. + Dropping the network connection is roughly equivalent to an automatic +\family typewriter +disconnect +\family default +, followed by an automatic re-connect attempt after +\family typewriter +connect-int +\family default + seconds. + During the dropped connection, the incident will appear as being resolved, + but at some hidden cost +\begin_inset Foot +status open + +\begin_layout Plain Layout +By appropriately tuning various DRBD parameters, such as +\family typewriter +timeout +\family default + and/or +\family typewriter +ping-timeout +\family default +, you can keep the impact of the incident below some viable limit. + However, the automatic disconnect will then happen earlier and more often + in practice. + Flaky or overloaded networks may easily lead to an enormous number of automatic + disconnects. +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Standard +What happens next in our scenario? During the +\family typewriter +disconnect +\family default +, DRBD will record all positions of writes in its bitmap and/or in its activity + log. + As soon as the automatic re-connect succeeds after +\family typewriter +connect-int +\family default + seconds, DRBD has to do a partial re-sync of those blocks which were marked + dirty in the meantime. + This leads to an +\emph on +additional +\emph default + bandwidth demand +\begin_inset Foot +status open + +\begin_layout Plain Layout +DRBD parameters +\family typewriter +sync-rate +\family default + resp +\family typewriter +resync-rate +\family default + may be used to tune the height of the additional demand. + In addition, the newer parameters +\family typewriter +c-plan-ahead +\family default +, +\family typewriter +c-fill-target +\family default +, +\family typewriter +c-delay-target +\family default +, +\family typewriter +c-min-rate +\family default +, +\family typewriter +c-max-rate +\family default + and friends may be used to dynamically adapt to +\emph on +some +\emph default + situations where the application throughput +\emph on +could +\emph default + fit through the bottleneck. + These newer parameters were developed in a cooperation between 1&1 and + Linbit, the maker of DRBD. +\end_layout + +\begin_layout Plain Layout +Please note that lowering / dynamically adapting the resync rates may help + in lowering the +\emph on +probability +\emph default + of occurrences of the above problems in practical scenarios where the bottlenec +k would recover to viable limits after some time. + However, lowering the rates will also increase the +\emph on +duration +\emph default + of re-sync operations accordingly. + The +\emph on +total amount of re-sync data +\emph default + simply does not decrease when lowering +\family typewriter +resync-rate +\family default +; it even tends to increase over time when new requests arrive. + Therefore, the +\emph on +expectancy value +\emph default + of problems caused by +\emph on +strong +\emph default + network bottlenecks (i.e. + when not even the ordinary application rate is fitting through) is +\emph on +not +\emph default + improved by lowering or adapting +\family typewriter +resync-rate +\family default +, but rather the expectancy value mostly depends on the +\emph on +relation +\emph default + between the amount of holdback data versus the amount of application write + data, both measured for the duration of some given strong bottleneck. +\end_layout + +\end_inset + + as indicated by the upper dotted blue box. +\end_layout + +\begin_layout Standard +Of course, there is +\emph on +absolutely no chance +\emph default + to get the increased amount of data through our bottleneck, since not even + the ordinary application load (lower dotted lines) could be transferred. +\end_layout + +\begin_layout Standard +Therefore, you run at a +\series bold +very high risk +\series default + that the re-sync cannot finish before the next +\family typewriter +timeout +\family default + / +\family typewriter +ping-timeout +\family default + cycle will drop the network connection again. +\end_layout + +\begin_layout Standard +What will be the final result when that risk becomes true? Simply, your + secondary site will be +\emph on +permanently +\emph default + in state +\family typewriter +inconsistent +\family default +. + This means, you have lost your redundancy. + In our scenario, there is no chance at all to become consistent again, + because the network bottleneck declines more and more, slowly. + It is simply +\emph on +hopeless +\emph default +, by construction. +\end_layout + +\begin_layout Standard +In case you lose your primary site now, you are lost at all. +\end_layout + +\begin_layout Standard +Some people may argue that the probability for a similar scenario were low. + We don't agree on such an argumentation. + Not only because it really happens in pratice, and it may even last some + days until problems are fixed. + In case of +\series bold +rolling disasters +\series default +, the network is very likely to become flaky and/or overloaded shortly before + the final damage. + Even in other cases, you can easily end up with inconsistent secondaries. + It occurs not only in the lab, but also in practice if you operate some + hundreds or even thousands of DRBD instances. +\end_layout + +\begin_layout Standard +The point is that you can produce an ill behaviour +\emph on +systematically +\emph default + just by overloading the network a bit for some sufficient duration. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + When coupling whole datacenters via some thousands of DRBD connections, + any (short) network loss will almost certainly increase the re-sync network + load each time the outage appears to be over. + As a consequence, overload may be +\emph on +provoked +\emph default + by the re-sync repair attempts. + This may easily lead to self-amplifying +\series bold +throughput storms +\series default + in some resonance frequency (similar to self-destruction of a bridge when + an army is marching over it in lockstep). +\end_layout + +\begin_layout Standard +The only way for reliable prevention of loss of secondaries is to start + any re-connect +\emph on +only +\emph default + in such situations where you can +\emph on +predict in advance +\emph default + that the re-sync is +\emph on +guaranteed +\emph default + to finish before any network bottleneck / loss will cause an automatic + disconnect again. + We don't know of any method which can reliably predict the future behaviour + of a complex network. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + Conclusion: in the presence of network bottlenecks, you run a considerable + risk that your DRBD mirrors get destroyed just in that moment when you + desperately need them. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Notice that crossover cables usually never show a behaviour like depicted + by the red line. + Crossover cables are +\emph on +passive components +\emph default + which normally +\begin_inset Foot +status open + +\begin_layout Plain Layout +Exceptions might be mechanical jiggling of plugs, or electro-magnetical + interferences. + We never noticed any of them. +\end_layout + +\end_inset + + either work, or not. + The binary connect / disconnect behaviour of DRBD has no problems to cope + with that. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +or +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Linbit recommends a +\series bold +workaround +\series default + for the inconsistencies during re-sync: LVM snapshots. + We tried it, but found a +\emph on +performance penalty +\emph default + which made it prohibitive for our concrete application. + A problem seems to be the cost of destroying snapshots. + LVM uses by default a BOW strategy (Backup On Write, which is the counterpart + of COW = Copy On Write). + BOW increases IO latencies during ordinary operation. + Retaining snapshots is cheap, but reverting them may be very costly, depending + on workload. + We didn't fully investigate that effect, and our experience is a few years + old. + You might come to a different conclusion for a different workload, for + newer versions of system software, or for a different strategy if you carefully + investigate the field. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + DRBD problems usually arise +\emph on +only +\emph default + when the network throughput shows some +\begin_inset Quotes eld +\end_inset + +awkward +\begin_inset Quotes erd +\end_inset + + analog behaviour, such as overload, or as occasionally produced by various + switches / routers / transmitters, or other potential sources of packet + loss. +\end_layout + +\begin_layout Subsection +Behaviour of MARS +\begin_inset CommandInset label +LatexCommand label +name "subsec:Behaviour-of-MARS" + +\end_inset + + +\end_layout + +\begin_layout Standard +The behaviour of MARS in the above scenario: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/network-bottleneck-mars.fig + width 80col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +When the network is restrained, an asynchronous system like MARS will continue + to serve the user IO requests (dotted green line) without any impact / + incident while the actual network throughput (solid green line) follows + the red line. + In the meantime, all changes to the block device are recorded at the transactio +n logfiles. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Here is one point in favour of DRBD: MARS stores its transaction logs on + the filesystem +\family typewriter +/mars/ +\family default +. + When the network bottleneck is lasting very long (some days or even some + weeks), the filesystem will eventually run out of space some day. + Section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Defending-Overflow" + +\end_inset + + discusses countermeasures against that in detail. + In contrast to MARS, DRBD allocates its bitmap +\emph on +statically +\emph default + at resource creation time. + It uses up less space, and you don't have to monitor it for (potential) + overflows. + The space for transaction logs is the price you have to pay if you want + or need anytime consistency, or asynchronous replication in general. +\end_layout + +\begin_layout Standard +In order to really grasp the +\emph on +heart +\emph default + of the difference between synchronous and asynchronous replication, we + look at the following modified scenario: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/network-flaky-mars.fig + width 80col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +This time, the network throughput (red line) is varying +\begin_inset Foot +status open + +\begin_layout Plain Layout +In real life, many long-distance lines or even some heavily used metro lines + usually show fluctuations of their network bandwidth by an order of magnitude, + or even higher. + We have measured them. + The overall behaviour can be characterized as +\begin_inset Quotes eld +\end_inset + + +\series bold +chaotic +\series default + +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\end_inset + + in some unpredictable way. + As before, the application throughput served by MARS is assumed to be constant + (dotted green line, often superseded by the solid green line). + The actual replication network throughput is depicted by the solid green + line. +\end_layout + +\begin_layout Standard +As you can see, a network dropdown undershooting the application demand + has no impact on the application throughput, but only on the replication + network throughput. + Whenever the network throughput is held back due to the flaky network, + it simply catches up as soon as possible by overshooting the application + throughput. + The amount of lag-behind is visualized as shaded area: downward shading + (below the application throughput) means an increase of the lag-behind, + while the upwards shaded areas (beyond the application throughput) indicate + a decrease of the lag-behind (catch-up). + Once the lag-behind has been fully caught up, the network throughput suddenly + jumps back to the application throughput (here visible in two cases). +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Note that the existence of lag-behind areas is roughly corresponding to + DRBD disconnect states, and in turn to DRBD inconsistent states of the + secondary as long as the lag-behind has not been fully cought up. + The very rough +\begin_inset Foot +status open + +\begin_layout Plain Layout +Of course, this visualization is not exact. + On one hand, the DRBD inconsistency phase may start later as depicted here, + because it only starts +\emph on +after +\emph default + the first automatic disconnect, upon the first automatic re-connect. + In addition, the amount of resync data may be smaller than the amount of + corresponding MARS transaction logfile data, because the DRBD bitmap will + coalesce multiple writes to the same block into one single transfer. + On the other hand, DRBD will transfer no data at all during its disconnected + state, while MARS continues its best. + This leads to a prolongation of the DRBD inconsistent phase. + Depending on properties of the workload and of the network, the real duration + of the inconsistency phase may be both shorter or longer. +\end_layout + +\end_inset + + duration of the corresponding DRBD inconsistency phase is visualized as + magenta line at the time scale. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +MARS utilizes the existing network bandwidth as best as possible in order + to pipe through as much data as possible, provided that there exists some + data requiring expedition. + Conceptually, there exists no better way due to information theoretic limits + (besides data compression). +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Note that +\emph on +in average +\emph default + during a longer period of time, the network must have emough capacity for + transporting all of your data. + MARS cannot magically break through information-theoretic limits. + It cannot magically transport gigabytes of data over modem lines. + Only +\emph on +relatively short +\emph default + network problems / packet loss can be compensated. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +In case of lag-behind, the version of the data replicated to the secondary + site corresponds to some time in the past. + Since the data is always transferred in the same order as originally submitted + at the primary site, the secondary never gets inconsistent. + Your mirror always remains usable. + Your only potential problem could be the outdated state, corresponding + to some state in the past. + However, the +\begin_inset Quotes eld +\end_inset + +as-best-as-possible +\begin_inset Quotes erd +\end_inset + + approach to the network transfer ensures that your version is always +\emph on +as up-to-date as possible +\emph default + even under ill-behaving network bottlenecks. + +\series bold +There is simply no better way to do it. + +\series default + In presence of temporary network bottlenecks such as network congestion, + there exists no better method than prescribed by the information theoretic + limit (red line, neglecting data compression). +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + In order to get all of your data through the line, somewhen the network + must be healthy again. + Otherwise, data will be recorded until the capacity of the +\family typewriter +/mars/ +\family default + filesystem is exhausted, leading to an emergency mode (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Resolution-of-Emergency" + +\end_inset + +). +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +MARS' property of never sacrificing local data consistency (at the possible + cost of actuality, as long as you have enough capacity in +\family typewriter +/mars/ +\family default +) is called +\series bold +Anytime Consistency +\series default +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Even when the capacity of +\family typewriter +/mars/ +\family default + is exhausted and when emergency mode is entered, the replicas will not + become inconsistent by themselves. + However, when the emergency mode is later +\emph on +cleaned up +\emph default + for a replica, it will become temporarily inconsistent during the fast + full sync. + Details are in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Resolution-of-Emergency" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Conclusion: you can even use +\series bold +traffic shaping +\series default + on MARS' TCP connections in order to globally balance your network throughput + (of course at the cost of actuality, but without sacrificing local data + consistency). + If you would try to do the same with DRBD, you could easily provoke a disaster. + MARS simply tolerates any network problems, provided that there is enough + disk space for transaction logfiles. + Even in case of completely filling up your disk with transaction logfiles + after some days or weeks, you will not lose local consistency anywhere + (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Defending-Overflow" + +\end_inset + +). +\end_layout + +\begin_layout Standard +Finally, here is yet another scenario where MARS can cope with the situation: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/network-constant-mars.fig + width 80col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +This time, the network throughput limit (solid red line) is assumed to be + constant. + However, the application workload (dotted green line) shows some heavy + peaks. + We know from our 1&1 datacenters that such an application behaviour is + very common (e.g. + in case of certain kinds of DDOS attacks etc). +\end_layout + +\begin_layout Standard +When the peaks are exceeding the network capacities for some short time, + the replication network throughput (solid green line) will be limited for + a short time, stay a little bit longer at the limit, and finally drop down + again to the normal workload. + In other words, you get a flexible buffering behaviour, coping with the + peaks. +\end_layout + +\begin_layout Standard +Similar scenarios (where both the application workload has peaks and the + network is flaky to some degree) are rather common. + If you would use DRBD there, you were likely to run into regular application + performance problems and/or frequent automatic disconnect cycles, depending + on the height and on the duration of the peaks, and on network resources. +\end_layout + +\begin_layout Section +Long Distances / High Latencies +\end_layout + +\begin_layout Standard +In general and in some theories, latencies are conceptually independent + from throughput, at least to some degree. + There exist all 4 possible combinations: +\end_layout + +\begin_layout Enumerate +There exist communication lines with high latencies but also high throughput. + Examples are raw fibre cables at the ground of the Atlantic. +\end_layout + +\begin_layout Enumerate +High latencies on low-throughput lines is very easy to achieve. + If you never saw it, you never ran interactive +\family typewriter +vi +\family default + over +\family typewriter +ssh +\family default + in parallel to downloads on your old-fashioned modem line. +\end_layout + +\begin_layout Enumerate +Low latencies need not be incompatible with high throughput. + See Myrinet, InfiniBand or high-speed point-to-point interconnects, such + as modern RAM busses. +\end_layout + +\begin_layout Enumerate +Low latency combined with low throughput is also possible: in an ATM system + (or another pre-reservation system for bandwidth), just increase the multiplex + factor on low-capacity but short lines, which is only possible at the cost + of assigned bandwidth. +\end_layout + +\begin_layout Standard +In the +\emph on +internet +\emph default + practice, however, it is very likely that high latencies will also lead + to worse throughput, because of the +\emph on +congestion control algorithms +\emph default + running all over the world. +\end_layout + +\begin_layout Standard +We have experimented with extremely large TCP send/receive buffers plus + various window sizes and congestion control algorithms over long-distance + lines between the USA and Europe. + Yes, it is possible to improve the behaviour to some degree. + But magic does not happen. + Natural laws will always hold. + You simply cannot travel faster than the speed of light. +\end_layout + +\begin_layout Standard +Our experience leads to the following rule of thumb, not formally proven + by anything, but just observed in practice: +\end_layout + +\begin_layout Quotation +In general +\begin_inset Foot +status open + +\begin_layout Plain Layout +We have heard of cases where even less than 50 km were not working with + DRBD. + It depends on application workload, on properties of the line, and on congestio +n caused by other traffic. + Some other people told us that according to +\emph on +their +\emph default + experience, much lesser distances should be considered operable, only in + the range of a few single kilometers. + However, they agree that DRBD is rock stable when used on crossover cables. +\end_layout + +\end_inset + +, synchronous data replication (not limited to applications of DRBD) works + reliably only over distances +\begin_inset Formula $<50$ +\end_inset + + km, or sometimes even less. +\end_layout + +\begin_layout Standard +There may be some exceptions, e.g. + when dealing with low-end workstation loads. + But when you are responsible for a whole datacenter and/or some centralized + storage units, don't waste your time by trying (almost) impossible things. + We recommend to use MARS in such use cases. +\end_layout + +\begin_layout Section +Explanation via CAP Theorem +\begin_inset CommandInset label +LatexCommand label +name "sec:Explanation-via-CAP" + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/cap-theorem.fig + width 60col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +The famous CAP theorem, also called Brewer's theorem, is important for a + deeper understanding of the differences between DRBD and MARS. + A good explanation can be found at +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + + (retrieved July 2018). +\end_layout + +\begin_layout Standard +The CAP theorem states that only 2 out of 3 properties can be achieved at + the same time, when a Distributed System is under pressure: C = Consistency + means +\series bold +\emph on +Strict +\series default +\emph default + Consistency at the level of the +\emph on +distributed +\emph default + system (which is +\emph on +not +\emph default + the same as strict consistency +\emph on +inside +\emph default + of one of the +\emph on +local +\emph default + systems), A = Availability = intuitively clear from a user's perspective, + and P = Partitioning Tolerance = the network may have its own outages at + any time (which is a negative criterion). +\end_layout + +\begin_layout Standard +As explained in the Wikipedia article, the P = Partitioning Tolerance is + a property which is imporant at least in +\emph on +wide-distance +\emph default + data replication scenarios, and possibly in some other scenarios. +\end_layout + +\begin_layout Subsection +CAP Differences between DRBD and MARS +\begin_inset CommandInset label +LatexCommand label +name "subsec:CAP-Differences" + +\end_inset + + +\end_layout + +\begin_layout Standard +If you are considering only short distances like passive crossover cables + between racks, +\emph on +then +\emph default + (and +\emph on +only then +\emph default +) you may +\emph on +assume(!) +\emph default + that P is not required. + Then, and only then, you can get both A and C at the same time, without + sacrificing P, because P is already for free by assumption. + In such a crossover cable scenario, getting all three C and A and P is + possible, similarly to an explanation in the Wikipedia article. +\end_layout + +\begin_layout Standard +This is the classical use case for DRBD: when both DRBD replicas are always + staying physically connected via a passive crossover cable (which is +\emph on +assumed +\emph default + to never break down), you can get both strict global consistency and availabili +ty, even in cases where one of the DRBD nodes is failing +\begin_inset Foot +status open + +\begin_layout Plain Layout +In addition, you will need some further components like Pacemaker, iSCSI + failover, etc. +\end_layout + +\end_inset + +. + Both C and A are provided by DRBD during +\family typewriter +connected +\family default + state, while P is assumed to be provided by a passive component. + By addition of iSCSI failover, A can be achieved even in case of single + storage node failures, while retaining C from the viewpoint +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice: the CAP theorem does not deal with node failures, only with +\emph on +network +\emph default + failures. + Node failures would always violate C by some +\begin_inset Quotes eld +\end_inset + +strong +\begin_inset Quotes erd +\end_inset + + definition. + By some +\begin_inset Quotes eld +\end_inset + +weaker +\begin_inset Quotes erd +\end_inset + + definition, the downtime plus recovery time (e.g. + DRBD re-sync) can be taken out of the game. + Notice: while a node can always +\begin_inset Quotes eld +\end_inset + +know +\begin_inset Quotes erd +\end_inset + + whether it has failed (at least after reboot), network failures cannot + be distinguished from failures of remote nodes in general. + Therefore node failures and network failures are fundamentally different + by their nature. +\end_layout + +\end_inset + + of the application. +\end_layout + +\begin_layout Standard +This is explained by the thick line in the following variant of the graphics, + which is only valid for crossover cables where P need not be guaranteed + by the replication because it is already assumed for free: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/cap-drbd-operational.fig + width 60col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +Now look at the case of a truly Distributed System, where P cannot be assumed + as for free. + For example, try to use DRBD in a long-distance replication scenario. + There we cannot assume P as already given. + We +\series bold +must +\emph on +tolerate +\series default +\emph default + replication network outages. + DRBD is reacting to this differently in two different modes. +\end_layout + +\begin_layout Standard +First we look at the (short) time interval +\emph on +before +\emph default + DRBD recognizes the replication network incident, and before it leaves + the +\family typewriter +connected +\family default + state. + During this phase, the application IO will +\series bold +hang +\series default + for some time, indicating the (temporary) sacrifice (from a user's perspective) + by a red X: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/cap-drbd-connected.fig + width 60col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +Because Availability is one of the highest goods of enterprise-critical + IT operations, you will typically configure DRBD such that it automatically + switches to some variant of a +\family typewriter +disconnected +\family default + state after some timeout, thereby giving up consistency between both replicas. + The red X indicates not only loss of global strict consistency in the sense + of the CAP theorem, but also that your replica will become +\family typewriter +Inconsistent +\family default + during the following re-sync: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/cap-drbd-disconnected.fig + width 60col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +You may wonder what the difference to MARS is. + As explained in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Requirements-for-Cloud" + +\end_inset + +, MARS is not only intended for wide distances, but also for +\series bold +Cloud Storage +\series default + where no strict consistency is required at global level by definition, + but instead +\series bold +Eventually Consistent +\series default + is the preferred model for the Distributed System. + Therefore, +\emph on +strict +\emph default + consistency (in the sense of the CAP theorem) is +\emph on +not required by definition +\emph default +. + Therefore, the red X is not present in the following graphics, showing + the state where MARS is remaining +\emph on +locally consistent +\emph default + all the time +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice that the +\emph on +initial +\emph default + full sync is not considered here, neither for DRBD, nor for MARS. + +\emph on +Setup +\emph default + of the Distributed System is its own scenario, not considered here. + +\emph on +Repair +\emph default + of a +\emph on +damaged +\emph default + system is also a different scenario, also not considered here. + Notice the MARS' emergency mode also belongs to the class of +\begin_inset Quotes eld +\end_inset + +damages +\begin_inset Quotes erd +\end_inset + +, as well as DRBD' disk failure modes, where is has some additional functionalit +y compared to the current version of MARS. +\end_layout + +\end_inset + +, even when a network outage occurs: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/cap-mars.fig + width 60col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Notice: MARS does not guarantee strict consistency +\emph on +between +\emph default + LV replicas at the level of the Distributed System, but only Eventually + Consistent. + However, +\emph on +at the same time +\emph default + it +\emph on +also +\emph default + guarantees strict consistency +\emph on +locally +\emph default +, and even at +\emph on +each +\emph default + of the passive replicas, each by each. + Don't confuse these different levels. + There are different consistency guarantees at different levels, at the + same time. + This might be confusing if you are not looking at the system at different + levels: (1) overall Distributed System versus (2) each of the local system + instances. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Why does MARS this? Because a better way is not possible at all. + The CAP theorem tells us that there exists no better way when both A have + to be guaranteed (as almost everywhere in enterprise-critical IT operations), + and P has to be ensured in datacenter disaster scenarios or some other + scenarios. + Similarly to natural laws like Einstein's laws of the speed of light, there + +\emph on +does not exist +\emph default + a better way! +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Conclusion from the CAP theorem: when P is a +\emph on +hard +\emph default + +\emph on +requirement +\emph default +, don't use DRBD (or other +\emph on +synchronous +\emph default + replication implementations) for long-distance and/or Cloud Storage scenarios. + The red X is in particular problematic during re-sync, after the network + has become healthy again (cf section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Behaviour-of-DRBD" + +\end_inset + +). + MARS has no red X at C because of its +\series bold +Anytime Consistency +\series default +, which refers to +\emph on +local +\emph default + consistency, and which is violated by DRBD during certain important phases + of its regular operation. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Another conclusion from the CAP theorem: when A+C is a +\emph on +hard requirement +\emph default +, and when P can be faithfully assumed as already given by passive crossover + cables, then don't use the current version of MARS. + Use DRBD instead. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + If you think that you require alle three properties C+A+P, but you don't + have passive crossover cables over short distances, you are requiring something + which is +\series bold +impossible +\series default +. + There exists no solution, with whatever component, or from whatever commercial + storage vendor. + The CAP theorem is as hard as Einstein's natural laws are. + Rethink your complete concept, from end to end. + Something is wrong, somewhere. + Ignoring this on enterprise-critical use cases can endanger a company and/or + your career. +\end_layout + +\begin_layout Subsection +CAP Commonalities between DRBD and MARS +\begin_inset CommandInset label +LatexCommand label +name "subsec:CAP-Commonalities" + +\end_inset + + +\end_layout + +\begin_layout Standard +In this subsection, we look at the case that P is not for free, but has + to be ensured by the Distributed Storage system. +\end_layout + +\begin_layout Standard +You may have noticed that MARS' ordinary CAP behaviour is similar to DRBD's + CAP picture in +\family typewriter +disconnected +\family default + state, or during similar states when the replication network is interrupted. +\end_layout + +\begin_layout Standard +Replication network interruption is also known as +\begin_inset Quotes eld +\end_inset + +Network Partitioning +\begin_inset Quotes erd +\end_inset + +. + This is where property P = Partitioning Tolerance comes into play. +\end_layout + +\begin_layout Standard +When a network partition has +\emph on +actually occurred +\emph default +, both DRDB and MARS allow you to do the same: you may +\series bold +forcefully switch +\series default + the +\family typewriter +primary +\family default + role, which means activation of a former +\family typewriter +secondary +\family default + node. + In such a situation, you can issue commands like +\family typewriter + drbdadm primary --force +\family default + or +\family typewriter +marsadm primary --force +\family default +. + It is no accident that both commands are looking similar to each other. +\end_layout + +\begin_layout Standard +The outcome will be the same: you will most likely get a +\family typewriter +\series bold +SplitBrain +\family default +\series default + situation. +\end_layout + +\begin_layout Standard +The possibility of getting a split brain is no specific property of neither + DRBD nor MARS. + It will also happen with any other replication system, whether synchronous + or asynchronous. +\end_layout + +\begin_layout Standard +It is one of the consequences from the CAP theorem when (1a) P has to be + assured, and (1b) a network partition has +\emph on +actually occurred +\emph default +, and (2) when A = Availability is enforced at both sides of the network + partition. + The result is that C = global Consistency is violated, by creation of two + or more versions of the data. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Careful: at least for some application classes, it is a bad idea to systematica +lly create split brain via automatic cluster managers, e.g. + Pacemaker or similar. + As explained in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Inappropriate-Clustermanger" + +\end_inset + +, some cluster managers were originally constructed for truly shared disk + scenarios, where no split brain can occur by construction. + Using them in masses on versioned data in truly distributed systems can + result in existential surprises, once a bigger network partition and/or + a flaky replication networks triggers them in masses, and at some moments + where you didn't really want to do what they now are doing automatically, + and in masses. + Split brain should not be provoked when not +\emph on +absolutely +\emph default + necessary. +\end_layout + +\begin_layout Standard +Split brain resolution is all else but easy in general. + When the data is in a generic block device, you typically will have no + general means for merging both versions. + This means, split brain resolution is typically only possible by +\series bold +throwing away +\series default + some of the versions. +\end_layout + +\begin_layout Standard +This kind of split brain resolution problem is no specific property of DRBD + or of MARS. + It is a fundamental property of generic block devices. +\end_layout + +\begin_layout Standard +DRBD and MARS have some commands like +\family typewriter +drbdadm invalidate +\family default + or +\family typewriter +marsadm invalidate +\family default + for this. + Again, the similarity is no accident. +\end_layout + +\begin_layout Standard +Notice that classical filesystems aren't typically better than raw block + devices. + There are even more possibilities for tricky types of +\series bold +conflicts +\series default + (e.g. + on path names in addition to file content). +\end_layout + +\begin_layout Standard +Similary, BigCluster object stores are often suffering from similar (or + even worse) problems, because higher application layers may have some hidden + internal dependencies between object versions, while the object store itself + is agnostic of version dependencies in general +\begin_inset Foot +status open + +\begin_layout Plain Layout +There exists lots of types of potential dependencies between objects. + Timely ones are easy to capture, but this is not sufficient in general + for everything. +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + When stacking block devices or filesystems (or something else) on top of + some BigCluster object store, the latter will not magically resolve any + split brain for you. + Check whether your favorite object store implementation has some kind of + equivalent of a +\family typewriter +primary --force +\family default + command, and some equivalent +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice: BigCluster architectures are typically discriminating between between + client servers and storage servers. + This will typically introduce some more possibilities into the game, such + as forced client failover, independently from forced storage failover. +\end_layout + +\end_inset + + of an +\family typewriter +invalidate +\family default + command. + If it doesn't have one, or only a restricted one, you should be +\emph on +alerted +\emph default +. + In case of a long-lasting storage network partition, you might need suchalike + +\emph on +desperately +\emph default + for ensuring A, even at the cost of C. + Check: whether you need this is heavily depending on the +\series bold +\emph on +application class +\series default +\emph default + (see also the Cloud Storage definition in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Requirements-for-Cloud" + +\end_inset + +, or look at webhosting, etc). + When you +\emph on +would +\emph default + need it, but you are +\series bold +not prepared for suchalike scenarios at your enterprise-critical data +\series default +, it could cost you a lot of money and/or reputation and/or even your existence. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + Notice: the +\emph on +concept +\emph default + of +\family typewriter +SplitBrain +\family default + is occurring almost everywhere in truly Distributed Systems when C can + be violated in favour of A+P. + It is a very general consequence +\begin_inset Foot +status open + +\begin_layout Plain Layout +There exist only few opportunities for generic conflict resolution, even + in classical databases where +\emph on +some +\emph default + knowledge about the structure of the data is available. + Typically, there are some more hidden dependencies. + Lossless +\family typewriter +SplitBrain +\family default + resolution will thus need to be implemented at application layer, if it + is possible at all. +\end_layout + +\end_inset + + of the CAP theorem. +\end_layout + +\begin_layout Standard +The only reliable way for avoiding split brain in truly distributed systems + would be: don't insist on A = Availability. + Notice that there exist some application classes, like certain types of + banking, where C is typically a higher good than A. +\end_layout + +\begin_layout Standard +Notice that both DRBD and MARS are supporting this also: just don't add + the option +\family typewriter +--force +\family default + to the +\family typewriter +primary +\family default + switch command. +\end_layout + +\begin_layout Standard +However: even in banking, some +\emph on +extremely extraordinary +\emph default + scenarios might occur, where sacrifice of C in favour of A could be necessary + (e.g. + when +\emph on +manual cleanup +\emph default + of C is cheaper than long-lasting violations of A). + Good to know that both DRBD and MARS have some emergency measure for killing + C in favour of A! +\end_layout + +\begin_layout Section +Higher Consistency Guarantees vs Actuality +\end_layout + +\begin_layout Standard +We already saw in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Network-Bottlenecks" + +\end_inset + + that certain types of network bottlenecks can easily (and reproducibly) + destroy the consistency of your DRBD secondary, while MARS will preserve + local consistency at the cost of actuality ( +\series bold +anytime consistency +\series default +). +\end_layout + +\begin_layout Standard +Some people, often located at database operations, are obtrusively arguing + that actuality is such a high good that it must not be sacrificed under + any circumstances. +\end_layout + +\begin_layout Standard +Anyone arguing this way has at least the following choices (list may be + incomplete): +\end_layout + +\begin_layout Enumerate +None of the above use cases for MARS apply. + For instance, short distance replication over crossover cables is sufficient + (which occurs very often), or the network is reliable enough such that + bottlenecks can never occur (e.g. + because the total load is extremely low, or conversely the network is extremely + overengineered / expensive), or the occurrence of bottlenecks can +\emph on +provably +\emph default + be taken into account. + In such cases, DRBD is clearly the better solution than MARS, because it + provides better actuality than the current version of MARS, and it uses + up less disk resources. +\end_layout + +\begin_layout Enumerate +In the presence of network bottlenecks, people didn't notice and/or didn't + understand and/or did under-estimate the risk of accidental invalidation + of their DRBD secondaries. + They should carefully check that risk. + They should convince themselves that the risk is +\emph on +really +\emph default + bearable. + Once they are hit by a systematic chain of events which +\emph on +reproducibly +\emph default + provoke the bad effect, it is too late +\begin_inset Foot +status open + +\begin_layout Plain Layout +Some people seem to need a bad experience before they get the difference + between risk caused by reproducible effects and inverted luck. +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Enumerate +In the presence of network bottlenecks, people found a solution such that + DRBD does not automatically re-connect after the connection has been dropped + due to network problems (c.f. + +\family typewriter +ko-count +\family default + parameter). + So the risk of inconsistency +\emph on +appears +\emph default + to have vanished. + In some cases, people did not notice that the risk has +\emph on +not completely +\begin_inset Foot +status open + +\begin_layout Plain Layout +Hint: what's the +\emph on +conceptual +\emph default + difference beween an automatic and a manual re-connect? Yes, you can try + to +\emph on +lower +\emph default + the risk in some cases by transferring risks to human analysis and human + decisions, but did you take into account the possibility of human errors? +\end_layout + +\end_inset + + +\emph default + vanished, and/or they did not notice that now the actuality produced by + DRBD is even drastically worse than that of MARS (in the same situation). + It is true that DRBD provides better actuality in +\family typewriter +connected +\family default + state, but for a full picture the actuality in +\family typewriter +disconnected +\family default + state should not be neglected +\begin_inset Foot +status open + +\begin_layout Plain Layout +Hint: a potential hurdle may be the fact that the current format of +\family typewriter +/proc/drbd +\family default + does neither display the timestamp of the first +\emph on +relevant +\emph default + network drop nor the total amount of lag-behind user data (which is +\emph on +not +\emph default + the same as the number of dirty bits in the bitmap), while +\family typewriter +marsadm view +\family default + can display it. + So it is difficult to judge the risks. + Possibly a chance is inspection of DRBD messages in the syslog, but quantificat +ion could remain hard. +\end_layout + +\end_inset + +. + So they didn't notice that their argumentation on the importance of actuality + may be fundamentally wrong. + A possible way to overcome that may be re-reading section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Behaviour-of-MARS" + +\end_inset + + and comparing its outcome with the corresponding outcome of DRBD in the + same situation. +\end_layout + +\begin_layout Enumerate +People are stuck in contradictive requirements because the current version + of MARS does not yet support synchronous or pseudo-synchronous operation + modes. + This should be resolved some day. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +A common misunderstanding is about the actuality guarantees provided by + filesystems. + The buffer cache / page cache uses by default a +\series bold +writeback strategy +\series default + for performance reasons. + Even modern journalling filesystems will (by default) provide only consistency + guarantees, but no strong actuality guarantee. + In case of power loss, some transactions may be even +\emph on +rolled back +\emph default + in order to restore consistency. + According to POSIX +\begin_inset Foot +status open + +\begin_layout Plain Layout +The above argumentation also applies to Windows filesystems in analogous + way. +\end_layout + +\end_inset + + and other standards, the only +\emph on +reliable +\emph default + way to achieve actuality is usage of system calls like +\family typewriter +sync() +\family default +, +\family typewriter +fsync() +\family default +, +\family typewriter +fdatasync() +\family default +, flags like +\family typewriter +O_DIRECT +\family default +, or similar. + For performance reasons, the +\emph on +vast majority of applications +\emph default + don't use them at all, or use them only sparingly! +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + It makes no sense to require strong actuality guarantees from any block + layer replication (whether DRBD or future versions of MARS) while higher + layers such as filesystems or even applications are already sacrificing + them! +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +In summary, the +\series bold +anytime consistency +\series default + provided by MARS is an argument you should consider, even if you need an + extra hard disk for transaction logfiles. +\end_layout + +\begin_layout Chapter +Quick Start Guide +\begin_inset CommandInset label +LatexCommand label +name "chap:Quick-Start-Guide" + +\end_inset + + +\end_layout + +\begin_layout Standard +This chapter is for impatient but experienced sysadmins who already know + DRBD. + For more complete information, refer to chapter +\begin_inset CommandInset ref +LatexCommand nameref +reference "chap:The-Sysadmin-Interface" + +\end_inset + +. +\end_layout + +\begin_layout Section +Preparation: What you Need +\begin_inset CommandInset label +LatexCommand label +name "sec:Preparation:-What-you" + +\end_inset + + +\end_layout + +\begin_layout Standard +Typically, you will use MARS at servers in a datacenter for replication + of big masses of data. +\end_layout + +\begin_layout Standard +Typically, you will use MARS for replication +\emph on +between +\emph default + multiple datacenters, when the distances are greater than +\begin_inset Formula $\approx50$ +\end_inset + + km. + Many other solutions, even from commercial storage vendors, will not work + reliably over large distances when your network is not +\emph on +extremely +\emph default + reliable, or when you try to push huge masses of data from high-performance + applications through a network bottleneck. + If you ever encountered suchalike problems (or try to avoid them in advance), + MARS is for you. +\end_layout + +\begin_layout Standard +You can use MARS both at dedicated storage servers (e.g. + for serving Windows clients), or at standalone Linux servers where CPU + and storage are not separated. +\end_layout + +\begin_layout Standard +In order to protect your data from low-level disk failures, you should use + a hardware RAID controller with BBU. + Software RAID is explicitly +\emph on +not +\emph default + recommended, because it generally provides worse performance due to the + lack of a hardware BBU (for some benchmark comparisons with/out BBU, see + +\begin_inset Flex URL +status collapsed + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + +). +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Don't set your hardware BBU cache to +\begin_inset Quotes eld +\end_inset + +writethrough +\begin_inset Quotes erd +\end_inset + + mode. + This may lead to tremendous performance degradation. + Use the +\begin_inset Quotes eld +\end_inset + +writeback +\begin_inset Quotes erd +\end_inset + + strategy instead. + It should be operationally safe, because in case of power loss the BBU + cache content will be preserved thanks to the battery, and/or thanks to + goldcaps for saving the cache content into some flash chips. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +For better performance, use newer MARS versions from branch +\family typewriter +mars0.1a.y +\family default + or later. + Check the trips and tricks from sections +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:IO-Performance-Tuning" +plural "false" +caps "false" +noprefix "false" + +\end_inset + + and +\begin_inset CommandInset ref +LatexCommand vref +reference "subsec:Tuning-Network-Performance" +plural "false" +caps "false" +noprefix "false" + +\end_inset + +. + You may also play around with +\family typewriter +/proc/sys/mars/aio_sync_mode +\family default + when actuality is less important. + Further tuning of +\family typewriter +/proc/sys/mars/io_tuning/ +\family default + and many more tunables is currently only recommended for experts. + Future versions of MARS are planned to provide better performance with + software RAID. +\end_layout + +\begin_layout Standard +Typically, you will need more than one RAID set +\begin_inset Foot +status open + +\begin_layout Plain Layout +For low-cost storage, RAID-5 is no longer regarded safe for today's typical + storage sizes, because the error rate is regarded too high. + Therefore, use RAID-6. + If you need more than 15 disks in total, create multiple RAID sets (each + having at most 15 disks, better about 12 disks) and stripe them via LVM + (or via your hardware RAID controller if it supports RAID-60). +\end_layout + +\end_inset + + for big masses of data. + Therefore, use of LVM is also recommended +\begin_inset Foot +status open + +\begin_layout Plain Layout +You may also combine MARS with commercial storage boxes connected via Fibrechann +el or iSCSI, but we have not yet operational experiences at 1&1 with such + setups. +\end_layout + +\end_inset + + for your data. +\end_layout + +\begin_layout Standard +MARS' tolerance of networking problems comes with some cost. + You will need some extra space for the transaction logfiles of MARS, residing + at the +\family typewriter +/mars/ +\family default + filesystem. +\end_layout + +\begin_layout Standard +The exact space requirements for +\family typewriter +/mars/ +\family default + depend on the +\emph on +average write rate +\emph default + of your application, not on the size of your data. + We found that only few applications are writing more than 1 TB per day. + Most are writing even less than 100 GB per day. + Usually, you want to dimension +\family typewriter +/mars/ +\family default + such that you can survive a network loss lasting 3 days / about one weekend. + This can be achieved with current technology rather easily: as a simple + rule of thumb, just use one +\series bold +dedicated disk +\series default + having a capacity of 4 TB or more. + Typically, that will provide you with plenty of headroom even for bigger + networking incidents. +\end_layout + +\begin_layout Standard +Dedicated disks for +\family typewriter +/mars/ +\family default + have another advantage: their mechanical head movement is completely independen +t from your data head movements. + For best performance, attach that dedicated disk to your hardware RAID + controller with BBU, building a separate RAID set (even if it consists + only of a single disk – notice that the +\series bold +hardware BBU +\series default + is the crucial point). +\end_layout + +\begin_layout Standard +If you are concerned about reliability, use two disks switched together + as a relatively small RAID-1 set. + For extremely high performance demands, you may consider (and check) RAID-10. +\end_layout + +\begin_layout Standard +Since the transaction logfiles are highly sequential in their access pattern, + a cheap but high-capacity SATA disk (or nearline-SAS disk) is usually sufficien +t. + At the time of this writing, standard SATA SSDs have shown to be +\emph on +not +\emph default + (yet) preferable. + Although they offer high random IOPS rate, their sequential throughput + is worse, and their long-term stability is questioned by many people at + the time of this writing. + However, as technology evolves and becomes more mature, this could change + in future. +\end_layout + +\begin_layout Standard +Use +\family typewriter +ext4 +\family default + for +\family typewriter +/mars/ +\family default +. + Avoid +\family typewriter +ext3 +\family default +, and don't use +\family typewriter +xfs +\family default + +\begin_inset Foot +status open + +\begin_layout Plain Layout +It seems that the late internal resource allocation strategy of +\family typewriter +xfs +\family default + (or another currently unknown reason) could be the reason for some resource + deadlocks which appear only with +\family typewriter +xfs +\family default + and only under +\emph on +extremely +\emph default + high IO load in combination with high memory pressure. +\end_layout + +\end_inset + + at all. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Notice that the filesystem +\family typewriter +/mars/ +\family default + has nothing to do with an ordinary filesystem. + It is completely reserved for MARS internal purposes, namely as a +\series bold +storage container +\series default + for MARS' persistent data. + It does not obey any userspace rules like FHS (filesystem hierarchy standard), + and it should not be accessed by any userspace tool execpt the official + +\family typewriter +marsadm +\family default + tool. + Its internal data format should be a regarded as a +\series bold +blackbox +\series default + by you. + The internal data format may change in future, or the complete +\family typewriter +/mars/ +\family default + filesystem may be even replaced by a totally different container format, + while the official +\family typewriter +marsadm +\family default + interface is supposed to remain stable. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +That said, you might look into its contents +\emph on +by hand +\emph default + for curiosity or for +\emph on +debugging purposes +\emph default +, and only as root. + But don't program any tools / monitoring scripts / etc bypassing the official + +\family typewriter +marsadm +\family default + tool. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + Like DRBD, the current version of MARS has +\series bold +no security +\series default + built in. + MARS assumes that it is running in a +\series bold +trusted network +\series default +. + Anyone who can connect to the MARS ports (default 7777 to 7779) can potentially + breach in and become root! Therefore, you +\series bold +must +\series default + protect your network by appropriate means, such as firewalling and/or encrypted + VPN. +\end_layout + +\begin_layout Standard +Currently, MARS provides no shared secret like DRBD, because a simple shared + secret is way too weak to provide any real security (potentially misleading + people about the real level of security). + Future versions of MARS should provide at least 2-factor authorization, + and encryption via dynamic session keys. + Until that is implemented, use a secured VPN instead! And don't forget + to +\emph on +audit +\emph default + it for security holes! +\end_layout + +\begin_layout Section +Setup Primary and Secondary Cluster Nodes +\begin_inset CommandInset label +LatexCommand label +name "sec:Setup-Primary-and" + +\end_inset + + +\end_layout + +\begin_layout Standard +If you already use DRBD, you may migrate to MARS (or even back from MARS + to DRBD) if you use +\emph on +external +\begin_inset Foot +status open + +\begin_layout Plain Layout + +\emph on +Internal +\emph default + DRBD metadata should also work as long as the filesystem inside your block + device / disk already exists and is not re-created. + The latter would destroy the DRBD metadata, but even that will not hurt + you really: you can always switch back to DRBD using +\emph on +external +\emph default + metadata, as long as you have some small spare space somewhere. +\end_layout + +\end_inset + + +\emph default + DRBD metadata (which is not touched by MARS). + +\end_layout + +\begin_layout Subsection +Kernel and MARS Module +\end_layout + +\begin_layout Standard +The MARS kernel module should be available or can be built via one of the + following methods: +\end_layout + +\begin_layout Enumerate +As an external Debian or rpm kernel module, as provided by a package contributor + (or hopefully by standard distros in the future). +\end_layout + +\begin_layout Enumerate +As a separate kernel module, only for experienced +\begin_inset Foot +status open + +\begin_layout Plain Layout +You should be familiar with the problems arising from orthogonal combination + of different kernel versions with different MARS module versions and with + different +\family typewriter +marsadm +\family default + userspace tool versions at the package management level. + Hint: +\family typewriter +modinfo +\family default + is your friend. +\end_layout + +\end_inset + + sysadmins: see file +\family typewriter +Makefile.dist +\family default + (tested with some older versions of Debian; may need some extra work with + other distros). +\end_layout + +\begin_layout Enumerate +Build for senior sysadmins or developers, inplace in the kernel source tree: + first apply +\family typewriter +0001-mars-minimum-pre-patch-for-mars.patch +\family default + and +\family typewriter +0001-mars-SPECIAL-for-in-tree-build.patch +\family default + or similar, then +\family typewriter +cd block/ && git clone --recurse-submodules +\family default +. + Then +\family typewriter +cd .. + +\family default + and build your kernel as usual. + Config options for MARS should appear under +\begin_inset Quotes eld +\end_inset + +Enable the block layer +\begin_inset Quotes erd +\end_inset + +. + Just activate MARS as a +\series bold +kernel module +\series default + via +\begin_inset Quotes eld +\end_inset + +m +\begin_inset Quotes erd +\end_inset + + (don't try a fixed compile-in), and leave all else MARS config options + at the default (except you know what you are doing). +\end_layout + +\begin_layout Standard +Further / more accurate / latest instructions can be found in +\family typewriter +README +\family default + and in +\family typewriter +INSTALL +\family default +. + You must not only install the kernel and the +\family typewriter +mars.ko +\family default + kernel module to all of your cluster nodes, but also the +\family typewriter +marsadm +\family default + userspace tool. +\end_layout + +\begin_layout Standard +Starting with +\family typewriter +mars0.1stable38 +\family default + and other branches having merged this feature, a prepatch for vanilla kernels + 3.2 through 4.4 is no longer needed. + However, +\series bold +IO performance +\series default + is currently somewhat worse when the pre-patch is not applied. + This will be addressed in a later release. +\end_layout + +\begin_layout Standard +Therefore, application of the pre-patch to the kernel is +\emph on +recommended +\emph default + for large-scale production systems for now. +\end_layout + +\begin_layout Standard +Kernel pre-patches can be found in the +\family typewriter +pre-patches/ +\family default + subdirectory of the MARS source tree. + Following are the types of pre-patches: +\end_layout + +\begin_layout Itemize + +\family typewriter +0001-mars-minimum-pre-patch-for-mars.patch +\family default + or similar. + Please prefer this one (when present for your kernel version) in front + of +\family typewriter +0001-mars-generic-pre-patch-for-mars.patch +\family default + or similar. + The latter should not be used anymore, except for testing or as an emergency + fallback. +\end_layout + +\begin_layout Itemize + +\family typewriter +0001-mars-SPECIAL-for-in-tree-build.patch +\family default + or similar. + This is +\emph on +only +\emph default + needed when building the MARS kernel module together with all other kernel + modules in a single +\family typewriter +make +\family default + pass. + For separate external module builds, this patch +\emph on +must not +\emph default + be applied (but the pre-patch +\emph on +should +\emph default + when possible). + When using this patch, please apply the aforementioned pre-patch also, + because your kernel is patched anyway. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Starting from version +\family typewriter +mars0.1stable56 +\family default + or +\family typewriter +mars0.1abeta8 +\family default +, +\series bold +submodules +\series default + have been added to the github repo of MARS. + If you have an old checkout, please say +\family typewriter +git pull --recurse-submodules=yes +\family default + or similar. + Otherwise you may be missing an important future part of the MARS release, + without notice (depending on your local +\family typewriter +git +\family default + version and its local configuration). +\end_layout + +\begin_layout Subsection +Setup your Cluster Nodes +\begin_inset CommandInset label +LatexCommand label +name "subsec:Setup-your-Cluster" + +\end_inset + + +\end_layout + +\begin_layout Standard +For your cluster, you need at least two nodes. + In the following, they will be called A and B. + In the beginning, A will have the +\family typewriter +primary +\family default + role, while B will be your initial +\family typewriter +secondary +\family default +. + The roles may change later. +\end_layout + +\begin_layout Enumerate +You must be +\family typewriter +root +\family default +. +\end_layout + +\begin_layout Enumerate +On each of A and B, create the +\family typewriter +/mars/ +\family default + mountpoint. +\end_layout + +\begin_layout Enumerate +On each node, create an +\family typewriter +ext4 +\family default + filesystem on your separate disk / RAID set via +\family typewriter +mkfs.ext4 +\family default + (for requirements on size etc see section +\begin_inset CommandInset ref +LatexCommand nameref +reference "sec:Preparation:-What-you" + +\end_inset + +). +\end_layout + +\begin_layout Enumerate +On each node, mount that filesystem to +\family typewriter +/mars/ +\family default +. + It is advisable to add an entry to +\family typewriter +/etc/fstab +\family default +. +\end_layout + +\begin_layout Enumerate +For security reasons, execute +\family typewriter +chmod 0700 /mars +\family default + everyhwere after +\family typewriter +/mars/ +\family default + has been mounted. + If you forget this step, any following +\family typewriter +marsadm +\family default + command will drop you a warning, but will fix the problem for you. +\end_layout + +\begin_layout Enumerate +On node A, say +\family typewriter +marsadm create-cluster +\family default +. +\begin_inset Newline newline +\end_inset + +This must be done +\emph on +exactly once +\emph default +, on exactly one node of your cluster. + Never do this twice or on different nodes, because that would create two + different clusters which would have nothing to do with each other. + The +\family typewriter +marsadm +\family default + tool protects you against accidentally joining / merging two different + clusters. + If you accidentally created two different clusters, just umount that +\family typewriter +/mars/ +\family default + partition and start over with step 3 at that node. +\end_layout + +\begin_layout Enumerate +On node B, you must have a working +\family typewriter +ssh +\family default + connection to node A (as +\family typewriter +root +\family default +). + Test it by saying +\family typewriter +ssh A w +\family default + on node B. + It should work without entering a password (otherwise, use +\family typewriter +ssh-agent +\family default + to achieve that). + In addition, +\family typewriter +rsync +\family default + must be installed. +\end_layout + +\begin_layout Enumerate +On node B, say +\family typewriter +marsadm join-cluster A +\end_layout + +\begin_layout Enumerate +Only +\emph on +after +\begin_inset Foot +status open + +\begin_layout Plain Layout +In fact, you may already +\family typewriter +modprobe mars +\family default + at node A after the +\family typewriter +marsadm create-cluster +\family default +. + Just don't do any of the +\family typewriter +*-cluster +\family default + operations when the kernel module is loaded. + All other operations should have no such restriction. +\end_layout + +\end_inset + + +\emph default + that, do +\family typewriter +modprobe mars +\family default + on each node. +\end_layout + +\begin_layout Section +Creating and Maintaining Resources +\begin_inset CommandInset label +LatexCommand label +name "sec:Creating-and-Maintaining" + +\end_inset + + +\end_layout + +\begin_layout Standard +In the following example session, a block device +\family typewriter +/dev/lv-x/mydata +\family default + (shortly called +\emph on +disk +\emph default +) must already exist on both nodes A and B, respectively, having the same +\begin_inset Foot +status open + +\begin_layout Plain Layout +Actually, the disk at the initially secondary side may be larger than that + at the initially primary side. + This will waste space and is therefore not recommended. +\end_layout + +\end_inset + + size. + For the sake of simplicity, the disk (underlying block device) as well + as its later logical resource name as well as its later virtual device + name will all be named uniformly by the same suffix +\family typewriter +mydata +\family default +. + In general, you might name each of them differently, but that is not recommende +d since it may easily lead to confusion in larger installations. +\end_layout + +\begin_layout Standard +You may have already some data inside your disk +\family typewriter +/dev/lv-x/mydata +\family default + at the initially primary side A. + Before using it for MARS, it must be unused for any other purpose (such + as being mounted, or used by DRBD, etc). + MARS will require +\series bold +exclusive access +\series default + to it. +\end_layout + +\begin_layout Enumerate +On node A, say +\family typewriter +marsadm create-resource mydata /dev/lv-x/mydata +\family default +. +\begin_inset Newline newline +\end_inset + +As a result, a directory +\family typewriter +/mars/resource-mydata/ +\family default + will be created on node A, containing some symlinks. + Node A will automatically start in the primary role for this resource. + Therefore, a new pseudo-device +\family typewriter +/dev/mars/mydata +\family default + will also appear after a few seconds. +\begin_inset Newline newline +\end_inset + +Note that the initial contents of +\family typewriter +/dev/mars/mydata +\family default + will be exactly the same as in your pre-existing disk +\family typewriter +/dev/lv-x/mydata +\family default +. +\begin_inset Newline newline +\end_inset + +If you like, you may already use +\family typewriter +/dev/mars/mydata +\family default + for mounting your already pre-existing data, or for creating a fresh filesystem +, or for exporting via iSCSI, and so on. + You may even do so before any other cluster node has joined the resource + (so-called +\begin_inset Quotes eld +\end_inset + +standalone mode +\begin_inset Quotes erd +\end_inset + +). + But you can also do so later after setup of (one ore many) secondaries. +\end_layout + +\begin_layout Enumerate +Wait a few seconds until the directory +\family typewriter +/mars/resource-mydata/ +\family default + and its symlink contents also appears on cluster node B. + The command +\family typewriter +marsadm wait-cluster +\family default + may be helpful. +\end_layout + +\begin_layout Enumerate +On node B, say +\family typewriter +marsadm join-resource mydata /dev/lv-x/mydata +\family default +. +\begin_inset Newline newline +\end_inset + +As a result, the initial full-sync from node A to node B should start automatica +lly. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Of course, your old contents of your disk +\family typewriter +/dev/lv-x/mydata +\family default + at side B (and +\emph on +only +\emph default + there!) is overwritten by the version from side A. + Since you are an experienced sysadmin, you knew that, and it was just the + effect you deliberately wanted to achieve. + If you didn't check that your old contents didn't contain any valuable + data (or if you accidentally provided a wrong disk device argument), it + is too late now. + The +\family typewriter +marsadm +\family default + command checks that the disk device argument is really a block device, + and that exclusive access to it is possible (as well as some further safety + checks, e.g. + matching sizes). + However, MARS cannot know the +\emph on +purpose +\emph default + of your generic block device. + MARS (as well as DRBD) is completely ignorant of the +\emph on +contents +\emph default + of a generic block device; it does not interpret it in any way. + Therefore, you may use MARS (as well as DRBD) for mirroring Windows filesystems +, or raw devices from databases, or virtual machines, or whatever. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Check that state +\family typewriter +Orphan +\family default + is left after a while on B. + Notice that +\family typewriter +join-resource +\family default + is only +\emph on +starting +\emph default + a new replica, but does not wait for its completion. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Hint: by default, MARS uses the so-called +\begin_inset Quotes eld +\end_inset + +fast fullsync +\begin_inset Quotes erd +\end_inset + + algorithm. + It works similar to +\family typewriter +rsync +\family default +, first reading the data on both sides and computing an md5 checksum for + each block. + Heavy-weight data is only transferred over the long-distance network upon + checksum mismatch. + This is extremely fast if your data is already (almost) identical on both + sides. + Conversely, if you know in advance that your initial data is completely + different on both sides, you may choose to switch off the fast fullsync + algorithm via +\family typewriter +echo 0 > /proc/sys/mars/do_fast_fullsync +\family default + in order to save the additional IO overhead and network latencies introduced + by the separate checksum comparison steps. +\end_layout + +\begin_layout Enumerate +Optionally, only for experienced sysadmins who +\emph on +really +\emph default + know what they are doing: if you will create a +\emph on +new +\emph default + filesystem on +\family typewriter +/dev/mars/mydata +\family default + +\emph on +after(!) +\emph default + having created the MARS resource as well as +\emph on +after +\emph default + having already joined it on every replica, you may abandon the fast fullsync + phase +\emph on +before +\emph default + creating the fresh filesystem, because the old content of +\family typewriter +/dev/mars/mydata +\family default + will then be just garbage not used by the freshly created filesystem +\begin_inset Foot +status open + +\begin_layout Plain Layout +It is +\emph on +vital +\emph default + that the transaction logfile contents created by +\family typewriter +mkfs +\family default + is +\emph on +fully +\emph default + propagated to the secondaries and then replayed there. +\end_layout + +\begin_layout Plain Layout +Analogously, another exception is also possible, but at your own risk (be + careful, really!): when migrating your data from DRBD to MARS, and you + have ensured that (1) at the end of using DRBD both your replicas were + really equal (you should have checked that), and (2) before and after setting + up any side of MARS ( +\family typewriter +create-resource +\family default + as well as +\family typewriter +join-resource +\family default +) nothing has been written at all to it (i.e. + no usage, neither of +\family typewriter +/dev/lv/mydata +\family default + nor of +\family typewriter +/dev/mars/mydata +\family default + has occurred in any way), the first transaction logfile +\family typewriter +/mars/resource-mydata/log-000000001-$primary +\family default + created by MARS will be empty. + Check whether this is really true! Then, and only then, you may also issue + a +\family typewriter +fake-sync +\family default +. +\end_layout + +\end_inset + +. + Then, and only then, you may say +\family typewriter +marsadm fake-sync mydata +\family default + in order to abort the sync operation. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + Never do a +\family typewriter +fake-sync +\family default + unless you are +\series bold +absolutely sure +\series default + that you really don't need to sync the data! Otherwise, you are +\emph on +guaranteed +\emph default + to have produced harmful inconsistencies. + If you accidentally issued +\family typewriter +fake-sync +\family default +, you may startover the fast full sync at your secondary side by saying + +\family typewriter +marsadm invalidate mydata +\family default + (analogously to the corresponding DRBD command). +\end_layout + +\begin_layout Section +Keeping Resources Operational +\end_layout + +\begin_layout Subsection +Logfile Rotation / Deletion +\begin_inset CommandInset label +LatexCommand label +name "subsec:Logfile-Rotation" + +\end_inset + + +\end_layout + +\begin_layout Standard +As explained in section +\begin_inset CommandInset ref +LatexCommand nameref +reference "sec:The-Transaction-Logger" + +\end_inset + +, all changes to your resource data are recorded in transaction logfiles + residing on the +\family typewriter +/mars/ +\family default + filesystem. + These files are always growing over time. + In order to avoid filesystem overflow, the following must be done in regular + time intervals: +\end_layout + +\begin_layout Enumerate + +\family typewriter +marsadm log-rotate all +\family default + +\begin_inset Newline newline +\end_inset + +This starts appending to a new logfile on all of your resources. + The logfiles are automatically numbered by an increasing 9-digit logfile + number. + This will suffice for many centuries even if you would logrotate once a + minute. + Practical frequencies for logfile rotation are more like once an hour, + or every 10 minutes when having highly-loaded storage servers. +\end_layout + +\begin_layout Enumerate + +\family typewriter +marsadm log-delete-all all +\family default + +\begin_inset Newline newline +\end_inset + +This determines all logfiles from all resources which are no longer needed + (i.e. + which are +\emph on +fully +\emph default + replayed, on +\emph on +all +\emph default + relevant secondaries). + All superfluous logfiles are then deleted, including all copies on all + secondaries. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + The current version of MARS deletes either +\emph on +all +\emph default + replicas of a logfile everywhere, or +\emph on +none +\emph default + of the replicas. + This is a simple rule, but has the drawback that one node may hinder other + nodes from freeing space in +\family typewriter +/mars/ +\family default +. + In particular, the command +\family typewriter +marsadm pause-replay $res +\family default + (as well as +\family typewriter +marsadm disconnect $res +\family default +) will freeze the space reclamation in the whole cluster when the pause + is lasting very long. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + During such space accumulation, also the number of so-called deletions + will accumulate in /mars/todo-global/ and sibling directories. + In very big installations consisting of thousands of nodes, it is a good + idea to regularly monitor the number of deletions similarly to the following: + +\family typewriter +$(find /mars/ -name +\begin_inset Quotes eld +\end_inset + +delete-* +\begin_inset Quotes erd +\end_inset + + | wc -l) +\family default + should not exceed a limit of ~150 entries. +\end_layout + +\begin_layout Standard +Please prefer the short form +\family typewriter +marsadm cron +\family default + as an equivalent to scripting two separate commands +\family typewriter +marsadm log-rotate all +\family default + and +\family typewriter +marsadm log-delete-all all +\family default +. + The short form is not only easier to remember, but also future-proof in + case some new MARS features should be implemented in future. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Best practice is to run +\family typewriter +marsadm cron +\family default + in a +\family typewriter +cron +\family default + job, such as +\family typewriter +/etc/cron.d/mars +\family default +. + An example cronjob can be found in the +\family typewriter +userspace/cron.d/ +\family default + subdirectory of the git repo. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +In addition, you should establish some regular monitoring of the free space + present in the +\family typewriter +/mars/ +\family default + filesystem. +\end_layout + +\begin_layout Standard +More detailed information about about avoidance of +\family typewriter +/mars/ +\family default + overflow is in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Defending-Overflow" + +\end_inset + +. +\end_layout + +\begin_layout Subsection +Switch Primary / Secondary Roles +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/switching.fig + width 90col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +In contrast to DRBD, MARS distinguishes between +\emph on +intended +\emph default + and +\emph on +forced +\emph default + switching. + This distinction is necessary due to differences in the communication architect +ure (asynchronous communication vs synchronous communication, see sections + +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-Lamport-Clock" + +\end_inset + + and +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-Symlink-Tree" + +\end_inset + +). +\end_layout + +\begin_layout Standard +Asynchronous communication means that (in worst case) a message may take + (almost) arbitrary time in a distorted network to propagate to another + node. + As a consequence, the risk for accidentally creating an (unintended) split + brain is increased (compared to a synchronous system like DRBD). +\end_layout + +\begin_layout Standard +In order to minimize this risk, MARS has invested a lot of effort into an + internal handover protocol when you start an +\emph on +intended +\emph default + primary switch. +\end_layout + +\begin_layout Subsubsection +Intended Switching / Planned Handover +\begin_inset CommandInset label +LatexCommand label +name "subsec:Intended-Switching" + +\end_inset + + +\end_layout + +\begin_layout Standard +Before starting a planned handover from your old primary +\family typewriter +A +\family default + to a new primary +\family typewriter +B +\family default +, you should check the replication of the resource. + As a human, use +\family typewriter +marsadm view mydata +\family default +. + For scripting, use the macros from section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Predefined-Trivial-Macros" + +\end_inset + + (see also section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Scripting-HOWTO" + +\end_inset + +; an example can be found in +\begin_inset Flex URL +status collapsed + +\begin_layout Plain Layout + +contrib/example-scripts/ +\end_layout + +\end_inset + +). + The network should be OK, and the amount of replication delay should be + as low as possible. + Otherwise, handover may take a very long time. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Best practice is to +\series bold +prepare a planned handover +\series default + by the following steps: +\end_layout + +\begin_layout Enumerate +Check the network and the replication lag. + It should be low (a few hundred megabytes, or a low number of gigabytes + - see also the rough time forecast shown by +\family typewriter +marsadm view mydata +\family default + when there is a larger replication delay, or directly access the forecast + by +\family typewriter +marsadm view-replinfo +\family default +). +\end_layout + +\begin_layout Enumerate +Only when the +\family typewriter +systemd +\family default + method from section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:systemd-Templates" + +\end_inset + + is +\emph on +not +\emph default + used: stop your application, then umount +\family typewriter +/dev/mars/mydata +\family default + on host +\family typewriter +A +\family default +. +\end_layout + +\begin_layout Enumerate +Optionally: when the +\family typewriter +systemd +\family default + method from section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:systemd-Templates" + +\end_inset + + is +\emph on +not +\emph default + used, and when scripting something else, or when typing extremely fast + by hand, or for better safety: say +\family typewriter +marsadm wait-umount mydata +\family default + on host +\family typewriter +B +\family default +. + When your network is OK, the propagation of the device usage state +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice that the usage check for +\family typewriter +/dev/mars/mydata +\family default + on host +\family typewriter +B +\family default + is based on the +\emph on +open count +\emph default + transferred from +\emph on +another +\emph default + node +\family typewriter +A +\family default +. + Since MARS is operating asynchronously (in contrast to DRBD), it may take + some time until our node +\family typewriter +B +\family default + knows that the device is no longer used at +\family typewriter +A +\family default +. + This can lead to a race condition if you automate an intended takeover + with a script like +\family typewriter +ssh root@A +\begin_inset Quotes eld +\end_inset + +umount /dev/mars/mydata +\begin_inset Quotes erd +\end_inset + +; ssh root@B +\begin_inset Quotes eld +\end_inset + +marsadm primary mydata +\begin_inset Quotes erd +\end_inset + + +\family default + because your second ssh command may be faster than the internal MARS symlink + tree propagation (cf section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-Symlink-Tree" + +\end_inset + +). + In order to prevent such races, you are strongly advised to use the command +\end_layout + +\begin_layout Itemize + +\family typewriter +marsadm wait-umount mydata +\end_layout + +\begin_layout Plain Layout +on node +\family typewriter +B +\family default + before trying to become primary. + See also section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Scripting-HOWTO" + +\end_inset + +. +\end_layout + +\end_inset + + should take only a few seconds. + Otherwise, check for any network problems or any other problems. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +This step is not really necessary, because +\family typewriter +marsadm primary +\family default + will also wait for the +\family typewriter +umount +\family default + before it will proceed. + However, scripting this intermediate step gives you some more options: + if the +\family typewriter +umount +\family default + takes too long, you may program a different action, like re-starting at + the old primary, or its contrary, some forced umount, or even continuing + with a forceful failover instead (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Forced-Switching" + +\end_inset + +). +\end_layout + +\begin_layout Enumerate +Optionally, and when the +\family typewriter +systemd +\family default + method from section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:systemd-Templates" + +\end_inset + + is +\emph on +not +\emph default + used: on host +\family typewriter +B +\family default +, wait until +\family typewriter +marsadm view mydata +\family default + (or +\family typewriter +view-diskstate +\family default +) shows +\family typewriter +UpToDate +\family default +. + It is possible to omit this step, but then you have no control on the duration + of the handover, and in case of any transfer problems, disk space problems, + etc you are potentially risking to produce a split brain (although +\family typewriter +marsadm +\family default + will do its best to avoid it). + Doing the wait by yourself, +\emph on +before +\emph default + starting +\family typewriter +marsadm primary +\family default +, has a big advantage: you can abort the handover cycle at any time, just + by re-mounting the device +\family typewriter +/dev/mars/mydata +\family default + at the old primary +\family typewriter +A +\family default + again, and by re-starting your application. + Once you have started +\family typewriter +marsadm primary +\family default + on host +\family typewriter +B +\family default +, you might have to switch back, or possibly even via +\family typewriter +primary --force +\family default + (see sections +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Forced-Switching" + +\end_inset + + and +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Split-Brain-Resolution" + +\end_inset + +). +\end_layout + +\begin_layout Standard +Switching the roles is very similar to DRBD: just issue the command +\end_layout + +\begin_layout Itemize + +\family typewriter +marsadm primary mydata +\end_layout + +\begin_layout Standard +on your formerly secondary node +\family typewriter +B +\family default +. + In combination with a properly set-up +\family typewriter +systemd +\family default + method (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:systemd-Templates" + +\end_inset + +), this will even automatically start your application at the new site. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +The most important difference to DRBD: don't use an intermediate +\family typewriter +marsadm secondary mydata +\family default + anywhere. + Although it would be possible, it has some +\emph on +disadvantages +\emph default +. + Always switch +\emph on +directly +\emph default +! +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +In contrast to DRBD, MARS remembers the designated primary, even when your + system crashes and reboots. + While in case of a crash you have to re-setup DRBD with commands like +\family typewriter +drbdadm up +\begin_inset Formula $\ldots$ +\end_inset + +; drbdadm primary +\begin_inset Formula $\ldots$ +\end_inset + + +\family default +, MARS will automatically resume its former roles just by saying +\family typewriter +modprobe mars +\family default +. + In combination with a properly set-up +\family typewriter +systemd +\family default + method (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:systemd-Templates" + +\end_inset + +), this will even automatically re-start your application. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Another fundamental difference to DRBD: when the network is healthy, there + can only exist +\emph on +one +\emph default + designated primary at a time (modulo some communication delays caused by + the +\begin_inset Quotes eld +\end_inset + +eventually consistent +\begin_inset Quotes erd +\end_inset + + communication model, see section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-Lamport-Clock" + +\end_inset + +). + By saying +\family typewriter +marsadm primary mydata +\family default + on host +\family typewriter +B +\family default +, +\series bold +all other +\series default + hosts (including +\family typewriter +A +\family default +) will +\series bold +automatically go into secondary role +\series default + after a while! +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +You simply +\emph on +don't need +\emph default + an intermediate +\family typewriter +marsadm secondary mydata +\family default + for planned handover! +\end_layout + +\begin_layout Standard +Precondition for a plain +\family typewriter +marsadm primary +\family default + (without +\family typewriter +systemd +\family default +) is that you are up, that means in attached and connected state (cf. + +\family typewriter +marsadm up +\family default +), that you are no sync target anymore, and (only when +\family typewriter +systemd +\family default + isn't configured to automatically stop the application at the old site) + that any old primary (in this case +\family typewriter +A +\family default +) does not use its +\family typewriter +/dev/mars/mydata +\family default + device any longer, and that the network is healthy. + If some (parts of) logfiles are not yet (fully) transferred to the new + primary, you will need enough space on +\family typewriter +/mars/ +\family default + at the target side. + If one of the preconditions described in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Operation-of-the" + +\end_inset + + is violated, +\family typewriter +marsadm primary +\family default + may refuse to start. +\end_layout + +\begin_layout Standard +These preconditions try to protect you from doing silly things, such as + accidentally provoking a split brain error state. + We try to avoid split brain as best as we can. + Therefore, we distinguish between +\emph on +intended +\emph default + and +\emph on +emergeny +\emph default + switching. + Intended switching will try to avoid split brain +\emph on +as best as it can +\emph default +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Don't +\emph on +rely +\emph default + on split brain avoidance, in particular when scripting any higher-level + applications such as cluster managers (cf. + section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Scripting-HOWTO" + +\end_inset + +). + +\family typewriter +marsadm +\family default + does its best, but at least in case of (unnoticed) network outages / partitions + (or +\emph on +extremely, really extremely +\emph default + slow / overloaded networks), an attempt to become +\family typewriter +UpToDate +\family default + may fail. + If you want to +\emph on +ensure +\emph default + that no split brain can result from intended primary switching, please + obey the the best practices from above, and please give the +\family typewriter +primary +\family default + command only after your secondary is +\emph on +known +\begin_inset Foot +status open + +\begin_layout Plain Layout +As noted in many places in this manual, checking this cannot be done by + looking at the local state of a single cluster node. + You have to check several nodes. + +\family typewriter +marsadm +\family default + can only check the +\emph on +local +\emph default + node reliably! +\end_layout + +\end_inset + + +\emph default + to be +\emph on +really +\emph default + +\family typewriter +UpToDate +\family default + (see +\family typewriter +marsadm wait-cluster +\family default + and +\family typewriter +marsadm view +\family default + and other macros described in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Inspecting-the-State" + +\end_inset + +). +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + A +\emph on +very rough +\emph default + estimation of the time to become +\family typewriter +UpToDate +\family default + is displayed by +\family typewriter +marsadm view mydata +\family default + or other macros (e.g. + +\family typewriter +view-replinfo +\family default +). + However, on very flaky networks, the estimation may not only flicker much, + but also be inaccurate. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + Planned handover is refused +\emph on +by default +\emph default + when some sync is running somewhere. + By adding the option +\family typewriter +--ignore-sync +\family default +, you are no longer protected by this +\emph on +safety measure +\emph default +, and you are willing to accept that any already running syncs will restart + from point 0, in order to ensure consistency. +\end_layout + +\begin_layout Subsubsection +Forced Switching +\begin_inset CommandInset label +LatexCommand label +name "subsec:Forced-Switching" + +\end_inset + + +\end_layout + +\begin_layout Standard +In case the connection to the old primary is lost for whatever reason, we + just don't know anything about its +\emph on +current +\emph default + state (which may deviate from its +\emph on +last known +\emph default + state). + The following command sequence will skip many checks (essentially you just + need to be attached and you must not be a current sync target) and tell + your node to become primary forcefully: +\end_layout + +\begin_layout Itemize + +\family typewriter +marsadm pause-fetch mydata +\family default + +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_deeper +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + notice that this is similar to +\family typewriter +drbdadm disconnect mydata +\family default + as you are probably used from DRBD. + For better compatibility with DRBD, you may use the alternate syntax +\family typewriter +marsadm disconnect mydata +\family default + instead. + However, there is a subtle difference to DRBD: DRBD will drop +\emph on +both +\emph default + sides of its single bi-directional connection and no longer try to re-connect + from any of both sides, while +\family typewriter +pause-fetch +\family default + is equivalent to +\family typewriter +pause-fetch-local +\family default +, which instructs only the +\emph on +local +\emph default + host to stop fetching logfiles. + Other members of the cluster, including the former primary, are +\emph on +not +\emph default + instructed to do so. + They may continue fetching logfiles over their own private TCP connections, + potentially using many connections in parallel, and potentially even from + any +\emph on +other +\emph default + member of the resource, if they think they can get the data from there. + In order to instruct +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice that not all such instructions may arrive at all sites when the network + is interrupted (or extremely slow). +\end_layout + +\end_inset + + +\emph on +all +\emph default + members of the resource to stop fetching logfiles, you may use +\family typewriter +marsadm pause-fetch-global mydata +\family default + instead (cf section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Operation-of-the" + +\end_inset + +). +\end_layout + +\end_deeper +\begin_layout Itemize + +\family typewriter +marsadm primary mydata --force +\family default + +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_deeper +\begin_layout Standard +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + this is the forceful failover. + Depending on the current replication lag, you may loose some data. + Use +\family typewriter +--force +\family default + only if you know what you are doing! +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + When +\family typewriter +systemd +\family default + is configured properly (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:systemd-Templates" + +\end_inset + +), your application will start automatically at the new primary site. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + when the network is interrupted, the old primary site cannot know this, + and will continue running. + Once the metadata exchange is working again (by default on port 7777), + the old site will be automatically shut down by its local +\family typewriter +systemd +\family default + configuration, when configured properly (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:systemd-Templates" + +\end_inset + +). + In difference to the +\emph on +planned +\emph default + handover from section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Intended-Switching" + +\end_inset + +, this may happen much later. + In case of long-last network outages, even days or weeks! +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + +Running both sites in parallel for a long time may seriously damage your + business. + Ensure that any +\series bold +customer traffic +\series default + cannot go to the old site! Be sure to configure your BGP in a proper way, + such that +\emph on +only +\emph default +, and +\emph on +only +\emph default + the new site will receive any customer traffic from both inside and outside + networks, like the internet. +\end_layout + +\end_deeper +\begin_layout Itemize + +\family typewriter +marsadm resume-fetch mydata +\family default + +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_deeper +\begin_layout Standard +As such, the new primary does not really need this, because primaries are + producing their own logfiles without need for fetching. + This is only to undo the previous +\family typewriter +pause-fetch +\family default +, in order to avoid future surprises when the new primary will somewhen + change to secondary mode again (in the far-distant future), and you have + forgotten to remember the fact that fetching had been switched off. + +\end_layout + +\end_deeper +\begin_layout Standard +When using +\family typewriter +--force +\family default +, many precondition checks and other internal checks are skipped, and in + particular the internal handover protocol for split brain avoidance. +\end_layout + +\begin_layout Standard +Therefore, use of +\family typewriter +--force +\family default + is +\emph on +likely +\emph default + to +\series bold +provoke a split brain +\series default +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + +\series bold +Split brain +\series default + is always an +\series bold +erroneous state +\series default + which should be never entered deliberately! Once you have entered it accidental +ly, you +\series bold +must +\series default + resolve it ASAP (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Split-Brain-Resolution" + +\end_inset + +), otherwise you cannot operate your resource in the long term. +\end_layout + +\begin_layout Standard +In order to impede you from giving an accidental +\family typewriter +--force +\family default +, the precondition is different: +\family typewriter +--force +\family default + works only in +\emph on +locally disconnected +\emph default + state. + This is similar to DRBD. +\end_layout + +\begin_layout Standard +Remember: +\family typewriter +marsadm primary +\family default + without +\family typewriter +--force +\family default + tries to prevent split brain as best as it can. + Use of the +\family typewriter +--force +\family default + option will almost +\emph on +certainly +\emph default + provoke a split brain, at least if the old primary continues to operate + on its local +\family typewriter +/dev/mars/mydata +\family default + device. + Therefore, you are +\series bold +strongly advised +\series default + to do this +\series bold +only +\series default + after +\end_layout + +\begin_layout Enumerate + +\family typewriter +marsadm primary +\family default + without +\family typewriter +--force +\family default + has failed +\emph on +for no good reason +\emph default + +\begin_inset Foot +status open + +\begin_layout Plain Layout +Most reasons will be displayed by +\family typewriter +marsadm +\family default + when it is rejecting the planned handhover. +\end_layout + +\end_inset + +, and +\end_layout + +\begin_layout Enumerate +You are sure you +\emph on +really +\emph default + want to switch, even when that eventually leads to a split brain. + You also declare that you are willing to do +\emph on +manual +\emph default + split-brain resolution as described in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Split-Brain-Resolution" + +\end_inset + +, or even destruction / reconstruction of a damaged node as described in + section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Final-Destroy-of" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Notice: in case of +\emph on +connection loss +\emph default + (e.g. + networking problems / network partitions), you may not be able to reliably + detect whether a split brain actually resulted, or not. +\end_layout + +\begin_layout Paragraph +Some Background +\end_layout + +\begin_layout Standard +In contrast to DRBD, split brain situations are handled differently by MARS + . + When two primaries are accidentally active at the same time, each of them + writes into different logfiles +\family typewriter +/mars/resource-mydata/log-000000001-A +\family default + and +\family typewriter +/mars/resource-mydata/log-000000001-B +\family default + where the +\emph on +origin +\emph default + host is always recorded in the filename. + Therefore, both nodes +\emph on +can theoretically +\emph default + run in primary mode independently from each other, at least for some time. + They +\emph on +might +\emph default + even +\family typewriter +log-rotate +\family default + independently from each other. + However, this is really no good idea. + The replication to third nodes will likely get stuck, and your +\family typewriter +/mars/ +\family default + filesystem(s) will eventually run out of space. + Any further secondary node (when having +\begin_inset Formula $k>2$ +\end_inset + + replicas) will certainly get into serious problems: it simply does not + know which split-brain version it should follow. + Therefore, you will certainly loose the actuality of your redundancy. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + +\family typewriter +marsadm secondary +\family default + is +\emph on +strongly discouraged +\emph default +. + It tells the whole cluster that +\emph on +nobody +\emph default + is designated as primary any more. + +\emph on +All +\emph default + nodes should go into secondary mode, globally. + In the current version of MARS, the secondaries will no long fetch any + logfiles, since they don't know which version is the +\begin_inset Quotes eld +\end_inset + +right +\begin_inset Quotes erd +\end_inset + + one. + Syncing is also not possible. + When the device +\family typewriter +/dev/mars/mydata +\family default + is in use somewhere, it will remain in +\emph on +actual +\emph default + primary mode during that time. + As soon as the local +\family typewriter +/dev/mars/mydata +\family default + is released, the node will +\emph on +actually +\emph default + go into secondary mode if it is no longer designated as primary. + You should avoid it in advance by always +\emph on +directly +\emph default + switching over from one primary to another one, without intermediate +\family typewriter +secondary +\family default + command. + This is different from DRBD. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + Split brain situations are detected +\emph on +passively +\emph default + by secondaries. + Whenever a secondary detects that somewhere a split brain has happend, + it refuses to replay any logfiles behind the split point (and also to fetch + them when possible), or anywhere where something appears suspect or ambiguous. + This tries to keep its local disk state always being consistent, but outdated + with respect to any of the split brain versions. + As a consequence, becoming primary may be impossible, because it cannot + always know which logfiles are the correct ones to replay before +\family typewriter +/dev/mars/mydata +\family default + can appear. + The ambiguity must be resolved first. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + If you +\emph on +really +\emph default + need the local device +\family typewriter +/dev/mars/mydata +\family default + to disappear +\emph on +everywhere +\emph default + in a split brain situation, you don't need a +\emph on +strongly discouraged +\emph default + +\family typewriter +marsadm secondary +\family default + command for this. + +\family typewriter +marsadm detach +\family default + or +\family typewriter +marsadm down +\family default + can do it also, without destroying knowledge about the former designated + primary. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + +\family typewriter +marsadm primary –force +\family default + is rejected in newer +\begin_inset Foot +status open + +\begin_layout Plain Layout +Beware: older versions before +\family typewriter +mars0.1stable52 +\family default + did deliberately skip this check because a few years ago somebody at 1&1 + did place a +\emph on +requirement +\emph default + on this. + Fortunately, the requirement now has gone, so a more safe behaviour could + be implemented. + The new behaviour is for your safety, to prevent you from doing +\begin_inset Quotes eld +\end_inset + +silly +\begin_inset Quotes erd +\end_inset + + things in case you are under pressure during an incident (try to safeguard + human error as best as possible). +\end_layout + +\end_inset + + marsadm versions if your replica is a current sync target. + This is not a bug: it should prevent you from forcing an inconsistent replica + into primary mode, which will +\emph on +certainly +\emph default + lead to inconsistent data. + However, in extreme rare cases of severe damage of +\emph on +all +\emph default + of your replicas, you may be desperate. + Only in such a rare case, and only then, you might decide to force any + of your replicas (e.g. + based on their last sync progress bar) into primary role although none + of the re-syncs had finished before. + In such a case, and only if you really know what you are doing, you may + use +\family typewriter +marsadm fake-sync +\family default + to first mark your inconsisten replica as UpToDate (which is a +\series bold +lie +\series default +) and then force it to primary as explained above. + Afterwards, you will certainly need an +\family typewriter +fsck +\family default + or similar repair before you can restart your application. + Good luck! And don't forget to check the size of +\family typewriter +lost+found +\family default + afterwards. + This is really your +\emph on +very last +\emph default + chance if nothing else had succeeded before. +\end_layout + +\begin_layout Subsection +Split Brain Resolution +\begin_inset CommandInset label +LatexCommand label +name "subsec:Split-Brain-Resolution" + +\end_inset + + +\end_layout + +\begin_layout Standard +Split brain can naturally occur during a long-lasting network outage (aka + network partition) when you (forcefully) switch primaries inbetween, or + due to final loss of your old primary node (fatal node crash) when not + all logfile data had been transferred immediately before the final crash. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + Remember that split brain is an +\series bold +erroneous state +\series default + which must be resolved as soon as possible! +\end_layout + +\begin_layout Standard +Whenever split brain occurs for whatever reason, you have two choices for + resolution: either destroy one of your versions, or retain it under a different + resource name. +\end_layout + +\begin_layout Standard +In any of both cases, do the following steps ASAP: +\end_layout + +\begin_layout Enumerate + +\series bold +Manually +\series default + check which (surviving) version is the +\begin_inset Quotes eld +\end_inset + +right +\begin_inset Quotes erd +\end_inset + + one. + Any error is up to you: destroying the wrong version is +\emph on +your +\emph default + fault, not the fault of MARS. +\end_layout + +\begin_layout Enumerate +If you did not already switch your primary to the final destination determined + in the previous step, do it now (see description in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Forced-Switching" + +\end_inset + +). + Don't use an intermediate +\family typewriter +marsadm secondary +\family default + command (as known from DRBD): +\emph on +directly +\emph default + switch to the new designated primary! +\end_layout + +\begin_layout Enumerate +Unless +\family typewriter +systemd +\family default + is configured properly (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:systemd-Templates" + +\end_inset + +), do the following manually: on each non-right version (which you don't + want to retain) which had been primary before, umount your +\family typewriter +/dev/mars/mydata +\family default + or otherwise stop using it (e.g. + stop iSCSI or other users of the device). + Wait until each of them has actually left primary state and until their + local logfile(s) have been fully written back to the underlying disk. +\end_layout + +\begin_layout Enumerate +Wait until the network works again. + All your (surviving) cluster nodes +\emph on +must +\emph default + +\begin_inset Foot +status open + +\begin_layout Plain Layout +If you are a MARS expert and you really know what you are doing (in particular, + you can anticipate the effects of the Lamport clock and of the symlink + update protocol including the +\begin_inset Quotes eld +\end_inset + +eventually consistent +\begin_inset Quotes erd +\end_inset + + behaviour including the not-yet-consistent intermediate states, see sections + +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-Lamport-Clock" + +\end_inset + + and +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-Symlink-Tree" + +\end_inset + +), you may deviate from this requirement. +\end_layout + +\end_inset + + be able to communicate with each other. + If that is not possible, or if it takes too long, you may fall back to + the method described in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Final-Destroy-of" + +\end_inset + +, but do this only as far as necessary. +\end_layout + +\begin_layout Standard +The next steps are different for different use cases: +\end_layout + +\begin_layout Paragraph +Destroying a Wrong Split Brain Version +\end_layout + +\begin_layout Standard +Continue with the following steps, each on those cluster node(s) where you + do not want to retain its split-brain version. + In preference, start with the old +\begin_inset Quotes eld +\end_inset + +wrong +\begin_inset Quotes erd +\end_inset + + primaries first (see advice at the end of this section): +\end_layout + +\begin_layout Enumerate-Resume + +\family typewriter +marsadm invalidate mydata +\family default + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +setcounter{enumi}{4} +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +When no split brain is reported anymore after that (via +\family typewriter +marsadm view all +\family default +), you are done. + You need to repeat this on other secondaries only when necessary. +\end_layout + +\begin_layout Standard +In very rare cases when things are screwed up very heavily (e.g. + a partly destroyed +\family typewriter +/mars/ +\family default + partition), you may try an alternate method described in appendix +\begin_inset CommandInset ref +LatexCommand ref +reference "chap:Alternative-Methods-for" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Check that state +\family typewriter +Orphan +\family default + is left after a while. + Notice that +\family typewriter +invalidate +\family default + is only +\emph on +restarting +\emph default + an existing replica, but does not wait for its completion. +\end_layout + +\begin_layout Paragraph +Keeping a Split Brain Version +\end_layout + +\begin_layout Standard +On those cluster node(s) where you want to retain the version (e.g. + for inspection purposes): +\end_layout + +\begin_layout Enumerate-Resume + +\family typewriter +marsadm leave-resource mydata +\end_layout + +\begin_layout Enumerate-Resume +After having done this on +\emph on +all +\emph default + those cluster nodes, check that the split brain is gone (e.g. + by saying +\family typewriter +marsadm view mydata +\family default +), as documented above. + In very rare cases, you might also need a +\family typewriter +log-purge-all +\family default + (see page +\begin_inset CommandInset ref +LatexCommand pageref +reference "log-purge-all$res" + +\end_inset + +). +\end_layout + +\begin_layout Enumerate-Resume +Rename the underlying local disk +\family typewriter +/dev/lv-x/mydata +\family default + is into something like +\family typewriter +/dev/lv-x/mynewdata +\family default + (see +\family typewriter +man lvrename +\family default +) This is +\emph on +extremely +\emph default + recommended to avoid confusion with the old resource name! +\end_layout + +\begin_layout Enumerate-Resume +Check that each underlying local disk +\family typewriter +/dev/lv-x/mynewdata +\family default + is really usable afterwards, e.g. + by test-mounting it (or +\family typewriter +fsck +\family default + if you can afford it). + If all is OK, don't forget to umount it before proceeding with the next + step. +\end_layout + +\begin_layout Enumerate-Resume +Create a completely new MARS resource out of the underlying disk +\family typewriter +/dev/lv-x/mynewdata +\family default + having a different name, best is +\family typewriter +mynewdata +\family default + (see description in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Creating-and-Maintaining" + +\end_inset + +). +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + Generally: +\series bold + best practice +\series default + is to always keep your LV names equal to your MARS resource names. + This can avoid a +\emph on +lot +\emph default + of unnecessary confusion. +\end_layout + +\begin_layout Paragraph +Keeping a Good Version +\end_layout + +\begin_layout Standard +When you had a secondary which did not participate in the split brain, but + just got confused and therefore stopped replaying logfiles immediately + before the split-brain point, it may very well happen +\begin_inset Foot +status open + +\begin_layout Plain Layout +In general, such a +\begin_inset Quotes eld +\end_inset + +good +\begin_inset Quotes erd +\end_inset + + behaviour cannot be guaranteed for all secondaries. + Race conditions in complex networks may asynchronously transfer +\begin_inset Quotes eld +\end_inset + +wrong +\begin_inset Quotes erd +\end_inset + + logfile data to a secondary much earlier than conflicting +\begin_inset Quotes eld +\end_inset + +good +\begin_inset Quotes erd +\end_inset + + logfile data which will be marked +\begin_inset Quotes eld +\end_inset + +good +\begin_inset Quotes erd +\end_inset + + only in the +\emph on +future. + +\emph default + It is impossible to predict this in advance. +\end_layout + +\end_inset + + that you don't need to do any action for it. + When all wrong versions have disappeared from the cluster (by +\family typewriter +invalidate +\family default + or +\family typewriter +leave-resource +\family default + as described before), the confusion should be over, and the secondary should + automatically resume tracking of the new unique version. +\end_layout + +\begin_layout Standard +Please check that +\emph on +all +\emph default + of your secondaries are no longer stuck. + You need to execute split brain resolution only for +\emph on +stuck +\emph default + nodes. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + Hint / advice for +\begin_inset Formula $k>2$ +\end_inset + + replicas: it is a good idea to start split brain resolution +\emph on +first +\emph default + with those (few) nodes which had been (accidentally) primary before, but + are not the new designated primary. + Usually, you had 2 primaries during split brain, so this will apply only + to +\emph on +one +\emph default + of them. + Leave the other one intact, by not umounting +\family typewriter +/dev/mars/mydata +\family default + at all, and keeping your applications running. + Even during emergency mode, see section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Emergency-Mode" + +\end_inset + +. + +\emph on +First +\emph default + resolve the problem of the +\begin_inset Quotes eld +\end_inset + +wrong +\begin_inset Quotes erd +\end_inset + + primary(s) via +\family typewriter +invalidate +\family default + or +\family typewriter +leave-resource +\family default +. + Wait for a short while. + Then check the rest of your secondaries, whether they now are already following + the new (unique) primary, and finally check whether the split brain warning + reported by +\family typewriter +marsadm view all +\family default + is gone everywhere. + This way, you can often skip unnecessary invalidations of replicas. +\end_layout + +\begin_layout Subsection +Final Destruction of a Damaged Node +\begin_inset CommandInset label +LatexCommand label +name "subsec:Final-Destroy-of" + +\end_inset + + +\end_layout + +\begin_layout Standard +When a node has eventually died, do the following steps ASAP: +\end_layout + +\begin_layout Enumerate + +\emph on +Physically +\emph default + remove the dead node from your network. + Unplug all network cables! Failing to do so might provoke a disaster in + case it somehow resurrects in an uncontrolled manner, such as a partly-damaged + +\family typewriter +/mars/ +\family default + filesystem, a half-defective kernel, RAM / kernel memory corruption, disk + corruption, or whatever. + Don't risk any such unpredictable behaviour! +\end_layout + +\begin_layout Enumerate + +\series bold +Manually +\series default + check which of the surviving versions will be the +\begin_inset Quotes eld +\end_inset + +right +\begin_inset Quotes erd +\end_inset + + one. + Any error is up to you: resurrecting an unnecessarily old / outdated version + and/or destroying the newest / best version is +\emph on +your +\emph default + fault, not the fault of MARS. +\end_layout + +\begin_layout Enumerate +If you did not already switch your primary to the final destination determined + in the previous step, do it now (see description in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Forced-Switching" + +\end_inset + +). +\end_layout + +\begin_layout Enumerate +On a surviving node, but preferably +\emph on +not +\emph default + the new designated primary, give the following commands: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_deeper +\begin_layout Enumerate + +\family typewriter +marsadm --host=your-damaged-host down mydata +\end_layout + +\begin_layout Enumerate + +\family typewriter +marsadm --host=your-damaged-host leave-resource mydata +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + Check for misspellings, in particular the hostname of the dead node, and + check the command syntax before typing return! Otherwise, you may forcefully + destroy the wrong +\begin_inset Foot +status open + +\begin_layout Plain Layout +That said, MARS is rather tolerant of human error. + Once a sysadmin accidentally destroyed a cluster while it was continuously + running as primary. + Fortunately, the problem was detected early enough for a correction without + causing any extraordinary customer downtime outside of accepted tolerances, + and no data loss at all. +\end_layout + +\end_inset + + node! +\end_layout + +\end_deeper +\begin_layout Enumerate +In case any of the previous commands should fail (which is rather likely), + repeat it with an additional +\family typewriter +--force +\family default + option. + Don't use +\family typewriter +--force +\family default + in the first place, alway try first without it! +\end_layout + +\begin_layout Enumerate +Repeat the same with +\emph on +all +\emph default + resources which were formerly present at +\family typewriter +your-damaged-host +\family default +. +\end_layout + +\begin_layout Enumerate +Finally, say +\family typewriter +marsadm --host=your-damaged-host leave-cluster +\family default + (optionally augmented with +\family typewriter +--force +\family default +). +\end_layout + +\begin_layout Standard +Now your surviving nodes should +\emph on +believe +\emph default + that the old node +\family typewriter +your-damaged-host +\family default + does no longer exist, and that it does no longer participate in any resource. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + Even if your dead node comes to life again in some way: always ensure that + the mars kernel module cannot run any more. + +\emph on +Never +\emph default + do a +\family typewriter +modprobe mars +\family default + on a node marked as dead this way! +\end_layout + +\begin_layout Standard +Further instructions for complicated cases are in appendix +\begin_inset CommandInset ref +LatexCommand ref +reference "chap:Alternative-De--and" + +\end_inset + + and +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Cleanup-in-case" + +\end_inset + +. +\end_layout + +\begin_layout Subsection +Online Resizing during Operation +\end_layout + +\begin_layout Standard +You should have LVM or some other means of increasing the physical size + of your disk (e.g. + via firmware of some RAID controllers). + The network must be healthy. + Do the following steps: +\end_layout + +\begin_layout Enumerate +Increase your local disks (usually +\family typewriter +/dev/vg/mydata +\family default +) +\emph on +everywhere +\emph default + in the whole cluster. + In order to avoid wasting space, increase them +\emph on +uniformly +\emph default + to the same size (when possible). + The +\family typewriter +lvresize +\family default + tool is documented elsewhere. +\end_layout + +\begin_layout Enumerate +Check that all MARS switches are on. + If not, say +\family typewriter +marsadm up mydata +\family default + everywhere. +\end_layout + +\begin_layout Enumerate +At the primary: +\family typewriter +marsadm resize mydata +\end_layout + +\begin_layout Enumerate +If you have intermediate layers such as iSCSI, you may need some +\family typewriter +iscsiadm +\family default + update or other command. +\end_layout + +\begin_layout Enumerate +Now you may increase your filesystem. + This is specific for the filesystem type and documented elsewhere. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Hint: the secondaries will start syncing the increased new part of the underlyin +g primary disk. + In many cases, this is not really needed, because the new junk data just + does not care. + If you are sure and if you know what you are doing, you may use +\family typewriter +marsadm fake-sync mydata +\family default + to abort such unnecessary traffic. +\end_layout + +\begin_layout Section +The State of MARS +\begin_inset CommandInset label +LatexCommand label +name "sec:The-State-of" + +\end_inset + + +\end_layout + +\begin_layout Standard +In general, MARS tries to +\emph on +hide +\emph default + any network failures from you as best as it can. + After a network problem, any internal low-level socket connections are + +\emph on +transparently +\emph default + tried to re-open ASAP, without need for sysadmin intervention. + In difference to DRBD, network failures will +\emph on +not +\emph default + automatically alter the state of MARS, such as switching to +\family typewriter +disconnected +\family default + after a +\family typewriter +ko_timeout +\family default + or similar. + From a high-level sysadmin viewpoint, communication may just take a very + long time to succeed. +\end_layout + +\begin_layout Standard +When the behaviour of MARS is different from DRBD, it is usually intended + as a feature. +\end_layout + +\begin_layout Standard +MARS is not only an +\series bold +asynchronous +\series default + system at block IO level, but also +\series bold +at control level +\series default +. +\end_layout + +\begin_layout Standard +This is +\emph on +necessary +\emph default + because in a widely distributed long-distance system running on slow or + even temporarily failing networks, actions may take a long time, and there + may be many actions +\series bold +started in parallel +\series default +. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Synchronous concepts are generally not sufficient for expressing that. + Because of inherent asynchronicity and of dynamic creation / joining of + resources, it is neither possible to comprehensively depict a complex distribut +ed MARS system, nor a comprehensive standalone snippet of MARS, as a finite + state transition diagram +\begin_inset Foot +status open + +\begin_layout Plain Layout +Probably it could be possible to formally model MARS as a Petri net. + However, complete Petri nets are tending to become very conplex, and to + describe lots of low-level details. + Expressing hierarchy, in a top-down fashion, is cumbersome. + We find no clue in trying to do so. +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Standard +Although MARS tries to +\emph on +approximate +\emph default + / +\emph on +emulate +\emph default + the synchronous control behaviour of DRBD at the interface level ( +\family typewriter +marsadm +\family default +) in many situations as best as it can, the +\emph on +internal +\emph default + control model is necessarily asynchronous. + As an experiencend sysadmin, you will be curious how it works in principle. + When you know something about it, you will no longer be surprised when + some (detail) behaviour is different from DRBD. +\end_layout + +\begin_layout Standard +The general principle is an asynchronous 2-edge handshake protocol, which + is used almost everywhere in MARS: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/handshake.fig + width 80col% + +\end_inset + + +\end_layout + +\begin_layout Standard +We have a binary todo switch, which can be either in state +\begin_inset Quotes eld +\end_inset + +on +\begin_inset Quotes erd +\end_inset + + or +\begin_inset Quotes eld +\end_inset + +off +\begin_inset Quotes erd +\end_inset + +. + In addition, we have an actual response indicator, which is similar to + an LED indicating the actual status. + In our example, we imagine that both are used for controlling a big ventilator, + having a huge inert mass. + Imagine a big machine from a power plant, which is as tall as a human. +\end_layout + +\begin_layout Standard +We start in a situation where the binary switch is off, and the ventilator + is stopped. + At point 1, we turn on the switch. + At that moment, a big contactor will sound like +\begin_inset Quotes eld +\end_inset + +zonggg +\begin_inset Quotes erd +\end_inset + +, and a big motor will start to hum. + At first you won't hear anything else. + It will take a while, say 1 minute, until the big wheel will have reached + its final operating RPM, due to the huge inert mass. + During that spin-up, the lights in your room will become slightly darker. + When having reached the full RPM at point 2, your workplace will then be + noisier, but in exchange your room lights will be back at ordinary strength, + and the actual response LED will start to lit in order to indicate that + the big fan is now operational. +\end_layout + +\begin_layout Standard +Assume we want to turn the system off. + When turning the todo switch to +\begin_inset Quotes eld +\end_inset + +off +\begin_inset Quotes erd +\end_inset + + at point 3, first nothing will seem to happen at all. + The big wheel will keep spinning due to its heavy inert mass, and the RPM + as well as the sound will go down only slowly. + During spin-down, the actual response LED will stay illuminated, in order + to warn you that you should not touch the wheel, otherwise you may get + injuried +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice that it is only safe to access the wheel when +\emph on +both +\emph default + the switch and the LED are off. + Conversely, if at least one of them is on, something is going on inside + the machine. + Transferred to MARS: always look at +\emph on +both +\emph default + the todo switch and the correponding actual indicator in order to not miss + something. +\end_layout + +\end_inset + +. + The LED will only go off after, say, 2 minutes, when the wheel has actually + stopped at point 4. + After that, the cycle may potentially start over again. +\end_layout + +\begin_layout Standard +As you can see, all four possible cartesian product combinations between + two boolean values are occurring in the diagram. +\end_layout + +\begin_layout Standard +The same handshake protocol is used in MARS for communication between userspace + and kernelspace, as well as for communication in the widely distributed + system. +\end_layout + +\begin_layout Section +Inspecting the State of MARS +\begin_inset CommandInset label +LatexCommand label +name "sec:Inspecting-the-State" + +\end_inset + + +\end_layout + +\begin_layout Standard +The main command for viewing the current state of MARS is +\end_layout + +\begin_layout Itemize + +\family typewriter +marsadm view mydata +\end_layout + +\begin_layout Standard +or its more specialized variant +\end_layout + +\begin_layout Itemize + +\family typewriter +marsadm view- +\emph on +$macroname +\emph default + mydata +\end_layout + +\begin_layout Standard +where +\family typewriter +\emph on +$macroname +\family default +\emph default + is one of the macros described in chapter +\begin_inset CommandInset ref +LatexCommand ref +reference "chap:The-Macro-Processor" + +\end_inset + +, or a macro which has been written by yourself. +\end_layout + +\begin_layout Standard +As always, you may replace the resource name +\family typewriter +mydata +\family default + with the special keyword +\family typewriter +all +\family default + in order to get the state of all locally joined resources, as well as a + list of all those resources. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +When using the variant +\family typewriter +marsadm view all +\family default +, additionally the global communication status will be displayed. + This helps humans in diagnosing problems. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Hint: use the compound command +\family typewriter +watch marsadm view all +\family default + for continuous display of the current state of MARS. + When starting this side-by-side in +\family typewriter +ssh +\family default + terminal windows for all your cluster nodes, you can easily watch what's + going on in the whole cluster. +\end_layout + +\begin_layout Chapter +Basic Working Principle +\end_layout + +\begin_layout Standard +Even if you are impatient, please read this chapter. + At the +\emph on +surface +\emph default +, MARS appears to be very similar to DRBD. + It looks like almost being a drop-in replacement for DRBD. +\end_layout + +\begin_layout Standard +When taking this naïvely, you could easily step into some trivial pitfalls, + because the internal working principle of MARS is totally different from + DRBD. + Please forget (almost) anything you already know about the internal working + principles of DRBD, and look at the very different working principles of + MARS. +\end_layout + +\begin_layout Section +The Transaction Logger +\begin_inset CommandInset label +LatexCommand label +name "sec:The-Transaction-Logger" + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/MARS_Data_Flow.pdf + lyxscale 60 + width 100text% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +The basic idea of MARS is to record all changes made to your block device + in a so-called +\series bold +transaction logfile +\series default +. + +\emph on +Any +\emph default + write reqeuest is treated like a transaction which changes the contents + of your block device. +\end_layout + +\begin_layout Standard +This is similar in concept to some database systems, but there exists no + separate +\begin_inset Quotes eld +\end_inset + +commit +\begin_inset Quotes erd +\end_inset + + operation: +\emph on +any +\emph default + write request is acting like a commit. +\end_layout + +\begin_layout Standard +The picture shows the flow of write requests. + Let's start with the primary node. +\end_layout + +\begin_layout Standard +Upon submission of a write request on +\family typewriter +/dev/mars/mydata +\family default +, it is first buffered in a +\emph on +temporary +\emph default + memory buffer. +\end_layout + +\begin_layout Standard +The temporary memory buffer serves multiple purposes: +\end_layout + +\begin_layout Itemize +It keeps track of the order of write operations. +\end_layout + +\begin_layout Itemize +Additionally, it keeps track of the positions in the underlying disk +\family typewriter +/dev/lv-x/mydata +\family default +. + In particular, it detects when the same block is overwritten multiple times. +\end_layout + +\begin_layout Itemize +During pending write operation, any concurrent reads are served from the + memory buffer. +\end_layout + +\begin_layout Standard +After the write has been buffered in the temporary memory buffer, the main + logger thread of the transaction logger creates a so-called +\emph on +log entry +\emph default + and starts an +\begin_inset Quotes eld +\end_inset + +append +\begin_inset Quotes erd +\end_inset + + operation on the transaction logfile. + The log entry contains vital information such as the logical block number + in the underlying disk, the length of the data, a timestamp, some header + magic in order to detect corruption, the log entry sequence number, of + course the data itself, and optional information like a checksum or compression + information. +\end_layout + +\begin_layout Standard +Once the log entry has been written through to the +\family typewriter +/mars/ +\family default + filesystem via fsync(), the application waiting for the write operation + at +\family typewriter +/dev/mars/mydata +\family default + is signalled that the write was successful. +\end_layout + +\begin_layout Standard +This may happen even +\emph on +before +\emph default + the writeback to the underlying disk +\family typewriter +/dev/lv-x/mydata +\family default + has started. + Even when you power off the system right now, the information is not lost: + it is present in the logfile, and can be reconstructed from there. +\end_layout + +\begin_layout Standard +Notice that the order of log records present in the transaction log defines + a total order among the write requests which is +\emph on +compatible +\emph default + to the partial order of write requests issued on +\family typewriter +/dev/mars/mydata +\family default +. +\end_layout + +\begin_layout Standard +Also notice that despite its sequential nature, the transaction logfile + is typically +\emph on +not +\emph default + the performance bottleneck of the system: since appending to a logfile + is almost purely sequential IO, it runs much faster than random IO on typical + datacenter workloads. +\end_layout + +\begin_layout Standard +In order to reclaim the temporary memory buffer, its content must be written + back to the underlying disk +\family typewriter +/dev/lv-x/mydat +\family default +a somewhen. + After writeback, the temporary space is freed. + The writeback can do the following optimizations: +\end_layout + +\begin_layout Enumerate +writeback may be in +\emph on +any +\emph default + order; in particular, it may be +\emph on +sorted +\emph default + according to ascending sector ´numbers. + This will reduce the average seek distances of magnetic disks in general. +\end_layout + +\begin_layout Enumerate +when the same sector is overwritten multiple times, only the +\begin_inset Quotes eld +\end_inset + +last +\begin_inset Quotes erd +\end_inset + + version need to be written back, skipping some intermediate versions. +\end_layout + +\begin_layout Standard +In case the primary node crashes during writeback, it suffices to replay + the log entries from some point in the past until the end of the transaction + logfile. + It does no harm if you accidentally replay some log entries twice or even + more often: since the replay is in the original total order, any temporary + inconsistency is +\emph on +healed +\emph default + by the logfile application. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +In mathematics, the property that you can apply your logfile twice to your + data (or even as often as you want), is called +\series bold +idempotence +\series default +. + This is a very desirable property: it ensures that nothing goes wrong when + replaying +\begin_inset Quotes eld +\end_inset + +too much +\begin_inset Quotes erd +\end_inset + + / starting your replay +\begin_inset Quotes eld +\end_inset + +too early +\begin_inset Quotes erd +\end_inset + +. + Idempotence is even more beneficial: in case anything should go wrong with + your data on your disk (e.g. + IO errors), replaying your logfile once more often may +\begin_inset Foot +status open + +\begin_layout Plain Layout +Miracles cannot be guaranteed, but +\emph on +higher chances +\emph default + and +\emph on +improvements +\emph default + can be expected (e.g. + better chances for +\family typewriter +fsck +\family default +). +\end_layout + +\end_inset + + even +\series bold +heal +\series default + some defects. + Good news for desperate sysadmins forced to work with flaky hardware! +\end_layout + +\begin_layout Standard +The basic idea of the asynchronous replication of MARS is rather simple: + just transfer the logfiles to your secondary nodes, and replay them onto + their copy of the disk data (also called +\emph on +mirror +\emph default +) in the same order as the total order defined by the primary. +\end_layout + +\begin_layout Standard +Therefore, a mirror of your data on any secondary may be outdated, but it + always corresponds to some version which was valid in the past. + This property is called +\series bold +anytime consistency +\begin_inset Foot +status open + +\begin_layout Plain Layout +Your secondary nodes are always consistent in themselves. + Notice that this kind of consistency is a +\emph on +local +\emph default + consistency model. + There exists no global consistency in MARS. + Global consistency would be practically impossible in long-distance replication + where Einstein's law of the speed of light is limiting global consistency. + The front-cover pictures showing the planets Earth and Mars tries to lead + your imagination away from global consistency models as used in +\begin_inset Quotes eld +\end_inset + +DRBD Think(tm) +\begin_inset Quotes erd +\end_inset + +, and try to prepare you mentally for local consistency as in +\begin_inset Quotes eld +\end_inset + +MARS Think(tm) +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +As you can see in the picture, the process of transfering the logfiles is + +\emph on +independent +\emph default + from the process which replays the logfiles onto the data at some secondary + site. + Both processes can be switched on / off separately (see commands +\family typewriter +marsadm {dis,}connect +\family default + and +\family typewriter +marsadm {pause,resume}-replay +\family default + in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Operation-of-the" + +\end_inset + +). + This may be +\emph on +exploited +\emph default +: for example, you may replicate your logfiles as soon as possible (to protect + against catastrophic failures), but deliberately wait one hour until it + is replayed (under regular circumstances). + If your data inside your filesystem +\family typewriter +/mydata/ +\family default + at the primary site is accidentally destroyed by +\family typewriter +rm -rf /mydata/ +\family default +, you have an old copy at the secondary site. + This way, you can substitute +\emph on +some parts +\begin_inset Foot +status open + +\begin_layout Plain Layout +Please note that MARS cannot +\emph on +fully +\emph default + substitute a backup system, because it can keep only +\emph on +physical +\emph default + copies, and does not create logical copies. +\end_layout + +\end_inset + + +\emph default + of conventional backup functionality by MARS. + In case you need the actual version, just replay in +\begin_inset Quotes eld +\end_inset + +fast-forward +\begin_inset Quotes erd +\end_inset + + mode (similar to old-fashioned video tapes). +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Future versions of MARS Full are planned to also allow +\begin_inset Quotes eld +\end_inset + +fast-backward +\begin_inset Quotes erd +\end_inset + + rewinding, of course at some cost. +\end_layout + +\begin_layout Section +The Lamport Clock +\begin_inset CommandInset label +LatexCommand label +name "sec:The-Lamport-Clock" + +\end_inset + + +\end_layout + +\begin_layout Standard +MARS is always +\emph on +asynchonously +\emph default + communicating in the distributed system on +\emph on +any +\emph default + topics, even strategic decisions. +\end_layout + +\begin_layout Standard +If there were a +\emph on +strict +\emph default + global consistency model, which would be roughly equivalent to a standalone + model, we would need +\emph on +locking +\emph default + in order to serialize conflicting requests. + It is known for many decades that +\emph on +distributed locks +\emph default + do not only suffer from performance problems, but they are also cumbersome + to get them working reliably in scenarios where nodes or network links + may fail at any time. +\end_layout + +\begin_layout Standard +Therefore, MARS uses a very different consistency model: +\series bold +Eventually Consistent +\series default +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Notice that the network bottleneck problems described in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Network-Bottlenecks" + +\end_inset + + are +\emph on +demanding +\emph default + an +\begin_inset Quotes eld +\end_inset + +eventually consistent +\begin_inset Quotes erd +\end_inset + + model. + You have +\series bold +no chance +\series default + against natural laws, like Einstein's laws. + In order to cope with the problem area, you have to +\emph on +invest some additional effort +\emph default +. + Unfortunately, asynchronous communication models are more tricky to program + and to debug than simple strictly consistent models. + In particular, you +\emph on +have to cope with +\emph default + additional +\series bold +race conditions +\series default + +\emph on +inherent +\emph default + +\emph on +to +\emph default + the +\begin_inset Quotes eld +\end_inset + +eventually consistent +\begin_inset Quotes erd +\end_inset + + model. + In the face of the laws of the universe, motivate yourself by looking at + the graphics at the cover page: the planets are a +\emph on +symbol +\emph default + for what you have to do! +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Example: the asynchronous communication protocol of MARS leads to a different + behaviour from DRBD in case of +\series bold +network partitions +\series default + (temporary interruption of communication between some cluster nodes), because + MARS +\emph on +remembers +\emph default + the old state of remote nodes over long periods of time, while DRBD knows + absolutely nothing about its peers in disconnected state. + Sysadmins familiar with DRBD might find the following behaviour unusual: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center + +\size tiny +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +Event +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +DRBD Behaviour +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +MARS Behaviour +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +1. + the network partitions +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +automatic disconnect +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +nothing happens, but replication lags behind +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +2. + on A: +\family typewriter +umount $device +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +works +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +works +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +3. + on A: +\family typewriter +{drbd,mars}adm secondary +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +works +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +works +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +4. + on B: +\family typewriter +{drbd,mars}adm primary +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +works, split brain happens +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\series bold +\size tiny +refused +\series default + because B believes that A is primary +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +5. + the network resumes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +automatic connect attempt fails +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +communication automatically resumes +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +If you intentionally want to switch over (and to produce a split brain as + a side effect), the following variant must be used with MARS: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center + +\size tiny +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +Event +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +DRBD Behaviour +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +MARS Behaviour +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +1. + the network partitions +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +automatic disconnect +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +nothing happens, but replication lags behind +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +2. + on A: +\family typewriter +umount $device +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +works +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +works +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +3. + on A: +\family typewriter +{drbd,mars}adm secondary +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +works +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +works (but +\emph on +not remmonended! +\emph default +) +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +4. + on B: +\family typewriter +{drbd,mars}adm primary +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +split brain, but nobody knows +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\series bold +\size tiny +refused +\series default + because B believes that A is primary +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +5. + on B: +\family typewriter +marsadm disconnect +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +works, nothing happens +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +6. + on B: +\family typewriter +marsadm primary --force +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +works, split brain happens on B, but A doesn't know +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +7. + on B: +\family typewriter +marsadm connect +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +works, nothing happens +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +8. + the network resumes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +automatic connect attempt fails +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size tiny +communication resumes, A now detects the split brain +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +In order to implement the consistency model +\begin_inset Quotes eld +\end_inset + +eventually consistent +\begin_inset Quotes erd +\end_inset + +, MARS uses a so-called Lamport +\begin_inset Foot +status open + +\begin_layout Plain Layout +Published in the late 1970s by Leslie Lamport, also known as inventor of + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +LaTeX +\end_layout + +\end_inset + +. +\end_layout + +\end_inset + + clock. + MARS uses a special variant called +\begin_inset Quotes eld +\end_inset + +physical Lamport clock +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Standard +The physical Lamport clock is another almost-realtime clock which +\emph on +can +\emph default + run independently from the Linux kernel system clock. + However, the Lamport clock tries to remain as near as possible to the system + clock. +\end_layout + +\begin_layout Standard +Both clocks can be queried at any time via +\family typewriter +cat /proc/sys/mars/lamport_clock +\family default +. + The result will show both clocks in parallel, in units of seconds since + the Unix epoch, with nanosecond resolution. +\end_layout + +\begin_layout Standard +When there are no network messages at all, both the system clock and the + Lamport clock will show almost the same time (except some minor differences + of a few nanoseconds resulting from the finite processor clock speed). +\end_layout + +\begin_layout Standard +The physical Lamport clock works rather simple: +\emph on +any +\emph default + message on the network is augmented with a Lamport time stamp telling when + the message was +\emph on +sent +\emph default + according to the local Lamport clock of the sender. + Whenever that message is received by some receiver, it checks whether the + time ordering relation would be violated: whenever the Lamport timestamp + in the message would claim that the sender had sent it +\emph on +after +\emph default + it arrived at the receiver (according to drifts in their respective local + clocks), something must be wrong. + In this case, the local Lamport clock of the +\emph on +receiver +\emph default + is advanced shortly after the sender Lamport timestamp, such that the time + ordering relation is no longer violated. +\end_layout + +\begin_layout Standard +As a consequence, any local Lamport clock may precede the corresponding + local system clock. + In order to avoid accumulation of deltas between the Lamport and the system + clock, the Lamport clock will run slower after that, possibly until it + reaches the system clock again (if no other message arrives which sets + it forward again). + After having reached the system clock, the Lamport clock will continue + with +\begin_inset Quotes eld +\end_inset + +normal +\begin_inset Quotes erd +\end_inset + + speed. +\end_layout + +\begin_layout Standard +MARS uses the local Lamport clock for anything where other systems would + use the local system clock: for example, timestamp generation in the +\family typewriter +/mars/ +\family default + filesystem. + Even symlinks created there are timestamped according to the Lamport clock. + Both the kernel module and the userspace tool +\family typewriter +marsadm +\family default + are always operating in the timescale of the Lamport clock. + Most importantly, all timestamp comparisons are always carried out with + respect to Lamport time. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Bigger differences between the Lamport and the system clock can be annoying + from a human point of view: when typing +\family typewriter +ls -l /mars/resource-mydata/ +\family default + many timestamps may appear as if they were created in the +\begin_inset Quotes eld +\end_inset + +future +\begin_inset Quotes erd +\end_inset + +, because the +\family typewriter +ls +\family default + command compares the output formatting against the system clock (it does + not even know of the existence of the MARS Lamport clock). +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + Always use +\family typewriter +ntp +\family default + (or another clock synchronization service) in order to pre-synchronize + your system clocks as close as possible. + Bigger differences are not only annoying, but may lead some people to wrong + conclusions and therefore even lead to bad human decisions! +\end_layout + +\begin_layout Standard +In a professional datacenter, you should use +\family typewriter +ntp +\family default + anyway, and you should monitor its effectiveness anyway. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Hint: many internal logfiles produced by the MARS kernel module contain + Lamport timestamps written as numerical values. + In order to convert them into human-readable form, use the command +\family typewriter +marsadm cat /mars/ +\family default + or similar. +\end_layout + +\begin_layout Section +The Symlink Tree +\begin_inset CommandInset label +LatexCommand label +name "sec:The-Symlink-Tree" + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + The symlink tree as described here will be replaced by another representation + in future versions of MARS. + Therefore, don't do any scripting by directly accessing symlinks! Use the + primitive macros described in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Predefined-Trivial-Macros" + +\end_inset + +. +\end_layout + +\begin_layout Standard +The current +\family typewriter +/mars/ +\family default + filesystem container format contains not only transaction logfiles, but + also acts as a generic storage for (persistent) state information. + Both configuration information and runtime state information are currently + stored in symlinks. + Symlinks are +\begin_inset Quotes eld +\end_inset + +misused +\begin_inset Foot +status open + +\begin_layout Plain Layout +This means, the symlink targets need not be other files or directories, + but just any values like integers or strings. +\end_layout + +\end_inset + + +\begin_inset Quotes erd +\end_inset + + in order to represent some +\family typewriter +key -> value +\family default + pairs. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +It is not yet clear / decided, but there is a +\emph on +chance +\emph default + that the +\emph on +concept +\emph default + of +\family typewriter +key -> value +\family default + pairs will be retained in future versions of MARS. + Instead of being represented by symlinks, another representation will be + used, such that hopefully the +\family typewriter +key +\family default + part will remain in the form of a pathname, even if there were no longer + a physical representation in an actual filesystem. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + A fundamentally different behaviour than DRBD: when your DRBD primary crashed + some time ago, and now comes up again, you have to setup DRBD again by + a sequence of commands like +\family typewriter +modprobe drbd; drbdadm up all; drbdadm primary all +\family default + or similar. + In contrast, MARS needs only +\family typewriter +modprobe mars +\family default + (after +\family typewriter +/mars/ +\family default + has been mounted by +\family typewriter +/etc/fstab +\family default +). + The +\emph on +persistence +\emph default + of the symlinks residing in +\family typewriter +/mars/ +\family default + will automatically remember your previous state, even if some your resources + were primary while others were secondary (mixed operations). + You don't need to do any actions in order to +\begin_inset Quotes eld +\end_inset + +restore +\begin_inset Quotes erd +\end_inset + + a previous state, no matter how +\begin_inset Quotes eld +\end_inset + +complex +\begin_inset Quotes erd +\end_inset + + it was. +\end_layout + +\begin_layout Standard +(Almost) all symlinks appearing in the +\family typewriter +/mars/ +\family default + directory tree are automatically replicated thoughout the whole cluster, + provided that the cluster +\family typewriter +uuid +\family default +s are equal +\begin_inset Foot +status open + +\begin_layout Plain Layout +This is protection against accidental +\begin_inset Quotes eld +\end_inset + +merging +\begin_inset Quotes erd +\end_inset + + of two unrelated clusters which had been created at different times with + different +\family typewriter +uuids +\family default +. +\end_layout + +\end_inset + + at all sites. + Thus the +\family typewriter +/mars/ +\family default + directory forms some kind of +\emph on +global namespace +\emph default +. +\end_layout + +\begin_layout Standard +In order to avoid name clashes, each pathname created at node A follows + a convention: the node name A should be a suffix of the pathname. + Typically, internal MARS names follow the scheme +\family typewriter +/mars/ +\emph on +something +\emph default +/myname-A +\family default +. + When using the expert command +\family typewriter +marsadm {get,set}-link +\family default + (which will likely be replaced by something else in future MARS releases), + you should follow the best practice of systematically using pathnames like + +\family typewriter +/mars/userspace/myname-A +\family default + or similar. + As a result, each node will automatically get informed about the state + at any other node, like B when the corresponding information is recorded + on node B under the name +\family typewriter +/mars/userspace/myname-B +\family default + (context-dependent names). +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + Experts only: the symlink replication works generically. + You might use the +\family typewriter +/mars/userspace/ +\family default + directory in order to place your own symlink there (for whatever purpose, + which need not have to do with MARS). + However, the symlinks are likely to disappear. + Use +\family typewriter +marsadm {get,set}-link +\family default + instead. + There is a chance that these abstract commands (or variants thereof) will + be retained, by acting on the new data representation in future, even if + the old symlink format will vanish some day. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Important: the convention of placing the +\series bold +creator host name +\series default + inside your pathnames should be used wherever possible. + The name part is a kind of +\begin_inset Quotes eld +\end_inset + +ownership indicator +\begin_inset Quotes erd +\end_inset + +. + It is crucial that no other host writes any symlink not +\begin_inset Quotes eld +\end_inset + +belonging +\begin_inset Quotes erd +\end_inset + + to him. + Other hosts may read foreign information as often as they want, but never + modify them. + This way, your cluster nodes are able to +\emph on +communicate +\emph default + with each other via symlink / information updates. +\end_layout + +\begin_layout Standard +Although experts might create (and change) the current symlinks with userspace + tools like +\family typewriter +ln -s +\family default +, you should use the following marsadm commands instead: +\end_layout + +\begin_layout Itemize + +\family typewriter +marsadm set-link myvalue /mars/userspace/mykey-A +\end_layout + +\begin_layout Itemize + +\family typewriter +marsadm delete-file /mars/userspace/mykey-A +\end_layout + +\begin_layout Standard +There are many reasons for this: first, the +\family typewriter +marsadm set-link +\family default + command will automatically use the Lamport clock for symlink creation, + and therefore will avoid any errors resulting from a +\begin_inset Quotes eld +\end_inset + +wrong +\begin_inset Quotes erd +\end_inset + + system clock (as in +\family typewriter +ln -s +\family default +). + Second, the +\family typewriter +marsadm delete-file +\family default + (which also deletes symlinks) works on the +\emph on +whole cluster +\emph default +. + And finally, there is a chance that this will work in future versions of + MARS even after the symlinks have vanished. +\end_layout + +\begin_layout Standard +What's the difference? If you would try to remove your symlink locally by + hand via +\family typewriter +rm -f +\family default +, you will be surprised: since the symlink has been replicated to the other + cluster nodes, it will be re-transferred from there and will be resurrected + locally after some short time. + This way, you cannot delete any object reliably, because your whole cluster + (which may consist of many nodes) remembers all your state information + and will +\begin_inset Quotes eld +\end_inset + +correct +\begin_inset Quotes erd +\end_inset + + it whenever +\begin_inset Quotes eld +\end_inset + +necessary +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Standard +In order to solve the deletion problem, MARS uses some internal deletion + protocol using auxiliary symlinks residing in +\family typewriter +/mars/todo-global/. + +\family default + The deletion protocol ensures that all replicas get deleted in the whole + cluster, and only thereafter the auxiliary symlinks in +\family typewriter +/mars/todo-global/ +\family default + are also deleted eventually. +\end_layout + +\begin_layout Standard +You may update your already existing symlink via +\family typewriter +marsadm set-link some-other-value /mars/userspace/mykey-A +\family default + . + The new value will be propagated throughout the cluster according to a + +\series bold +timestamp comparison protocol +\series default +: whenever node B notices that A has a +\emph on +newer +\emph default + version of some symlink (according to the Lamport timestamp), it will replace + its elder version by the newer one. + The opposite does +\emph on +not +\emph default + work: if B notices that A has an elder version, just nothing happens. + This way, the timestamps of symlinks can only progress in forward direction, + but never backwards in time. +\end_layout + +\begin_layout Standard +As a consequence, symlink updates made +\begin_inset Quotes eld +\end_inset + +by hand +\begin_inset Quotes erd +\end_inset + + via +\family typewriter +ln -sf +\family default + may get lost when the local system clock is much more earlier than the + Lamport clock. +\end_layout + +\begin_layout Standard +When your cluster is fully connected by the network, the last timestamp + will finally win everywhere. + Only in case of network outages leading to +\emph on +network partitions +\emph default +, some information may be +\emph on +temporarily inconsistent +\emph default +, but only for the duration of the network outage. + The timestamp comparison protocol in combination with the Lamport clock + and with the persistence of the +\family typewriter +/mars/ +\family default + filesystem will automatically heal any temporary inconsistencies as soon + as possible, even in case of temporary node shutdown. +\end_layout + +\begin_layout Standard +The meaning of some internal MARS symlinks residing in +\family typewriter +/mars/ +\family default + will be hopefully documented in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Documentation-of-the" + +\end_inset + + some day. +\end_layout + +\begin_layout Section +Defending Overflow of +\family typewriter +/mars/ +\begin_inset CommandInset label +LatexCommand label +name "sec:Defending-Overflow" + +\end_inset + + +\end_layout + +\begin_layout Standard +This section describes an important difference to DRBD. + The metadata of DRBD is allocated +\emph on +statically +\emph default + at +\emph on +creation +\emph default + +\emph on +time +\emph default + of the resource. + In contrast, the MARS transaction logfiles are allocated +\emph on +dynamically +\emph default + at +\emph on +runtime +\emph default +. +\end_layout + +\begin_layout Standard +This leads to a potential risk from the perspective of a sysadmin: what + happens if the +\family typewriter +/mars/ +\family default + filesystem runs out of space? +\end_layout + +\begin_layout Standard +No risk, no fun. + If you want a system which survives long-lasting network outages while + keeping your replicas always consistent (anytime consistency), you +\emph on +need +\emph default + dynamic memory for that. + It is +\emph on +impossible +\emph default + to solve that problem using static memory +\begin_inset Foot +status open + +\begin_layout Plain Layout +The bitmaps used by DRBD don't preserve the +\emph on +order +\emph default + of write operations. + They cannot do that, because their space is +\begin_inset Formula $O(k)$ +\end_inset + + for some constant +\begin_inset Formula $k$ +\end_inset + +. + In contrast, MARS preserves the order. + Preserving the order as such (even when only +\emph on +facts +\emph default + about the order were recorded without recording the actual data contents) + requires +\begin_inset Formula $O(n)$ +\end_inset + + space where +\begin_inset Formula $n$ +\end_inset + + is infinitely growing over time. +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Standard +Therefore, DRBD and MARS have different application areas. + If you just want a simple system for mirroring your data over short distances + like a crossover cable, DRBD will be a suitable choice. + However, if you need to replicate over longer distances, or if you need + higher levels of reliability even when multiple failures may accumulate + (such as network loss during a +\emph on +re +\emph default +sync of DRBD), the transaction logs of MARS can solve that, but at some + +\emph on +cost +\emph default +. +\end_layout + +\begin_layout Subsection +Countermeasures +\end_layout + +\begin_layout Subsubsection +Dimensioning of +\family typewriter +/mars/ +\begin_inset CommandInset label +LatexCommand label +name "subsec:Dimensioning-of-/mars/" + +\end_inset + + +\end_layout + +\begin_layout Standard +The first (and most important) measure against overflow of +\family typewriter +/mars/ +\family default + is simply to dimension it large enough to survive longer-lasting problems, + at least one weekend. +\end_layout + +\begin_layout Standard +Recommended size is at least one dedicated disk, residing at a hardware + RAID controller with BBU (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Preparation:-What-you" + +\end_inset + +). + During normal operation, that size is needed only for a small fraction, + typically a few percent or even less than one percent. + However, it is your +\series bold +safety margin +\series default +. + Keep it high enough! +\end_layout + +\begin_layout Subsubsection +Monitoring +\end_layout + +\begin_layout Standard +The next (equally important) measure is +\series bold +monitoring in userspace +\series default +. +\end_layout + +\begin_layout Standard +Following is a list of countermeasures both in userspace and in kernelspace, + in the order of +\begin_inset Quotes eld +\end_inset + +defensive walling +\begin_inset Quotes erd +\end_inset + +: +\end_layout + +\begin_layout Enumerate +Regular userspace monitoring must throw an INFO if a certain freespace limit + +\begin_inset Formula $l_{1}$ +\end_inset + + of +\family typewriter +/mars/ +\family default + is undershot. + Typical values for +\begin_inset Formula $l_{1}$ +\end_inset + + are 30%. + Typical actions are automated calls of +\family typewriter +marsadm cron +\family default + (or +\family typewriter +marsadm log-rotate all +\family default + followed by +\family typewriter +marsadm log-delete-all all +\family default +). + You have to implement that yourself in sysadmin space. +\end_layout + +\begin_layout Enumerate +Regular userspace monitoring must throw a WARNING if a certain freespace + limit +\begin_inset Formula $l_{2}$ +\end_inset + + of +\family typewriter +/mars/ +\family default + is undershot. + Typical values for +\begin_inset Formula $l_{2}$ +\end_inset + + are 20%. + Typical actions are (in addition to +\family typewriter +log-rotate +\family default + and +\family typewriter +log-delete-all +\family default +) alarming human supervisors via SMS and/or further stronger automated actions. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Frequently large space is occupied by files stemming from debugging output, + or from other programs or processes. + A hot candidate is +\begin_inset Quotes eld +\end_inset + +forgotten +\begin_inset Quotes erd +\end_inset + + removal of debugging output to +\family typewriter +/mars/ +\family default +. + Sometimes, an +\family typewriter +rm -rf $(find /mars/ -name +\begin_inset Quotes eld +\end_inset + +*.log +\begin_inset Quotes erd +\end_inset + +) +\family default + can work miracles. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Another source of space hogging is a +\begin_inset Quotes eld +\end_inset + +forgotten +\begin_inset Quotes erd +\end_inset + + +\family typewriter +pause-sync +\family default + or +\family typewriter +disconnect +\family default +. + Therefore, a simple +\family typewriter +marsadm connect-global all +\family default + followed by +\family typewriter +marsadm resume-replay-global all +\family default + may also work miracles (if you didn't want to freeze some mirror deliberately). +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +If you just wanted to freeze a mirror at an outdated state for a very long + time, you simply +\emph on +cannot +\emph default + do that without causing infinite growth of space consumption in +\family typewriter +/mars/ +\family default +. + Therefore, a +\family typewriter +marsadm leave-resource $res +\family default + at +\emph on +exactly that(!) +\emph default + secondary site where the mirror is frozen, can also work miracles. + If you want to automate this in unserspace, be careful. + It is easy to get unintended effects when choosing the wrong site for +\family typewriter +leave-resource +\family default +. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Hint: you can / should start some of these measures even earlier at the + INFO level (see item 1), or even earlier. +\end_layout + +\begin_layout Enumerate +Regular userspace monitoring must throw an ERROR if a certain freespace + limit +\begin_inset Formula $l_{3}$ +\end_inset + + of +\family typewriter +/mars/ +\family default + is undershot. + Typical values for +\begin_inset Formula $l_{3}$ +\end_inset + + are 10%. + Typical actions are alarming the CEO via SMS and/or even stronger automated + actions. + For example, you may choose to automatically call +\family typewriter +marsadm leave-resource $res +\family default + on some or all secondary nodes, such that the primary will be left alone + and now has a chance to really delete its logfiles because no one else + is any longer potentially needing it. +\end_layout + +\begin_layout Enumerate +First-level kernelspace action, automatically executed when +\family typewriter + +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + +/proc/sys/mars/required_free_space_4_gb +\end_layout + +\end_inset + + +\family default + + +\family typewriter + +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + +/proc/sys/mars/required_free_space_3_gb +\end_layout + +\end_inset + + +\family default + + +\family typewriter + +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + +/proc/sys/mars/required_free_space_2_gb +\end_layout + +\end_inset + + +\family default + + +\family typewriter + +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + +/proc/sys/mars/required_free_space_1_gb +\end_layout + +\end_inset + + +\family default + is undershot: +\begin_inset Newline newline +\end_inset + +a warning will be issued. +\end_layout + +\begin_layout Enumerate +Second-level kernelspace action, automatically executed when +\family typewriter + +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + +/proc/sys/mars/required_free_space_3_gb +\end_layout + +\end_inset + + +\family default + + +\family typewriter + +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + +/proc/sys/mars/required_free_space_2_gb +\end_layout + +\end_inset + + +\family default + + +\family typewriter + +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + +/proc/sys/mars/required_free_space_1_gb +\end_layout + +\end_inset + + +\family default + is undershot: +\begin_inset Newline newline +\end_inset + +all locally secondary resources will delete local copies of transaction + logfiles which are no longer needed locally. + This is a desperate action of the kernel module. +\end_layout + +\begin_layout Enumerate +Third-level kernelspace action, automatically executed when +\family typewriter + +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + +/proc/sys/mars/required_free_space_2_gb +\end_layout + +\end_inset + + +\family default + + +\family typewriter + +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + +/proc/sys/mars/required_free_space_1_gb +\end_layout + +\end_inset + + +\family default + is undershot: +\begin_inset Newline newline +\end_inset + +all locally secondary resources will stop fetching transaction logfiles. + This is a more desperate action of the kernel module. + You don't want to get there (except for testing). +\end_layout + +\begin_layout Enumerate +Last desperate kernelspace action when all else has failed and +\family typewriter + +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + +/proc/sys/mars/required_free_space_1_gb +\end_layout + +\end_inset + + +\family default + is undershot: +\begin_inset Newline newline +\end_inset + +all locally primary resources will enter +\series bold +emergency mode +\series default + (see description below in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Emergency-Mode" + +\end_inset + +). + This is the most desperate action of the kernel module. + You don't want to get there (except for testing). +\end_layout + +\begin_layout Standard +In addition, the kernel module obeys a general global limit +\family typewriter + +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + +/proc/sys/mars/required_total_space_0_gb +\end_layout + +\end_inset + + + +\family default + the sum of all of the above limits. + When the +\emph on +total size +\emph default + of +\family typewriter +/mars/ +\family default + undershots that sum, the kernel module refuses to start at all, because + it assumes that it is senseless to try to operate MARS on a system with + such low memory resources. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +The current level of emergency kernel actions may be viewed at any time + via +\family typewriter + +\begin_inset Flex URL +status collapsed + +\begin_layout Plain Layout + +/proc/sys/mars/mars_emergency_mode +\end_layout + +\end_inset + + +\family default +. +\end_layout + +\begin_layout Subsubsection +Throttling +\end_layout + +\begin_layout Standard +The last measure for defense of overflow is +\series bold +throttling your performance pigs +\series default +. +\end_layout + +\begin_layout Standard +Motivation: in rare cases, some users with +\family typewriter +ssh +\family default + access can do +\emph on +very +\emph default + silly things. + For example, some of them are creating their own backups via user-cron + jobs, and they do it every 5 minutes. + Some example guy created a zip archive (almost 1GB) by regularly copying + his old zip archive into a new one, then appending deltas to the new one, + and finally deleting the old archive. + Every 5 minutes. + Yes, every 5 minutes, although almost never any new files were added to + the archive. + Essentially, he copied over his archive, for nothing. + This led to massive bulk write requests, for ridiculous reasons. +\end_layout + +\begin_layout Standard +In general, your hard disks (or even RAID systems) allow much higher write + IO rates than you can ever transport over a standard TCP network from your + primary site to your secondary, at least over longer distances (see use + cases for MARS in chapter +\begin_inset CommandInset ref +LatexCommand ref +reference "chap:Use-Cases-for" + +\end_inset + +). + Therefore, it is easy to create a such a high write load that it will be + +\emph on +impossible +\emph default + to replicate it over the network, +\emph on +by construction +\emph default +. +\end_layout + +\begin_layout Standard +Therefore, we +\emph on +need +\emph default + some mechanism for throttling bulk writers whenever the network is weaker + than your IO subsystem. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Notice that DRBD will +\emph on +always +\emph default + throttle your writes whenever the network forms a bottleneck, due to its + synchronous operation mode. + In contrast, MARS allows for buffering of performance peaks in the transaction + logfiles. + +\emph on +Only when +\emph default + your buffer in +\family typewriter +/mars/ +\family default + runs short (cf subsection +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Dimensioning-of-/mars/" + +\end_inset + +), MARS will start to throttle your application writes. +\end_layout + +\begin_layout Standard +There are a lot of screws named +\family typewriter +/proc/sys/mars/write_throttle_* +\family default + with the following meaning: +\end_layout + +\begin_layout Description + +\family typewriter +write_throttle_start_percent +\family default + Whenever the used space in +\family typewriter +/mars/ +\family default + is below this threshold, no throttling will occur at all. + Only when this threshold is exceeded, throttling will start +\emph on +slowly +\emph default +. + Typical values for this are 60%. +\end_layout + +\begin_layout Description + +\family typewriter +write_throttle_end_percent +\family default + Maximum throttling will occur once this space threshold is reached, i.e. + the throttling is now at its maximum effect. + Typical values for this are 90%. + When the actual space in +\family typewriter +/mars/ +\family default + lies between +\family typewriter +write_throttle_start_percent +\family default + and +\family typewriter +write_throttle_end_percent +\family default +, the strength of throttling will be interpolated linearly between the extremes. + In practice, this should lead to an equilibrum between new input flow into + +\family typewriter +/mars/ +\family default + and output flow over the network to secondaries. +\end_layout + +\begin_layout Description + +\family typewriter +write_throttle_size_threshold_kb +\family default + (readonly) This parameter shows the internal strength calculation of the + throttling. + Only write +\begin_inset Foot +status open + +\begin_layout Plain Layout +Read requests are never throttled at all. +\end_layout + +\end_inset + + requests exceeding this size (in KB) are throttled at all. + Typically, this will hurt the bulk performance pigs first, while leaving + ordinary users (issuing small requests) unaffected. +\end_layout + +\begin_layout Description + +\family typewriter +write_throttle_ratelimit_kb +\family default + Set the global IO rate in KB/s for those write requests which are throttled. + In case of strongest +\begin_inset Foot +status open + +\begin_layout Plain Layout +In case of lighter throttling, the input flow into +\family typewriter +/mars/ +\family default + may be higher because small requests are not throttled. +\end_layout + +\end_inset + + throttling, this parameters determines the input flow into +\family typewriter +/mars/ +\family default +. + The default value is 5.000 KB/s. + Please adjust this value to your application needs and to your environment. +\end_layout + +\begin_layout Description + +\family typewriter +write_throttle_rate_kb +\family default + (readonly) Shows the current rate of exactly those requests which are actually + throttled (in contrast to +\emph on +all +\emph default + requests). +\end_layout + +\begin_layout Description + +\family typewriter +write_throttle_cumul_kb +\family default + (logically readonly) Same as before, but the cumulative sum of all throttled + requests since startup / reset. + This value can be reset from userspace in order to prevent integer overflow. +\end_layout + +\begin_layout Description + +\family typewriter +write_throttle_count_ops +\family default + (logically readonly) Shows the cumulative number of throttled requests. + This value can be reset from userspace in order to prevent integer overflow. +\end_layout + +\begin_layout Description + +\family typewriter +write_throttle_maxdelay_ms +\family default + Each request is delayed at most for this timespan. + Smaller values will improve the responsiveness of your userspace application, + but at the cost of potentially retarding the requests not sufficiently. +\end_layout + +\begin_layout Description + +\family typewriter +write_throttle_minwindow_ms +\family default + Set the minimum length of the measuring window. + The measuring window is the timespan for which the average (throughput) + rate is computed (see +\family typewriter +write_throttle_rate_kb +\family default +). + Lower values can increase the responsiveness of the controller algorithm, + but at the cost of accuracy. +\end_layout + +\begin_layout Description + +\family typewriter +write_throttle_maxwindow_ms +\family default + This parameter must be set sufficiently much greater than +\family typewriter +write_throttle_minwindow_ms +\family default +. + In case the flow of throttled operations pauses for some natural reason + (e.g. + switched off, low load, etc), this parameter determines when a completely + new rate calculation should be started over +\begin_inset Foot +status open + +\begin_layout Plain Layout +Motivation: if requests would pause for one hour, the measuring window could + become also an hour. + Of course, that would lead to completely meaningless results. + Two requests in one hour is +\begin_inset Quotes eld +\end_inset + +incorrect +\begin_inset Quotes erd +\end_inset + + from a human point of view: we just have to ensure that averages are computed + with respect to a reasonable maximum time window in the magnitude of 10s. +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Subsection +Emergency Mode and its Resolution +\begin_inset CommandInset label +LatexCommand label +name "subsec:Emergency-Mode" + +\end_inset + + +\end_layout + +\begin_layout Standard +When +\family typewriter +/mars/ +\family default + is almost full and there is really absolutely no chance of getting rid + of any local transaction logfile (or free some space in any other way), + there is only one exit strategy: stop creating new logfile data. +\end_layout + +\begin_layout Standard +This means that the ability for replication gets lost. +\end_layout + +\begin_layout Standard +When entering emergency mode, the kernel module will execute the following + steps for all resources where the affected host is acting as a primary: +\end_layout + +\begin_layout Enumerate +Do a kind of +\begin_inset Quotes eld +\end_inset + +logrotate +\begin_inset Quotes erd +\end_inset + +, but create a +\emph on +hole +\emph default + in the sequence of transaction logfile numbers. + The +\begin_inset Quotes eld +\end_inset + +new +\begin_inset Quotes erd +\end_inset + + logfile is left empty, i.e. + no data ist written to it (for now). + The hole in the numbering will prevent any secondaries from replaying any + logfiles behind the hole (should they ever contain some data, e.g. + because the emergency mode has been left again). + This works because the secondaries are regularly checking the logfile numbers + for contiguity, and they will refuse to replay anything which is not contiguous. + As a result, the secondaries will be left in a consistent, but outdated + state (at least if they already were consistent before that). +\end_layout + +\begin_layout Enumerate +The kernel module writes back all data present in the temporary memory buffer + (see figure in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-Transaction-Logger" + +\end_inset + +). + This may lead to a (short) delay of user write requests until that has + finished (typically fractions of a second or a few seconds). + The reason is that the temporary memory buffer must not be increased in + parallel during this phase (race conditions). +\end_layout + +\begin_layout Enumerate +After the temporary memory buffer is empty, all local IO requests (whether + reads or writes) are directly going to the underlying disk. + This has the same effect as if MARS would not be present anymore. + Transaction logging does no longer take place. +\end_layout + +\begin_layout Enumerate +Any sync from any secondary is stopped ASAP. + In case they are resuming their sync somewhen later, they will start over + from the beginning (position +\begin_inset Formula $0$ +\end_inset + +). +\end_layout + +\begin_layout Standard +In order to leave emergency mode, the sysadmin should do the following steps: +\end_layout + +\begin_layout Enumerate +Free enough space. + For example, delete any foreign files on +\family typewriter +/mars/ +\family default + which have nothing to do with MARS, or resize the +\family typewriter +/mars/ +\family default + filesystem, or whatever. +\end_layout + +\begin_layout Enumerate +If +\family typewriter + +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + +/proc/sys/mars/mars_reset_emergency +\end_layout + +\end_inset + + +\family default + is not set, now it is time to set it. + Normally, it should be already set. +\end_layout + +\begin_layout Enumerate +Notice: as long as not enough space has been freed, a message containing + +\family typewriter + +\begin_inset Quotes eld +\end_inset + +EMEGENCY MODE HYSTERESIS +\begin_inset Quotes erd +\end_inset + + +\family default + (or similar) will be displayed by +\family typewriter +marsadm view all +\family default +. + As a consequence, any sync will be automatically halted. + This applies to freshly invoked syncs also, for example created by +\family typewriter +invalidate +\family default + or +\family typewriter +join-resource +\family default +. +\end_layout + +\begin_layout Enumerate +On the secondaries, use +\family typewriter +marsadm invalidate $res +\family default + in order to request updating your outdated mirrors. +\end_layout + +\begin_layout Enumerate +On the primary: +\family typewriter +marsadm log-delete-all all +\end_layout + +\begin_layout Enumerate +As soon as emough space has been freed everywhere to leave the +\family typewriter +EMEGENCY MODE HYSTERESIS +\family default +, sync should really start. + Until that it had been halted. +\end_layout + +\begin_layout Enumerate +Recommendation: check at secondaries that state +\family typewriter +Orphan +\family default + has been left after a while. +\end_layout + +\begin_layout Standard +Alternatively, there is another method by roughly following the instructions + from appendix +\begin_inset CommandInset ref +LatexCommand ref +reference "chap:Alternative-Methods-for" + +\end_inset + +, but in a slightly different order. + In this case, do +\family typewriter +leave-resource +\family default + everywhere on +\emph on +all +\emph default + secondaries, but +\emph on +don't +\emph default + start the +\family typewriter +join-resource +\family default + phase +\emph on +for now +\emph default +. + Then cleanup all your secondaries via +\family typewriter +log-purge-all +\family default +, and finally +\family typewriter +log-delete-all all +\family default + at the primary, and wait until the emergency has vanished everywhere. + Only after that, re- +\family typewriter +join-resource +\family default + your secondaries. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Expert advice for +\begin_inset Formula $k=2$ +\end_inset + + replicas: this means you had only 1 mirror per resource before the overflow + happened. + Provided that you have enough space on your LVMs and on +\family typewriter +/mars/ +\family default +, and provided that transaction logging has automatically restarted after + +\family typewriter +leave-resource +\family default + and +\family typewriter +log-purge-all +\family default +, you can recover redundancy by creating a +\emph on +new +\emph default + replica via +\family typewriter +marsadm join-resource $res +\family default + on a +\emph on +third +\emph default + node. + Only after the initial full sync has finished there, run +\family typewriter +join-resource +\family default +at your original mirror. + This way, you will always retain at least one +\series bold +consistent mirror +\series default + somewhere. + After all is up-to-date, you can delete the superfluous mirror by +\family typewriter +marsadm leave-resource $res +\family default + and reclaim the disk space from its underlying LVM disk. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +If you already have +\begin_inset Formula $k>2$ +\end_inset + + replicas in total, it may be a wise idea to prefer the +\family typewriter +leave-resource ; log-purge-all ; join-resource +\family default + method in front of +\family typewriter +invalidate +\family default + because it does not invalidate +\emph on +all +\emph default + your replicas at the same time (when handled properly in the right order). +\end_layout + +\begin_layout Chapter +The Macro Processor +\begin_inset CommandInset label +LatexCommand label +name "chap:The-Macro-Processor" + +\end_inset + + +\end_layout + +\begin_layout Standard + +\family typewriter +marsadm +\family default + comes with a customizable macro processor. + It can be used for high-level complex display of the state of MARS (so-called + +\emph on +complex macros +\emph default +), as well as for low-level display of lots of individual state values (so-calle +d +\emph on +primitive macros +\emph default +). +\end_layout + +\begin_layout Standard +From the commandline, any macro can be called via +\family typewriter +marsadm view- +\emph on +$macroname +\emph default + mydata +\family default +. + The short form +\family typewriter +marsadm view mydata +\family default + is equivalent to +\family typewriter +marsadm view-default mydata +\family default +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +In general, the command +\family typewriter +marsadm view- +\emph on +$macroname +\emph default + all +\family default + will first call the macro +\family typewriter +\emph on +$macroname +\family default +\emph default + in a loop for +\emph on +all +\emph default + resources we are a +\emph on +member locally +\emph default +. + Finally, a trailing macro +\family typewriter +\emph on +$macroname +\emph default +-global +\family default + will be called with an empty +\family typewriter +%{res} +\family default + argument, provided that such a macro is defined. + This way, you can produce per-resource output followed by global output + which does not depend on a particular resource. +\end_layout + +\begin_layout Section +Predefined Macros +\end_layout + +\begin_layout Standard +The macro processor is a very flexible and versatile tool for +\series bold +customizing +\series default +. + You can create your own macros, but probably the rich set of predefined + macros is already sufficient for your needs. +\end_layout + +\begin_layout Subsection +Predefined Complex and High-Level Macros +\begin_inset CommandInset label +LatexCommand label +name "subsec:Predefined-Complex-and" + +\end_inset + + +\end_layout + +\begin_layout Standard +The following predefined complex macros try to address the information needs + of humans. + Use them only in scripts when you are prepared about the fact that the + output format may change during development of MARS. +\end_layout + +\begin_layout Standard +Notice: the definitions of predefined complex macros may be updated in the + course of the MARS project. + However, the primitive macros recursively called by the complex ones will + be hopefully rather stable in future (with the exception of bugfixes). + If you want to retain an old / outdated version of a complex macro, just + check it out from git, follow the instructions in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Creating-your-own" + +\end_inset + +, and preferably give it a different name in order to avoid confusion with + the newer version. + In general, it should be possible to use old macros with newer versions + of +\family typewriter +marsadm +\family default + +\begin_inset Foot +status open + +\begin_layout Plain Layout +You might need to check out also old versions of further macros and adapt + their names, whenever complex macros call each other. +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +default +\family default + This is equivalent to +\family typewriter +marsadm view mydata +\family default + without +\family typewriter +\emph on +-maroname +\family default +\emph default + suffix. + It shows a one-line status summary for each resource, optionally followed + by informational lines such as progress bars whenever a sync or a fetch + of logfiles is currently running. + The status line has the following fields: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_deeper +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +%{res} +\family default + resource name. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +[ +\emph on +this_count +\emph default +/ +\emph on +total_count +\emph default +] +\family default + total number of replicas of this resource, out of total number of cluster + members. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +%include{diskstate} +\family default + see +\family typewriter +diskstate +\family default + macro below. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +%include{replstate} +\family default + see +\family typewriter +replstate +\family default + macro below. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +%include{flags} +\family default + see +\family typewriter +flags +\family default + macro below. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +%include{role} +\family default + see +\family typewriter +role +\family default + macro below. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +%include{primarynode} +\family default + see +\family typewriter +primarynode +\family default + macro below. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +%include{commstate} +\family default + see +\family typewriter +commstate +\family default + macro below. +\end_layout + +\end_deeper +\begin_layout Labeling +\labelwidthstring 00.00.0000 +\begin_inset space ~ +\end_inset + + After that, optional lines such as progress bars are appearing only when + something unusual is happening. + These lines are subject to future changes. + For examples, wasted disk space due to missing +\family typewriter +resize +\family default + is reported when +\family typewriter +%{threshold} +\family default + is exceeded. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +1and1 +\family default + +\begin_inset space ~ +\end_inset + +or +\begin_inset space ~ +\end_inset + + +\family typewriter +default-1and1 +\family default + A variant of +\family typewriter +default +\family default + for internal use by 1&1 Internet AG. + You may call this complex macro by saying +\family typewriter +marsadm view-1and1 all +\family default +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Note: the +\family typewriter +marsadm view-1and1 +\family default + command has been intensely tested in Spring 2014 to produce exactly the + same output than the 1&1 internal +\begin_inset Foot +status open + +\begin_layout Plain Layout +In addition to allow for customization, the macro processor is also meant + as an exit strategy for removing dependencies from non-free software. + +\series bold +Please put your future macros also under GPL! +\end_layout + +\end_inset + + tool +\family typewriter +marsview +\family default + +\begin_inset Foot +status open + +\begin_layout Plain Layout +There are some subtle differences: numbers are displayed in a different + precision, some bug fixes in the macro version (which might have occurred + +\emph on +in the meantime +\emph default + ) may lead to different output as a side effect from bug fixes in +\emph on +predefined +\emph default + macros, because the original +\family typewriter +marsview +\family default + command is currently not actively maintained. + Documentation of +\family typewriter +marsview +\family default + can be found in the corresponding manpage, see +\family typewriter +man marsview +\family default +. + By construction, this is also the (unmaintained) documentation of +\family typewriter +marsadm view-1and1 +\family default + and other +\family typewriter +-1and1 +\family default + macros. + Notice that all +\family typewriter +*-1and1 +\family default + macros are not officially supported by the developer of MARS, and they + may disappear in a future major release. + However, they could be useful for your own customization macros. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Customization via your own macros (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Creating-your-own" + +\end_inset + +) is explicitly encouraged by the developer. + It would be nice if a vibrant user community would emerge, helping each + other by exchange of macros. + +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Hint: in order to produce your own customized inspection / monitoring tools, + you may ask the author for an official reservation of a macro sub-namespace + such as +\family typewriter +*- +\emph on +yourcompanyname +\family default +\emph default +. + You will be fully responsible for your own reserved namespace and can do + with it whatever you want. + The official MARS release will guarantee that +\emph on +no name clashes +\emph default + with your reserved sub-namespace will occur in future. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +default-global +\family default + Currently, this just calls +\family typewriter +comminfo +\family default + (see below). + May be extended in future. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +diskstate +\family default + Shows the status of the underlying disk device, in the following order + of precedence +\begin_inset Foot +status open + +\begin_layout Plain Layout +When an earlier list item is displayed, no combinations with following items + are possible. + This kind of +\begin_inset Quotes eld +\end_inset + +hiding effect +\begin_inset Quotes erd +\end_inset + + can lead to an +\emph on +information loss +\emph default +. + In order to get a non-lossy picture from the state of your system, please + look at the +\family typewriter +flags +\family default + which are able to display cartesian combinations of more detailed internal + states. +\end_layout + +\end_inset + +: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_deeper +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +NotJoined +\family default + (cf +\family typewriter +%get-disk{} +\family default +) No underlying disk device is configured. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +NotPresent +\family default + (cf +\family typewriter +%disk-present{} +\family default +) The underlying disk device (as configured, see +\family typewriter +marsadm view-get-disk +\family default +) does not exist or the device node is not accessible. + Therefore MARS cannot work. + Check that LVM or other software is properly configured and running. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +Detached +\family default + (cf +\family typewriter +InConsistent +\family default +, +\family typewriter +NeedsReplay +\family default +, +\family typewriter +%todo-attach{} +\family default +, +\family typewriter +%is-attach{} +\family default +) The underlying disk is willingly switched off (see +\family typewriter +marsadm detach +\family default +), and it actually is no longer opened by MARS. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +Detaching +\family default + (cf +\family typewriter +%todo-attach{} +\family default + and +\family typewriter +%is-attach{} +\family default +) Access to the underlying disk is switched off, but actually not yet +\family typewriter +close() +\family default +d by MARS. + This can happen for a long time on a primary when other secondaries are + accessing the disk remotely for syncing. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +DefectiveLog[ +\emph on +description-text +\emph default +] +\family default + (cf +\family typewriter +%replay-code{} +\family default +) Typicially this indicates an +\family typewriter +md5 +\family default + checksum error in a transaction logfile, or another (hardware / filesystem) + defect. + This occurs extremely rarely in practice, but has been observed more frequently + during a massive failure of air conditioning in a datacenter, when disk + temperatures raised to more than 80° Celsius. + Notice that a secondary +\series bold +refuses +\series default + to apply any knowingly defective logfile data to the disk. + Although this message is +\emph on +not directly +\emph default + referring to the underlying disk, it is mentioned here because of its superior + +\series bold +relevance +\series default + for the diskstate. + A damaged transaction logfile will always affect the +\emph on +actuality +\emph default + of the disk, but not its +\emph on +integrity +\emph default + (by itself). + What to do in such a case? +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_deeper +\begin_layout Enumerate +When the damage is only at one of your secondaries, you should first ensure + that the primary has a good logfile after a +\family typewriter +marsadm log-rotate +\family default +, then try +\family typewriter +marsadm invalidate +\family default + at the damaged secondary. + It is crucial that the primary has a fresh correct logfile behind the error + position, and that it is continuing to operate correctly. +\end_layout + +\begin_layout Enumerate +When +\emph on +all +\emph default + of your secondaries are reporting +\family typewriter +DefectiveLog +\family default +, the primary could have +\emph on +produced +\emph default + a damaged logfile (e.g. + in RAM, in a DMA channel, etc) while continuing to operate, and all of + your secondaries got that defective logfile. + After +\family typewriter +marsadm log-delete-all all +\family default +, you can check this by comparing the +\family typewriter +md5sum +\family default + of the first primary logfile (having the lowest serial number) with the + versions on your replicas. + The problem is that you don't know whether the primary side has a silent + corruption on any of its disks, or not. + You will need to take an operational decision whether to switchover to + a secondary via +\family typewriter +primary --force +\family default +, or whether to continue operation at the primary and +\family typewriter +invalidate +\family default + your secondaries. +\end_layout + +\begin_layout Enumerate +When the original primary is affected in a very bad way, such that it crashed + badly and afterwards even recovery of the +\emph on +primary +\emph default + is impossible +\begin_inset Foot +status open + +\begin_layout Plain Layout +In such a rare case, the +\emph on +original primary +\emph default + (but not any other host) +\series bold +refuses +\series default + to come up during recovery with +\emph on +his own +\emph default + logfile originally produced by +\emph on +himself +\emph default +. + This is not a bug, but saves you from incorrectly assuming that your original + primary disk were consistent - it is +\emph on +known +\emph default + to be inconsistent, but recovery is impossible due to the damaged logfile. + Thus +\emph on +this one +\emph default + replica is trapped by defective hardware. + The other replicas shouldn't. +\end_layout + +\end_inset + + due to this error (which typically occurs extremely rarely, observed two + times during 7 millions of operating hours on defective hardware), you + need to take an operational decision between the following alternatives: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_deeper +\begin_layout Enumerate +switchover to a former secondary via +\family typewriter +primary --force +\family default +, producing a split brain, and producing some (typically small) data loss. + However, integrity is more important than actuality in such an extreme + case. +\end_layout + +\begin_layout Enumerate +deconstruction of the resource at +\emph on +all +\emph default + replicas via +\family typewriter +leave-resource --force +\family default +, running +\family typewriter +fsck +\family default + or similar tools by hand at the underlying disks, selecting the best replica + out of them, and finally re-constructing the resource again. +\end_layout + +\begin_layout Enumerate +restore your backup. +\end_layout + +\end_deeper +\end_deeper +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +Orphan +\family default + The secondary cannot replay data anymore, because it has been kicked out + for avoidance of emergency mode. + The data is not recent anymore. + Typically, +\family typewriter +marsadm invalidate +\family default + needs to be done. +\begin_inset Newline newline +\end_inset + +There is an execption: shortly after +\family typewriter +join-resource +\family default + or +\family typewriter +invalidate +\family default +, it may take some time until state +\family typewriter +Orphan +\family default + may be left, and until the newest logfile has appeared at your secondary + site (depending on the size of logfiles, and on your network). + In case of network problems, this may take very long. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + This state tells you that your replica is not current, and currently not + being updated at all. + Don't forget to +\series bold +monitor +\series default + for longer occurrences of this state! Otherwise you may get a big surprise + when you need a forceful emergency failover, but your replica is very old + or even does not really exist at all. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +NoAttach +\family default + (cf +\family typewriter +%is-attach{} +\family default +) The underlying disk is currently not opened by MARS. + Reasons may be that the kernel module is not loaded, or an exclusive +\family typewriter +open() +\family default + is currently not possible because somebody else has already opened it. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +InConsistent +\family default + (cf +\family typewriter +%is-consistent{} +\family default +) A logfile replay and/or sync is known to be needed / or to complete (e.g. + after +\family typewriter +invalidate +\family default + has started) in order to restore local consistency (for details, look at + +\family typewriter +flags +\family default +). +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Hint: in the current implementation of MARS, this will never happen on secondari +es during ordinary replay (but only when either sync has not yet finished, + or when the +\emph on +initial +\emph default + logfile replay after the sync has not yet finished), because the ordinary + logfile replay always maintains anytime consistency once a consistent state + had been reached. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + +\emph on +Only +\emph default + in case of a primary node crash, and +\emph on +only +\emph default + after attempts have failed to become primary again (e.g. + IO errors, etc), this +\emph on +can +\emph default + (but need not) mean that something went wrong. + Even in such an extremely unlikely event, chances are high that +\family typewriter +fsck +\family default +can fix any remaining problems (and, of course, you can also switchover + to a former secondary). +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +When this message appears, simply start MARS again (e.g. + +\family typewriter +modprobe mars; marsadm up all +\family default +), in whatever role you are intending. + This will +\emph on +automatically +\emph default + try to replay any necessary transaction logfile(s) in order to fix the + inconsistency. + Only if the automatic fix fails and this message persists for a long time + without progress, you +\emph on +might +\emph default + have a problem. + Typically, as observed at a large installation at 1&1, this happens extremely + rarely, and then typically indicates that your hardware is likely to be + defective. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +OutDated[FR] +\family default + (cf +\family typewriter +%work-reached{} +\family default +) Only at secondaries. + Tells whether it is +\emph on +currently known +\emph default + that the disk has any lag-behind when compared to the +\emph on +currently known +\emph default + state of the current designated primary (if there exists one). + Only meaningful if a current designated primary exists. + Notice that this kind of status display is subject to +\emph on +natural races +\emph default +, for example when new logfile data has been produced in parallel, or network + propagation is very slow. + Additional information is in brackets: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_deeper +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +[F] +\family default + Fetch is known to be needed. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +[R] +\family default + Replay is known to be needed. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +[FR] +\family default + Both are known to be needed. +\end_layout + +\end_deeper +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +WriteBack[ +\emph on +amount +\emph default +] +\family default + (cf +\family typewriter +%is-primary{} +\family default + and amount via +\family typewriter +%writeback-rest{} +\family default +) Appears only at actual primaries (whether designated or not), when the + writeback from the RAM buffer is active (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-Transaction-Logger" + +\end_inset + +). + The +\emph on +amount +\emph default + is displayed in human readable form, and may be used for a very rough estimatio +n of recovery time after a primary crash. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +Recovery +\family default + (cf +\family typewriter +%todo-primary{} +\family default +) Appears only at the designated primary before it actually has become primary. + Similar to database recovery, this indicates the recovery phase after a + crash +\begin_inset Foot +status open + +\begin_layout Plain Layout +In some cases, +\family typewriter +primary --force +\family default + may also trigger this message. +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +EmergencyMode +\family default + (cf +\family typewriter +%is-emergency{} +\family default +) A current designated primary exists, and it is known that this host has + entered emergency mode. + See section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Emergency-Mode" + +\end_inset + +. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +UpToDate +\family default + Displayed when none of the above has been detected. +\end_layout + +\end_deeper +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +diskstate-1and1 +\family default + A variant for internal use by 1&1 Internet AG. + See above note. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +replstate +\family default + Shows the status of the replication in the following order of precedence: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_deeper +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +ModuleNotLoaded +\family default + (cf +\family typewriter +%is-module-loaded{} +\family default +) No kernel module is loaded, and as a consequence no +\family typewriter +/proc/sys/mars/ +\family default + does exist. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +UnResponsive +\family default + (cf +\family typewriter +%is-alive{%{host}} +\family default +) The main thread +\family typewriter +mars_light +\family default + did not do any noticable work for more than +\family typewriter +%{window} +\family default + (default 60) seconds. + Notice that this may happen when deleting +\emph on +extremely +\emph default + large logfiles (up to hundreds of gigabytes or terabytes). + If this happens for a +\emph on +very +\emph default + long time, you should check whether you might need a reboot in order to + fix the hang. + The time window may be changed by +\family typewriter +--window=$seconds +\family default +. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +NotJoined +\family default + (cf +\family typewriter +%get-disk{} +\family default +) No underlying disk device is configured for this resource. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +NotStarted +\family default + (cf +\family typewriter +%todo-attach{} +\family default +) Replication has not been started. +\end_layout + +\begin_layout Itemize +When the current host is designated as a primary, the rest of the precedence + list looks as follows: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_deeper +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +EmergencyMode +\family default + (cf. + +\family typewriter +%is-emergency{} +\family default +) See section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Emergency-Mode" + +\end_inset + +. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +Replicating +\family default + (cf. + +\family typewriter +%is-primary{} +\family default +) Primary mode has been entered. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +NotYetPrimary +\family default +(catchall) This means the current host +\emph on +should +\emph default + act as a primary (see +\family typewriter +marsadm primary +\family default + or +\family typewriter +marsadm primary --force +\family default +), but currently doesn't (yet). + This happens during logfile replay, before primary mode is actually entered. + Notice that replay of very big logfiles may take a long time. +\end_layout + +\end_deeper +\begin_layout Itemize +When the current host is +\emph on +not +\emph default + designated as a primary: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_deeper +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +PausedSync +\family default + (cf. + +\family typewriter +%sync-rest{} +\family default + and +\family typewriter +%todo-sync{} +\family default +) Some data needs to be synced, but sync is currently switched off. + See +\family typewriter +marsadm {pause,resume}-sync +\family default +. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +Syncing +\family default + (cf. + +\family typewriter +%is-sync{} +\family default +) Sync is currently running. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +PausedFetch +\family default + (cf. + +\family typewriter +%todo{fetch} +\family default +) Fetch is currently switched off. + See +\family typewriter +marsadm {pause,resume}-fetch +\family default +. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +PausedReplay +\family default + (cf. + +\family typewriter +%todo{replay} +\family default +) Replay is currently switched off. + See +\family typewriter +marsadm {pause,resume}-replay +\family default +. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +NoPrimaryDesignated +\family default + (cf. + +\family typewriter +%get-primary{} +\family default +) A +\family typewriter +secondary +\family default + command has been given somewhere in the cluster. + Thus no designated primary exists. + All resource members are in state +\family typewriter +Secondary +\family default + or try to approach it. + Sync and other operations are not possible. + This state is therefore not recommended. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +PrimaryUnreachable +\family default + (cf. + +\family typewriter +%is-alive{} +\family default +) A current designated primary has been set, but this host has not been + remotely updated for more than 60 seconds (see also +\family typewriter +--window=$seconds +\family default +). +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +Orphan +\family default + The secondary cannot replay data anymore, because it has been kicked out + for avoidance of emergency mode. + The data is not recent anymore. + Typically, +\family typewriter +marsadm invalidate +\family default + needs to be done. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +Replaying +\family default + (catchall) None of the previous conditions have triggered. +\end_layout + +\end_deeper +\end_deeper +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +replstate-1and1 +\family default + A variant for internal use by 1&1 Internet AG. + See above note. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +flags +\family default + For each of disk, consistency, attach, sync, fetch, and replay, show exactly + one character. + Each character is either a capital one, or the corresponding lowercase + one, or a dash. + The meaning is as follows: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_deeper +\begin_layout Labeling +\labelwidthstring 00.00.0000 +disk/device: +\family typewriter +D +\family default + = the device +\family typewriter +/dev/mars/mydata +\family default + is present, +\family typewriter +d +\family default + = only the underlying disk +\family typewriter +/dev/lv-x/mydata +\family default + is present, +\family typewriter +- +\family default + = none present / configured. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 +consistency: this relates to the +\emph on +underlying disk +\emph default +, not to +\family typewriter +/dev/mars/mydata +\family default +! +\family typewriter +C +\family default + = locally consistent, +\family typewriter +c +\family default + = maybe inconsistent (no guarantee), - = cannot determine. + Notice: this does not tell anything about +\emph on +actuality +\emph default +. + Notice: like the other flags, this flag is subject to races and therefore + should be relied on only in +\emph on +detached +\emph default + state! See also description of macro +\family typewriter +is-consistent +\family default + below. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 +attach: +\family typewriter +A +\family default + = attached, +\family typewriter +a +\family default + = currently trying to attach/detach but not yet ready (intermediate state), + +\family typewriter +- +\family default + = attach is switched off. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 +sync: +\family typewriter +S +\family default + = sync finished, +\family typewriter +s +\family default + = currently syncing, +\family typewriter +- +\family default + = sync is switched off. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 +fetch: +\family typewriter +F +\family default + = according to knowlege, fetched logfiles are up-to-date, +\family typewriter +f +\family default + = currently fetching (some parts of) a logfile, +\family typewriter +- +\family default + = fetch is switched off. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 +replay: +\family typewriter +R +\family default + = all fetched logfiles are replayed, +\family typewriter +r +\family default + = currently replaying, +\family typewriter +- +\family default + = replay is switched off. +\end_layout + +\end_deeper +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +flags-1and1 +\family default + A variant for internal use by 1&1 Internet AG. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +todo-role +\family default + Shows the +\emph on +designated +\emph default + state: +\family typewriter +None +\family default +, +\family typewriter +Primary +\family default + or +\family typewriter +Secondary +\family default +. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +role +\family default + Shows the +\emph on +actual +\emph default + state: +\family typewriter +None +\family default +, +\family typewriter +NotYetPrimary +\family default +, +\family typewriter +Primary +\family default +, +\family typewriter +RemainsPrimary +\family default +, or +\family typewriter +Secondary +\family default +. + Any differences to the designated state are indicated by a prefix to the + keyword +\family typewriter +Primary +\family default +: +\family typewriter +NotYet +\family default + means that it +\emph on +should +\emph default + become primary, but actually hasn't. + Vice versa, +\family typewriter +Remains +\family default + means that it +\emph on +should +\emph default + leave primary state in order to become secondary, but actually cannot do + that because the +\family typewriter +/dev/mars/mydata +\family default + device is currently in use . +\begin_inset Newline newline +\end_inset + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +%todo-primary{} == 0 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +%todo-primary{} == 1 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +%is-primary{} == 0 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +None +\family default + / +\family typewriter +Secondary +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +NotYetPrimary +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +%is-primary{} == 1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +RemainsPrimary +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +Primary +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +role-1and1 +\family default + A variant for internal use by 1&1 Internet AG. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +primarynode +\family default + Display +\family typewriter +(none) +\family default + or the hostname of the designated primary. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +primarynode-1and1 +\family default + A variant for internal use by 1&1 Internet AG. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +commstate +\family default + When the last metadata communication to the designated primary is longer + ago than +\family typewriter +${window} +\family default + (see also +\family typewriter +--window= +\emph on +seconds +\family default +\emph default + option), display that age in human readable form. + See also primitive macro +\family typewriter +%alive-age{} +\family default +. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +syncinfo +\family default + Shows an informational progress bar when sync is running. + Intended for humans. + Scripts should not rely on any details from this. + Scripts may use this only as an +\emph on +approximate +\emph default + means for detecting progress (when comparing the +\emph on +full +\emph default + output text to a prior version and finding +\emph on +any +\emph default + difference, they may conclude that some progress has happened, how small + whatsoever). +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +syncinfo-1and1 +\family default + A variant for internal use by 1&1 Internet AG. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +replinfo +\family default + Shows an informational progress bar when fetch is running. + This should not be used for scripting at all, because it contains realtime + information in human-readable form. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +replinfo-1and1 +\family default + A variant for internal use by 1&1 Internet AG. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +fetch-line +\family default + Additional details, called by +\family typewriter +replinfo +\family default +. + Shows the amount of data to be fetched, as well as the current transfer + rate and a very rough estimation of the future duration. + When primitive macros +\family typewriter +%fetch-age{} +\family default + or +\family typewriter +%fetch-lag{} +\family default + exceed +\family typewriter +${window} +\family default +, their values are also displayed for human informational purposes. + See description of these primitive macros. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +replay-line +\family default + Additional details, called by +\family typewriter +replinfo +\family default +. + Shows the amount of data to be replayed, as well as the current replay + rate and a very rough estimation of the future duration. + When primitive macro +\family typewriter +%replay-age{} +\family default + exceeds +\family typewriter +${window} +\family default +, it is also displayed for human informational purposes. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +comminfo +\family default + When the network communication is in an unusual condition, display it. + Otherwise, don't produce any output. +\end_layout + +\begin_layout Subsection +Predefined Primitive Macros +\begin_inset CommandInset label +LatexCommand label +name "subsec:Predefined-Trivial-Macros" + +\end_inset + + +\end_layout + +\begin_layout Subsubsection +Intended for Humans +\end_layout + +\begin_layout Standard +In the following, shell glob notation +\family typewriter +{a,b} +\family default + is used to document similar variants of similar macros in a single place. + When you actually call the macro, you must choose one of the possible variants + (excluding the braces). +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +the-err-msg +\family default + Show reported errors for a resource. + When the resource argument is missing or empty, show global error information. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +all-err-msg +\family default + Like before, but show all information including those which are +\family typewriter +OK +\family default +. + This way, you get a list +\begin_inset Foot +status open + +\begin_layout Plain Layout +The list may be extended in future versions of MARS. +\end_layout + +\end_inset + + of +\emph on +all +\emph default + potential error information present in the system. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{all,the}-wrn-msg +\family default + Show all / reported warnings in the system. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{all,the}-inf-msg +\family default + Show all / reported informational messages in the system. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{all,the}-msg +\family default + Show all / reported messages regardless of its classification. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{all,the}-global-msg +\family default + Show global messages not associated with any resource (the resource argument + of the +\family typewriter +marsadm +\family default + command is ignored in this case). +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{all,the}-global-{inf,wrn,err}-msg +\family default + Dito, but more specific. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{all,the}-pretty-{global-,}{inf-,wrn-,err-,}msg +\family default + Dito, but show numerical timestamps in a human readable form. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{all,the}-{global-,}{inf-,wrn-,err-,}count +\family default + Instead of showing the messages, show their count (number of lines). +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +errno-text +\family default + This macro takes 1 argument, which must represent a Linux +\family typewriter +errno +\family default + number, and converts it to human readable form (similar to the C +\family typewriter +strerror() +\family default + function). +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +todo-{attach,sync,fetch,replay,primary} +\family default + Shows a boolean value (0 or 1) indicating the current state of the correspondin +g todo switch (whether on or off). + The meaning of todo switches is illustrated in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-State-of" + +\end_inset + +. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +get-resource-{fat,err,wrn} +\family default + Access to the internal error status files. + This is not an official interface and may thus change at any time without + notice. + Use this only for human inspection, not for scripting! +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + These macros, as well as the error status files, are likely to disappear + in future versions of MARS. + They should be used for debugging only. + At least when merging into the upstream Linux kernel, only the +\family typewriter +*-msg +\family default + macros will likely survive. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +get-resource-{fat,err,wrn}-count +\family default + Dito, but get the number of lines instead of the text. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +replay-code +\family default + Indicate the current state of logfile replay / recovery: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_deeper +\begin_layout Labeling +\labelwidthstring 00.00.0000 +(empty) Unknown. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 +0 No replay is currently running. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 +1 Replay is currently running. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 +2 Replay has successfully stopped. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 +<0 See Linux +\family typewriter +errno +\family default + code. + Typically this indicates a damaged logfile, or another filesystem error + at +\family typewriter +/mars +\family default +. +\end_layout + +\end_deeper +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +is-{attach,sync,fetch,replay,primary,module-loaded} +\family default + Shows a boolean value (0 or 1) indicating the +\emph on +actual +\emph default + state, whether the corresponding action has been actually carried out, + or not (yet). + Notice that the values indicated by +\family typewriter +is-* +\family default + may differ from the +\family typewriter +todo-* +\family default + values when something is not (yet) working. + More explanations can be found in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-State-of" + +\end_inset + +. + +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +is-split-brain +\family default + Shows whether split brain (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Split-Brain-Resolution" + +\end_inset + +) has been detected, or not. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +is-consistent +\family default + Shows whether the +\emph on +underlying disk +\emph default + is in a locally consistent state, i.e. + whether it +\emph on +could +\emph default + be (potentially) detached and then used for read-only test-mounting +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice that the +\emph on +writeback +\emph default + at the primary side is out-of-order by default, for performance reasons. + Therefore, the underlying disk is only guaranteed to be consistent when + there is no data left to be written back. + Notice that this condition is racy by construction. + When your primary node crashes during writeback and then comes up again, + you must do a +\family typewriter +modprobe mars +\family default + first in order to automatically replay the transaction logfiles, which + will automatically heal such temporary inconsistencies. +\end_layout + +\end_inset + +. + Don't confuse this with the consistency of +\family typewriter +/dev/mars/mydata +\family default +, which is by construction +\emph on +always +\emph default + locally consistent once it has appeared +\begin_inset Foot +status open + +\begin_layout Plain Layout +Exceptions are possible when using +\family typewriter +marsadm fake-sync +\family default +. + Even in split brain situations, +\family typewriter +marsadm primary --force +\family default + tries to prevent any further potential exception as best as it can, by + not letting +\family typewriter +/dev/mars/mydata +\family default + to appear and by insisting on split brain resolution first. + In future implementations, this might change if more pressure is put on + the developer to sacrifice consistency in preference to not waiting for + a full logfile replay. +\end_layout + +\end_inset + +. + By construction of MARS, the disk of secondaries will +\emph on +always +\emph default + remain in a locally consistent state once the initial sync has finished + as well as the initial logfile replay. + Notice that local consistency does not necessarily imply actuality (see + high-level explanation in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Behaviour-of-MARS" + +\end_inset + +). +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +is-emergency +\family default + Shows whether emergency mode (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Emergency-Mode" + +\end_inset + +) has been entered for the named resource, or not. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +rest-space +\family default + (global, no resource argument necessary) Shows the +\emph on +logically +\emph default + available space in +\family typewriter +/mars/ +\family default +, which may deviate from the physically available space as indicated by + the +\family typewriter +df +\family default + command. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +get-{disk,device} +\family default + Show the name of the underlying disk, or of the +\family typewriter +/dev/mars/mydata +\family default + device (if it is available). +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{disk,device}-present +\family default + Show (as a boolean value) whether the underlying disk, or the +\family typewriter +/dev/mars/mydata +\family default + device, is available. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +device-opened +\family default + Show (as a number) how often +\family typewriter +/dev/mars/mydata +\family default + has been actually openend, e.g. + by +\family typewriter +mount +\family default + or by some processes like +\family typewriter +dd +\family default +, or by iSCSI, etc. +\end_layout + +\begin_layout Subsubsection +Intended for Scripting +\end_layout + +\begin_layout Standard +While complex macros may output a whole bunch of information, the following + primitive macros are outputting exactly one value. + They are intended for script use (cf. + section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Scripting-HOWTO" + +\end_inset + +). + Of course, curious humans may also try them :) +\end_layout + +\begin_layout Standard +In the following, shell glob notation +\family typewriter +{a,b} +\family default + is used to document similar variants of similar macros in a single place. + When you actually call the macro, you must choose one of the possible variants + (excluding the braces). +\end_layout + +\begin_layout Paragraph +Name Querying +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +cluster-members +\family default + Show a newline-separated list of all host names participating in the cluster. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +resource-members +\family default + Show a newline-separated list of all host names participating in the particular + resource +\family typewriter +%{res} +\family default +. + Notice that this may be a subset of +\family typewriter +%cluster-members{} +\family default +. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{my,all}-resources +\family default + Show a newline-separated list of either all resource names existing in + the cluster, or only those where the current host +\family typewriter +%{host} +\family default + is member. + Optionally, you may specify the hostname as a parameter, e.g. + +\family typewriter +%my-resources{ +\emph on +otherhost +\emph default +} +\family default +. +\end_layout + +\begin_layout Paragraph +Amounts of Data Inquiry +\end_layout + +\begin_layout Standard +\begin_inset Float figure +placement h +wide false +sideways false +status open + +\begin_layout Plain Layout +\noindent +\align center +\begin_inset Graphics + filename images/fetch-replay-total.fig + width 80col% + +\end_inset + + +\end_layout + +\begin_layout Plain Layout +\begin_inset Caption Standard + +\begin_layout Plain Layout +overview on amounts / cursors +\begin_inset CommandInset label +LatexCommand label +name "fig:overview-on-amounts" + +\end_inset + + +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +The following macros are meaningful for both primary and secondary nodes: +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +deletable-size +\family default + Show the total amount of +\emph on +locally present +\emph default + logfile data which +\emph on +could +\emph default + be deleted by +\family typewriter +marsadm log-delete-all mydata +\family default +. + This differs almost always from both +\family typewriter +replay-pos +\family default + and +\family typewriter +occupied-size +\family default + due to granularity reasons (only whole logfiles can be deleted). + Units are +\emph on +bytes +\emph default +, not kilobytes. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +occupied-size +\family default + Show the total amount of +\emph on +locally present +\emph default + logfile data (sum of all file sizes). + This is often roughly approximate to +\family typewriter +fetch-pos +\family default +, but it may differ vastly (in both directions) when logfiles are not completely + transferred, when some are damaged, during split brain, after a +\family typewriter +join-resource +\family default + / +\family typewriter +invalidate +\family default +, or when the resource is in emergency mode (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Emergency-Mode" + +\end_inset + +). +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +disk-size +\family default + Show the size of the underlying local disk in bytes. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +resource-size +\family default + Show the logical size of the resource in bytes. + When this value is lower than +\family typewriter +disk-size +\family default +, you are wasting space. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +device-size +\family default + At a primary node, this may differ from +\family typewriter +resource-size +\family default + only for a very short time during the +\family typewriter +resize +\family default + operation. + At secondaries, there will be no difference. +\end_layout + +\begin_layout Standard +\noindent +The following macros are only meaningful for resources in primary mode: +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +writeback-rest +\family default + Show the amount of data which is already in the transaction logfile, but + has not yet been written back to the underlying disk. + This may be used for estimation of recovery time after a potential primary + crash. + The writeback buffer is explained by the graphics at +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:The-Transaction-Logger" +plural "false" +caps "false" +noprefix "false" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\noindent +The following macros are only meaningful for resources in secondary mode. + By information theoretic limits, they can only tell what is +\emph on +locally known +\emph default +. + They +\series bold +cannot +\series default + reflect the +\begin_inset Quotes eld +\end_inset + +true (global) state +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice that according to Einstein's law, and according to observations by + Lamport, the concept of +\begin_inset Quotes eld +\end_inset + +true state +\begin_inset Quotes erd +\end_inset + + does not exist at all in a distributed system. + Anything you can know in a distributed system is always local knowlege, + which races with other (remote) knowlege, and may be outdated at +\emph on +any +\emph default + time. +\end_layout + +\end_inset + + +\begin_inset Quotes erd +\end_inset + + of a cluster, in particular during network partitions. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{sync,fetch,replay,work}-size +\family default + Show the total amount of data which is / was to be processed by either + sync, fetch, or replay. + +\family typewriter +work-size +\family default + is equivalent to +\family typewriter +fetch-size +\family default +. + +\family typewriter +replay-size +\family default + is equivalent to +\family typewriter +fetch-pos +\family default + (see below). + Units are +\emph on +bytes +\emph default +, not kilobytes. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{sync,fetch,replay,work}-pos +\family default + Show the total amount of data which is already processed (current +\begin_inset Quotes eld +\end_inset + +cursor +\begin_inset Quotes erd +\end_inset + + position). + +\family typewriter +work-pos +\family default + is equivalent to +\family typewriter +replay-pos +\family default +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +The 0% point is the +\emph on +locally contiguous +\emph default + amount of data since the last +\family typewriter +create-resource +\family default +, +\family typewriter +join-resource +\family default +, or +\family typewriter +invalidate +\family default +, or since the last emergency mode, but possibly shortened by +\family typewriter +log-delete +\family default +s. + Notice that the 0% point may be different on different cluster nodes, because + their resource history may be different or non-contiguous during split + brain, or after a +\family typewriter +join-resource +\family default +, or after +\family typewriter +invalidate +\family default +, or during / after emergency mode. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{sync,fetch,replay,work}-rest +\family default + Shows the difference between +\family typewriter +*-size +\family default + and +\family typewriter +*-pos +\family default + (amount of work to do). + +\family typewriter +work-rest +\family default + is therefore the difference between +\family typewriter +fetch-size +\family default + and +\family typewriter +replay-pos +\family default +, which is the +\emph on +total +\emph default + amount of work to do (regardless whether to be fetched and/or to be replayed). +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{sync,fetch,replay,work}-reached +\family default + Boolean value indicating whether +\family typewriter +*-rest +\family default + dropped down to zero +\begin_inset Foot +status open + +\begin_layout Plain Layout +Recall from chapter +\begin_inset CommandInset ref +LatexCommand ref +reference "chap:Use-Cases-for" + +\end_inset + + that MARS (in its current stage of development) does only guarantee local + consistency, but cannot guarantee actuality in all imaginable situations. + Notice that a general notion of +\begin_inset Quotes eld +\end_inset + +actuality +\begin_inset Quotes erd +\end_inset + + is +\emph on +undefinable +\emph default + in a widely distributed system at all, according to Einstein's laws. +\end_layout + +\begin_layout Plain Layout +Let's look at an example. + In case of a node crash, and after the node is up again, a +\family typewriter +modprobe mars +\family default + has to occur, in order to replay the transaction logs of MARS again. + However, at the recovery phase before, the journalling +\family typewriter +ext4 +\family default + filesystem +\family typewriter +/mars/ +\family default + +\emph on +may +\emph default + have rolled back some internal symlink updates which have occurred immediately + before the crash. + MARS is relying on the fact that journalling filesystems like +\family typewriter +ext4 +\family default + should do their recovery in a consistent way, possibly by sacrifycing actuality + a little bit. + Therefore, the above macros cannot guarantee to deliver true information + about what is persisted at the moment. +\end_layout + +\begin_layout Plain Layout +Notice that there are further potential caveats. +\end_layout + +\begin_layout Plain Layout +In case of +\family typewriter +{sync,fetch}-reached +\family default +, MARS uses +\family typewriter +bio +\family default + callbacks resp. + +\family typewriter +fdatasync() +\family default + by default, thus the underlying storage layer has +\emph on +told +\emph default + us that it +\emph on +believes +\emph default + it has commited the data in a reboot-safe way. + Whether this is +\emph on +really +\emph default + true does not depend on MARS, but on the lower layers of the storage hierarchy. + There exists hardware where this claim is known to be wrong under certain + circumstances, such as certain hard disk drives in certain modes of operation. + Please check the hardware for any violations of storage semantics under + certain circumstances such as power loss, and check information sources + like magazines about the problem area. + Please notice that such a problem, if it exists at all, is independent + from MARS. + It would also exist if you wouldn't use MARS on the same system. +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{fetch,replay,work}-threshold-reached +\family default + Boolean value indicating whether +\family typewriter +*-rest +\family default + dropped down to +\family typewriter +%{threshold} +\family default +, which is pre-settable by the +\family typewriter +--threshold= +\emph on +size +\family default +\emph default + command line option (default is 10 MiB). + In asynchronous use cases of MARS, this should be preferred over +\family typewriter +*-reached +\family default + for +\emph on +human display +\emph default +, because it produces less flickering by the inevitable replication delay. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{fetch,replay,work}-almost-reached +\family default + Boolean value indicating whether +\family typewriter +*-rest +\family default + +\emph on +almost +\emph default + / +\emph on +approximately +\emph default + dropped down to zero. + The default is that at lease 990 permille are reached. + In asynchronous use cases of MARS, this can be preferred over +\family typewriter +*-reached +\family default + for +\emph on +human display +\emph default + only, because it produces less flickering by the inevitable replication + delay. + However, don't base any decisions on this! +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{sync,fetch,replay,work}-percent +\family default + The cursor position +\family typewriter +*-pos +\family default + as a percentage of +\family typewriter +*-size +\family default +. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{sync,fetch,replay,work}-permille +\family default + The cursor position +\family typewriter +*-pos +\family default + as permille of +\family typewriter +*-size +\family default +. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{sync,fetch,replay,work}-rate +\family default + Show the current throughput in bytes +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice that the internal granularity reported by the kernel may be coarser, + such as KiB. + This interfaces abstracts away from kernel internals and thus presents + everything in byte units. +\end_layout + +\end_inset + + per second. + +\family typewriter +work-rate +\family default + is the +\emph on +maximum +\emph default + of +\family typewriter +fetch-rate +\family default + and +\family typewriter +replay-rate +\family default +. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{sync,fetch,replay,work}-remain +\family default + Show the +\emph on +estimated +\emph default + remaining time for completion of the respective operation. + This is just a very raw guess. + Units are seconds. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +summary-vector +\family default + Show the colon-separated CSV value +\family typewriter +%replay-pos{}:%fetch-pos{}:%fetch-size{} +\family default +. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +replay-basenr +\family default +Get currently first reachable logfile number (see figure +\begin_inset CommandInset ref +LatexCommand vref +reference "fig:overview-on-amounts" + +\end_inset + +). + Only for curious humans or for debugging / monitoring - don't base any + decisions on this. + Use the +\family typewriter +*-{pos,size} +\family default + macros instead. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{replay,fetch,work}-lognr +\family default +Get current logfile number of replay or fetch position, or of the currently + known last reachable number (see figure +\begin_inset CommandInset ref +LatexCommand vref +reference "fig:overview-on-amounts" + +\end_inset + +). + Only for curious humans or for debugging / monitoring - don't base any + decisions on this. + Use the +\family typewriter +*-{pos,size} +\family default + macros instead. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{replay,fetch,work}-logcount +\family default +Get current number of logfiles which are already replayed, or are already + fetched, or are to be applied in total (see figure +\begin_inset CommandInset ref +LatexCommand vref +reference "fig:overview-on-amounts" + +\end_inset + +). + Only for curious humans or for debugging / monitoring - don't base any + decisions on this. + Use the +\family typewriter +*-{rest} +\family default + macros instead. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +alive-timestamp +\family default + Tell the Lamport Unix timestamp (seconds since 1970) of the last metadata + communication to the designated primary (or to any other host given by + the first argument). + Returns +\begin_inset Formula $-1$ +\end_inset + + if no such host exists. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{fetch,replay,work}-timestamp +\family default + Tell the Lamport Unix timestamp (seconds since 1970) when the last progress + has been made. + When no such action exists, +\begin_inset Formula $-1$ +\end_inset + + is returned. + +\family typewriter +%work-timestamp{ +\emph on +hostname +\emph default +} +\family default + is the maximum of +\family typewriter +%fetch-timestamp{ +\emph on +hostname +\emph default +} +\family default + and +\family typewriter +%replay-timestamp{ +\emph on +hostname +\emph default +} +\family default +. + When the parameter +\family typewriter +\emph on +hostname +\family default +\emph default + is empty, the local host will be reported (default). + Example usage: +\family typewriter +marsadm view all --macro= +\begin_inset Quotes erd +\end_inset + +%replay-timestamp{%todo-primary{}} +\begin_inset Quotes erd +\end_inset + + +\family default + shows the timestamp of the last reported +\begin_inset Foot +status open + +\begin_layout Plain Layout +Updates of this information are occurring with lower frequency than actual + writebacks, for performance reasons. + The metadata network update protocol will add further delays. + Therefore, the accuracy is only in the range of minutes. +\end_layout + +\end_inset + + writeback action at the designated primary. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{alive,fetch,replay,work}-age +\family default + Tell the number of seconds since the last respective action, or +\begin_inset Formula $-1$ +\end_inset + + if none exists. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +{alive,fetch,replay,work}-lag +\family default + Report the time difference (in seconds) between the last +\emph on +known +\emph default + action at the local host and at the designated primary (or between any + other hosts when 2 parameters are given). + Returns +\begin_inset Formula $-1$ +\end_inset + + if no such action exists at any of the two hosts. + Attention! This need not reflect the +\emph on +actual +\emph default + state in case of networking problems. + Don't draw wrong conclusions from a high +\family typewriter +{fetch,replay}-lag +\family default + value: it could also mean that simply no write operation at all has occurred + at the primary side for a long time. + Conversely, a low lag value does not imply that the replication is recent: + it may refer to +\emph on +different +\emph default + write operations at each of the hosts; therefore it only tells that +\emph on +some +\emph default + progress has been made, but says nothing about the amount of the progress. +\end_layout + +\begin_layout Paragraph +Misc Informational Status +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +get-primary +\family default + Return the name of the current designated primary node as locally known. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +actual-primary +\family default + (deprecated) try to determine the name of the node which +\emph on +appears +\emph default + to be the actual primary. + This only a +\series bold +\emph on +guess +\series default +\emph default +, because it is not generally unique in split brain situations! Don't use + this macro. + Instead, use +\family typewriter +is-primary +\family default + on those nodes you are interested in. + The explanations from section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-State-of" + +\end_inset + + also apply to +\family typewriter +get-primary +\family default + versus +\family typewriter +actual-primary +\family default + analogously. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +is-alive +\family default + Boolean value indicating whether all other nodes participating in +\family typewriter +mydata +\family default + are reachable / healthy. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +uuid +\family default + (global) Show the unique identifier created by +\family typewriter +create-cluster +\family default + or by +\family typewriter +create-uuid +\family default +. + Hint: this is immutable, and it is firmly bound to the +\family typewriter +/mars/ +\family default + filesystem. + It can only be destroyed by deleting the whole filesystem (see section + +\begin_inset CommandInset ref +LatexCommand ref +reference "leave-cluster" + +\end_inset + +). +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +tree +\family default + (global) Indicate symlink tree version (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-Symlink-Tree" + +\end_inset + +). +\end_layout + +\begin_layout Paragraph +Experts Only +\end_layout + +\begin_layout Standard +The following is for hackers who know what they are doing. + The following is not officially supported. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +wait-{is,todo}-{attach,sync,fetch,replay,primary}-{on,off} +\family default + This may be used to program some useful waiting conditions in advanced + macro scripts. + Use at your own risk! +\end_layout + +\begin_layout Section +Creating your own Macros +\begin_inset CommandInset label +LatexCommand label +name "subsec:Creating-your-own" + +\end_inset + + +\end_layout + +\begin_layout Standard +In order to create your own macros, you could start writing them from scratch + with your favorite ASCII text editor. + However, it is much easier to take an existing macro and to customize it + to your needs. + In addition, you can learn something about macro programming by looking + at the existing macro code. +\end_layout + +\begin_layout Standard +Go to a new empty directory and say +\end_layout + +\begin_layout Itemize + +\family typewriter +marsadm dump-macros +\end_layout + +\begin_layout Standard +in order to get the most interesting complex macros, or say +\end_layout + +\begin_layout Itemize + +\family typewriter +marsadm dump-all-macros +\end_layout + +\begin_layout Standard +in order to additionally get some primitive macros which could be customized + if needed. + This will write lots of files +\family typewriter +*.tpl +\family default + into your current working directory. +\end_layout + +\begin_layout Standard +Any modfied or new macro file should be placed either into the current working + directory +\family typewriter +./ +\family default + , or into +\family typewriter +$HOME/.marsadm/ +\family default + , or into +\family typewriter +/etc/marsadm/ +\family default + . + They will be searched in this order, and the first match will win. + When no macro file is found, the built-in version will be used if it exists. + This way, you may override builtin macros. +\end_layout + +\begin_layout Standard +Example: if you have a file +\family typewriter +./mymacro.tpl +\family default + you just need to say +\family typewriter +marsadm view-mymacro mydata +\family default + in order to invoke it in the resource context +\family typewriter +mydata +\family default +. +\end_layout + +\begin_layout Subsection +General Macro Syntax +\end_layout + +\begin_layout Standard +Macros are simple ASCII text, enriched with calls to other macros. +\end_layout + +\begin_layout Standard +ASCII text outside of comments are copied to the output verbatim. + Comments are skipped. + Comments may have one of the following well-known forms: +\end_layout + +\begin_layout Itemize + +\family typewriter +# skipped text until / including next newline character +\end_layout + +\begin_layout Itemize + +\family typewriter +// skipped text until / including next newline character +\end_layout + +\begin_layout Itemize + +\family typewriter +/* skipped text including any newline characters */ +\end_layout + +\begin_layout Itemize +denoted as Perl regex: +\family typewriter + +\backslash + +\backslash + +\backslash +n +\backslash +s* +\family default +(single backslash directly followed by a newline character, and eating up + any whitespace characters at the beginning of the next line) Hint: this + may be fruitfully used to structure macros in a more readable form / indentatio +n. +\end_layout + +\begin_layout Standard +Special characters are always initiated by a backslash. + The following pre-defined special character sequences are recognized: +\end_layout + +\begin_layout Itemize + +\family typewriter + +\backslash +n +\family default + newline +\end_layout + +\begin_layout Itemize + +\family typewriter + +\backslash +r +\family default + return (useful for DOS compatibility) +\end_layout + +\begin_layout Itemize + +\family typewriter + +\backslash +t +\family default + tab +\end_layout + +\begin_layout Itemize + +\family typewriter + +\backslash +f +\family default + formfeed +\end_layout + +\begin_layout Itemize + +\family typewriter + +\backslash +b +\family default + backspace +\end_layout + +\begin_layout Itemize + +\family typewriter + +\backslash +a +\family default + alarm (bell) +\end_layout + +\begin_layout Itemize + +\family typewriter + +\backslash +e +\family default + escape (e.g. + for generating ANSI escape sequences) +\end_layout + +\begin_layout Itemize + +\family typewriter + +\backslash + +\family default + followed by anything else: assure that the next character is taken verbatim. + Although possible, please don't use this for escaping letters, because + further escape sequences might be pre-defined in future. + Best practice is to use this only for escaping the backslash itself, or + for escaping the percent sign when you don't want to call a macro (protect + against evaluation), or to escape a brace directly after a macro call (verbatim + brace not to be interpreted as a macro parameter). +\end_layout + +\begin_layout Itemize +All other characters stand for their own. + If you like, you should be able to produce XML, HTML, JSON and other ASCII-base +d output formats this way. +\end_layout + +\begin_layout Standard +Macro calls have the following syntax: +\end_layout + +\begin_layout Itemize + +\family typewriter +% +\emph on +macroname +\emph default +{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +}{ +\emph on +argn +\emph default +} +\end_layout + +\begin_layout Itemize +Of course, arguments may be empty, denoted as +\family typewriter +{} +\end_layout + +\begin_layout Itemize +It is possible to supply more arguments than required. + These are simply ignored. +\end_layout + +\begin_layout Itemize +There must be always at least 1 argument, even for parameterless macros. + In such a case, it is good style to leave it empty (even if it is actually + ignored). + Just write +\family typewriter +%parameterlessmacro{} +\family default + in such a case. +\end_layout + +\begin_layout Itemize + +\family typewriter +%{ +\emph on +varname +\emph default +} +\family default + syntax: As a special case, the macro name may be empty, but then the first + argument must denote a previously defined variable (such as assigned via + +\family typewriter +%let{varname}{myvalue} +\family default +, or a pre-defined standard variable like +\family typewriter +%{res} +\family default + for the current resource name, see later paragraph +\begin_inset CommandInset ref +LatexCommand ref +reference "par:Predefined-Variables" + +\end_inset + +). +\end_layout + +\begin_layout Itemize +Of course, parameter calls may be (almost) arbitrarily nested. +\end_layout + +\begin_layout Itemize +Of course, the +\emph on +correctness +\emph default + of nesting of braces must be generally obeyed, as usual in any other macro + processor language. + General rule: for each opening brace, there must be exactly one closing + brace somewhere afterwards. +\end_layout + +\begin_layout Standard +These rules are hopefully simple and intuitive. + There are currently no exceptions. + In particular, there is no special infix operator syntax for arithmetic + expressions, and therefore no operator precedence rules are necessary. + You have to write nested arithmetic expressions always in the above prefix + syntax, like +\family typewriter +%*{7}{%+{2}{3}} +\family default + (similar to non-inverse polish notation). +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +When deeply nesting macros and their braces, you may easily find yourself + in a feeling like in the good old days of Lisp. + Use the above backslash-newline syntax to indent your macros in a readable + and structured way. + Fortunately, modern text editors like (x)emacs or vim have modes for dealing + with the correctness of nested braces. +\end_layout + +\begin_layout Subsection +Calling Builtin / Primitive Macros +\end_layout + +\begin_layout Standard +Primitive macros can be called in two alternate forms: +\end_layout + +\begin_layout Itemize + +\family typewriter +%primitive- +\emph on +macroname +\emph default +{ +\emph on +something +\emph default +} +\end_layout + +\begin_layout Itemize + +\family typewriter +% +\emph on +macroname +\emph default +{ +\emph on +something +\emph default +} +\end_layout + +\begin_layout Standard +When using the +\family typewriter +%primitive-*{} +\family default + form, you +\emph on +explicitly disallow +\emph default + interception of the call by a +\family typewriter +*.tpl +\family default + file. + Otherwise, you may override the standard definition even of primitive macros + by your own template files. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Notice that +\family typewriter +%call{} +\family default + conventions are used in such a case. + The parameters are passed via +\family typewriter +%{0} +\family default + +\begin_inset Formula $\ldots$ +\end_inset + + +\family typewriter +%{n} +\family default + variables (see description below). +\end_layout + +\begin_layout Paragraph +Standard MARS State Inspection Macros +\end_layout + +\begin_layout Standard +These are already described in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Predefined-Trivial-Macros" + +\end_inset + +. + When calling one of them, the call will simply expand to the corresponding + value. +\end_layout + +\begin_layout Standard +Example: +\family typewriter +%get-primary{} +\family default + will expand to the hostname of the current designated primary node. +\end_layout + +\begin_layout Paragraph +Further MARS State Inspection Macros +\end_layout + +\begin_layout Paragraph +Variable Access Macros +\end_layout + +\begin_layout Itemize + +\family typewriter +%let{ +\emph on +varname +\emph default +}{ +\emph on +expression +\emph default +} +\family default +Evaluates both +\family typewriter +\emph on +varname +\family default +\emph default + and the +\family typewriter +\emph on +expression +\family default +\emph default +. + The +\family typewriter +\emph on +expression +\family default +\emph default + is then assigned to +\family typewriter +varname +\family default +. +\end_layout + +\begin_layout Itemize + +\family typewriter +%let{ +\emph on +varname +\emph default +}{ +\emph on +expression +\emph default +} +\family default +Evaluates both +\family typewriter +\emph on +varname +\family default +\emph default + and the +\family typewriter +\emph on +expression +\family default +\emph default +. + The +\family typewriter +\emph on +expression +\family default +\emph default + is then appended to +\family typewriter +varname +\family default + (concatenation). +\end_layout + +\begin_layout Itemize + +\family typewriter +%{ +\emph on +varname +\emph default +} +\family default +Evaluates +\family typewriter +\emph on +varname +\family default +\emph default +, and outputs the value of the corresponding variable. + When the variable does not exist, the empty string is returned. +\end_layout + +\begin_layout Itemize + +\family typewriter +%{++}{ +\emph on +varname +\emph default +} +\family default +or +\family typewriter +%{ +\emph on +varname +\emph default +}{++} +\family default + Has the obvious well-known side effect e.g. + from C or Java. + You may also use +\family typewriter +-- +\family default + instead of +\family typewriter +++ +\family default +. + This is handy for programming loops (see below). +\end_layout + +\begin_layout Itemize + +\family typewriter +%dump-vars{} +\family default +Writes all currently defined variables (from the currently active scope) + to +\family typewriter +stderr +\family default +. + This is handy for debugging. +\end_layout + +\begin_layout Paragraph +CSV Array Macros +\end_layout + +\begin_layout Itemize + +\family typewriter +%{ +\emph on +varname +\emph default +}{ +\emph on +delimiter +\emph default +}{ +\emph on +index +\emph default +} +\family default +Evaluates all arguments. + The contents of +\family typewriter +\emph on +varname +\family default +\emph default + is interpreted as a comma-separated list, delimited by +\family typewriter +\emph on +delimiter +\family default +\emph default +. + The +\family typewriter +\emph on +index +\family default +\emph default +'th list element is returned. +\end_layout + +\begin_layout Itemize + +\family typewriter +%set{ +\emph on +varname +\emph default +}{ +\emph on +delimiter +\emph default +}{ +\emph on +index +\emph default +}{ +\emph on +expression +\emph default +} +\family default +Evaluates all arguments. + The contents of the old +\family typewriter +\emph on +varname +\family default +\emph default + is interpreted as a comma-separated list, delimited by +\family typewriter +\emph on +delimiter +\family default +\emph default +. + The +\family typewriter +\emph on +index +\family default +\emph default +'th list element is the assigend to, or substituted by, +\family typewriter +\emph on +expression +\family default +\emph default +. +\end_layout + +\begin_layout Paragraph +Arithmetic Expression Macros +\end_layout + +\begin_layout Standard +The following macros can also take more than two arguments, carrying out + the corresponding arithmetic operation in sequence (it depends on the operator + whether this accords to the associative law). +\end_layout + +\begin_layout Itemize + +\family typewriter +%+{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Evaluates the arguments, inteprets them as numbers, and adds them together. +\end_layout + +\begin_layout Itemize + +\family typewriter +%-{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Subtraction. +\end_layout + +\begin_layout Itemize + +\family typewriter +%*{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Multiplication. +\end_layout + +\begin_layout Itemize + +\family typewriter +%/{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Division. +\end_layout + +\begin_layout Itemize + +\family typewriter +%%{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Modulus. +\end_layout + +\begin_layout Itemize + +\family typewriter +%&{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Bitwise Binary And. +\end_layout + +\begin_layout Itemize + +\family typewriter +%|{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Bitwise Binary Or. +\end_layout + +\begin_layout Itemize + +\family typewriter +%^{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Bitwise Binary Exclusive Or. +\end_layout + +\begin_layout Itemize + +\family typewriter +%<<{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Binary Shift Left. +\end_layout + +\begin_layout Itemize + +\family typewriter +%>>{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Binary Shift Right. +\end_layout + +\begin_layout Itemize + +\family typewriter +%min{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Compute the arithmetic minimum of the arguments. +\end_layout + +\begin_layout Itemize + +\family typewriter +%max{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Compute the arithmetic maximum of the arguments. +\end_layout + +\begin_layout Paragraph +Boolean Condition Macros +\end_layout + +\begin_layout Itemize + +\family typewriter +%=={ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Numeral Equality. +\end_layout + +\begin_layout Itemize + +\family typewriter +%!={ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Numeral Inequality. +\end_layout + +\begin_layout Itemize + +\family typewriter +%<{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Numeral Less Then. +\end_layout + +\begin_layout Itemize + +\family typewriter +%<={ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Numeral Less or Equal. +\end_layout + +\begin_layout Itemize + +\family typewriter +%>{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Numeral Greater Then. +\end_layout + +\begin_layout Itemize + +\family typewriter +%>={ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Numeral Greater or Equal. +\end_layout + +\begin_layout Itemize + +\family typewriter +%eq{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default + +\begin_inset space ~ +\end_inset + +String Equality. +\end_layout + +\begin_layout Itemize + +\family typewriter +%ne{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +String Inequality. +\end_layout + +\begin_layout Itemize + +\family typewriter +%lt{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +String Less Then. +\end_layout + +\begin_layout Itemize + +\family typewriter +%le{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +String Less or Equal. +\end_layout + +\begin_layout Itemize + +\family typewriter +%gt{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +String Greater Then. +\end_layout + +\begin_layout Itemize + +\family typewriter +%ge{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +String Greater or Equal. +\end_layout + +\begin_layout Itemize + +\family typewriter +%=~{ +\emph on +string +\emph default +}{ +\emph on +regex +\emph default +}{ +\emph on +opts +\emph default +} +\family default +or +\family typewriter +%match{ +\emph on +string +\emph default +}{ +\emph on +regex +\emph default +}{ +\emph on +opts +\emph default +} +\family default + Checks whether +\family typewriter +\emph on +string +\family default +\emph default + matches the Perl regular expression +\family typewriter +\emph on +regex +\family default +\emph default +. + Modifiers can be given via +\family typewriter +\emph on +opts +\family default +\emph default +. +\end_layout + +\begin_layout Paragraph +Shortcut Evaluation Operators +\end_layout + +\begin_layout Standard +The following operators evaluate their arguments only when needed (like + in C). +\end_layout + +\begin_layout Itemize + +\family typewriter +%&&{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Logical And. +\end_layout + +\begin_layout Itemize + +\family typewriter +%and{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Alias for +\family typewriter +%&&{} +\family default +. +\end_layout + +\begin_layout Itemize + +\family typewriter +%||{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Logical Or. +\end_layout + +\begin_layout Itemize + +\family typewriter +%or{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +} +\family default +Alias for +\family typewriter +%||{} +\family default +. +\end_layout + +\begin_layout Paragraph +Unary Operators +\end_layout + +\begin_layout Itemize + +\family typewriter +%!{ +\emph on +arg +\emph default +} +\family default +Logical Not. +\end_layout + +\begin_layout Itemize + +\family typewriter +%not{ +\emph on +arg +\emph default +} +\family default +Alias for +\family typewriter +%!{} +\family default +. +\end_layout + +\begin_layout Itemize + +\family typewriter +%~{ +\emph on +arg +\emph default +} +\family default +Bitwise Ńegation. +\end_layout + +\begin_layout Paragraph +String Functions +\end_layout + +\begin_layout Itemize + +\family typewriter +%length{ +\emph on +string +\emph default +} +\family default +Return the number of ASCII characters present in +\family typewriter +\emph on +string +\family default +\emph default +. +\end_layout + +\begin_layout Itemize + +\family typewriter +%toupper{ +\emph on +string +\emph default +} +\family default +Return all ASCII characters converted to uppercase. +\end_layout + +\begin_layout Itemize + +\family typewriter +%tolower{ +\emph on +string +\emph default +} +\family default +Return all ASCII characters converted to lowercase. +\end_layout + +\begin_layout Itemize + +\family typewriter +%append{ +\emph on +varname +\emph default +}{ +\emph on +string +\emph default +} +\family default +Equivalent to +\family typewriter +%let{ +\emph on +varname +\emph default +}{%{ +\emph on +varname +\emph default +} +\emph on +string +\emph default +} +\family default +. +\end_layout + +\begin_layout Itemize + +\family typewriter +%subst{ +\emph on +string +\emph default +}{ +\emph on +regex +\emph default +}{ +\emph on +subst +\emph default +}{ +\emph on +opts +\emph default +} +\family default +Perl regex substitution. +\end_layout + +\begin_layout Itemize + +\family typewriter +%sprintf{ +\emph on +fmt +\emph default +}{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +}{ +\emph on +argn +\emph default +} +\family default +Perl +\family typewriter +sprintf() +\family default + operator. + Details see Perl manual. +\end_layout + +\begin_layout Itemize + +\family typewriter +%human-number{ +\emph on +unit +\emph default +}{ +\emph on +delim +\emph default +}{ +\emph on +unit-sep +\emph default +}{ +\emph on +number +\emph default +1}{ +\emph on +number +\emph default +2} +\begin_inset Formula $\ldots$ +\end_inset + + +\family default +Convert a number or a list of numbers into human-readable +\family typewriter +B +\family default +, +\family typewriter +KiB +\family default +, +\family typewriter +MiB +\family default +, +\family typewriter +GiB +\family default +, +\family typewriter +TiB +\family default +, as given by +\family typewriter +\emph on +unit +\family default +\emph default +. + When +\family typewriter +\emph on +unit +\family default +\emph default + is empty, a reasonable unit will be guessed automatically from the maximum + of all given numbers. + A single result string is produced, where multiple numbers are separated + by +\family typewriter +\emph on +delim +\family default +\emph default + when necessary. + When +\family typewriter +\emph on +delim +\family default +\emph default + is empty, the slash symbol +\family typewriter +/ +\family default + is used by default (the most obvious use case is result strings like +\family typewriter + +\begin_inset Quotes eld +\end_inset + +17/32 KiB +\begin_inset Quotes erd +\end_inset + + +\family default +). + The final unit text is separated from the previous number(s) by +\family typewriter +\emph on +unit-sep +\family default +\emph default +. + When +\family typewriter +\emph on +unit-sep +\family default +\emph default + is empty, a single blank is used by default. +\end_layout + +\begin_layout Itemize + +\family typewriter +%human-seconds{ +\emph on +number +\emph default +} +\family default +Convert the given number of seconds into +\family typewriter +hh:mm:ss +\family default + format. +\end_layout + +\begin_layout Paragraph +Complex Helper Macros +\end_layout + +\begin_layout Itemize + +\family typewriter +%progress{20} +\family default +Return a string containing a progress bar showing the values from +\family typewriter +%summary-vector{} +\family default +. + The default width is 20 characters plus two braces. +\end_layout + +\begin_layout Itemize + +\family typewriter +%progress{20}{ +\emph on +minvalue +\emph default +}{ +\emph on +midvalue +\emph default +}{ +\emph on +maxvalue +\emph default +} +\family default +Instead of taking the values from +\family typewriter +%summary-vector{} +\family default +, use the supplied values. + +\family typewriter +minvalue +\family default + and +\family typewriter +midvalue +\family default + indicate two different intermediate points, while +\family typewriter +maxvalue +\family default + will determine the 100% point. +\end_layout + +\begin_layout Paragraph +Control Flow Macros +\end_layout + +\begin_layout Itemize + +\family typewriter +%if{ +\emph on +expression +\emph default +}{ +\emph on +then-part +\emph default +} +\family default + or +\family typewriter +%if{ +\emph on +expression +\emph default +}{ +\emph on +then-part +\emph default +}{ +\emph on +else-part +\emph default +} +\family default + Like in any other macro or programming language, this evaluates the +\family typewriter +expression +\family default + once, not copying its outcome to the output. + If the result is non-empty and is not a string denoting the number +\family typewriter +0 +\family default +, the +\family typewriter +\emph on +then-part +\family default +\emph default + is evaluated and copied to the output. + Otherwise, the +\family typewriter +else-part +\family default + is evaluated and copied, provided that one exists. +\end_layout + +\begin_layout Itemize + +\family typewriter +%unless{ +\emph on +expression +\emph default +}{ +\emph on +then-part +\emph default +} +\family default + or +\family typewriter +%unless{ +\emph on +expression +\emph default +}{ +\emph on +then-part +\emph default +}{ +\emph on +else-part +\emph default +} +\family default + Like +\family typewriter +%if{} +\family default +, but the expression is logically negated. + Essentially, this is a shorthand for +\family typewriter +%if{%not{expression}}{...} +\family default + or similar. +\end_layout + +\begin_layout Itemize + +\family typewriter +%elsif{ +\emph on +expr1 +\emph default +}{ +\emph on +then1 +\emph default +}{ +\emph on +expr2 +\emph default +}{ +\emph on +then2 +\emph default +} +\family default + +\begin_inset Formula $\ldots$ +\end_inset + + or +\family typewriter +%elsif{ +\emph on +expr1 +\emph default +}{ +\emph on +then1 +\emph default +}{ +\emph on +expr2 +\emph default +}{ +\emph on +then2 +\emph default +} +\family default + +\begin_inset Formula $\ldots$ +\end_inset + + +\family typewriter +{ +\emph on +odd-else-part +\emph default +} +\family default + This is for simplification of boring if-else-if chains. + The classical if-syntax (as shown above) has the drawback that inner if-parts + need to be nested into outer else-parts, so rather deep nestings may occur + when you are programming longer chains. + This is an alternate syntax for avoidance of deep nesting. + When giving an odd number of arguments, the last argument is taken as final + else-part. +\end_layout + +\begin_layout Itemize + +\family typewriter +%elsunless +\family default + +\begin_inset Formula $\ldots$ +\end_inset + + Like +\family typewriter +%elsif +\family default +, but +\emph on +all +\emph default + conditions are negated. +\end_layout + +\begin_layout Itemize + +\family typewriter +%while{ +\emph on +expression +\emph default +}{ +\emph on +body +\emph default +} +\family default +Evaluates the +\family typewriter +\emph on +expression +\family default +\emph default + in a while loop, like in any other macro or programming language. + The +\family typewriter +\emph on +body +\family default +\emph default + is evaluated exactly as many times as the +\family typewriter +\emph on +expression +\family default +\emph default + holds. + Notice that endless loops can be only avoided by a calling a non-pure macro + inspecting external state information, or by creating (and checking) another + side effect somewhere, like assigning to a variable somewhere. +\end_layout + +\begin_layout Itemize + +\family typewriter +%until{ +\emph on +expression +\emph default +}{ +\emph on +body +\emph default +} +\family default +Like +\family typewriter + %while{ +\emph on +expression +\emph default +}{ +\emph on +body +\emph default +} +\family default +, but negate the expression. +\end_layout + +\begin_layout Itemize + +\family typewriter +%for{ +\emph on +exp +\emph default +r1}{ +\emph on +exp +\emph default +r2}{ +\emph on +exp +\emph default +r3}{ +\emph on +body +\emph default +} +\family default + As you will expect from the corresponding C, Perl, Java, or (add your favorite + language) construct. + Only the syntactic sugar is a little bit different. +\end_layout + +\begin_layout Itemize + +\family typewriter +%foreach{ +\emph on +varname +\emph default +}{ +\emph on +CSV-delimited-string +\emph default +}{ +\emph on +delimiter +\emph default +}{ +\emph on +body +\emph default +} +\family default + As you can expect from similar +\family typewriter +foreach +\family default + constructs in other languages like Perl. + Currently, the macro processor has no arrays, but can use comma-separated + strings as a substitute. +\end_layout + +\begin_layout Itemize + +\family typewriter +%eval{ +\emph on +count +\emph default +}{ +\emph on +body +\emph default +} +\family default + Evaluates the +\family typewriter +\emph on +body +\family default +\emph default + exactly as many times as indicated by the numeric argument +\family typewriter +\emph on +count +\family default +\emph default +. + This may be used to re-evaluate the output of other macros once again. +\end_layout + +\begin_layout Itemize + +\family typewriter +%protect{ +\emph on +body +\emph default +} +\family default + Equivalent to +\family typewriter +%eval{0}{ +\emph on +body +\emph default +} +\family default +, which means that the body is not evaluated at all, but copied to the output + verbatim +\begin_inset Foot +status open + +\begin_layout Plain Layout +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +TeX +\end_layout + +\end_inset + + +\begin_inset space ~ +\end_inset + +or +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +LaTeX +\end_layout + +\end_inset + + +\begin_inset space ~ +\end_inset + +fans usually know what this is good for ;) +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Itemize + +\family typewriter +%eval-down{ +\emph on +body +\emph default +} +\family default + Evaluates the +\family typewriter +\emph on +body +\family default +\emph default + in a loop until the result does not change any more +\begin_inset Foot +status open + +\begin_layout Plain Layout +Mathematicians knowing Banach's fixedpoint theorem will know what this is + good for ;) +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Itemize + +\family typewriter +%tmp{ +\emph on +body +\emph default +} +\family default + Evaluates the +\family typewriter +\emph on +body +\family default +\emph default + once in a temporary scope which is thrown away afterwards. +\end_layout + +\begin_layout Itemize + +\family typewriter +%call{ +\emph on +macroname +\emph default +}{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +}{ +\emph on +argn +\emph default +} +\family default + Like in many other macro languages, this evaluates the named macro in the + a new scope. + This means that any side effects produced by the called macro, such as + variable assignments, will be reverted after the call, and therefore not + influence the old scope. + However notice that the arguments +\family typewriter +\emph on +arg1 +\family default +\emph default + to +\family typewriter +\emph on +argn +\family default +\emph default + are evaluted in the +\emph on +old +\emph default + scope before the call actually happens (possibly producing side effects + if they contain some), and their result is respectively assigned to +\family typewriter +%{1} +\family default + until +\family typewriter +%{ +\emph on +n +\emph default +} +\family default + in the new scope, analogously to the Shell or to Perl. + In addition, the new +\family typewriter +%{0} +\family default + gets the +\family typewriter +\emph on +macroname +\family default +\emph default +. + Notice that the argument evaluation happens non-lazily in the old scope + and therefore differs from other macro processors like +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +TeX +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Itemize + +\family typewriter +%include{ +\emph on +macroname +\emph default +}{ +\emph on +arg1 +\emph default +}{ +\emph on +arg2 +\emph default +}{ +\emph on +argn +\emph default +} +\family default + Like +\family typewriter +%call{} +\family default +, but evaluates the named macro in the +\emph on +current +\emph default + scope (similar to the +\family typewriter +source +\family default + command of the bourne shell). + This means that any side effects produced by the called macro, such as + variable assignments, will +\emph on +not +\emph default + be reverted after the call. + Even the +\family typewriter +%{0} +\family default + until +\family typewriter +%{ +\emph on +n +\emph default +} +\family default + variables will continue to exist (and may lead to confusion if you aren't + aware of that). +\end_layout + +\begin_layout Itemize + +\family typewriter +%callstack{} +\family default + Useful for debugging: show the current chain of macro invocations. +\end_layout + +\begin_layout Paragraph +Time Handling Macros +\end_layout + +\begin_layout Itemize + +\family typewriter +%time{} +\family default + Return the current Lamport timestamp (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-Lamport-Clock" + +\end_inset + +), in units of seconds since the Unix epoch. +\end_layout + +\begin_layout Itemize + +\family typewriter +%real-time{} +\family default + Return the current system clock timestamp, in units of seconds since the + Unix epoch. +\end_layout + +\begin_layout Itemize + +\family typewriter +%sleep{ +\emph on +seconds +\emph default +} +\family default + Pause the given number of seconds. +\end_layout + +\begin_layout Itemize + +\family typewriter +%timeout{ +\emph on +seconds +\emph default +} +\family default + Like +\family typewriter +%sleep{ +\emph on +seconds +\emph default +} +\family default +, but abort the +\family typewriter +marsadm +\family default + command after the total waiting time has exceeded the timeout given by + the +\family typewriter +--timeout= +\family default + parameter. +\end_layout + +\begin_layout Paragraph +Misc Macros +\end_layout + +\begin_layout Itemize + +\family typewriter +%warn{ +\emph on +text +\emph default +} +\family default + Show a WARNING: +\end_layout + +\begin_layout Itemize + +\family typewriter +%die{ +\emph on +text +\emph default +} +\family default + Abort execution with an error message. +\end_layout + +\begin_layout Paragraph +Experts Only - Risky +\end_layout + +\begin_layout Standard +The following macros are unstable and may change at any time without notice. +\end_layout + +\begin_layout Itemize + +\family typewriter +%get-msg{ +\emph on +name +\emph default +} +\family default + Low-level access to system messages. + You should not use this, since this is not extensible (you must know the + name in advance). +\end_layout + +\begin_layout Itemize + +\family typewriter +%readlink{ +\emph on +path +\emph default +} +\family default + Low-level access to symlinks. + Don't misuse this for circumvention of the abstraction macros from the + symlink tree! +\end_layout + +\begin_layout Itemize + +\family typewriter +%setlink{ +\emph on +value +\emph default +}{ +\emph on +path +\emph default +} +\family default + Low-level creation of symlinks. + Don't misuse this for circumvention of the abstraction macros for the symlink + tree! +\end_layout + +\begin_layout Itemize + +\family typewriter +%fetch-info{} +\family default +etc. + Low-level access to internal symlink formats. + Don't use this in scripts! Only for curious humans. +\end_layout + +\begin_layout Itemize + +\family typewriter +%is-almost-consistent{} +\family default + Whatever you guess what this could mean, don't use it, at least never in + place of +\family typewriter +%is-consistent{} +\family default + - it is risky to base decisions on this. + Mostly for historical reasons. +\end_layout + +\begin_layout Itemize + +\family typewriter +%does{ +\emph on +name +\emph default +} +\family default +Equivalent to +\family typewriter +%is- +\emph on +name +\emph default +{} +\family default + (just more handy for computing the macro name). + Use with care! +\end_layout + +\begin_layout Subsection +Predefined Variables +\begin_inset CommandInset label +LatexCommand label +name "par:Predefined-Variables" + +\end_inset + + +\end_layout + +\begin_layout Itemize + +\family typewriter +%{cmd} +\family default +The command argument of the invoked +\family typewriter +marsadm +\family default + command. +\end_layout + +\begin_layout Itemize + +\family typewriter +%{res} +\family default +The resource name given to the +\family typewriter +marsadm +\family default + command as a command line parameter (or, possibly expanded from +\family typewriter +all +\family default +). +\end_layout + +\begin_layout Itemize + +\family typewriter +%{resdir} +\family default +The corresponding resource directory. + The current version of MARS uses +\family typewriter +/mars/resource-%{res}/ +\family default +, but this may change in future. + Normally, you should not need this, since anything should be already abstracted + for you. + In case you +\emph on +really +\emph default + need low-level access to something, please prefer this variable over +\family typewriter +%{mars}/resource-%{res} +\family default + because it is a bit more abstracted. +\end_layout + +\begin_layout Itemize + +\family typewriter +%{mars} +\family default +Currently the fixed string +\family typewriter +/mars +\family default +. + This may change in future, probably with the advent of MARS Full. +\end_layout + +\begin_layout Itemize + +\family typewriter +%{host} +\family default +The hostname of the local node. +\end_layout + +\begin_layout Itemize + +\family typewriter +%{ip} +\family default +The IP address of the local node. +\end_layout + +\begin_layout Itemize + +\family typewriter +%{timeout} +\family default +The value given by the +\family typewriter +--timeout= +\family default + option, or the corresonding default value. +\end_layout + +\begin_layout Itemize + +\family typewriter +%{threshold} +\family default +The value given by the +\family typewriter +--threshold= +\family default + option, or the corresonding default value. +\end_layout + +\begin_layout Itemize + +\family typewriter +%{window} +\family default +The value given by the +\family typewriter +--window= +\family default + option, or the corresonding default value (60s). +\end_layout + +\begin_layout Itemize + +\family typewriter +%{force} +\family default +The number of times the +\family typewriter +--force +\family default + option has been given. +\end_layout + +\begin_layout Itemize + +\family typewriter +%{dry-run} +\family default +The number of times the +\family typewriter +--dry-run +\family default + option has been given. +\end_layout + +\begin_layout Itemize + +\family typewriter +%{verbose} +\family default +The number of times the +\family typewriter +--verbose +\family default + option has been given. +\end_layout + +\begin_layout Itemize + +\family typewriter +%{callstack} +\family default +Same as the +\family typewriter +%callstack{} +\family default + macro. + The latter gives you an opportunity for overriding, while the former is + firmly built in. +\end_layout + +\begin_layout Section +Scripting HOWTO +\begin_inset CommandInset label +LatexCommand label +name "sec:Scripting-HOWTO" + +\end_inset + + +\end_layout + +\begin_layout Standard +Both the +\series bold +asynchronous communication model +\series default + of MARS (cf section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-Lamport-Clock" + +\end_inset + +) including the Lamport clock, and the +\series bold +state model +\series default + (cf section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-State-of" + +\end_inset + +) is something you +\emph on +definitely +\emph default + should have in mind when you want to do some scripting. + Here is some further concrete advice: +\end_layout + +\begin_layout Itemize +Don't access anything on +\family typewriter +/mars/ +\family default + directly, except for debugging purposes. + Use +\family typewriter +marsadm +\family default +. +\end_layout + +\begin_layout Itemize +Avoid running scripts in parallel, other than for inspection / monitoring + purposes. + When you give two +\family typewriter +marsadm +\family default + commands in parallel (whether on the same host, or on different hosts belonging + to the same cluster), it is very likely to produce a mess. + +\family typewriter +marsadm +\family default + has no internal locking. + There is no cluster-wide locking at all. + Unfortunately, some systems like Pacemaker are violating this in many cases + (depending on their configuration). + Best is if you have a dedicated / more or less centralized +\series bold +control machine +\series default + which controls masses of your georedundant working servers. + This reduces the risk of running interfering actions in parallel. + Of course, you need backup machines for your control machines, and in different + locations. + Not obeying this advice can easily lead to problems such as complex races + which are very difficult to solve in long-distance distributed systems, + even in general (not limited to MARS). +\end_layout + +\begin_layout Itemize + +\family typewriter +marsadm wait-cluster +\family default + is your friend. + Whenever your (near-)central script has to switch between different hosts + +\family typewriter +A +\family default + and +\family typewriter +B +\family default + (of the same cluster), use it in the following way: +\begin_inset Newline newline +\end_inset + + +\family typewriter +ssh A +\begin_inset Quotes eld +\end_inset + +marsadm action1 +\begin_inset Quotes erd +\end_inset + +; ssh B +\begin_inset Quotes eld +\end_inset + +marsadm wait-cluster; marsadm action2 +\begin_inset Quotes erd +\end_inset + + +\begin_inset Newline newline +\end_inset + + +\family default + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Don't ignore this advice! Interference is almost +\emph on +sure +\emph default +! As a rule of thumb, precede almost any action command with some appropriate + waiting command! +\end_layout + +\begin_layout Itemize +Further friends are any +\family typewriter +marsadm wait-* +\family default + commands, such as +\family typewriter +wait-umount +\family default +. +\end_layout + +\begin_layout Itemize +In some places, busy-wait loops might be needed, e.g. + for waiting until a specific resource is +\family typewriter +UpToDate +\family default + or matches some other condition. + Examples of waiting conditions can be found under +\family typewriter +\family default + in subdirectory +\family typewriter +mars/modules/ +\family default +, specifically +\family typewriter +\family default + or similar. +\end_layout + +\begin_layout Itemize +In case of network problems, some command may hang (forever), if you don't + set the +\family typewriter +--timeout= +\family default + option. + Don't forget the check the return state of any failed / timeouted commands, + and to take appropriate measures! +\end_layout + +\begin_layout Itemize +Test your scripts in failure scenarios! +\end_layout + +\begin_layout Chapter +The Sysadmin Interface ( +\family typewriter +marsadm +\family default + and +\family typewriter +/proc/sys/mars/ +\family default +) +\family typewriter + +\begin_inset CommandInset label +LatexCommand label +name "chap:The-Sysadmin-Interface" + +\end_inset + + +\end_layout + +\begin_layout Standard +In general, the term +\begin_inset Quotes eld +\end_inset + +after a while +\begin_inset Quotes erd +\end_inset + + means that other cluster nodes will take notice of your actions according + to the +\begin_inset Quotes eld +\end_inset + +eventually consistent +\begin_inset Quotes erd +\end_inset + + propagation protocol described in sections +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-Lamport-Clock" + +\end_inset + + and +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-Symlink-Tree" + +\end_inset + +. + Please be aware that this +\begin_inset Quotes eld +\end_inset + +while +\begin_inset Quotes erd +\end_inset + + may last very long in case of network outages or bad firewall rules. +\end_layout + +\begin_layout Standard +In the following tables, column +\begin_inset Quotes eld +\end_inset + +Cmp +\begin_inset Quotes erd +\end_inset + + means compatibility with DRBD. + Please note that 100% exact compatibility is not possible, because of the + asynchronous communication paradigm. +\end_layout + +\begin_layout Standard +The following table documents common options which work with (almost) any + command: +\end_layout + +\begin_layout Standard + +\size scriptsize +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Option +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Cmp +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Description +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +--dry-run +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Run the command without actually creating symlinks or touching files or + executing rsync. + This option +\emph on +should +\emph default + be used first at any dangerous command, in order to check what would happen. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + +Don't use in scripts! Only use by hand! +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +This option does not change the waiting logic. + Many commands are waiting until the desired effect has taken place. + However, with +\family typewriter +--dry-run +\family default + the desired effect will never happen, so the command may wait forever (or + abort with a timeout). +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +In addition, this option can lead to additional aborts of the commands due + to unmet conditions, which cannot be met because the symlinks are not actually + created / altered. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Thus this option can give only a +\series bold +rough estimate +\series default + of what would happen later! +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +--force +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +almost +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Some preconditions are skipped, i.e. + the command will / should work although some (more or less) vital preconditions + are violated. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Instead of giving +\family typewriter +--force +\family default +, you may alternatively prefix your command with +\family typewriter +force- +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + THIS OPTION IS DANGEROUS! +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Use it only when you are absolutely sure that you know what you are doing! +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Use it only as a last resort if the same command without +\family typewriter +--force +\family default + has failed +\emph on +for no good reason +\emph default +! +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +--ignore-sync +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +almost +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Use this for a +\emph on +planned +\emph default + handover instead of +\family typewriter +--force +\family default +. + Only one precondition is relaxed: some sync may be running somewhere. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + +Careful when using this on extremely huge LVs where the sync may take serveral + days, or weeks. + It is your sysadmin decision what you want to prefer: restarting the sync, + or planned handover. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +--verbose +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Some (few) commands will become more speaky. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +--timeout=$seconds +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Some commands require response from either the local kernel module, or from + other cluster nodes. + In order to prevent infinite waiting in case of network outages or other + problems, the command will fail after the given timeout has been reached. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +When $seconds is -1, the command will wait forever. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +When $seconds is 0, the command will not wait in case any precondition is + not met, und abort without performing an action.. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +The default timeout is 5s. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +--window=$seconds +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +The time window for checking the aliveness of other nodes in the network. + When no symlink updates have occurred during the last window, the node + is considered dead. + Default is 60s. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +--threshold=$size +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +The macros containing the substring +\family typewriter +-threshold- +\family default + or +\family typewriter +-almost- +\family default + are using this as a default value for approximation whether something has + been approximately reached. + Default is 10MiB. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +The $size argument may be a number optionally followed by one the lowercase + characters k m g t p for indicating kilo mega giga tera or peta bytes as + multiples of 1000. + When using the corresponding uppercase character, multiples of 1024 are + formed instead. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +--host=$host +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +The command acts as if the command were executed on another host $host. + This option should not be used regularly, because the local information + in the symlink tree may be outdated or even wrong. + Additionally, some local information like remote sizes of physical devices + (e.g. + remote disks) is not present in the symlink tree at all, or is wrong (reflectin +g only the +\emph on +local +\emph default + state). +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + THIS OPTION IS DANGEROUS! +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Use it only for final destruction of dead cluster nodes, see section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Final-Destroy-of" + +\end_inset + +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +--ip=$ip +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +By default, +\family typewriter +marsadm +\family default + always uses the IP for +\family typewriter +$host +\family default + as stored in the symlink tree (directory +\family typewriter +/mars/ips/ +\family default +). + When such an IP entry does not (yet) exist (e.g. + +\family typewriter +create-cluster +\family default + or +\family typewriter +join-cluster +\family default +), all local network interfaces are automatically scanned for IPv4 adresses, + and the first one is taken. + This may lead to wrong decisions if you have multiple network interfaces. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +In order to override the automatic IP detection explicitly tell the + IP address of your storage network, use this option. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize +Usually you will need this only at +\family typewriter +{create,join}-cluster +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +--verbose +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Some (few) commands will become more speaky. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Section +Cluster Operations +\begin_inset CommandInset label +LatexCommand label +name "sec:Cluster-Operations" + +\end_inset + + +\end_layout + +\begin_layout Standard + +\size scriptsize +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Command / Params +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Cmp +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Description +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +create-cluster +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the +\family typewriter +/mars/ +\family default + filesystem must be mounted and it must be empty ( +\family typewriter +mkfs.ext4 +\family default +, see instructions in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Setup-your-Cluster" + +\end_inset + +). + The kernel module must +\emph on +not +\emph default + be loaded. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: the initial symlink tree is created in +\family typewriter +/mars/ +\family default +. + Additionally, the +\family typewriter +/mars/uuid +\family default + symlink is created for later distribution in the cluster. + It uniquely indentifies the cluster in the world. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +This must be called exactly once at the initial primary. + +\end_layout + +\begin_layout Plain Layout +Hint: use the +\family typewriter +--ip= +\family default + option if you have multiple interfaces. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +join-cluster +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$host +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the +\family typewriter +/mars/ +\family default + filesystem must be mounted and it must be empty ( +\family typewriter +mkfs.ext4 +\family default +, see instructions in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Setup-your-Cluster" + +\end_inset + +). + The kernel module must +\emph on +not +\emph default + be loaded. + The cluster must have been already created at another node +\family typewriter +$host +\family default +. + A working ssh connecttion to $host as root must exist (without password). + +\family typewriter +rsync +\family default + must be installed at all cluster nodes. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: the initial symlink tree +\family typewriter +/mars/ +\family default + is replicated from the remote host +\family typewriter +$host +\family default +, and the local host has been added as another cluster member. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +This must be called exactly once at every initial secondary node. +\end_layout + +\begin_layout Plain Layout +Hint: use the +\family typewriter +--ip= +\family default + option if you have multiple interfaces. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +leave-cluster +\begin_inset CommandInset label +LatexCommand label +name "leave-cluster" + +\end_inset + + +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the +\family typewriter +/mars/ +\family default + filesystem must be mounted and it must contain a valid MARS symlink tree + produced by the other +\family typewriter +marsadm +\family default + commands. + The local node must no longer be member of any resource (see +\family typewriter +marsadm leave-resource +\family default +). + The kernel module should be loaded and the network should be operating + in order to also propogate the effect to the other nodes. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: the local node is removed from the replicated symlink tree + +\family typewriter +/mars/ +\family default + such that other nodes will cease to communicate with it after a while. + The converse it not true: the local node may continue +\begin_inset Foot +status open + +\begin_layout Plain Layout + +\size scriptsize +Reason: +\family typewriter +leave-cluster +\family default + removes only its +\emph on +own +\emph default + IP address from +\family typewriter +/mars/ips/ +\family default +, but does not destroy the usual symmetry of the symlink tree by leaving + the other IPs intact. + Therefore, the local node will continue fetching updates from all nodes + present in +\family typewriter +/mars/ips/ +\family default +. + As an effect, the local node will +\emph on +passively +\emph default + mirror the symlinks of other cluster members, but not vice versa. + There is no communication from the local node to the other ones, turning + the local node into a +\series bold +whitness +\series default + according to some terminology from Distributed Systems. + This is a feature, not a bug. + It could be used for porst-mortem analysis, or for monitoring purposes. + However, +\emph on +deletions +\emph default + of symlinks are not guaranteed to take place, so your whitness may +\emph on +accumulate +\emph default + thousands of old symlinks over a long time. + If you want to eventually stop all communication to the local node, just + run +\family typewriter +rmmod +\family default +. +\end_layout + +\end_inset + + passivley fetching the symlink tree. + In order to really stop all communication, the kernel module should be + unloaded afterwards. + The local +\family typewriter +/mars/ +\family default + filesystem may be manually destroyed after that (at least if you need to + reuse it). +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +In case of an eventual node loss (e.g. + fire, water, ...) this command should be used on another node $helper in order + to finally remove $damaged from the cluster via the command +\family typewriter +marsadm leave-cluster --host=$damaged --force +\family default +. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize +In case you cannot use +\family typewriter +leave-resource +\family default + for any reason, you may do the following: just destroy the +\family typewriter +/mars/ +\family default + filesystem on the host +\family typewriter +$deadhost +\family default + you want to remove (e.g. + by +\family typewriter +mkfs +\family default +), or take other measures to +\emph on +ensure +\emph default + that it cannot be accidentally re-used in any way (e.g. + physical destruction of the underlying RAID, +\family typewriter +lvremove +\family default +, etc). + On all other hosts, do +\family typewriter +rmmod mars +\family default +, then delete the symlink +\family typewriter +/mars/ips/ip-$deadhost +\family default + everywhere by hand, and finally +\family typewriter +modprobe mars +\family default + again. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + +\size scriptsize +Notice that the last +\family typewriter +leave-resource +\family default + operation does not delete the cluster as such. + It just creates an +\emph on +empty +\emph default + cluster which has no longer any members. + In particular, the cluster ID +\family typewriter +/mars/uuid +\family default + is +\emph on +not +\emph default + removed, deliberately +\begin_inset Foot +status open + +\begin_layout Plain Layout + +\size scriptsize +This is a feature, not a bug. + The +\family typewriter +uuid +\family default + is created once, but never alterered anywhere. + The only way to get rid of it is +\emph on +external +\emph default + deletion (not by +\family typewriter +marsadm +\family default +) +\emph on +together(!) +\emph default + with all other contents of +\family typewriter +/mars/ +\family default +. + This prevents you from accidentally merging half-dead remains which could + have survived a disaster for any reason, such as snapshotting filesystems + / VMs or whatever. +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize +Before you can re-use +\emph on +any +\emph default + left-over +\family typewriter +/mars/ +\family default + filesystem for creating / joining a new / different cluster, you +\emph on +must +\emph default + obey the instructions in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Setup-your-Cluster" + +\end_inset + + and use +\family typewriter +mkfs.ext4 +\family default + accordingly. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +merge-cluster +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$host +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the set of resources at the local cluster (transitively) and + at the cluster of +\family typewriter +$host +\family default + (transitively) must be disjoint. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Create the union of both clusters, consisting of the union of all participating + machines (transitively). + Resource memberships are unaffected. + This is useful for creating a +\begin_inset Quotes eld +\end_inset + +virtual LVM cluster +\begin_inset Quotes erd +\end_inset + + where resources can be migrated later via +\family typewriter +join-resource +\family default + / +\family typewriter +leave-resource +\family default + operations. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + +\size scriptsize +Attention! The mars branch +\family typewriter +0.1.y +\family default + does not scale well in number of cluter members, because it evolved from + a lab prototype with +\begin_inset Formula $O(n^{2})$ +\end_inset + + behaviour at metadata exchange. + Never exceed the maximum cluster members as described in appendix +\begin_inset CommandInset ref +LatexCommand vref +reference "chap:Technical-Data-MARS" + +\end_inset + +. + For safety, you should better stay at 1/2 of the numbers mentioned there. + Use +\family typewriter +split-cluster +\family default + for going back to smaller clusters again after your background data migration + has completed. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize +Future versions of MARS, starting with branch +\family typewriter +0.1b.y +\family default + will be constructed for very big clusters in the range of thousands of + nodes. + Development has not yet stabilized there, and operational experiences are + missing at the moment. + Be careful until official announcements are appearing in the ChangeLog, + reporting of operational experiences from the 1&1 big cluster at metadata + level. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +merge-cluster-check +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$host +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Check in advance whether the set of resources at the local cluster and at + the other cluster +\family typewriter +$host +\family default + are disjoint. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +split-cluster +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +This is almost the inverse operation of +\family typewriter +merge-cluster +\family default +: it determines the minimum sub-cluster groups participating in some common + resources. + Then it splits the cluster memberships such that unnecessary connections + between non-related nodes are interrupted. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Use this for avoidance of too big clusters. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +wait-cluster +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +See section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Waiting" + +\end_inset + +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +create-uuid +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +Deprecated. + Only for compatibility with old version light0.1beta05 or earlier. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the +\family typewriter +/mars/ +\family default + filesystem must be mounted. + A +\family typewriter +uuid +\family default + (such as automatically created by recent versions of +\family typewriter +marsadm create-cluster +\family default +) must not already exist; i.e. + you have a very old and outdated symlink tree. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: the +\family typewriter +/mars/uuid +\family default + symlink is created for later distribution in the cluster. + It uniquely indentifies the cluster in the world. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +This must be called at most once at the current primary. + +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Section +Resource Operations +\begin_inset CommandInset label +LatexCommand label +name "sec:Resource-Operations" + +\end_inset + + +\end_layout + +\begin_layout Standard +Common precondition for all resource operations is that the +\family typewriter +/mars/ +\family default + filesystem is mounted, that it contains a valid MARS symlink tree produced + by other +\family typewriter +marsadm +\family default + commands (including a unique +\family typewriter +uuid +\family default +), that your current node is a valid member of the cluster, and that the + kernel module is loaded. + When communication is impossible due to network outages or bad firewall + rules, most commands will succeed, but other cluster nodes may take a long + time to notice your changes. +\end_layout + +\begin_layout Standard +Instead of executing +\family typewriter +marsadm +\family default + commands serveral times for each resource argument, you may give the special + resource argument +\family typewriter +all +\family default +. + This work even when combined with +\family typewriter +--force +\family default +, but be cautious when giving dangerous command combinations like +\family typewriter +marsadm delete-resource --force all +\family default +. +\end_layout + +\begin_layout Standard +In newer versions of +\family typewriter +marsadm +\family default +, you may give a comma-separated list of resource names in place of +\family typewriter +all +\family default +. + This way, you have more fine-grained control over the set of resource names + you want to use. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + +Beware when combining this with +\family typewriter +--host=somebody +\family default +. + In some very rare cases, like final destruction of a whole datacenter after + an earthquake, you might need a combination like +\family typewriter +marsadm --host=defective delete-resource --force all +\family default +. + Don't use such combinations if you don't need them +\emph on +really +\emph default +! You can easily shoot yourself in your head if you are not carefully operating + such commands! +\end_layout + +\begin_layout Subsection +Resource Creation / Deletion / Modification +\begin_inset CommandInset label +LatexCommand label +name "subsec:Resource-Creation" + +\end_inset + + +\end_layout + +\begin_layout Standard + +\size scriptsize +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Command / Params +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Cmp +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Description +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +create-resource +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$disk_dev +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +[$mars_name] +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +[$size] +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the resource argument +\family typewriter +$res +\family default + must not denote an already existing resource name in the cluster. + The argument +\family typewriter +$disk_dev +\family default + must denote an absolute path to a usable local block device, its size must + be greater zero. + When the optional +\family typewriter +$mars_name +\family default + is given, that name must not already exist on the local node; when not + given, +\family typewriter +$mars_name +\family default + defaults to +\family typewriter +$res +\family default +. + When the optional +\family typewriter +$size +\family default + argument is given, it must be a number, optionally followed by a lowercase + suffix +\family typewriter +k +\family default +, +\family typewriter +m +\family default +, +\family typewriter +g +\family default +, +\family typewriter +t +\family default +, or +\family typewriter +p +\family default + (denoting size factors as multiples of 1000), or an uppercase suffix +\family typewriter +K +\family default +, +\family typewriter +M +\family default +, +\family typewriter +G +\family default +, +\family typewriter +T +\family default + or +\family typewriter +P +\family default + (denoting size factors as multiples of 1024). + The given size must not exceed the actual size of +\family typewriter +$disk_dev +\family default +. + It will specify the future resource size as shown by +\family typewriter +marsadm view-resource-size $res +\family default +. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: the resource +\family typewriter +$res +\family default + is created, the inital role of the current node is primary. + The corresponding symlink tree information is asynchonously distributed + in the cluster (in the background). + The device +\family typewriter +/dev/mars/$mars_name +\family default + should appear after a while. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Notice: when +\family typewriter +$size +\family default + is strictly smaller than the size of +\family typewriter +$disk_dev +\family default +, you will unnecessarily waste some space.. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +This must be called exactly once for any new resource. + +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +join-resource +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$disk_dev +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +[$mars_name] +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the resource argument +\family typewriter +$res +\family default + must denote an already existing resource in the cluster (i.e. + its symlink tree information must have been received). + The resource must have a designated primary, and it must no be in emergency + mode. + There must not exist a split brain in the cluster. + The local node must not be already member of that resource. + The argument +\family typewriter +$disk_dev +\family default + must denote an absolute path to a usable (but currently unused) local block + device, its size must be greater or equal to the logical size of the resource. + When the optional +\family typewriter +$mars_name +\family default + is given, that name must not already exist on the local node; when not + given, +\family typewriter +$mars_name +\family default + defaults to +\family typewriter +$res +\family default +. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: the current node becomes a member of resource +\family typewriter +$res +\family default +, the inital role is secondary. + The initial full sync should start after a while. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Notice: when the size of $disk_dev is strictly greater than the size of + the resource, you will unnecessarily waste some space. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + +\size scriptsize +After a while, state +\family typewriter +Orphan +\family default + should be left. + Don't forget to regularly monitor for longer occurrences of +\family typewriter +Orphan +\family default +! +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +leave-resource +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the local node must be a member of the resource +\family typewriter +$res +\family default +; its current role must be secondary. + Sync, fetch and replay must be paused (see commands +\family typewriter +pause-{sync,fetch,replay} +\family default + or their abbreviation +\family typewriter +down +\family default +). + The disk must be detatched (see commands +\family typewriter +detach +\family default + or +\family typewriter +down +\family default +). + The kernel module should be loaded and the network should be operating + in order to also propogate the effect to the other nodes. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: the local node is no longer a member of +\family typewriter +$res +\family default +. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Notice: as a side effect for other nodes, their +\family typewriter +log-delete +\family default + may now become possible, since the current node does no longer count as + a candidate for logfile application. + In addition, a split brain situation may be (partly) resolved by this. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize + Please notice that this command +\emph on +may +\emph default + lead to (but does not guarantee) split-brain resolution. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + +\size scriptsize +The contents of the disk is not changed by this command. + Before issuing this command, check whether the disk appears to be locally + consistent (see +\family typewriter +view-is-consistent +\family default +)! After giving this command, any internal information indicating the consistenc +y state will be gone, and you will no longer be able to guess consistency + properties. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize + When you are +\emph on +sure +\emph default +.that the disk was consistent before (or is now by manually checking it), + you may re-create a new resource out of it via +\family typewriter +create-resource +\family default +. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +In case of an eventual node loss (e.g. + fire, water, ...) this command may be used on another node $helper in order + to finally remove all the resources $damaged from the cluster via the command + +\family typewriter +marsadm leave-resource $res --host=$damaged --force +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +delete-resource +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the resource must be empty (i.e. + all members must have left via +\family typewriter +leave-resource +\family default +). + This precondition is overridable by +\family typewriter +--force +\family default +, increasing the danger to maximum! It is even possible to combine +\family typewriter +--force +\family default + with an invalid resource argument and an invalid +\family typewriter +--host=somebodyelse +\family default + argument in order to desperately try to destroy remains of incomplete or + pysically damaged hardware. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: all cluster members will somewhen be forcefully removed from + +\family typewriter +$res +\family default +. + In case of network interruptions, the forced removal may take place far + in the future. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + THIS COMMAND IS +\emph on +VERY +\emph default + DANGEROUS! +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Use this only in desperate situations, and only manually. + Don't call this from scripts. + You are forcefully using a sledgehammer, even without +\family typewriter +--force +\family default +! The danger is that the +\emph on +true +\emph default + state of other cluster nodes need not be known in case of network problems + .Even when it were known, it could be compromised by +\series bold +byzantine failures +\series default +. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +It is strongly advised to try this command with +\family typewriter +--dry-run +\family default + first. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +When combined with +\family typewriter +--force +\family default +, this command will definitely +\series bold +murder +\series default + other cluster nodes, possibly after a long while, and even when they are + operating in primary mode / having split brains / etc. + However, there is no guarantee that other cluster nodes will be +\emph on +really +\emph default + dead – it is (theoretically) possible that they remain only +\emph on +half +\emph default + +\emph on +dead +\emph default +. + For example, a half dead node may continue to write data to +\family typewriter +/mars/ +\family default + and thus lead to overflow somewhen. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + +This command implies a forceful detach, possibly destroying consistency. + +\size scriptsize +It is similar in spirit to a +\series bold +STONITH +\series default +. + In particular, when a cluster node was operating in primary mode ( +\family typewriter +/dev/mars/mydata +\family default + being continuously in use), the forceful detach cannot be carried out until + the device is completely unused. + In the meantime, the current transaction logfile will be appended to, but + the file +\emph on +might +\emph default + be already unlinked (orphan file filling up the disk). + After the forceful detach, the underlying disk need not be consistent (although + MARS does its best). + Since this command deletes any symlinks which normally would indicate the + consistency state, no guarantees about consistency can be given after this + +\emph on +in general +\emph default +! Always check consistency by hand! +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +When possible / as soon as possible, check the local state on the other + nodes in order to +\emph on +really +\emph default + shutdown the resource everywhere (e.g. + to +\emph on +really +\emph default + unuse the +\family typewriter +/dev/mars/mydata +\family default + device, etc). +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +After this command, you +\emph on +should +\emph default + rebuild the resource under a different name, in order to avoid any clashes + caused by unexpected resurrection of +\begin_inset Quotes eld +\end_inset + +dead +\begin_inset Quotes erd +\end_inset + + or +\begin_inset Quotes eld +\end_inset + +half-dead +\begin_inset Quotes erd +\end_inset + + nodes (beware of shapshot / restores on virtual machines!!). + MARS does its best to avoid problems even in case the new resource name + should equal the old one, but there can be +\emph on +no guarantee +\emph default + in all possible failure scenarios / usage scenarios. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize +When possible, prefer +\family typewriter +leave-resource +\family default + over this! +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +wait-resource +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +{is-,}{attach, +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + + primary, +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + + device}{-off,} +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +See section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Waiting" + +\end_inset + +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Subsection +Operation of the Resource +\begin_inset CommandInset label +LatexCommand label +name "subsec:Operation-of-the" + +\end_inset + + +\end_layout + +\begin_layout Standard +Common preconditions are the preconditions from section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Resource-Operations" + +\end_inset + +, plus the respective resource +\family typewriter +$res +\family default + must exist, and the local node must be a member of it. + With the single exception of +\family typewriter +attach +\family default + itself, all other operations must be started in +\family typewriter +attached +\family default + state. +\end_layout + +\begin_layout Standard +When +\family typewriter +$res +\family default + has the special reserved value +\family typewriter +all +\family default +, the following operations will work on all resources where the current + node is a member (analogously to DRBD). +\end_layout + +\begin_layout Standard +With newer versions of +\family typewriter +marsadm +\family default +, you can also give a list of comma-separated resource names in place of + +\family typewriter +all +\family default +. +\end_layout + +\begin_layout Standard +\noindent + +\size scriptsize +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Command / Params +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Cmp +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Description +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +attach +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the local disk belonging to $res is not in use by anyone else. + Its contents has not been altered in the meantime since the last +\family typewriter +detach +\family default +. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize +Mounting +\emph on +read-only +\emph default + is allowed during the detached phase. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + +\size scriptsize +However, be careful! If you +\emph on +accidentally +\emph default + forget to give the right readonly-mount flags, if you use +\family typewriter +fsck +\family default + in repair mode inbetween, or alter the disk content in any other way (beware + of LVM snapshots / restores etc), you will almost certainly produce an + +\series bold +unnoticed inconsistency +\series default + (not reported by +\family typewriter +view-is-consistent +\family default +)! MARS has +\emph on +no chance +\emph default + to notice suchalike! +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: MARS uses the local disk and is able to work with it (e.g. + replay logfiles on it). +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Note: the local disk is opened in exclusive read-write mode. + This should protect against most common misuse, such as opening the disk + in parallel to MARS. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + +\size scriptsize +However, this does not necessarily protect against non-exclusive openers. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +detach +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the local +\family typewriter +/dev/mars/mydata +\family default + device (when present) is no longer opened by anybody. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: the local disk belonging to $res is no longer in use. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize +In contrast to DRBD, you need not explicitly pause syncing, fetching, or + replaying +\emph on +to +\emph default + (as apposed to +\emph on +from +\emph default +) the local disk. + These processes are automatically paused. + As another contrast to DRBD, the respective processes will usually +\emph on +automatically +\emph default + resume after re-attach, as far as possible in the respective new situation. + This will usually work even over +\family typewriter +rmmod +\family default + or reboot cycles, since the internal symlink tree will automatically persist + all todo switches for you (c.f. + section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-State-of" + +\end_inset + +). +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + +\size scriptsize +Notice: only +\emph on +local +\emph default + transfer operations +\emph on +to +\emph default + the local disk are paused by a detach. + When another node is remotely running a sync +\emph on +from +\emph default + your local disk, it will likely remain in use for remote reading. + The reason is that the server part of MARS is operating purely passively, + in order serve all remote requests as best as possible (similar to the + original Unix philosophy). + In order to really stop all accesses, do a +\family typewriter +pause-sync +\family default + on all other resource member where a sync is currently running. + You may also try +\family typewriter +pause-sync-global +\family default +. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + +\size scriptsize +WARNING! After this, and ather having paused any remote data access, you + might use the underlying disk for your own purposes, such as test-mounting + it in +\emph on +readonly +\emph default + mode. + +\series bold +Don't modifiy +\series default + its contents in any way! Not even by an +\family typewriter +fsck +\family default + +\begin_inset Foot +status open + +\begin_layout Plain Layout + +\size scriptsize +Some (but not all) +\family typewriter +fsck +\family default + tools for some filesystems have options to start only a test repair / verify + mode / dry run, without doing actual modifications to the data. + Of course, these modes +\emph on +can +\emph default + be used. + But be really sure! Double-check for the right options! +\end_layout + +\end_inset + +! Otherwise, you will have inconsistencies +\emph on +guaranteed +\emph default +. + MARS has no way for knowing of any modifications to your disk when bypassing + +\family typewriter +/dev/mars/* +\family default +. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize +In case you accidentally modified the underlying disk at the +\emph on +primary +\emph default + side, you may choose to resolve the inconsistencies by +\family typewriter +marsadm invalide $res +\family default + on +\emph on +each +\emph default + secondary. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +pause-sync +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Equivalent to +\family typewriter +pause-sync-local +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +pause-sync-local +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: none additionally. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: any sync operation targeting the local disk (when not yet + completed) is paused after a while (cf section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-State-of" + +\end_inset + +). + When successfully completed, this operation will remember the switch state + forever and automatically become relevant if a sync is needed again (e.g. + +\family typewriter +invalidate +\family default + or +\family typewriter +resize +\family default +). +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +pause-sync-global +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Like +\family typewriter +*-local +\family default +, but operates on all members of the resource. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +resume-sync +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Equivalent to +\family typewriter +resume-sync-local +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +resume-sync-local +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: additionally, a primary must be designated, and it must not + be in emergency mode. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: any sync operation targeting the local disk (when not yet + completed) is resumed after a while. + When completed, this operation will remember the switch state forever and + become relevant if a sync is needed again (e.g. + +\family typewriter +invalidate +\family default + or +\family typewriter +resize +\family default +). +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +resume-sync-global +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Like +\family typewriter +*-local +\family default +, but operates on all members of the resource. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +pause-fetch +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Equivalent to +\family typewriter +pause-fetch-local +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +pause-fetch-local +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: none additionally. + The resource +\emph on +should +\emph default + be in secondary role. + Otherwise the switch has +\emph on +no +\emph default + +\emph on +immediate +\emph default + effect, but will come (possibly unexpectedly) into effect whenever secondary + role is entered later for whatever reason. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: any transfer of (parts of) transaction logfiles which are + present at another primary host to the local +\family typewriter +/mars/ +\family default + storage are paused at their current stage. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize +This switch works independently from +\family typewriter +{pause,resume}-replay +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +pause-fetch-global +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Like +\family typewriter +*-local +\family default +, but operates on all members of the resource. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +resume-fetch +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Equivalent to +\family typewriter +resume-fetch-local +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +resume-fetch-local +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: none additionally. + The resource +\emph on +should +\emph default + be in secondary role. + Otherwise the switch has +\emph on +no +\emph default + +\emph on +immediate +\emph default + effect, but will come (possibly unexpectedly) into effect whenever secondary + role is entered later for whatever reason. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: any (parts of) transaction logfiles which are present at + another primary host shouldl be transferred to the local +\family typewriter +/mars/ +\family default + storage as far as not yet locally present. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize +This works independently from +\family typewriter +{pause,resume}-replay +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +resume-fetch-global +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Like +\family typewriter +*-local +\family default +, but operates on all members of the resource. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +pause-replay +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Equivalent to +\family typewriter +pause-replay-local +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +pause-replay-local +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: none additionally. + The resource +\emph on +should +\emph default + be in secondary role. + Otherwise the switch has +\emph on +no +\emph default + +\emph on +immediate +\emph default + effect, but will come (possibly unexpectedly) into effect whenever secondary + role is entered later for whatever reason. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: any local replay operations of transaction logfiles to the + local disk are paused at their current stage. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize +This works independently from +\family typewriter +{pause,resume}-fetch +\family default + resp. + +\family typewriter +{dis,}connect +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +pause-replay-global +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Like +\family typewriter +*-local +\family default +, but operates on all members of the resource. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +resume-replay +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Equivalent to +\family typewriter +pause-replay-local +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +resume-replay-local +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status collapsed + +\begin_layout Plain Layout + +\size scriptsize +Precondition: must be in secondary role. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: any (parts of) locally existing transaction logfiles (whether + replicated from other hosts or produced locally) are started for replay + to the local disk, as far as they have not yet been applied. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +resume-replay-global +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Like +\family typewriter +*-local +\family default +, but operates on all members of the resource. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +connect +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Equivalent to +\family typewriter +connect-local +\family default + and to +\family typewriter +resume-fetch-local +\family default +. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize +Note: although this sounds similar to DRBD's +\family typewriter +drbdadm connect +\family default +, there are subtle differences. + DRBD has exactly one connection per resource, which is associated with + +\emph on +pairs +\emph default + of nodes. + In contrast, MARS may create multiple connections per resource at runtime, + and these are associated with the +\emph on +target +\emph default + host (not with +\emph on +pairs +\emph default + of hosts). + As a consequence, the fetch may +\emph on +potentially +\emph default + occur from any other other source host which happens to be reachable (although + the current implementation prefers the current designated primary, but + this may change in future). + In addition, +\family typewriter +marsadm disconnect +\family default + does not stop +\emph on +all +\emph default + communication. + It only stops fetching logfiles. + The symlink update running in background is +\emph on +not +\emph default + stopped, in order to always propagate as much metadata as possible in the + cluster. + In case of a later incident, chances are higher for a better knowledge + of the +\emph on +real +\emph default + state of the cluster. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +connect-local +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Equivalent to +\family typewriter +resume-fetch-local +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +connect-global +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Equivalent to +\family typewriter +resume-fetch-global +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +disconnect +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Equivalent to +\family typewriter +disconnect-local +\family default + and to +\family typewriter +pause-fetch-local +\family default +. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize +See above note at +\family typewriter +connect +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +disconnect-local +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Equivalent to +\family typewriter +pause-fetch-local +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +disconnect-global +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +partly +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Equivalent to +\family typewriter +pause-fetch-global +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +up +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Equivalent to +\family typewriter +attach +\family default + followed by +\family typewriter +resume-fetch +\family default + followed by +\family typewriter +resume-replay +\family default + followed by +\family typewriter +resume-sync +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +down +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Equivalent to +\family typewriter +pause-sync +\family default + followed by +\family typewriter +pause-fetch +\family default + followed by +\family typewriter +pause-replay +\family default + followed by +\family typewriter +detach +\family default +. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize +Hint: consider to prefer plain +\family typewriter +detach +\family default + over this, because +\family typewriter +detach +\family default + will remember the last state of all switches, while +\family typewriter +down +\family default + will +\emph on +not +\emph default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +primary +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +almost +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: sync must have finished at any resource member. + All relevant transaction logfiles must be either already locally present, + or be fetchable (see +\family typewriter +resume-fetch +\family default + and +\family typewriter +resume-replay +\family default +). + When some logfile data is locally missing, there must be enough space on + +\family typewriter +/mars/ +\family default + to fetch it. + Any replay must not have been interrupted by a replay error (see macro + %replay-code{} or diskstate +\family typewriter +DefectiveLog +\family default +). + The current designated primary must be reachable over network. + When there is no designated primary (i.e. + +\family typewriter +marsadm secondary +\family default + had been executed before, which is explicitly +\emph on +not recommended +\emph default +), +\emph on +all +\emph default + other members of the resource must be reachable (since we have no memory + who was the old primary before), and then they must also match the same + preconditions. + When another host is currently primary (whether designated or not), it + must match the preconditions of +\family typewriter +marsadm secondary +\family default + (that means, its local +\family typewriter +/dev/mars/mydata +\family default + device must not be in use any more). + A split brain must not already exist. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: +\family typewriter +/dev/mars/$dev_name +\family default + appears locally and is usable; the current host is in primary role. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Switches the +\series bold +designated primary +\series default +. + There are three variants: +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +1) +\series bold +Handover +\series default + when +\emph on +not +\emph default + giving +\family typewriter +--force +\family default +: when another host is currently primary, it is first asked to leave its + primary role, and it is waited until it actually has become secondary. + After that, the local host is asked to become primary. + Before actually becoming primary, all relevant logfiles are transferred + over the network and replayed, in order to avoid accidental creation of + split brain as best as possible +\begin_inset Foot +status open + +\begin_layout Plain Layout + +\size scriptsize +Note that split brain avoidance is +\series bold +best effort +\series default + and cannot be guaranteed in general. + For example, it may be impossible to avoid split brain in case of long-lasting + network outages. +\end_layout + +\end_inset + +. + Only after that, +\family typewriter +/dev/mars/$dev_name +\family default + will appear. + When network transfers of the symlink tree are very slow (or currently + impossible), this command may take a very long time. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +In case a split brain is already detected at the initial situation, the + local host will refuse to switch the designated primary without +\family typewriter +--force +\family default +. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize + In case of +\begin_inset Formula $k>2$ +\end_inset + + replicas: if you want to handover between host +\family typewriter +A +\family default + and +\family typewriter +B +\family default + while a sync is currently running at host +\family typewriter +C +\family default +, you have the following options: +\end_layout + +\begin_layout Enumerate + +\size scriptsize +wait until the sync has finished (see macro +\family typewriter +sync-rest +\family default +, or +\family typewriter +marsadm view +\family default + in general). +\end_layout + +\begin_layout Enumerate + +\size scriptsize +do a +\family typewriter +leave-resouce +\family default + on host +\family typewriter +C +\family default +, and later +\family typewriter +join-resource +\family default + after the handover completed successfully. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +2) +\series bold +Handover ignoring running syncs, +\series default + by adding the option +\family typewriter +--ignore-sync +\family default +. + Any running syncs will restart from scratch, in order to ensure consistency. + Use this only when the planned handover is more important than the sync + time. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +3) +\series bold +Forced switching +\series default +: by giving –force while +\family typewriter +pause-fetch +\family default + is active (but not +\family typewriter +pause-replay +\family default +), most preconditions are ignored, and MARS does its best to actually become + primary even if some logfiles are missing or incomplete or even defective. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + +\family typewriter +\size scriptsize +primary --force +\family default + is a potentially harmful variant, because it will provoke a split brain + in most cases, and therefore in turn will lead to +\series bold +data loss +\series default + because one of your split brain versions must be discarded later in order + to resolve the split brain (see section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Split-Brain-Resolution" + +\end_inset + +). +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + +\series bold +\size scriptsize +Never +\series default + call +\family typewriter +primary --force +\family default + when +\family typewriter +primary +\family default + without +\family typewriter +--force +\family default + is sufficient! If +\family typewriter +primary +\family default + without +\family typewriter +--force +\family default + complains that the device is in use at the former primary side, take it + seriously! Don't override with +\family typewriter +--force +\family default +, but rather umount +\begin_inset Foot +status open + +\begin_layout Plain Layout + +\size scriptsize +A common misconception is when people think that they can keep their filesystem + mounted without provoking a split brain, because they have their application + stopped and thus don't write any data into the filesystem. + This is a wrong idea, because filesystems may write some metadata, like + booking information, even after hours or days of inactivity. + Therefore MARS insists that the device is no longer in use before any handover + can take place. +\end_layout + +\end_inset + + the device at the other side! +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize + Only use +\family typewriter +primary --force +\family default + when something is +\emph on +already broken +\emph default +, such as a network outage, or a node crash, etc. + During ordinary operations (network OK, nodes OK), you should never need + +\family typewriter +primary --force +\family default +! +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize + If you umount +\family typewriter +/dev/mars/mydata +\family default + on the old primary +\family typewriter +A +\family default +, and then wait until +\family typewriter +marsadm view +\family default + (or another suitable macro) on the target host +\family typewriter +B +\family default + shows that everything is +\family typewriter +UpToDate +\family default +, you can prevent a split brain by yourself even when giving +\family typewriter +primary --force +\family default + afterwards. + However, checking / assuring this is +\emph on +your +\emph default + responsibility! +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + +\family typewriter +\size scriptsize + primary --force +\family default + switches the +\emph on +designated +\emph default + primary. + In some extremely rare cases, when +\emph on +multiple +\emph default + faults have accumulated in a +\emph on +weird +\emph default + situation, it +\emph on +might +\emph default + be impossible becoming the / an actual primary. + Typically you may be +\emph on +already +\emph default + in a split brain situation. + This has not been observed for a long operations time on recent versions + of MARS, but in general becoming primary via +\family typewriter +--force +\family default + cannot be guaranteed always, although MARS does its best. + In split brain situations, or if you ever encounter such a problem, you + +\emph on +must +\emph default + resolve the split brain immediately after giving this command (see section + +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Split-Brain-Resolution" + +\end_inset + +). +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize + Hint in case of +\begin_inset Formula $k>2$ +\end_inset + + replicas: +\family typewriter +marsadm invalidate +\family default + cannot always resolve a split brain at other secondaries (which are neither + the old nor the new designated primary). + Therefore, prefer the +\family typewriter +leave-resource +\family default + method described in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Split-Brain-Resolution" + +\end_inset + +, starting with a +\family typewriter +leave-resource +\family default + phase at the old primary, and proceeding to +\begin_inset Quotes eld +\end_inset + +unrelated +\begin_inset Quotes erd +\end_inset + + secondaries step by step, until the split brain is gone. + Don't +\family typewriter +join-resource +\family default + again before the split brain is gone! This way, all these replicas will + remain consistent for now, but of course outdated (or potentially even + a +\begin_inset Quotes eld +\end_inset + +wrong +\begin_inset Quotes erd +\end_inset + + split-brain version, but +\emph on +potentially usable +\emph default + in case you get under pressure in some way). + In the hopefully unlikely case that you should later discover that you + accidentally forced the +\emph on +wrong +\emph default + replica via +\family typewriter +primary --force +\family default +, you will have a chance to recover by either forcing the +\begin_inset Quotes eld +\end_inset + +correct +\begin_inset Quotes erd +\end_inset + + host to primary (if it did not already leave the resource), or by creating + a completely fresh resource out of the +\begin_inset Quotes eld +\end_inset + +correct +\begin_inset Quotes erd +\end_inset + + local disk. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize + Generally: in case of +\family typewriter +primary --force +\family default +, the preconditions are different. + The fetch +\emph on +must +\emph default + be switched off (see +\family typewriter +pause-fetch +\family default +), in order to get stable logfile positions. + See section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Forced-Switching" + +\end_inset + +. + For your safety, +\family typewriter +–force +\family default + does not work in newer marsadm (after mars0.1stable52) when your replica + is a current sync target. + More explanations see section +\begin_inset CommandInset ref +LatexCommand vref +reference "subsec:Forced-Switching" + +\end_inset + +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +secondary +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +almost +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the local +\family typewriter +/dev/mars/$dev_name +\family default + is no longer in use (e.g. + umounted). +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: There exists no designated primary any more. + During split brain and when the network is OK (again), all actual primaries + (including the local host) will leave primary ASAP (i.e. + when their +\family typewriter +/dev/mars/mydata +\family default + is no longer in use). + Any secondary will start following (old) logfiles (even from backlogs) + by replaying transaction logs if it is +\emph on +uniquely +\emph default + possible (which is often violated during split brain). + On any secondary, +\family typewriter +/dev/mars/$dev_name +\family default + will have disappeared. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize + Notice: in difference to DRBD, you +\series bold +don't need +\series default + this command during normal operation, including handover. + Any resource member which is +\emph on +not +\emph default + designated as primary will +\emph on +automatically +\emph default + go into secondary role. + For example, if you have +\begin_inset Formula $k=4$ +\end_inset + + replicas, only +\emph on +one of them +\emph default + can be designated as a primary. + When the network is OK, all other 3 nodes will know this fact, and they + will +\emph on +automatically +\emph default + go into secondary mode, following the transaction logs from the (new) primary. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + +\size scriptsize +Hint: avoid this command. + It turns off +\emph on +any +\emph default + primary, +\series bold +globally +\series default + +\begin_inset Foot +status open + +\begin_layout Plain Layout + +\size scriptsize +A serious +\series bold +misconception +\series default + among some people is when they believe that they can switch +\begin_inset Quotes eld +\end_inset + +a certain node to secondary +\begin_inset Quotes erd +\end_inset + +. + It is not possible to switch individual nodes to secondary, without affecting + other nodes! The concept of +\begin_inset Quotes eld +\end_inset + +designated primary +\begin_inset Quotes erd +\end_inset + + is +\series bold +global +\series default + throughout a resource! +\end_layout + +\end_inset + +. + You cannot start a sync after that (e.g. + +\family typewriter +invalidate +\family default + or +\family typewriter +join-resource +\family default + or +\family typewriter +resume-sync +\family default +), because it is +\emph on +not unique +\emph default + wherefrom the data shall be fetched. + In split brain situations (when the network is OK again), this may have + further drawbacks. + It is much better / easier to +\series bold +\emph on +directly +\emph default + switch the designated primary +\series default + from one node to another via the +\family typewriter +primary +\family default + command. + See also section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Forced-Switching" + +\end_inset + +. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\size scriptsize + There is only one valid use case where you +\emph on +really +\emph default + need this command: before finally destroying a resouce via the +\emph on +last +\emph default + +\family typewriter +leave-resource +\family default + (or the dangerous +\family typewriter +delete-resource +\family default +), you will need this before you can do that. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +wait-umount +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +See section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Waiting" + +\end_inset + +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +log-purge-all +\begin_inset CommandInset label +LatexCommand label +name "log-purge-all$res" + +\end_inset + + +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: none additionally. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: all locally known logfiles and version links are removed, + whenever they are not / no longer reachable by any split brain version. +\end_layout + +\begin_layout Plain Layout +Rationale: remove hindering split-brain / +\family typewriter +leave-resource +\family default + leftovers. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Use this only when split brain does not go away by means of +\family typewriter +leave-resource +\family default + (which +\emph on +could +\emph default + happen in very weird scenarios such as MARS running on virtual machines + doing a restore of their snapshots, or otherwise unexpected resurrection + of dead or half-dead nodes). +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + THIS IS POTENTIALLY DANGEROUS! +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +This command +\emph on +might +\emph default + destroy some valuable logfiles / other information in case the local informatio +n is outdated or otherwise incorrect. + MARS does its best for checking anything, but there is no guarantee. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Hint: use +\family typewriter +--dry-run +\family default + beforehand for checking! +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +resize +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +[$size] +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +almost +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: The local host must be primary. + All disks in the cluster participating in +\family typewriter +$res +\family default + must be physically larger than the logical resource size (e.g, by use of + +\family typewriter +lvm +\family default +; can be checked by macros +\family typewriter +%disk-size{} +\family default + and +\family typewriter +%resource-size{} +\family default +). + When the optional +\family typewriter +$size +\family default + argument is present, it must be smaller than the minimum of all physical + sizes, but larger than the current logical size of the resource. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: the logical size of +\family typewriter +/dev/mars/$dev_name +\family default + will reflect the new size after a while. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Subsection +Logfile Operations +\end_layout + +\begin_layout Standard + +\size scriptsize +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Command / Params +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Cmp +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Description +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +cron +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Do all necessary housekeeping tasks. + See +\family typewriter +log-rotate +\family default + and +\family typewriter +log-delete-all +\family default + for details. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +This should be regularly called by an external cron job or similar. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +log-rotate +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the local node +\family typewriter +$host +\family default + must be primary at +\family typewriter +$res +\family default +. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: after a while, a new transaction logfile +\family typewriter +/mars/resource-$res/log-$new_nr-$host +\family default + will be used instead of +\family typewriter +/mars/resource-$res/log-$old_nr-$host +\family default + where +\family typewriter +$new_nr +\family default + = +\family typewriter +$old_nr +\family default + + 1. + Without +\family typewriter +--force +\family default +, this will only carry out actions at the primary side since it makes no + sense on secondaries. + With +\family typewriter +--force +\family default +, secondaries are +\emph on +trying +\emph default + to +\emph on +remotely +\emph default + trigger a log-rotate, but without any guarantee (likely even a split-brain + may result instead, so use this only if you are +\emph on +really +\emph default + desperate). +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +log-delete +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the local node must be a member of +\family typewriter +$res +\family default +. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: when there exists some old transaction logfiles +\family typewriter +/mars/resource-$res/log-*-$some_host +\family default + which are no longer referenced by any of the symlinks +\family typewriter +/mars/resource-$res/replay-* +\family default + , those logfiles are marked for deletion in the whole cluster. + When no such logfiles exist, nothing will happen. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +log-delete-one +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the local node must be a member of +\family typewriter +$res +\family default +. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: when there exists an old transaction logfile +\family typewriter +/mars/resource-$res/log-$old_nr-$some_host +\family default + where +\family typewriter +$old_nr +\family default + is the minimum existing number and that logfile is no longer referenced + by any of the symlinks +\family typewriter +/mars/resource-$res/replay-* +\family default + , that logfile is marked for deletion in the whole cluster. + When no such logfile exists, nothing will happen. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +log-delete-all +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Alias for +\family typewriter +log-delete +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Subsection +Consistency Operations +\end_layout + +\begin_layout Standard + +\size scriptsize +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Command / Params +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Cmp +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Description +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +invalidate +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the local node must be in secondary role at +\family typewriter +$res +\family default +. + A +\emph on +designated +\emph default + primary must exist. + When having +\begin_inset Formula $k>2$ +\end_inset + + replicas, no split brain must exist (otherwise, or when +\family typewriter +invalidate +\family default + does not work in case of +\begin_inset Formula $k=2$ +\end_inset + +, use the +\family typewriter +leave-resource +\family default + ; +\family typewriter +join-resource +\family default + method described in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Split-Brain-Resolution" + +\end_inset + +). +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: the local disk is marked as inconsistent, and a fast fullsync + from the designated primary will start after a while. + Notice that +\family typewriter +marsadm {pause,resume}-sync +\family default + will influence whether the sync really starts. + When the fullsync has finished successfully, the local node will be consistent + again. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + +\size scriptsize +After a while, state +\family typewriter +Orphan +\family default + should be left. + Don't forget to regularly monitor for longer occurrences of +\family typewriter +Orphan +\family default +! +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +fake-sync +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the local node must be in secondary role at +\family typewriter +$res +\family default +. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: when a fullsync is running, it will stop after a while, and + the local node will be +\emph on +marked +\emph default + as consistent as if it were consistent again. +\end_layout + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + +\size scriptsize +ONLY USE THIS IF YOU REALLY KNOW WHAT YOU ARE DOING! +\begin_inset Newline newline +\end_inset + +See the WARNING in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Creating-and-Maintaining" + +\end_inset + + +\begin_inset Newline newline +\end_inset + +Use this only +\emph on +before +\emph default + creating a fresh filesystem inside +\family typewriter +/dev/mars/$res +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +set-replay +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + +\size scriptsize +ONLY FOR ADVANCED HACKERS WHO KNOW WHAT THEY ARE DOING! +\begin_inset Newline newline +\end_inset + +This command is deliberately not documented. + You need the competence level RTFS ( +\begin_inset Quotes eld +\end_inset + +read the fucking sources +\begin_inset Quotes erd +\end_inset + +). +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Section +Further Operations +\end_layout + +\begin_layout Subsection +Inspection Commands +\end_layout + +\begin_layout Standard + +\size scriptsize +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Command / Params +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Cmp +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Description +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +view- +\emph on +macroname +\begin_inset Newline newline +\end_inset + + +\emph default + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Display the output of a macro evaluation. + See section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Inspecting-the-State" + +\end_inset + + for a thorough description. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +view +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Equivalent to +\family typewriter +view-default +\family default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +role +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Deprectated. + Use +\family typewriter +view-role +\family default + instead. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +state +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Deprectated. + Use +\family typewriter +view-state +\family default + instead. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +cstate +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Deprectated. + Use +\family typewriter +view-cstate +\family default + instead. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +dstate +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Deprectated. + Use +\family typewriter +view-dstate +\family default + instead. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +status +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Deprectated. + Use +\family typewriter +view-status +\family default + instead. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +show-state +\end_layout + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Deprectated. + Don't use it. + Use +\family typewriter +view-state +\family default + instead, or other macros. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +show-info +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Deprectated. + Don't use it. + Use +\family typewriter +view-info +\family default + instead, or other macros. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +show +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Deprectated. + Don't use it. + Use or implement some macros instead. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +show-errors +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Deprectated. + Use +\family typewriter +view-the-err-msg +\family default + or +\family typewriter +view-resource-err +\family default + similar macros. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +cat +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$file +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Write the file content to stdout, but replace all occurences of numeric + timestamps converted to a human-readable format. + Thus is most useful for inspection of status and log files, e.g. + +\family typewriter +marsadm cat /mars/ +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Subsection +Setting Parameters +\begin_inset CommandInset label +LatexCommand label +name "subsec:Setting-Parameters" + +\end_inset + + +\end_layout + +\begin_layout Subsubsection +Per-Resource Parameters +\end_layout + +\begin_layout Standard + +\size scriptsize +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Command / Params +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Cmp +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Description +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +set-emergency-limit $res +\emph on +n +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +The argument +\emph on +n +\emph default + must be percentage between 0 and 100 %. + When the remaining store space in +\family typewriter +/mars/ +\family default + undershoots the given percentage, the resource will go +\emph on +earlier +\emph default + into emergency mode than by the global computation described in section + +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Defending-Overflow" + +\end_inset + +. + 0 means unlimited. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +get-emergency-limit $res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Inquiry of the preceding value. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Subsubsection +Global Parameters +\end_layout + +\begin_layout Standard + +\size scriptsize +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Command / Params +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Cmp +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Description +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +set-sync-limit-value +\emph on +n +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Limit the concurrency of sync operations to some maximum number. + 0 means unlimited. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +get-sync-limit-value +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Inquiry of the preceding value. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +set-connect-pref-list host1,host2,hostn +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Set the order of preferences for connections when there are more than 2 + hosts participating in a cluster. + The argument must be comma-separated list of node names. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +get-connect-pref-list +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Inquiry of the preceding value. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Subsection +Waiting +\begin_inset CommandInset label +LatexCommand label +name "subsec:Waiting" + +\end_inset + + +\end_layout + +\begin_layout Standard + +\size scriptsize +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Command / Params +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Cmp +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Description +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +wait-cluster +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the +\family typewriter +/mars/ +\family default + filesystem must be mounted and it must contain a valid MARS symlink tree + produced by the other +\family typewriter +marsadm +\family default + commands. + The kernel module must be loaded. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: none. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Wait until +\emph on +all +\emph default + nodes in the cluster have sent a message, or until timeout. + The default timeout is 30 s (exceptionally) and +\size default + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + +Be +\size scriptsize + may be changed by +\family typewriter + --timeout=$seconds +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +wait-resource +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +{is-,}{attach, +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + + primary, +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + + device}{-off,} +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: the local node must be a member of the resource +\family typewriter +$res +\family default +. + +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: none. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Wait until the local node reaches a specified condition on +\family typewriter +$res +\family default +, or until timeout. + The default timeout of 60 s may be changed by +\family typewriter + --timeout=$seconds +\family default +. + The last argument denotes the condition. + The condition is inverted if suffixed by +\family typewriter +-off +\family default +. + When preceded by +\family typewriter +is- +\family default + (which is the most useful case), it is checked whether the condition is + actually reached. + When the +\family typewriter +is- +\family default + prefix is left off, the check is whether another +\family typewriter +marsadm +\family default + command has been already given which +\emph on +tries +\emph default + to achieves the intended result (typicially, you may use this after the + +\family typewriter +is- +\family default + variant has failed). +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +wait-connect +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +almost +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +This is an alias for +\family typewriter +wait-cluster +\family default + waiting until only those nodes are reachable which belong to +\family typewriter +$res +\family default + (instead of waiting for the +\emph on +full +\emph default + cluster). +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +wait-umount +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$res +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Precondition: none additionally. +\end_layout + +\begin_layout Plain Layout + +\size scriptsize +Postcondition: the local +\family typewriter +/dev/mars/$dev_name +\family default + is no longer in use (e.g. + umounted). +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Subsection +Low-Level Expert Commands +\end_layout + +\begin_layout Standard +These commands are for experts and advanced sysadmins only. + The interface is not stable, i.e. + the meaning may change at any time. + Use at your own risk! +\end_layout + +\begin_layout Standard + +\size scriptsize +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Command / Params +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Cmp +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Description +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +set-link +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +RTFS. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +get-link +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +RTFS. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +delete-file +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +RTFS. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +The following commands are for manual setup / repair of cluster membership. + Only to be used by experts who know what they are doing! In general, cluster-wi +de operations on IP addresses may need to be repeated at all hosts in the + cluster iff the communication is not (yet) possible and/or not (yet) actually + working (e.g. + firewalling problems etc). +\end_layout + +\begin_layout Standard + +\size scriptsize +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Command / Params +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Cmp +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Description +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "30col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +lowlevel-ls-host-ips +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "50col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +List all configured cluster members together with their currently configured + IP addresses, as known +\emph on +locally +\emph default +. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "30col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +lowlevel-set-host-ip +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$hostname +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$ip +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "50col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Change the assignment of IP addresses +\emph on +locally +\emph default +. + May be used when hosts are moved to different network locations, or when + different network interfaces are to be used for replication (e.g. + dedicated replication IPs). + Notice that the names of hosts must not change at all, only their IP addresses + may be changed. + Check active connections with +\family typewriter +netstat +\family default + & friends. + Updates may need some time to proceed (socket timeouts etc). +\begin_inset Newline newline +\end_inset + +Hint: for safety, call this on +\emph on +all +\emph default + members of a cluster to ensure consistency. + Otherwise it may happen that some cluster members do not know the +\emph on +new +\emph default + IP address where to fetch the +\emph on +new +\emph default + information from. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "30col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +lowlevel-delete-host +\begin_inset Newline newline +\end_inset + + +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +strut +\backslash +hfill +\end_layout + +\end_inset + +$hostname +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "50col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Remove a host from the cluster membership +\emph on +locally +\emph default +, together with its IP address assignment. + This does not remove any further information. + In particular, resource memberships are untouched. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Subsection +Senseless Commands (from DRBD) +\end_layout + +\begin_layout Standard + +\size scriptsize +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Command / Params +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Cmp +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Description +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +syncer +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +new-current-uuid +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +create-md +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +dump-md +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +dump +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +get-gi +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +show-gi +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +outdate +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +adjust +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +yes +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +Implemented as NOP (not necessary with MARS). +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +hidden-commands +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Subsection +Forbidden Commands (from DRBD) +\end_layout + +\begin_layout Standard +These commands are not implemented because they would be dangerous in MARS + context: +\end_layout + +\begin_layout Standard + +\size scriptsize +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Command / Params +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Cmp +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +Description +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +invalidate-remote +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +This would be too dangerous in case you have multiple secondaries. + A similar effect can be achieved with the +\family typewriter +--host= +\family default + option. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "20col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\family typewriter +\size scriptsize +verify +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +no +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\size scriptsize +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "60col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout + +\size scriptsize +This would cause unintended side effects due to races between logfile transfer + / application and block-wise comparison of the underlying disks. + However, +\family typewriter +marsadm join-resource +\family default + or +\family typewriter +invalidate +\family default + will do the same as DRBD verify followed by DRBD resync, i.e. + this will automatically correct any found errors;. + Note that the fast-fullsync algorithm of MARS will minimize network traffic. +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Section +The +\family typewriter +/proc/sys/mars/ +\family default + and other Expert Tweaks +\begin_inset CommandInset label +LatexCommand label +name "sec:The-/proc/sys/mars/-Expert" + +\end_inset + + +\end_layout + +\begin_layout Standard +In many case, you will not need to deal with tweaks in +\family typewriter +/proc/sys/mars/ +\family default + because everything should already default to reasonable predefined values. + This interface allows access to some internal kernel variables of the +\family typewriter +mars.ko +\family default + kernel module at +\emph on +runtime +\emph default +. + This means, the values will be reset to default at +\family typewriter +rmmod mars +\family default + or at reboot. + If you need some persistence, implement it by yourself, e.g. + at startup scripts. +\end_layout + +\begin_layout Standard + +\family typewriter +/proc/sys/mars/ +\family default + is +\emph on +not +\emph default + a stable interface. + It is not only specific for MARS, but may also change between releases + without notice. +\end_layout + +\begin_layout Standard +This section describes only those tweaks intended for sysadmins, not those + for developers / very deep internals. +\end_layout + +\begin_layout Subsection +Tuning Network Performance +\begin_inset CommandInset label +LatexCommand label +name "subsec:Tuning-Network-Performance" + +\end_inset + + +\end_layout + +\begin_layout Standard +Starting with MARS Light series 0.2, a new feature called +\begin_inset Quotes eld +\end_inset + +socket bundling +\begin_inset Quotes erd +\end_inset + + is available. +\end_layout + +\begin_layout Standard +It is mostly intendend for lines showing high packet loss. + By using multiple TCP sockets in parallel for emulating a single logical + connection, throughput can be significantly increased. +\end_layout + +\begin_layout Standard +Example for setting the socket parallelism to 4: +\end_layout + +\begin_layout Itemize + +\family typewriter +echo 4 > /proc/sys/mars/parallel_connections +\end_layout + +\begin_layout Standard +The following graphics shows the throughput of a non-fast +\begin_inset Foot +status open + +\begin_layout Plain Layout +The fast fullsync algorithm would not saturate the +\family typewriter +eth0 +\family default + link with traffic from a single resource. +\end_layout + +\end_inset + + fullsync of a +\emph on +single +\emph default + 100GiB resource over a loaded long-distance line between Europe/Germany + and USA/Midwest. + In order to compensate highly varying load at the line, all the experiments + were repeated more than 10 times and averaged. + Each bar shows the throughput for a particular socket parallelism. +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/socket-bundling-long-summary.png + width 70col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +Notice that the uplinks of the two servers are only 1 GBit/s respectively. + When the uplink is saturated, about 100 MByte/s is the maximum possible + peak throughput in theory. + You can easily recognize that the peak throughput is almost reached with + a parallelism degree of 2, but using even more sockets appears to be slightly + counter-productive. + One of the reasons is that more sockets will increase contention on the + line, and thus increasing packet loss. + Another potential reason is that higher parallelism at sockets will lead + to higher parallelism in disk reads, in turn leading to more permutations + of disk read positions (more +\emph on +random +\emph default + reads instead of purely sequential reads), which is counter-productive + for disk readahead strategies. +\end_layout + +\begin_layout Standard +The next graphics shows the same, but over a medium distance of about 50km. + This line is even more heavily loaded with respect to the number of TCP + connections running in parallel (probly some 10,000 or even 100,000 if + not more), and there is some kind of +\begin_inset Quotes eld +\end_inset + +traffic shaping +\begin_inset Quotes erd +\end_inset + + at some intermediate network gear which will +\begin_inset Quotes eld +\end_inset + +punish +\begin_inset Quotes erd +\end_inset + + those traffic sources disproportionally increasing overall packet loss. + This can explain the even higher counter-productive effect of using too + much sockets and thus injecting additional packet loss: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/socket-bundling-short-summary.png + width 70col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +In general, the optimum value for +\family typewriter +/proc/sys/mars/parallel_connections +\family default + may depend on many runtime factors such as other load running over some + (parts of) physical equipment. + You will need to determine optimum values yourself. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Notice that socket bundling is conceptually the +\begin_inset Quotes eld +\end_inset + +opposite +\begin_inset Quotes erd +\end_inset + + of traffic shaping. + You are trying to get +\emph on +more +\emph default + bandwidth, at the cost of +\emph on +other +\emph default + traffic competing for the same network resources. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + If you are operating masses of servers, don't set the MARS socket parallelism + +\series bold +too high +\series default +everywhere. + You might +\begin_inset Quotes eld +\end_inset + +steal +\begin_inset Quotes erd +\end_inset + + too much bandwidth from other applications when starting masses of syncs + in parallel, e.g. + after an incident. + Best practice is to start with a default value of 1, and to increase it + only +\emph on +on demand +\emph default +, and/or preferably +\emph on +only +\emph default + at those servers where high load really occurs or where some urgent actions + need a +\emph on +temporary +\emph default + boost. +\end_layout + +\begin_layout Subsection +Syslogging +\end_layout + +\begin_layout Standard +All internal messages produced by the kernel module belong to one of the + following classes: +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 +0 debug messages +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 +1 info messages +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 +2 warnings +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 +3 error messages +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 +4 fatal error messages +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 +5 any message (summary of 0 to 4) +\end_layout + +\begin_layout Subsubsection +Logging to Files +\end_layout + +\begin_layout Standard +This feature will likely disappear when MARS goes to kernel upstream. + It was mostly intended for debugging during early beta phases and is no + longer needed for stable operation. + Developers may use it for spotting potential problems. +\end_layout + +\begin_layout Standard +The classes may be used to produce status files +\family typewriter +$class.*.status +\family default + in the +\family typewriter +/mars/ +\family default + and/or in the +\family typewriter +/mars/resource- +\emph on +mydata +\emph default +/ +\family default + directory / directories. +\end_layout + +\begin_layout Standard +When you create a file +\family typewriter +$class.*.log +\family default + in parallel to any +\family typewriter +$class.*.status +\family default +, the +\family typewriter +*.log +\family default + file will be appended forever with the same messages as in +\family typewriter +*.status +\family default +. + The difference is that *.status is regenerated anew from an empty starting + point, while *.log can (potentially) increase indefinitely unless you remove + it, or rename it to something else. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + +Beware, any permamently present +\family typewriter +*.log +\family default + file can easily fill up your +\family typewriter +/mars/ +\family default + partition until the problems described in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Defending-Overflow" + +\end_inset + + will appear. + Use +\family typewriter +*.log +\family default + only for a +\series bold +limited time +\series default +, and +\series bold +only for debugging! +\end_layout + +\begin_layout Subsubsection +Logging to Syslog +\end_layout + +\begin_layout Standard +The classes also play a role in the following +\family typewriter +/proc/sys/mars/ +\family default + tweaks: +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +syslog_min_class +\family default + (rw) The +\emph on +mimimum +\emph default + class number for +\emph on +permanent +\emph default + syslogging. + By default, this is set to -1 in order to switch off perment logging completely. + Permament logging can easily flood your syslog with such huge amounts of + messages (in particular when class=0), that your system as a whole may + become unusable (because vital kernel threads may be blocked too long or + too often by the userspace syslog daemon). + Instead, please use the flood-protected syslogging described below! +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +syslog_max_class +\family default + (rw) The +\emph on +maximum +\emph default + class number for +\emph on +permanent +\emph default + syslogging. + Please use the flood-protected version instead. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +syslog_flood_class +\family default + (rw) The mimimum class of flood-protected syslogging. + The maximum class is always 4. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +syslog_flood_limit +\family default + (rw) The maxmimum number of messages after which the flood protection will + start. + This is a hard limit for the the number of messages written to the syslog. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +syslog_flood_recovery_s +\family default + (rw) The number of seconds after which the internal flood counter is reset + (after flood protection state has been reached). + When no new messages appear after this time, the flood protection will + start over at count 0. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +The rationale behind flood protected syslogging: sysadmins are usually only + interested in the point in time where some problems / incidents / etc have + +\emph on +started +\emph default +. + They are usually not interested in capturing +\emph on +each +\emph default + and +\emph on +every +\emph default + single error message (in particular when they are flooding the system logs). +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +If you +\emph on +really +\emph default + need complete error information, use the +\family typewriter +*.log +\family default + files described above, compress them and save them to somewhere else +\emph on +regularly +\emph default + by a cron job. + This bears much less overhead than filtering via the syslog daemon, or + even remote syslogging in real time which will almost surely screw up your + system in case of network problems co-inciding with flood messages, such + as caused in turn by those problems. + Don't rely on real-time concepts, just do it the old-fashioned batch job + way. +\end_layout + +\begin_layout Subsubsection +Tuning Verbosity of Logging +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +show_debug_messages +\family default + Boolean switch, 0 or 1. + Mostly useful only for developers. + This can easily flood your logs if our are not careful. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +show_log_messages +\family default + Boolean switch, 0 or 1. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +show_connections +\family default + Boolean switch, 0 or 1. + Show detailed internal statistics on sockets. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +show_statistics_local +\begin_inset space ~ +\end_inset + +/ +\begin_inset space ~ +\end_inset + +show_statistics_global +\family default + Only useful for kernel developers. + Shows some internal information on internal brick instances, memory usage, + etc. +\end_layout + +\begin_layout Subsection +Tuning the Sync +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +sync_flip_interval_sec +\family default + (rw) The sync process must not run in parallel to logfile replay, in order + to easily guarantee consistency of your disk. + If logfile replay would be paused for the full duration of very large or + long-lasting syncs (which could take some days over very slow networks), + your +\family typewriter +/mars/ +\family default + filesystem could overflow because no replay would be possible in the meantime. + Therefore, MARS regulary flips between actually syncing and actually replaying, + if both is enabled. + You can set the time interval for flipping here. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +sync_limit +\family default + (rw) When > 0, this limits the maximum number of sync processes actually + running parallel. + This is useful if you have a large number of resources, and you don't want + to overload the network with sync processes. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +sync_nr +\family default + (ro) Passive indicator for the number of sync processes currently running. +\end_layout + +\begin_layout Labeling +\labelwidthstring 00.00.0000 + +\family typewriter +sync_want +\family default + (ro) Passive indicator for the number of sync processes which +\emph on +demand +\emph default + running. +\end_layout + +\begin_layout Subsection +Lowlevel TCP Tuning (Networking Experts Only) +\begin_inset CommandInset label +LatexCommand label +name "subsec:TCP-Tuning" + +\end_inset + + +\end_layout + +\begin_layout Standard +When +\family typewriter +CONFIG_MARS_SEPARATE_PORTS +\family default + and +\family typewriter +CONFIG_MARS_IPv4_TOS +\family default + are enabled, MARS uses the following types of traffic: +\end_layout + +\begin_layout Description + +\family typewriter +MARS_TRAFFIC_META +\family default + (by default on port 7777 with +\family typewriter +IPTOS_LOWDELAY +\family default +) This can be tuned in directory +\family typewriter +/proc/sys/mars/tcp_tuning_0_meta_traffic/ +\family default +. +\end_layout + +\begin_layout Description + +\family typewriter +MARS_TRAFFIC_REPLICATION +\family default + (by default on port 7778 with +\family typewriter +IPTOS_RELIABILITY +\family default +) This can be tuned in directory +\family typewriter +/proc/sys/mars/tcp_tuning_1_replication_traffic/ +\family default +. +\end_layout + +\begin_layout Description + +\family typewriter +MARS_TRAFFIC_SYNC +\family default + (by default on port 7779 with +\family typewriter +IPTOS_MINCOST +\family default +) This can be tuned in directory +\family typewriter +/proc/sys/mars/tcp_tuning_2_sync_traffic/ +\family default +. + Attention: since the advent of +\family typewriter +DSCP +\family default +, this bit (hex +\family typewriter +0x2 +\family default + in host byte order) is suppressed by the kernel, and yields +\family typewriter +DS0 +\family default +. +\end_layout + +\begin_layout Standard +In each of these directories, the following tunables are available (only + for networking experts who know what they are doing): +\end_layout + +\begin_layout Description + +\family typewriter +ip_tos +\family default + As explained above. + Notice: hex constants from +\family typewriter +/usr/include/linux/ip.h +\family default + must be converted to decimal before forwarding to the +\family typewriter +/proc +\family default + interface. +\end_layout + +\begin_layout Description + +\family typewriter +tcp_window_size +\family default + Current default is 8 * 1024 * 1024. +\end_layout + +\begin_layout Description + +\family typewriter +tcp_nodelay +\family default + Current default is 0. +\end_layout + +\begin_layout Description + +\family typewriter +tcp_timeout +\family default + Current default is 2. +\end_layout + +\begin_layout Description + +\family typewriter +tcp_keepcnt +\family default + Current default is 3. +\end_layout + +\begin_layout Description + +\family typewriter +tcp_keepintvl +\family default + Current default is 3. +\end_layout + +\begin_layout Description + +\family typewriter +tcp_keepidle +\family default + Current default is 4. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Further tuning parameters are in the standard Linux kernel. + Notice that +\family typewriter +IP_TOS +\family default + is internally converted to +\family typewriter +DSCP +\family default +, which in turn can be further manipulated by +\family typewriter +netfilter +\family default + / +\family typewriter +iptables +\family default + and/or by +\family typewriter +qdisc +\family default + ( +\family typewriter +tc +\family default +) and/or by further (external) networking components. + The ancient TOS settings are meant as a default +\emph on +starting point +\emph default + for further customization to your needs. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Typically, +\emph on +public +\emph default + internet transports are flattening / ignoring or otherwise manipulating +\begin_inset Foot +status open + +\begin_layout Plain Layout +DSCP markings can be only made reliable on private networks (possibly requiring + some effort). + Public Internet service and transit providers do not necessarily treat + the TOS values or DSCP markings with any form of priority and may also + remove or change them without any notice. + Some internet service or transit providers also do use specific DSCP markings + to mark packets for being dropped, which may result in hard to find transmissio +n errors. +\end_layout + +\begin_layout Plain Layout +If want to use MARS on a public internet connection, you should use +\series bold +encrypted +\series default + +\series bold +VPN +\series default + with different DSCP markings, and coordinate them with your network services + provider. +\end_layout + +\end_inset + + the TOS / DSCP fields. + There it will not work. + Anyway, you should never route unencrypted MARS traffic over public transports, + for obvious security reasons. + Notice: MARS replication is meant for company- +\emph on +internal +\emph default + networks like +\emph on +internal +\emph default + +\series bold +replication networks +\series default + (or storage networks) where some networking department has control of. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Playing with the above settings can easily tear down your whole (replication) + network if you don't know exactly what you are doing. + Please test any changes in the lab first. + Mass rollout should be done in incremental phases, each in power of 10 + units. + There might be unexpected effects like packet storms, or packet loss, etc. + Some of these effects may only show up when a certain number of hosts is + exceeded, or when certain load conditions are hammering the overall Distributed + System. + Some very old routers / switches are known to break down unexpectedly when + overloaded in certain ways. + Be careful in a production environment! +\end_layout + +\begin_layout Chapter +Tips and Tricks +\end_layout + +\begin_layout Section +IO Performance Tuning +\begin_inset CommandInset label +LatexCommand label +name "sec:IO-Performance-Tuning" + +\end_inset + + +\end_layout + +\begin_layout Standard +There +\emph on +exist +\emph default + some use cases where MARS +\emph on +can +\emph default + deliver better IO performance than a raw block device. + However, this cannot be expected +\emph on +in general +\emph default +. + In some +\emph on +other +\emph default + cases the performance may be +\emph on +lower +\emph default + than with a +\emph on +single +\emph default + local raw device. +\end_layout + +\begin_layout Standard +For demonstration, we use the +\family typewriter +blkreplay +\family default + tool from +\begin_inset Flex URL +status open + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + + and a load which has been captured from a +\series bold +real datacenter +\series default + (1&1 Ionos ShaHoLin = Shared Hosting Linux). + The load already contains a parallelism degree of 20 LXC containers running + in parallel at the same iron. + This corresponds to about 60,000 web spaces running on 20 Apache instances, + already in parallel. + In difference to artificial benchmarks (like pure random IO or pure sequential + IO), this benchmark is much more close to real server operations, while + artificial benchmarks are not meaningful for practice in general, because + they can deviate from real server operations by +\emph on +factors +\emph default + or even by +\series bold +orders of magnitude +\series default +. +\end_layout + +\begin_layout Standard +In order to determine the limits of the test candidates, the timing of the + original workload was converted to a linear ramp-up, simulating an +\series bold +overloaded +\series default + system. + Otherwise benchmarking would not be possible. +\end_layout + +\begin_layout Standard +The following +\family typewriter +blkreplay +\family default + benchmarks were executed on an otherwise unloaded Dell R630 with 40 CPU + threads on 2 sockets, 192 GB RAM, a Dell R730 hardware RAID controller + with 2 GB BBU cache, and 10 spindles Dell 1.8 TB 2.5 inch SAS disks configured + as RAID-6. + All data, including the +\family typewriter +/mars +\family default + directory, was located on the hardware RAID via LVM2. + +\family typewriter +/dev/vginfong/lv-0 +\family default + was assigned a size of 8 TiB. + For testing, vanilla kernel 4.9.x with the MARS pre-patch and +\family typewriter +mars0.1astable72 +\family default + was used. +\end_layout + +\begin_layout Standard +The +\family typewriter +blkreplay +\family default + parameters were as follows: +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +output_label="MARS" +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +# input description +\end_layout + +\begin_layout Plain Layout + +input_file_list=" +HoLin_from_bare_metal/x20/shaholin-x20-ramped/shaholin-x20.adjacent.ramped-100.load. +gz" +\end_layout + +\begin_layout Plain Layout + +replay_duration=110 +\end_layout + +\begin_layout Plain Layout + +speedup=10 +\end_layout + +\begin_layout Plain Layout + +threads=512 +\end_layout + +\begin_layout Plain Layout + +cmode=with-conflicts +\end_layout + +\begin_layout Plain Layout + +scheduler="noop" +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +# hardware setup +\end_layout + +\begin_layout Plain Layout + +replay_host_list="icpu5133" +\end_layout + +\begin_layout Plain Layout + +replay_device_list="/dev/vginfong/lv-0" +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +# output description +\end_layout + +\begin_layout Plain Layout + +enable_graph=1 +\end_layout + +\begin_layout Plain Layout + +graph_options="--no-static --dynamic" +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +We start with the +\series bold +raw +\series default + device +\family typewriter +/dev/vginfong/lv-0 +\family default + which had a size of 8 TiB. + The throughput is about 1418 IOPS, and the latency diagram shows that the + system is overloaded, but can cope with that overload: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/blkreplay/ + width 100col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +As you can see in the filename, the NOOP kernel IO scheduler was used, and + the kernel parameter +\family typewriter +nr_requests +\family default + was left at its default value of 128. + When you read the specs of the Dell R730 hardware RAID controller, you + will notice that it can handle a much higher IO request parallelism of + almost 1024 requests in parallel. +\end_layout + +\begin_layout Standard +So the first natural tuning attempt is +\family typewriter +nr_requests=1020 +\family default +, in order to release the +\begin_inset Quotes eld +\end_inset + +kernel IO handbrake +\begin_inset Quotes erd +\end_inset + +. + This results in an improved throughput of 1562 IOPS, and even the +\emph on +maximum +\emph default + latencies are improved, but the +\emph on +average +\emph default + latencies are becoming a little bit worse: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/blkreplay/ + width 100col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +It is well known since decades that there is a principal tradeoff between + throughput and latencies in IO systems. + Thus it is not a surprising result. +\end_layout + +\begin_layout Standard +On servers, overload situations should be rare, and during overload throughput + is typically much more important than latencies, as long as latencies are + not exceedingly high. + Thus we can recommend +\family typewriter +nr_requests=1000 +\family default + for production. +\end_layout + +\begin_layout Standard +However, some sysadmins might be tempted to question why the NOOP scheduler + has been used. + On the internet, there are a ton of claims that CFQ is much better. +\end_layout + +\begin_layout Standard +Well, testing with CFQ instead of NOOP is no problem for +\family typewriter +blkreplay +\family default +. + However, the result is very surprising. + While the IOPS are 1539, which is only a slight decrease which could result + from measurement tolerances, the latencies are now turning almost into + a disaster: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/blkreplay/ + width 100col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +In production, you should never encounter IO latencies of almost 15 seconds. + So what is going wrong here? +\end_layout + +\begin_layout Standard +Here is an explanation. + A hardware RAID controller +\emph on +already +\emph default + has an +\emph on +internal +\emph default + IO scheduler. + This IO scheduler is hidden in a black box, such that many sysadmins don't + know of its existence. + If you add another IO scheduler at kernel level, you will have +\series bold +two different +\series default + IO schedulers running in parallel, and sometimes taking +\series bold +contradictory decisions +\series default +. +\end_layout + +\begin_layout Standard +These contradictory scheduling decisions may lead to problems in certain + cases and scenarios. +\end_layout + +\begin_layout Standard +While kernel-level IO schedulers like CFQ certainly have their merits at + improving your workstation's IO behaviour, they are counter-productive + at servers with hardware RAID controllers. +\end_layout + +\begin_layout Standard +So the advice is clear: +\series bold +switch them off +\series default + +\emph on +in such a case +\emph default +. +\end_layout + +\begin_layout Standard +Even if you have a software RAID, check with +\family typewriter +blkreplay +\family default + that any IO schedulers are +\emph on +really +\emph default + improving things. + When possible, use your real workload, captured with +\family typewriter +blktrace +\family default +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Never use a benchmark which only delivers IOPS! As demonstrated, inappropriate + IOPS tuning (or choice of inappropriate components) can worsen latencies + so much that production can be endangered! +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + Always look at +\emph on +both +\emph default + IOPS +\emph on +and +\emph default + latencies! +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + +\emph on +Average +\emph default + latencies, even when enriched with +\emph on +standard deviation +\emph default +, are not enough. + Classical statistics does not clearly describe operational problems like + +\series bold +hangs +\series default + and +\series bold +exceptionally high latency requests +\series default +, which may occur only rarely, but can then lead to +\series bold +serious incidents +\series default +. + Use a tool which can clearly display +\emph on +any +\emph default + faulty behaviour, such as +\family typewriter +blkreplay +\family default +'s +\series bold +latency diagrams +\series default +! +\end_layout + +\begin_layout Standard +Now we come to benchmarking +\family typewriter +/dev/mars/lv-0 +\family default + placed on top of +\family typewriter +/dev/vginfong/lv-0 +\family default +. + Notice that MARS needs to write all write requests twice: once into the + transaction logfile, and a second time by writeback into +\family typewriter +/dev/vginfong/lv-0 +\family default +. +\end_layout + +\begin_layout Standard +So you might expect that performace of +\family typewriter +/dev/mars/lv-0 +\family default + could be worse than at the underlying raw device. +\end_layout + +\begin_layout Standard +Nevertheless, the +\series bold +throughput +\series default + is now measured 4338 IOPS, which means that performance has +\series bold +more than doubled +\series default +. + You can also see it by the duration of the benchmark at the x axis. + Even the latencies have improved in many cases: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/blkreplay/ + width 100col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +How is it possible to be faster than a RAW device? How can this be explained? +\end_layout + +\begin_layout Standard +Look at the graphics and at the explanations from section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:The-Transaction-Logger" +plural "false" +caps "false" +noprefix "false" + +\end_inset + +. + The key to local IO performance is the +\series bold +re-ordering of writeback +\series default + according to ascending sector numbers. + This can reduce mechanical seek times of hard disks considerably, and even + by factors, such that it can over-compensate the doubled writes to the + transaction logfile, and even when both are residing at the same RAID set. +\end_layout + +\begin_layout Standard +Notice: this effect is not only dependent from total RAM size and from the + maximum size of the MARS temporary memory buffer (tuning parameter +\family typewriter +/proc/sys/mars/mars_mem_percent +\family default + which defaults to a limit of 20%). + It is also highly dependent from the actual seek behaviour of the +\series bold +workload +\series default +. +\end_layout + +\begin_layout Standard +For example, if you use +\family typewriter +dd +\family default + for sequentially overwriting /dev/mars/lv-0 with a parallelism degree of + 1, the writeback optimization of MARS cannot be exploited. + However, +\family typewriter +dd +\family default + is no appropriate benchmarking tool, and has almost nothing to do with + real workloads occuring in datacenters, which typically are neither sequential, + nor do they have a parallelism degree of only 1. + Please don't try to lead any discussions about this: simply use +\family typewriter +blktrace +\family default + to capture your real server workload, and compare it to a run of dd. + Only if you encounter the same behaviour as +\family typewriter +dd +\family default +, only then you can really claim that your workload is like +\family typewriter +dd +\family default +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Any assumptions about workloads are very dangerous: they can deviate from + practice not only by factors, but sometimes even by +\emph on +orders of magnitude +\emph default +. +\end_layout + +\begin_layout Standard +Notice: the writeback optimization of MARS can typically only improve performanc +e of HDDs, but not of SSDs. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + By placing +\family typewriter +/mars +\family default + onto its own physical device with appropriate speed, you can compensate + the doubled writes to some degree. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + Depending on the workload and on RAID parameters, +\family typewriter +/mars +\family default + may be better placed onto SSDs, or better be placed on HDDs. + There is no general rule. + Just use +\family typewriter +blktrace +\family default + on your real workload, and check several configuration alternatives (also + different RAID levels etc) with +\family typewriter +blkreplay +\family default +. +\end_layout + +\begin_layout Section +Avoiding Inappropriate Clustermanager Types for Medium and Long-Distance + Replication +\begin_inset CommandInset label +LatexCommand label +name "sec:Inappropriate-Clustermanger" + +\end_inset + + +\end_layout + +\begin_layout Standard +This section addresses some wide-spread misconceptions. + Its main target audience is developers, but sysadmins will profit from + +\series bold +detailed explanations of problems and pitfalls +\series default +. + When the problems described in this section are solved somewhen in future, + this section will be shortened and some relevant parts moved to the appendix. +\end_layout + +\begin_layout Standard +Doing +\series bold +High Availability (HA) +\series default + wrong at +\emph on +concept level +\emph default + may easily get you into trouble, and may cost you several millions of € + or $ in larger installations, or even knock you out of business when disasters + are badly dealt with at higher levels such as clustermanagers. +\end_layout + +\begin_layout Subsection +General Cluster Models +\end_layout + +\begin_layout Standard +The most commonly known cluster model is called +\series bold +shared-disk +\series default +, and typically controlled by clustermanagers like +\family typewriter +PaceMaker +\family default +: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/shared-disk-model.fig + width 50col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +The most important property of shared-disk is that there exists only a single + disk instance. + Nowadays, this disk often has some +\emph on +internal +\emph default + redundancy such as RAID. + At +\emph on +system +\emph default + architecure layer / network level, there exists no redundant disk at all. + Only the application cluster is built redundant. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + It should be immediately clear that shared-disk clusters are only suitable + for short-distance operations in the same datacenter. + Although running one of the data access lines over short distances between + very near-by datacenters (e.g. + 1 km) would be theoretically possible, there would be no sufficient protection + against failure of a whole datacenter. +\end_layout + +\begin_layout Standard +Both DRBD and MARS belong to a different architectural model called +\series bold +shared-nothing +\series default +: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/shared-nothing-model.fig + width 50col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +The characteristic feature of a shared-nothing model is (additional) +\series bold + redundancy at network level +\series default +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + Shared-nothing +\begin_inset Quotes eld +\end_inset + +clusters +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice that the term +\begin_inset Quotes eld +\end_inset + +cluster computing +\begin_inset Quotes erd +\end_inset + + usually refers to short-distance only. + Long-distance coupling should be called +\begin_inset Quotes eld +\end_inset + +grid computing +\begin_inset Quotes erd +\end_inset + + in preference. + As known from the scientific literature, grid computing requires different + concepts and methods in general. + Only for the sake of simplicity, we use +\begin_inset Quotes eld +\end_inset + +cluster +\begin_inset Quotes erd +\end_inset + + and +\begin_inset Quotes eld +\end_inset + +grid +\begin_inset Quotes erd +\end_inset + + interchangeably. +\end_layout + +\end_inset + + +\begin_inset Quotes erd +\end_inset + + could theoretically be built for +\emph on +any +\emph default + distances, from short to medium to long distances. + However, concrete technologies of disk coupling such as synchronous operation + may pose practical limits on the distances (see chapter +\begin_inset CommandInset ref +LatexCommand ref +reference "chap:Use-Cases-for" + +\end_inset + +). +\end_layout + +\begin_layout Standard +In general, clustermanagers must fit to the model. + Some clustermanager can be configured to fit to multiple models. + If so, this must be done properly, or you may get into serious trouble. +\end_layout + +\begin_layout Standard +Some people don't know, or they don't believe, that different architectural + models like shared-disk or shared-nothing will +\emph on +require +\emph default + an +\emph on +appropriate +\emph default + type of clustermanager and/or a different configuration. + Failing to do so, by selection of an inappropriate clustermanager type + and/or an inappropriate configuration may be hazardous. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Selection of the right model alone is not sufficient. + Some, if not many, clustermanagers have not been designed for long distances. + As explained in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Special-Requirements-for" + +\end_inset + +, long distances have further +\series bold +hard requirements +\series default +. + Disregarding them may be also hazardous! +\end_layout + +\begin_layout Subsection +Handover / Failover Reasons and Scenarios +\end_layout + +\begin_layout Standard +From a sysadmin perspective, there exist a number of different +\series bold +reasons +\series default + why the application workload must be switched from the currently active + side A to the currently passive side B: +\end_layout + +\begin_layout Enumerate +Some +\series bold +defect +\series default + has occurred at cluster side A or at some corresponding part of the network. +\end_layout + +\begin_layout Enumerate +Some +\series bold +maintenance +\series default + has to be done at side A which would cause a longer downtime (e.g. + security kernel update or replacement of core network equipment or maintainance + of UPS or of the BBU cache etc - hardware isn't 24/7/365 in practice, although + some vendors +\emph on +claim +\emph default + it - it is either not really true, or it becomes +\emph on +extremely +\emph default + expensive). +\end_layout + +\begin_layout Standard +Both reasons are valid and must be automatically handled in larger installations. + In order to deal with all of these reasons, the following basic mechanisms + can be used in either model: +\end_layout + +\begin_layout Enumerate + +\series bold +Failover +\series default + (triggered either manually or automatically) +\end_layout + +\begin_layout Enumerate + +\series bold +Handover +\series default + (triggered manually +\begin_inset Foot +status open + +\begin_layout Plain Layout +Automatic triggering could be feasible for prophylactic treatments. +\end_layout + +\end_inset + +) +\end_layout + +\begin_layout Standard +It is important to not confuse handover with failover at concept level. + Not only the reasons / preconditions are very different, but also the +\emph on +requirements +\emph default +. + Example: precondition for handover is that +\emph on +both +\emph default + cluster sides are healthy, while precondition for failover is that +\emph on +some relevant(!) +\emph default + failure has been +\emph on +detected +\emph default + somewhere (whether this is +\emph on +really +\emph default + true is another matter). + Typically, failover must be able to run in masses, while planned handover + often has lower scaling requirements. +\end_layout + +\begin_layout Standard +Not all existing clustermanagers are dealing with all of these cases (or + their variants) equally well, and some are not even dealing with some of + these cases / variants +\emph on +at all +\emph default +. + +\end_layout + +\begin_layout Standard +Some clustermanagers cannot easily express the concept of +\begin_inset Quotes eld +\end_inset + +automatic triggering +\begin_inset Quotes erd +\end_inset + + versus +\begin_inset Quotes eld +\end_inset + +manual triggering +\begin_inset Quotes erd +\end_inset + + of an action. + There exists simply no cluster-global switch which selects either +\begin_inset Quotes eld +\end_inset + +manual mode +\begin_inset Quotes erd +\end_inset + + or +\begin_inset Quotes eld +\end_inset + +automatic mode +\begin_inset Quotes erd +\end_inset + + (except when you start to hack the code and/or write new plugins; then + you might notice that there is almost no architectural layering / sufficient + separation between mechanism and strategy). + Being forced to permanently use an automatic mode for several hundreds + or even thousands of clusters is not only boring, but bears a considerable + risk when automatics do a wrong decision at hundreds of instances in parallel. +\end_layout + +\begin_layout Subsection +Granularity and Layering Hierarchy for Long Distances +\begin_inset CommandInset label +LatexCommand label +name "subsec:Granularity-and-Layering" + +\end_inset + + +\end_layout + +\begin_layout Standard +Many existing clustermanager solutions are dealing with a single cluster + instance, as the term +\begin_inset Quotes eld +\end_inset + + +\emph on +cluster +\emph default +manager +\begin_inset Quotes erd +\end_inset + + suggests. + However, when running several hundreds or thousands of cluster instances, + you likely will not want to manage each of them individually. + In addition, failover should +\emph on +not only +\emph default + be +\emph on +triggered +\emph default + (not to be confused with +\emph on +executed +\emph default +) individually at cluster level, but likely +\emph on +also +\emph default + at a higher granularity such as a room, or a whole datacenter. + Otherwise, some chaos is likely to happen. +\end_layout + +\begin_layout Standard +Here is what you probably will +\series bold +need +\series default +, possibly in difference to what you may find on the market (whether OpenSource + or not). + For simplicity, the following diagram shows only two levels of granularity, + but can be easily extended to multiple layers of granularity, or to some + concept of various +\emph on +subsets of clusters +\emph default +: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/clustermanager-hierarchy.fig + width 70col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +Notice that many existing clustermanager solutions are not addressing the + datacenter granularity at all. + Typically, they use concepts like +\series bold +quorums +\series default + for determining failures +\emph on +at cluster level +\emph default + solely, and then immediately executing failover of the cluster, sometimes + without clean architectural distinction between trigger and execution (similar + to the +\begin_inset Quotes eld +\end_inset + +separation of concerns +\begin_inset Quotes erd +\end_inset + + between +\series bold +mechanism +\series default + and +\series bold +strategy +\series default + in Operating Systems). + Sometimes there is even no internal software layering / modularization + according to this separation of concerns at all. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + When there is no distinction between different levels of granularity, you + are hopelessly bound to a non-extensible and thus non-adaptable system + when you need to operate masses of clusters. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + A lacking distinction between automatic mode and manual mode, and/or lack + of corresponding +\series bold +architectural software layers +\series default + is not only a blatant ignoration of well-established best practices of + +\series bold +software engineering +\series default +, but will bind you even more firmly to an inflexible system. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + Terminology: for practical reasons, we use the general term +\begin_inset Quotes eld +\end_inset + +clustermanager +\begin_inset Quotes erd +\end_inset + + also for speaking about layers dealing with higher granularity, such as + datacenter layers, and also for long-distance replication scenarios, although + some terminology from grid computing would be more appropriate in a scientific + background. +\end_layout + +\begin_layout Standard +Please consider the following: when it comes to long-distance HA, the above + layering architecture is also motivated by vastly different numbers of + instances for each layer. + Ideally, the topmost automatics layer should be able to overview several + datacenters in parallel, in order to cope with (almost) global network + problems such as network partitions. + Additionally, it should also detect single cluster failures, or intermediate + problems like +\begin_inset Quotes eld +\end_inset + +rack failure +\begin_inset Quotes erd +\end_inset + + or +\begin_inset Quotes eld +\end_inset + +room failure +\begin_inset Quotes erd +\end_inset + +, as well as various types of (partial / intermediate) (replication) network + failures. + Incompatible decisions at each of the different granularities would be + a no-go in practice. + Somewhere and somehow, you need one single +\begin_inset Foot +status open + +\begin_layout Plain Layout +If you have +\emph on +logical pairs of datacenters +\emph default + which are firmly bound together, you could also have several topmost automatics + instances, e.g. + for each +\emph on +pair +\emph default + of datacenters. + However, that would be very +\series bold +inflexible +\series default +, because then you cannot easily mix locations or migrate your servers between + datacenters. + Using +\begin_inset Formula $k>2$ +\end_inset + + replicas with MARS would also become a nightmare. + In your own interest, please don't create any concepts where masses of + hardware are firmly bound to fixed constants at some software layers. +\end_layout + +\end_inset + + top-most +\emph on +logical +\emph default + problem detection / ranking instance, which should be +\emph on +internally distributed +\emph default + of course, typically using some +\series bold +distributed consensus protocol +\series default +; but in difference to many published distributed consensus algorithms it + should be able to work with multiple granularities at the same time. +\end_layout + +\begin_layout Subsection +Methods and their Appropriateness +\end_layout + +\begin_layout Subsubsection +Failover Methods +\begin_inset CommandInset label +LatexCommand label +name "subsec:Failover-Methods" + +\end_inset + + +\end_layout + +\begin_layout Standard +Failover methods are only needed in case of an incident. + They should not be used for regular handover. +\end_layout + +\begin_layout Paragraph +STONITH-like Methods +\end_layout + +\begin_layout Standard +STONITH = Shoot The Other Node In The Head +\end_layout + +\begin_layout Standard +These methods are widely known, although they have several serious drawbacks. + Some people even believe that +\emph on +any +\emph default + clustermanager must +\emph on +always +\emph default + have some STONITH-like functionality. + This is wrong. + There +\emph on +exist +\emph default + alternatives, as shown in the next paragraph. +\end_layout + +\begin_layout Standard +The most obvious drawback is that STONITH will always create a +\series bold +damage +\series default +, by definition. +\end_layout + +\begin_layout Standard +Example: a typical contemporary STONITH implementation uses IPMI for automatical +ly powering off your servers, or at least pushes the (virtual) reset button. + This will +\emph on +always +\emph default + create a certain type of damage: the affected systems will definitely not + be available, at least for some time until they have (manually) rebooted. +\end_layout + +\begin_layout Standard +This is a conceptual contradiction: the reason for starting failover is + that you want to restore availability as soon as possible, but in order + to do so you will first +\emph on +destroy +\emph default + the availability of a particular +\emph on +component +\emph default +. + This may be counter-productive. +\end_layout + +\begin_layout Standard +Example: when your hot standby node B does not work as expected, or if it + works even +\emph on +worse +\emph default + than A before, you will loose some time until you +\emph on +can +\emph default + become operational again at the old side A. +\end_layout + +\begin_layout Standard +Here is an example method for handling a failure scenario. + The old active side A is assumed to be no longer healthy anymore. + The method uses a sequential state transition chain with a STONITH-like + step: +\end_layout + +\begin_layout Description +Phase1 Check whether the hot standby B is currently usable. + If this is violated (which may happen during certain types of disasters), + abort the failover for any affected resources. +\end_layout + +\begin_layout Description +Phase2 +\emph on +Try +\emph default + to shutdown the damaged side A (in the +\emph on +hope +\emph default + that there is no +\emph on +serious +\emph default + damage). +\end_layout + +\begin_layout Description +Phase3 In case phase2 did not work during a grace period / after a timeout, + assume that A is badly damaged and therefore STONITH it. +\end_layout + +\begin_layout Description +Phase4 Start the application at the hot standby B. +\end_layout + +\begin_layout Standard +Notice: any cleanup actions, such as +\series bold +repair +\series default + of defective hard- or software etc, are outside the scope of failover processes. + Typically, they are executed much later when restoring redundancy. +\end_layout + +\begin_layout Standard +Also notice: this method is a +\emph on +heavily +\emph default + distributed one, in the sense that sequential actions are alternated multiple + times on different hosts. + This is known to be cumbersome in distributed systems, in particular in + presence of network problems. +\end_layout + +\begin_layout Standard +\begin_inset CommandInset label +LatexCommand label +name "Phase4-in-more" + +\end_inset + +Phase4 in more detail for DRBD, augmented with some pseudo code for application + control: +\end_layout + +\begin_layout Enumerate +at side B: +\family typewriter +drbdadm disconnect all +\end_layout + +\begin_layout Enumerate +at side B: +\family typewriter +drbdadm primary --force all +\end_layout + +\begin_layout Enumerate +at side B: +\family typewriter +applicationmanager start all +\end_layout + +\begin_layout Standard +The same phase4 using MARS: +\end_layout + +\begin_layout Enumerate +at side B: +\family typewriter +marsadm pause-fetch all +\end_layout + +\begin_layout Enumerate +at side B: +\family typewriter +marsadm primary --force all +\end_layout + +\begin_layout Enumerate +at side B: +\family typewriter +applicationmanager start all +\end_layout + +\begin_layout Standard +This sequential 4-phase method is far from optimal, for the following reasons: +\end_layout + +\begin_layout Itemize +The method tries to handle both failover and handover scenarios with one + single sequential receipe. + In case of a true failover scenario where it is +\emph on +already known for sure +\emph default + that side A is badly damaged, this method will unnecessarily waste time + for phase 2. + This could be fixed by introduction of a conceptual distinction between + handover and failover, but it would not fix the following problems. +\end_layout + +\begin_layout Itemize +Before phase4 is started (which will re-establish the service from a user's + perspective), a lot of time is wasted by +\emph on +both +\emph default + phases 2 +\emph on +and +\emph default + 3. + Even if phase 2 would be skipped, phase 3 would unnecessarily cost some + time. + In the next paragraph, an alternative method is explained which eliminates + any unnecessary waiting time at all. +\end_layout + +\begin_layout Itemize +The above method is adapted to the shared-disk model. + It does not take advantage of the shared-nothing model, where further possibili +ties for better solutions exist. +\end_layout + +\begin_layout Itemize +In case of long-distance network partitions and/or sysadmin / system management + subnetwork outages, you may not even be able to (remotely) start STONITH + at at. + Thus the above method misses an important failure scenario. +\end_layout + +\begin_layout Standard +Some people seem to have a +\emph on +binary +\emph default + view at the healthiness of a system: in their view, a system is either + operational, or it is damaged. + This kind of view is ignoring the fact that some systems may be half-alive, + showing only +\emph on +minor +\emph default + problems, or occurring only from time to time. +\end_layout + +\begin_layout Standard +It is obvious that damaging a healthy system is a bad idea by itself. + Even +\emph on +generally +\emph default + damaging a half-alive system in order to +\begin_inset Quotes eld +\end_inset + +fix +\begin_inset Quotes erd +\end_inset + + problems is not generally a good idea, because it may increase the damage + when you don't know the +\emph on +real +\emph default + reason +\begin_inset Foot +status open + +\begin_layout Plain Layout +Example, occurring in masses: an incorrectly installed bootloader, or a + wrong BIOS boot priority order which unexpectedly lead to hangs or infinite + reboot cycles once the DHCP or BOOTP servers are not longer available / + reachable. +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Standard +Even worse: in a distributed system +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice: the STONITH concept is more or less associated with short-distance + scenarios where +\series bold +crossover cables +\series default + or similare equipment are used. + The assumption is that crossover cables can't go defective, or at least + it would be an extremely unlikely scenario. + For long-distance replication, this assumption is simply not true. +\end_layout + +\end_inset + + you sometimes +\emph on +cannot(!) +\emph default + know whether a system is healthy, or to what degree it is healthy. + Typical STONITH methods as used in some contemporary clustermanagers are + +\series bold +assuming a worst case +\series default +, even if that worst case is currently not for real. +\end_layout + +\begin_layout Standard +Therefore, avoid the following +\series bold +fundamental flaws +\series default + in failover concepts and healthiness models, which apply to implementors + / configurators of clustermanagers: +\end_layout + +\begin_layout Itemize +Don't mix up knowledge with conclusions about a (sub)system, and also don't + mix this up with the real state of that (sub)system. + In reality, you don't have any knowledge about a complex distributed system. + You only may have +\emph on +some +\emph default + knowledge about +\emph on +some +\emph default + parts of the system, but you cannot +\begin_inset Quotes eld +\end_inset + +see +\begin_inset Quotes erd +\end_inset + + a complex distributed system as a whole. + What you think is your knowledge, isn't knowledge in reality: in many cases, + it is +\emph on +conclusion +\emph default +, not knowledge. + Don't mix this up! +\end_layout + +\begin_layout Itemize +Some systems are more complex than your model of it. + Don't neglect important parts (such as networks, routers, switches, cables, + plugs) which may lead you to wrong conclusions! +\end_layout + +\begin_layout Itemize +Don't restrict your mind to boolean models of healthyness. + Doing so can easily create unnecessary damage by construction, and even + at concept level. + You should know from software engineering that defects in concepts or models + are much more serious than simple bugs in implementations. + Choosing the wrong model cannot be fixed as easily as a typical bug or + a typo. +\end_layout + +\begin_layout Itemize +Try to deduce the state of a system as +\series bold +reliably +\series default + as possible. + If you don't know something for sure, don't generally assume that it has + gone wrong. + Don't confuse missing knowledge with the conclusion that something is bad. + Boolean algebra restricts your mind to either +\begin_inset Quotes eld +\end_inset + +good +\begin_inset Quotes erd +\end_inset + + or +\begin_inset Quotes eld +\end_inset + +bad +\begin_inset Quotes erd +\end_inset + +. + Use at least +\series bold +tri-state algebra +\series default + which has a means for expressing +\series bold + +\begin_inset Quotes eld +\end_inset + +unknown +\begin_inset Quotes erd +\end_inset + + +\series default +. + Even better: attach a probability to anything you (believe to) know. + Errare humanum est: nothing is absolutely sure. +\end_layout + +\begin_layout Itemize +Oversimplification: don't report an +\begin_inset Quotes eld +\end_inset + +unknown +\begin_inset Quotes erd +\end_inset + + or even a +\begin_inset Quotes eld +\end_inset + +broken +\begin_inset Quotes erd +\end_inset + + state for a complex system whenever a smaller subsystem exists for which + you have some knowledge (or you can conclude something about it with reasonable + evidence). + Otherwise, your users / sysadmins may draw wrong conclusions, and assume + that the whole system is broken, while in reality only some minor part + has some minor problem. + Users could then likely make wrong decisions, which may then easily lead + to bigger damages. +\end_layout + +\begin_layout Itemize +Murphy's law: +\series bold +never assume that something can't go wrong! +\series default + Doing so is a blatant misconception at topmost level: the +\emph on +purpose +\emph default + of a clustermanager is creating High Availablity (HA) out of more or less + +\begin_inset Quotes eld +\end_inset + +unreliable +\begin_inset Quotes erd +\end_inset + + components. + It is the damn duty of both a clustermanager and its configurator to try + to compensate +\emph on +any +\emph default + failures, +\emph on +regardless of their probability +\emph default + +\begin_inset Foot +status open + +\begin_layout Plain Layout +Never claim that something has only low probability (and therefore it were + not relevant). + In the HA area, you simply +\series bold +cannot know +\series default + that, because you typically have +\emph on +sporadic +\emph default + incidents. + In extreme cases, the +\emph on +purpose +\emph default + of your HA solution is protection against 1 failure per 10 years. + You simply don't have the time to wait for creating an incident statistics + about that! +\end_layout + +\end_inset + +, as best as possible. +\end_layout + +\begin_layout Itemize +Never confuse +\series bold +probability +\series default + with +\series bold + expectancy value! +\series default +If you don't know the mathematical term +\begin_inset Quotes eld +\end_inset + +expectancy value +\begin_inset Quotes erd +\end_inset + +, or if you don't know what this means +\emph on +in practice +\emph default +, don't take responsibility for millions of € or $. +\end_layout + +\begin_layout Itemize +When operating masses of hard- and software: never assume that a particular + failure can occur only at a low number of instances. + There are +\series bold +\emph on +unknown(!) +\emph default + systematic errors +\series default + which may pop up at the wrong time and in huge masses when you don't expect + them. +\end_layout + +\begin_layout Itemize +Multiple layers of fallback: +\emph on +any +\emph default + action can fail. + Be prepared to have a plan B, and even a plan C, and even better a plan + D, wherever possible. +\end_layout + +\begin_layout Itemize +Never increase any damage anywhere, unnecessarily! Always try to +\emph on +miminize +\emph default + any damage! It can be mathematically proven that in deterministic probabilistic + systems having finite state, increases of a damage level +\emph on +at the wrong place +\emph default + will +\emph on +introduce +\emph default + an +\emph on +additional +\emph default + +\emph on +risk +\emph default + of getting into an +\series bold +endless loop +\series default +. + This is also true for nondeterministic systems, as known from formal language + theory +\begin_inset Foot +status open + +\begin_layout Plain Layout +Finite automatons are known to be transformable to deterministic ones, usually + by an exponential increase in the number of states. +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Itemize +Use the +\series bold +best effort principle +\series default +. + You should be aware of the following fact: in general, it is impossible + to create an +\emph on +absolutely reliable system +\emph default + out of unreliable components. + You can +\emph on +lower +\emph default + the risk of failures to any +\begin_inset Formula $\epsilon>0$ +\end_inset + + by investing a lot of resources and of money, but whatever you do: +\begin_inset Formula $\epsilon=0$ +\end_inset + + is impossible. + Therefore, be careful with boolean algebra. + Prefer approximation methods / optimizing methods instead. + Always do +\emph on +your +\emph default + best, instead of trying to reach a +\emph on +global +\emph default + optimum which likely does not exist at all (because the +\begin_inset Formula $\epsilon$ +\end_inset + + can only +\emph on +converge +\emph default + to an optimum, but will never actually reach it). + The best effort principle means the following: if you discover a method + for improving your operating state by reduction of a (potential) damage + in a reasonable time and with reasonable effort, then +\series bold +simply do it +\series default +. + Don't argue that a particular step is no 100% solution for all of your + problems. + +\emph on +Any +\emph default + +\emph on +improvement +\emph default + is valuable. + +\series bold +Don't miss any valuable step +\series default + having reasonable costs with respect to your budget. + Missing valuable measures which have low costs are certainly a violation + of the best effort principle, because you are not doing +\emph on +your +\emph default + best. + Keep that in mind. +\begin_inset Newline newline +\end_inset + +If you have +\emph on +understood +\emph default + this (e.g. + deeply think at least one day about it), you will no longer advocate STONITH + methods +\emph on +in general +\emph default +, when there are alternatives. + STONITH methods are only valuable when you +\emph on +know in advance +\emph default + that the final outcome (after reboot) will most likely be better, and that + waiting for reboot will most likely +\emph on +pay off +\emph default +. + In general, this condition is +\emph on +not true +\emph default + if you have a healthy hot standby system. + This should be easy to see. + But there exist well-known clustermanager solutions / configurations blatantly + ignoring +\begin_inset Foot +status open + +\begin_layout Plain Layout +For some +\emph on +special(!) +\emph default + cases of the shared-disk model, there exist some justifications for doing + STONITH +\emph on +before +\emph default + starting the application at the hot standby. + Under certain circumstances, it can happen that system A running amok could + destroy the data on your single shared disk (example: a filesystem doubly + mounted +\emph on +in parallel +\emph default +, which will certainly destroy your data, except you are using +\family typewriter +ocfs2 +\family default + or suchalike). + This argument is only valid for +\emph on +passive +\emph default + disks which are +\emph on +directly +\emph default + attached to +\emph on +both +\emph default + systems A and B, such that there is no +\emph on +external +\emph default + means for fencing the disk. + In case of iSCSI running over ordinary network equipment such as routers + or switches, the argument +\begin_inset Quotes eld +\end_inset + +fencing the disk is otherwise not possible +\begin_inset Quotes erd +\end_inset + + does not apply. + You can interrupt iSCSI connection at the network gear, or you can often + do it at cluster A or at the iSCSI target. + Even commercial storage appliances speaking iSCSI can be remotely controlled + for forcefully aborting iSCSI sessions. + In modern times, the STONITH method has no longer such a justification. + The justification stems from ancient times when a disk was a purely passive + mechanical device, and its disk controller was part of the server system. +\end_layout + +\end_inset + + this. + Only when the former standby system does not work as expected (this means + that +\emph on +all +\emph default + of your redundant systems are not healthy enough for your application), + +\emph on +only then +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice that STONITH may be needed for (manual or partially automatic) +\emph on +repair +\emph default + in some cases, e.g. + when you know that a system has a kernel crash. + Don't mix up the repair phase with failover or handover phases. + Typically, they are executed at different times. + The repair phase is outside the scope of this section. +\end_layout + +\end_inset + + +\emph default + STONITH is unevitable as a +\emph on +last resort +\emph default + option. +\begin_inset Newline newline +\end_inset + +In short: blindly using STONITH without true need during failover is a violation + of the best effort principle. + You are simply not doing your best. +\end_layout + +\begin_layout Itemize +When your budget is limited, carefully select those improvements which make + your system +\series bold +as reliable as possible +\series default +, given your fixed budget. +\end_layout + +\begin_layout Itemize +Create statistics on the duration of your actions. + Based on this, try to get a +\emph on +balanced +\emph default + optimum between time and costs. +\end_layout + +\begin_layout Itemize +Whatever actions you can +\series bold +start in parallel +\series default + for saving time, do it. + Otherwise you are disregarding the best effort principle, and your solution + will be sub-optimal. + You will require deep knowledge of parallel systems, as well as experience + with dealing with problems like (distributed) races. + Notice that +\emph on +any +\emph default + distributed system is +\emph on +inherently parallel +\emph default +. + Don't believe that sequential methods can deliver an optimum solution in + such a difficult area. +\end_layout + +\begin_layout Itemize +If you don't have the +\series bold +necessary skills +\series default + for (a) recognizing already existing parallelism, (b) dealing with parallelism + at concept level, (c) programming and/or configuring parallelism race-free + and deadlock-free (or if you even don't know what a race condition is and + where it may occur in practice), then don't take responsibility for millions + of € or $. +\end_layout + +\begin_layout Itemize +Avoid hard timeouts wherever possible. + Use +\series bold +adaptive timeouts +\series default + instead. + Reason: depending on hardware or workload, the same action A may take a + very short time on cluster 1, but take a very long time on cluster 2. + If you need to guard action A from hanging (which is almost always the + case because of Murphy's law), don't configure any fixed timeout for it. + When having several hundreds of clusters, you would need to use the +\emph on +worst case value +\emph default +, which is the longest time occurring somewhere at the very slow clusters + / slow parts of the network. + This wastes a lot of time in case one of the fast clusters is hanging. + Adaptive timeouts work differently: they use a kind of +\begin_inset Quotes eld +\end_inset + +progress bar +\begin_inset Quotes erd +\end_inset + + to monitor the +\emph on +progress +\emph default + of an action. + They will abort only if there is +\emph on +no progress +\emph default + for a certain amount of time. + Hint: among others, +\family typewriter +marsadm view-*-rest +\family default + commands or macros are your friend. +\end_layout + +\begin_layout Paragraph +ITON = Ignore The Other Node +\end_layout + +\begin_layout Standard +This means +\series bold +fencing from application traffic +\series default +, and can be used as an alternative to STONITH when done properly. +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/fencing-hierarchy.fig + width 60col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +Fencing from application traffic is best suited for the shared-nothing model, + but can also be adapted to the shared-disk model with some quirks. +\end_layout + +\begin_layout Standard +The idea is simple: always route your application network traffic to the + current (logically) active side, whether it is currently A or B. + Just don't route any application requests to the current (logically) passive + side at all. +\end_layout + +\begin_layout Standard +For failover (and +\emph on +only +\emph default + for that), you +\emph on +should not care about +\emph default + any split brain occurring at the low-level generic block device: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/split-brain-history.fig + width 50col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +Although having a split brain at the generic low-level block device, you + now define the +\begin_inset Quotes eld +\end_inset + +logically active +\begin_inset Quotes erd +\end_inset + + and +\begin_inset Quotes eld +\end_inset + +logically passive +\begin_inset Quotes erd +\end_inset + + side by yourself by +\emph on +logically ignoring +\emph default + the +\begin_inset Quotes eld +\end_inset + +wrong +\begin_inset Quotes erd +\end_inset + + side as defined by yourself: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/split-brain-resolved.fig + width 50col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +This is possible because the generic block devices provided by DRBD or MARS + are completely +\series bold +agnostic +\series default + of the +\begin_inset Quotes eld +\end_inset + +meaning +\begin_inset Quotes erd +\end_inset + + of either version A or B. + Higher levels such as clustermanagers (or humans like sysadmins) can assign + them a meaning like +\begin_inset Quotes eld +\end_inset + +relevant +\begin_inset Quotes erd +\end_inset + + or +\begin_inset Quotes eld +\end_inset + +not relevant +\begin_inset Quotes erd +\end_inset + +, or +\begin_inset Quotes eld +\end_inset + +logically active +\begin_inset Quotes erd +\end_inset + + or +\begin_inset Quotes eld +\end_inset + +logically passive +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Standard +As a result of fencing from application traffic, the +\begin_inset Quotes eld +\end_inset + +logically passive +\begin_inset Quotes erd +\end_inset + + side will +\emph on +logically +\emph default + cease any actions such as updating user data, even if it is +\begin_inset Quotes eld +\end_inset + +physically active +\begin_inset Quotes erd +\end_inset + + during split-brain (when two primaries exist in DRBD or MARS sense +\begin_inset Foot +status open + +\begin_layout Plain Layout +Hint: some clustermanagers and/or some people seem to define the term +\begin_inset Quotes eld +\end_inset + +split-brain +\begin_inset Quotes erd +\end_inset + + differently from DRBD or MARS. + In the context of generic block devices, split brain means that the +\emph on +history +\emph default + of both versions has been split to a Y-like +\series bold +fork +\series default + (for whatever reason), such that re-joining them +\emph on +incrementally +\emph default + by ordinary write operations is no longer guaranteed to be possible. + As a slightly simplified definition, you might alternatively use the definition + +\begin_inset Quotes eld +\end_inset + +two incompatible primaries are existing in parallel +\begin_inset Quotes erd +\end_inset + +, which means almost the same in practice. + Details of formal semantics are not the scope of this treatment. +\end_layout + +\end_inset + +). +\end_layout + +\begin_layout Standard +If you already have some load balancing, or BGP, or another +\emph on +mechanism +\emph default + for dynamic routing, you already have an important part for the ITON method. + Additionally, ensure by an appropriate +\emph on +strategy +\emph default + that your balancer status / BGP announcement etc does always coincide with + the +\begin_inset Quotes eld +\end_inset + +logically active +\begin_inset Quotes erd +\end_inset + + side (recall that even during split-brain +\emph on +you +\emph default + must define +\begin_inset Quotes eld +\end_inset + +logically active +\begin_inset Quotes erd +\end_inset + + +\series bold +uniquely +\series default + +\begin_inset Foot +status open + +\begin_layout Plain Layout +A possible strategy is to use a Lamport clock for route changes: the change + with the most recent Lamport timestamp will always win over previous changes. +\end_layout + +\end_inset + + by yourself). +\end_layout + +\begin_layout Standard +Example: +\end_layout + +\begin_layout Description +Phase1 Check whether the hot standby B is currently usable. + If this is violated (which may happen during certain types of disasters), + abort the failover for any affected resources. +\end_layout + +\begin_layout Description +Phase2 Do the following +\emph on +in parallel +\begin_inset Foot +status open + +\begin_layout Plain Layout +For database applications where no transactions should get lost, you should + slightly modify the order of operations: first fence the old side A, then + start the application at standby side B. + However, be warned that even this cannot guarantee that no transaction + is lost. + When the network between A and B is interrupted +\emph on +before +\emph default + the incident happens, DRBD will automatically disconnect, and MARS will + show a lagbehind. + In order to fully eliminate this possibility, you can either use DRBD and + configure it to hang forever during network outages (such that users will + be unable to commit any transactions at all), or you can use the shared-disk + model instead. + But in the latter case, you are introducing a SPOF at the single shared + disk. + The former case is logically almost equivalent to shared-disk, but avoiding + some parts of the physical SPOF. + In a truly distributed system, the famous CAP theorem is limiting your + possibilities. + Therefore, no general solution exists fulfilling all requirements at the + same time. +\end_layout + +\end_inset + +: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_deeper +\begin_layout Itemize +Start all affected applications at the hot standby B. + This can be done with the same DRBD or MARS procedure as described +\begin_inset CommandInset ref +LatexCommand vpageref +reference "Phase4-in-more" + +\end_inset + +. +\end_layout + +\begin_layout Itemize +Fence A by fixedly routing all affected application traffic to B. +\end_layout + +\end_deeper +\begin_layout Standard +That's all which has to be done for a shared-nothing model. + Of course, this will likely produce a split-brain (even when using DRBD + in place of MARS), but that will not matter from a user's perspective, + because the users will no longer +\begin_inset Quotes eld +\end_inset + +see +\begin_inset Quotes erd +\end_inset + + the +\begin_inset Quotes eld +\end_inset + +logically passive +\begin_inset Quotes erd +\end_inset + + side A through their network. + Only during the relatively small time period where application traffic + was going to the old side A while not replicated to B due to the incident, + a very small number of updates +\emph on +could +\emph default + have gone lost. + In fields like webhosting, this is taken into account. + Users will usually not complain when some (smaller amount of) data is lost + due to split-brain. + They will complain when the service is unavailable. +\end_layout + +\begin_layout Standard +This method is the fastest for restoring availability, because it doesn't + try to execute any (remote) action at side A. + Only from a sysadmin's perspective, there remain some cleanup tasks to + be done during the following repair phase, such as split-brain resolution, + which are outside the scope of this treatment. +\end_layout + +\begin_layout Standard +By running the application fencing step +\emph on +sequentially +\emph default + (including wait for its partial successfulness such that the old side A + can no longer be reached by any users) in front of the failover step, you + may minimize the amount of lost data, but at the cost of total duration. + Your service will take longer to be available again, while the amount of + lost data is typically somewhat smaller. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + A few people might clamour when some data is lost. + In long-distance replication scenarios with high update traffic, there + is +\emph on +simply no way at all +\emph default + for guaranteeing that no data can be lost ever. + According to the laws of Einstein and the laws of Distributed Systems like + the famous CAP theorem, this isn't the fault of DRBD+proxy or MARS, but + simply the +\emph on +consequence +\emph default + of having long distances. + If you want to protect against data loss as best as possible, then don't + use +\begin_inset Formula $k=2$ +\end_inset + + replicas. + Use +\begin_inset Formula $k\geq4$ +\end_inset + +, and spread them over different distances, such as mixed small + medium + + long distances. + Future versions of MARS will support adaptive pseudo-synchronous modes, + which will allow individual adaptation to network latencies / distances. +\end_layout + +\begin_layout Standard +The ITON method can be adapted to shared-disk by additionally fencing the + common disk from the (presumably) failed cluster node A. +\end_layout + +\begin_layout Subsubsection +Handover Methods +\end_layout + +\begin_layout Standard +Planned handover is conceptually simpler, because both sides must be (almost) + healthy as a +\emph on +precondition +\emph default +. + There are simply no pre-existing failures to deal with. +\end_layout + +\begin_layout Standard +Here is an example using DRBD, some application commands denoted as pseudo + code: +\end_layout + +\begin_layout Enumerate +at side A: +\family typewriter +applicationmanager stop all +\end_layout + +\begin_layout Enumerate +at side A: +\family typewriter +drbdadm secondary all +\end_layout + +\begin_layout Enumerate +at side B: +\family typewriter +drbdadm primary all +\end_layout + +\begin_layout Enumerate +at side B: +\family typewriter +applicationmanager start all +\end_layout + +\begin_layout Standard +MARS already has a conceptual distinction between handover and failover. + With MARS, it becomes even simpler, because a generic handover procedure + is already built in: +\end_layout + +\begin_layout Enumerate +at side A: +\family typewriter +applicationmanager stop all +\end_layout + +\begin_layout Enumerate +at side B: +\family typewriter +marsadm primary all +\end_layout + +\begin_layout Enumerate +at side B: +\family typewriter +applicationmanager start all +\end_layout + +\begin_layout Subsubsection +Hybrid Methods +\end_layout + +\begin_layout Standard +In general, a planned handover may fail at any stage. + Notice that such a failure is also a failure, but (partially) caused by + the planned handover. + You have the following alternatives for automatically dealing with such + cases: +\end_layout + +\begin_layout Enumerate +In case of a failure, switch back to the old side A. +\end_layout + +\begin_layout Enumerate +Instead, forcefully switch to the new side A, similar to the methods described + in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Failover-Methods" + +\end_inset + +. +\end_layout + +\begin_layout Standard +Similar options exist for a failed failover (at least in theory), but chances + are lower for actually recovering if you have only +\begin_inset Formula $k=2$ +\end_inset + + replicas in total. +\end_layout + +\begin_layout Standard +Whatever you decide to do in what case in whatever priority order, whether + you decide it in advance or during the course of a failing action: it simply + means that according to the best effort principle, you should +\series bold +never leave your system in a broken state +\series default + when there exists a chance to recover availability with any method. +\end_layout + +\begin_layout Standard +Therefore, you should +\emph on +implement +\emph default + neither handover nor failover in their pure forms. + Always implement hybrid forms following the best effort principle. +\end_layout + +\begin_layout Subsection +Special Requirements for Long Distances +\begin_inset CommandInset label +LatexCommand label +name "subsec:Special-Requirements-for" + +\end_inset + + +\end_layout + +\begin_layout Standard +Most contemporary clustermanagers have been constructed for short distance + shared-nothing clusters, or even for +\emph on +local +\emph default + shared-nothing clusters (c.f. + DRBD over crossover cables), or even for shared-disk clusters ( +\emph on +originally +\emph default +, when their +\emph on +concepts +\emph default + were developed). + Blindly using them for long-distance replication without modification / + adaptation bears some additional risks. +\end_layout + +\begin_layout Itemize +Notice that long-distance replication always +\emph on +requires +\emph default + a +\series bold +shared-nothing +\series default + model. +\end_layout + +\begin_layout Itemize +As a consequence, +\series bold +split brain +\series default + can appear +\emph on +regularly +\emph default + during failover. + There is no way for preventing it! This is an +\emph on +inherent property +\emph default + of distributed systems, not limited to MARS (e.g. + also ocurring with DRBD if you try to use it over long distances). + Therefore, you +\emph on +must +\emph default + deal with occurences of split-brain as a +\emph on +requirement +\emph default +. +\end_layout + +\begin_layout Itemize +The probability of +\series bold +network partitions +\series default + is much higher: although you should have been required by Murphy's law + to deal with network partitions already in short-distance scenarios, it + now becomes +\emph on +mandatory +\emph default +. +\end_layout + +\begin_layout Itemize +Be prepared that in case of certain types of (more or less global) internet + partitions, you may not be able to trigger STONITH actions +\emph on +at all +\emph default +. + Therefore, +\series bold +fencing of application traffic +\series default + is +\emph on +mandatory +\emph default +. +\end_layout + +\begin_layout Section + +\family typewriter +systemd +\family default + Templates +\begin_inset CommandInset label +LatexCommand label +name "sec:systemd-Templates" + +\end_inset + + +\end_layout + +\begin_layout Standard +Starting with +\family typewriter +mars0.1stable57 +\family default + (resp. + +\family typewriter +mars0.1abeta9 +\family default +), you may use +\family typewriter +systemd +\family default + as a cluster manager at the Mechanics Layer as explained in section +\begin_inset CommandInset ref +LatexCommand vref +reference "subsec:Granularity-and-Layering" + +\end_inset + +. + MARS will replicate some +\family typewriter +systemd +\family default +-relevant state information across the (big) cluster, so there is some limited + remote operation support. + In particular, automated handover via +\family typewriter +marsadm primary $resource +\family default + is supported. + More features will be added to future releases. +\end_layout + +\begin_layout Subsection +Why +\family typewriter +systemd +\family default +? +\end_layout + +\begin_layout Standard +All major Linux distributions are now +\family typewriter +systemd +\family default + based. + It is the new quasi standard. + Although there have been some discussions in the community about its merits + and shortcomings, it appears to be accepted now in large parts of the Linux + world. +\end_layout + +\begin_layout Standard +Systemd has a few advantages: +\end_layout + +\begin_layout Enumerate +It is running as +\family typewriter +init +\family default + process under the reserved +\family typewriter +pid=1 +\family default +. + If it would ever die, then your system would die. + There is no need for adding a new MARS clustermanager daemon or similar, + which could fail independently from other parts of the system. +\end_layout + +\begin_layout Enumerate +Although +\family typewriter +systemd +\family default + has been criticised as being +\begin_inset Quotes eld +\end_inset + +monolithic +\begin_inset Quotes erd +\end_inset + + (referring to its internal software architecture), its +\emph on +usage +\emph default + by sysadmins is easily decomposable into many plugins called +\series bold +units +\series default +. +\end_layout + +\begin_layout Enumerate +Local LXC containers, local VMs, +\family typewriter +iSCSI +\family default + exports, +\family typewriter +nfs +\family default + exports and many other parts of the system are often already controlled + by +\family typewriter +systemd +\family default +. + Together with +\family typewriter +udev +\family default + and other parts, it already controls devices, LVM, mountpoints, etc. + Since MARS is only a particular +\emph on +component +\emph default + in a bigger complicated stack, it is an advantage to use the same (more + or less standardized and well-integrated) tools for managing the whole + stack. +\end_layout + +\begin_layout Standard +Systemd has also a few disadvantages: +\end_layout + +\begin_layout Enumerate +It is not accepted everywhere. + Therefore the +\family typewriter +systemd +\family default + template extensions of +\family typewriter +marsadm +\family default + are not mandatory for MARS operations. + You can implement your own alternatives when necessary. +\end_layout + +\begin_layout Enumerate +It can be messy to deal with. + In particular, it can sometimes +\emph on +believe +\emph default + that the system +\emph on +were +\emph default + in a particular state, although in reality it isn't. + Compensation is hairy. +\end_layout + +\begin_layout Enumerate +Usablility / reporting: it is less usable for getting an overview over a + bigger local system, and is practically unusable (out-of-the-box) for managing + a bigger cluster at cluster level. + Monitoring needs to be done separately. +\end_layout + +\begin_layout Subsection +Working Principle of the +\family typewriter +systemd +\family default + Template Engine +\begin_inset CommandInset label +LatexCommand label +name "subsec:Working-Principle-of" + +\end_inset + + +\end_layout + +\begin_layout Standard +Systemd already has some very basic templating capabilities. + It is possible to create unit names containing the +\family typewriter +@ +\family default + symbol, which can then be expanded under certain circumstances, e.g. + to tty names etc. + However, automatic expansion is only done when somebody knows the instance + name already +\emph on +in advance +\emph default +. + The author has not found any way for creating instance names out of +\begin_inset Quotes eld +\end_inset + +thin air +\begin_inset Quotes erd +\end_inset + +, such as from dynamically created MARS resource names. + Essentially, an +\emph on +inference machine +\emph default + for systemd templates does not yet exist. +\end_layout + +\begin_layout Standard +This lacking functionality is completed with the following macro processing + capabilities of +\family typewriter +marsadm +\family default +: +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +sloppy +\end_layout + +\end_inset + + Some ordinary or templated +\family typewriter +systemd +\family default + unit files (see +\family typewriter +man systemd.unit +\family default +) can be installed into one of the following directories: +\family typewriter +./systemd-templates +\family default +, +\family typewriter +$HOME/.marsadm/systemd-templates/ +\family default +, +\family typewriter +/etc/marsadm/systemd-templates/ +\family default +, +\family typewriter +/usr/lib/marsadm/systemd-templates/ +\family default +, +\family typewriter +/usr/local/lib/marsadm/systemd-templates/ +\family default +. + Futher places can be defined by overriding the $ +\family typewriter +MARS_PATH +\family default + environment variable. +\end_layout + +\begin_layout Standard +From these directories, ordinary systemd unit files will be just copied + into +\family typewriter +/run/systemd/system/ +\family default + (configurable via +\family typewriter +$SYSTEMD_TARGET_DIR +\family default +) and then picked up by +\family typewriter +systemd +\family default + as ordinary unit files. +\end_layout + +\begin_layout Standard +Template unit files are nothing but unit files containing +\family typewriter +@{ +\emph on +varname +\emph default +} +\family default + parts or other macro definitions in their filename, and possibly also in + their bodies, at arbitrary places. + These +\family typewriter +@{...} +\family default + parts are substituted by a +\family typewriter +marsadm +\family default + macro processing engine. +\end_layout + +\begin_layout Standard +The following macro capabilities are currently defined: +\end_layout + +\begin_layout Description + +\family typewriter +@{ +\emph on +varname +\emph default +} +\family default + Expands to the value of the variable. + This can be used both in template filenames and and in content of template + files. + Predefined are the following variables: +\end_layout + +\begin_deeper +\begin_layout Description + +\family typewriter +@{res} +\family default + The MARS resource name. +\end_layout + +\begin_layout Description + +\family typewriter +@{resdir} +\family default + The MARS resource directory +\family typewriter +/mars/resource-$res/ +\family default +. +\end_layout + +\begin_layout Description + +\family typewriter +@{host} +\family default + The local host name as determined by +\family typewriter +marsadm +\family default +, or as overridden by the +\family typewriter +--host= +\family default + parameter. +\end_layout + +\begin_layout Description + +\family typewriter +@{cmd} +\family default + The +\family typewriter +marsadm +\family default + command as given on the command line (only reasonable for debugging or + for error messages). +\end_layout + +\begin_layout Description + +\family typewriter +@{ +\emph on +varname +\emph default +} +\family default + Further variables as defined by the macro processor, see section +\begin_inset CommandInset ref +LatexCommand vref +reference "par:Predefined-Variables" + +\end_inset + +, and as definable by +\family typewriter +%let{ +\emph on +varname +\emph default +}{...} +\family default + statements, see also sections +\begin_inset CommandInset ref +LatexCommand vref +reference "subsec:Predefined-Complex-and" + +\end_inset + + and +\begin_inset CommandInset ref +LatexCommand vref +reference "subsec:Predefined-Trivial-Macros" + +\end_inset + +. +\end_layout + +\end_deeper +\begin_layout Description + +\family typewriter +@eval{ +\emph on +text +\emph default +} +\family default + Calls the MARS macro processor as explained in chapter +\begin_inset CommandInset ref +LatexCommand vref +reference "chap:The-Macro-Processor" + +\end_inset + +, and substitutes its output. + Notice that systemd template variables occurring in the macro processor + +\family typewriter +\emph on +text +\family default +\emph default + must be accessed via the macro processor syntax +\family typewriter +%{varname} +\family default +, because the macro processor uses +\family typewriter +% +\family default + as an escape symbol, while the systemd template engine uses +\family typewriter +@ +\family default + instead. + This is necessary for distinction of both layers. + Notice that variables defined via the macro processor syntax +\family typewriter +%let{varname}{value} +\family default + can be afterwards accessed by the template engine via +\family typewriter +@{varname} +\family default + syntax, once the macro engine has finished working on +\family typewriter +\emph on +text +\family default +\emph default +. +\end_layout + +\begin_layout Description + +\family typewriter +^ +\emph on +{varname +\emph default +} +\family default + +\begin_inset space ~ +\end_inset + +or +\begin_inset space ~ +\end_inset + +, +\family typewriter +^ +\emph on +{varname +\emph default +}{ +\emph on +regex +\emph default +} +\family default + This can be used in template filenames only. + The +\family typewriter +\emph on +regex +\family default +\emph default + denotes a delimiter for scanning the filename until the delimiter is reached. + The matching part of the filename is assigned to +\family typewriter +\emph on +varname +\family default +\emph default +, and can be used at any following +\family typewriter + @{ +\emph on +varname +\emph default +} +\family default + substitutions, both in the rest of the filename, and in the content of + the file. + When +\family typewriter +\emph on +regex +\family default +\emph default + is omitted or empty, it defaults to +\family typewriter +- +\family default + (a single minus symbol) which is suitable for matching paths of mountpoints + as written in systemd syntax. +\end_layout + +\begin_layout Description + +\family typewriter +@esc{ +\emph on +text +\emph default +} +\family default + Calls the +\family typewriter +systemd-escape +\family default + tool for conversion of pathnames following the +\family typewriter +systemd +\family default + naming conventions (see +\family typewriter +man systemd-escape +\family default +). + For example, a dash is converted to +\family typewriter + +\backslash +x2d +\family default +. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Omitting this can lead to problems when your resource names are containing + special characters like dashes or other special symbols (in the sense of + +\family typewriter +systemd +\family default +). + Bugs of this kind are hard to find and to debug. + Either forbid special characters in your installation, or don't forget + to test everything with some crude resource names! +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + Example snippet from a +\family typewriter +.path +\family default + unit. + Please notice where escaping is needed and where it must not be used (also + notice that a dash is sometimes a legal part of the +\family typewriter +.mount +\family default + unit name, but except from the resource name part): +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +[Path] +\end_layout + +\begin_layout Plain Layout + +PathExists=/dev/mars/@{res} +\end_layout + +\begin_layout Plain Layout + +Unit=vol-@escvar{res}.mount +\end_layout + +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Another source of crude bugs is the backslash character in the +\family typewriter +systemd-escape +\family default + substitution, such as from +\family typewriter + +\backslash +x2d +\family default +. + When passed to a shell, such as in certain +\family typewriter +ExecStart= +\family default + statements, the backslash will be removed. + Therefore, don't forget to either replace any single backslash with two + backslashes, or to put the whole pathname in single quotes, or similar. + Always check the result of your substitutions! It depends on the +\emph on +target +\emph default + (such as +\family typewriter +bash +\family default +, as opposed to +\family typewriter +systemd +\family default +) whether further escaping of the escapes is needed, or whether it +\emph on +must not +\emph default + be applied. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Become a master of the escaping hell by inserting debug code into your scripts + (reporting to +\family typewriter +/dev/stderr +\family default + or to log files) and do thorough testing like a devil. +\end_layout + +\begin_layout Description + +\family typewriter +@escvar{ +\emph on +varname +\emph default +} +\family default + Equivalent to +\family typewriter +@esc{@{ +\emph on +varname +\emph default +}} +\family default +. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +When creating a new resource via +\family typewriter +marsadm create-resource +\family default +, or when adding a new replica via +\family typewriter +marsadm join-resource +\family default + or similar, the template system will automatically create new instances + for the new resource or its replicas. + Conversely, +\family typewriter +marsadm leave-resource +\family default + and its friends like +\family typewriter +delete-resource +\family default + etc will automatically remove the corresponding template instances from + +\family typewriter +/run/systemd/system/ +\family default +. +\end_layout + +\begin_layout Subsection +Example +\family typewriter +systemd +\family default + Templates +\begin_inset CommandInset label +LatexCommand label +name "subsec:Example-systemd-Templates" + +\end_inset + + +\end_layout + +\begin_layout Standard +These can be found in the MARS repo in the +\family typewriter +systemd/ +\family default + subdirectory. + At the moment, the following are available (subject to further extension + and improvements without notice): +\end_layout + +\begin_layout Description + +\family typewriter +mars.path +\family default + This ensures that the mountpoint +\family typewriter +/mars/ +\family default + is already mounted before +\family typewriter +mars.service +\family default + is started. +\end_layout + +\begin_layout Description + +\family typewriter +mars.service +\family default + This starts and stops the MARS kernel module, provided that +\family typewriter +/mars +\family default + is (somehow) mounted. + The latter can be ensured by classical +\family typewriter +/etc/fstab +\family default + methods, or by +\family typewriter +.mount +\family default + units like your own hand-crafted +\family typewriter +mars.mount +\family default + unit. +\end_layout + +\begin_layout Description + +\family typewriter +mars-trigger.path +\family default + This is used for remote triggering of the marsadm template engine from + another MARS cluster member, e.g. + when initiating a handover. + Local triggering is also possible via +\family typewriter +touch /mars/userspace/systemd-trigger +\family default +. + When triggered, the command +\family typewriter +marsadm systemd-trigger +\family default + is executed. + In turn, this will re-compute all +\family typewriter +systemd +\family default + templates and start those units where the local host is in primary role. +\end_layout + +\begin_layout Description + +\family typewriter +dev-mars-@{res}.path +\family default + This is used for generic triggering of any +\family typewriter +systemd +\family default + unit as set by +\family typewriter +marsadm set-systemd-unit $res $unit +\family default + (see below in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Handover-using-systemd" + +\end_inset + +). +\end_layout + +\begin_layout Description + +\family typewriter +^{unit}-@{res}.mount +\family default + This is one of the possible sub-ordinate targets which depend on +\family typewriter +dev-mars-@{res}.path +\family default +. + For fully automatic activation of this target, use something like +\family typewriter +marsadm set-systemd-unit mydata vol-mydata.mount +\family default + or similar. + This will automatically mount +\family typewriter +/dev/mars/mydata +\family default + to the mountpoint +\family typewriter +/vol/mydata +\family default +. + Notice that the template notation +\family typewriter +^{unit} +\family default + can be used for mounting to an arbitrary mountpoint, such as +\family typewriter + /another/mountdir/mydata +\family default +, by using the corresponding systemd template syntax in +\family typewriter +marsadm set-systemd-unit mydata another-mountdir-mydata.mount +\family default +. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + In general, it is good practice to have a +\emph on +consistent +\emph default + name scheme. + Always use the same name for the underlying LV (called disk in MARS terminology +), equal to the MARS resource name, equal to the last part of the mountpoint, + equal to the IQN of an iSCSI export, equal to the NFS share name, equal + to the LXC container name, equal to the KVM/qemu virtual machine name, + and so on. + Messing around with non-systematic naming conventions can easily result + in a hell. +\end_layout + +\begin_layout Subsection +Handover involving +\family typewriter +systemd +\begin_inset CommandInset label +LatexCommand label +name "subsec:Handover-using-systemd" + +\end_inset + + +\end_layout + +\begin_layout Standard +First, you need to install your systemd templates into one of the template + directories mentioned in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Working-Principle-of" + +\end_inset + +. + In case you have never used the template engine before, you can create + the first instantiation via +\family typewriter +marsadm systemd-trigger +\family default +. + Afterwards, inspect +\family typewriter +/run/systemd/system/ +\family default + for newly created template instances and check them. +\end_layout + +\begin_layout Standard +For each resource +\family typewriter +$res +\family default +, you should set (potentially different) systemd targets via +\family typewriter +marsadm set-systemd-unit $res +\begin_inset Quotes eld +\end_inset + +$start_unit +\begin_inset Quotes erd +\end_inset + + +\begin_inset Quotes eld +\end_inset + +$stop_unit +\family default + +\begin_inset Quotes erd +\end_inset + +. + Notice that +\family typewriter +$start_unit +\family default + and +\family typewriter +$stop_unit +\family default + are typically denoting different targets (with few exceptions) for the + following reason: +\end_layout + +\begin_layout Description +Example: assume your stack consists of +\family typewriter +vol-@{res}.mount +\family default + and +\family typewriter +nfs-export-@{res}.service +\family default +. + Before the filesystem can be exported via +\family typewriter +nfs +\family default +, it +\emph on +first +\emph default + needs to be mounted. + At startup, +\family typewriter +systemd +\family default + can do this easily for you: just add a +\family typewriter +Requires= +\family default + dependency between both targets, or similar. + However, the situation can become tricky upon shutdown. + Theoretically, +\family typewriter +systemctl stop nfs-export-@{res}.service +\family default + +\emph on +could +\emph default + work in some cases, but in general it is not reliable. + Reason: there might be other +\emph on +sister +\emph default + units which +\emph on +also +\emph default + depend on the mount. + In some cases, you need not necessarily notice that sisters, because systemd + can add further (internal) targets +\emph on +automatically +\emph default +. + The problem is easily solvable by +\family typewriter +systemctl stop vol-@{res}.mount +\family default +, which will automatically tear down all dependencies in reverse order. +\end_layout + +\begin_layout Standard +For maximum safety, +\family typewriter +$start_unit +\family default + should always point at the +\emph on +tip +\emph default + of your stack, while +\family typewriter +$stop_unit +\family default + should point at the +\emph on +bottom +\emph default + (but one level higher than +\family typewriter +/dev/mars/$res +\family default +). +\end_layout + +\begin_layout Standard +Removing any systemd targets is also possible via +\family typewriter +marsadm set-systemd-unit $res +\begin_inset Quotes eld +\end_inset + + +\begin_inset Quotes erd +\end_inset + + +\family default + . + +\end_layout + +\begin_layout Standard +When everything is set up properly, the following should work: +\end_layout + +\begin_layout Enumerate +Issue +\family typewriter +marsadm primary $res +\family default + on another node which is currently in secondary role. +\end_layout + +\begin_layout Enumerate +As a consequence, +\family typewriter +systemctl stop +\begin_inset Quotes eld +\end_inset + +$stop_unit +\begin_inset Quotes erd +\end_inset + + +\family default + should be automatically executed at the old primary side. + +\end_layout + +\begin_layout Enumerate +After a while, the MARS kernel module will notice that +\family typewriter +/dev/mars/$res +\family default + is no longer opened. + You can check this manually via +\family typewriter + marsadm view-device-opened $res +\family default + which will tell you a boolean result. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + In case the device is not closed, ordinary handover cannot proceed, because + somebody could (at least potentially) write some data into it, even after + the handover, which would lead to a split brain. + Therefore MARS +\emph on +must +\emph default + insist that the device is closed before ordinary handover will proceed. + In case it is not closed, you can (a) use +\family typewriter +primary --force +\family default + which will likely provoke a split brain, or (b) check your +\family typewriter +systemd +\family default + configuration or other sources of error why the device is not closed. + Possible reasons could be hanging processes or hanging sessions which might + need a +\family typewriter +kill +\family default + or a +\family typewriter +kill -9 +\family default + or similar. + Notice that +\family typewriter +lsof +\family default + does not catch +\emph on +all +\emph default + possible sources like (recursive or bind-) mounts. +\end_layout + +\begin_layout Enumerate +Once +\family typewriter +/dev/mars/$res +\family default + has disappeared, the ordinary MARS handover from the old primary to the + new site should proceed as usual. +\end_layout + +\begin_layout Enumerate +After +\family typewriter +/dev/mars/$res +\family default + has appeared at the new site, +\family typewriter +systemctl start +\begin_inset Quotes eld +\end_inset + +$start_unit +\begin_inset Quotes erd +\end_inset + + +\family default + should be automatically executed. + +\end_layout + +\begin_layout Standard +The rest depends on your +\family typewriter +systemd +\family default + and its configuration. + For example, you can configure systemd targets for activation of VMs, or + for +\family typewriter +LXC +\family default + containers, or for +\family typewriter +iSCSI +\family default + exports, or for +\family typewriter +nfs +\family default + exports, or for +\family typewriter +glusterfs +\family default + exports, or for whatever you need. + For true geo-redundancy, you will likely have to include some +\family typewriter +quagga +\family default + or +\family typewriter +bird +\family default + or other BGP configurations into your stack. +\end_layout + +\begin_layout Section +Creating Backups via Pseudo Snapshots +\end_layout + +\begin_layout Standard +When all your secondaries are all homogenously located in a standby datacenter, + they will be almost idle all the time. + This is a waste of computing resources. +\end_layout + +\begin_layout Standard +Since MARS is no substitute for a full-fledged backup system, and since + backups may put high system load onto your active side, you may want to + utilize your passive hardware resources in a better way. +\end_layout + +\begin_layout Standard +MARS supports this thanks to its ability to switch the +\family typewriter +pause-replay +\family default + +\emph on +independently +\emph default + from +\family typewriter +pause-fetch +\family default +. +\end_layout + +\begin_layout Standard +The basic idea is simple: just use +\family typewriter +pause-replay +\family default + at your secondary site, but leave the replication of transaction logfiles + intact by deliberately +\emph on +not +\emph default + saying +\family typewriter +pause-fetch +\family default +. + This way, your secondary replica (block device) will stay frozen for a + limited time, without loosing your redundancy: since the transaction logs + will continue to replicate in the meantime, you can start +\family typewriter +resume-replay +\family default + at any time, in particular when a primary-side incident should happen unexpecte +dly. + The former secondary will just catch up by replaying the outstanding parts + of the transaction logs in order to become recent. +\end_layout + +\begin_layout Standard +However, some +\emph on +details +\emph default + have to be obeyed. + In particular, the current version of MARS needs an additional +\family typewriter +detach +\family default + operation, in order to release exclusive access to the underlying disk + +\family typewriter +/dev/lv/$res +\family default +. + Future versions of MARS are planned to support this more directly, without + need for an intermediate +\family typewriter +detach +\family default + operation. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + +Beware: +\family typewriter +mount -o ro /dev/vg/$res +\family default + can lead to +\series bold +unnoticed write operations +\series default + if you are not careful! Some journalling filesystems like +\family typewriter +xfs +\family default + or +\family typewriter +ext4 +\family default + may replay their journals onto the disk, leading to +\emph on +binary +\emph default + differences and thus +\series bold +destroying your consistency +\series default + later when you re-enable +\family typewriter +resume-replay +\family default +! +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Therefore, you may use small LVM snapshots (only in such cases). + Typically, +\family typewriter +xfs +\family default + journal replay will require only a few megabytes. + Therefore you typically don't need much temporary space for this. + Here is a more detailed description of steps: +\end_layout + +\begin_layout Enumerate + +\family typewriter +marsadm pause-replay $res +\end_layout + +\begin_layout Enumerate + +\family typewriter +marsadm detach $res +\end_layout + +\begin_layout Enumerate + +\family typewriter +lvcreate --size 100m --snapshot --name ro-$res /dev/vg/$res +\end_layout + +\begin_layout Enumerate + +\family typewriter +mount -o ro /dev/vg/ro-$res /mnt/tmp +\end_layout + +\begin_layout Enumerate +Now draw your backup from +\family typewriter +/mnt/tmp/ +\end_layout + +\begin_layout Enumerate + +\family typewriter +umount /mnt/tmp +\end_layout + +\begin_layout Enumerate + +\family typewriter +lvremove -f /dev/vg/ro-$res +\end_layout + +\begin_layout Enumerate + +\family typewriter +marsadm up $res +\end_layout + +\begin_layout Standard +Hint: during the backup, the transaction logs will accumulate on +\family typewriter +/mars/ +\family default +. + In order to avoid overflow of +\family typewriter +/mars/ +\family default + (c.f. + section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Defending-Overflow" + +\end_inset + +), don't unnecessarily prolong the backup duration. +\end_layout + +\begin_layout Chapter +LV Football / VM Football / Container Football +\begin_inset CommandInset label +LatexCommand label +name "chap:LV-Football" + +\end_inset + + +\end_layout + +\begin_layout Standard +The Football scripts can be obtained in two different ways: +\end_layout + +\begin_layout Enumerate + +\family typewriter +git clone --recurse-submodules +\begin_inset Newline newline +\end_inset + + +\family default +then +\family typewriter +cd mars/football/ +\end_layout + +\begin_layout Enumerate + +\family typewriter +git clone +\end_layout + +\begin_layout Standard +The +\family typewriter +--recurse-submodule +\family default + method is the preferred way for non-developers because the main repo contains + a link to the right version of Football. +\end_layout + +\begin_layout Standard +When switching branches, you should use +\family typewriter +git submodule update +\family default + for synchronizing the Football submodule with the MARS main checkout. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Recommended MARS branch for playing Football is +\family typewriter +mars0.1a.y +\family default +. + Although the old stable branch +\family typewriter +mars0.1.y +\family default + has been updated for the most important +\family typewriter +marsadm +\family default + features +\family typewriter +merge-cluster +\family default + and +\family typewriter +split-cluster +\family default +, it does not scale well for Football and can cause operational problems + when merging too many hosts together, showing some +\begin_inset Formula $O(n^{2})$ +\end_inset + + metadata update behaviour where +\begin_inset Formula $n$ +\end_inset + + is the number of machines in a MARS cluster. + The future branch +\family typewriter +mars0.1b.y +\family default + will contain more scalability improvements; in particular the +\family typewriter +split-cluster +\family default + operation should no longer be needed at all because it is planned to scale + with +\begin_inset Formula $O(k)$ +\end_inset + + where +\begin_inset Formula $k$ +\end_inset + + is the number of resources at a +\emph on +single +\emph default + host. + This should allow creation of a +\emph on +virtual(!) +\emph default + +\family typewriter +BigCluster +\family default + pool at +\emph on +metadata +\emph default + level (where metadata transfer rates are typically measured in KiB/s), + consisting of thousands of machines, while at the same time creating a + +\family typewriter +LocalSharding +\family default + or +\family typewriter +FlexibleSharding +\family default + model at the realtime IO paths (where some petabytes are pumped through + thick pipelines). + Please check the other branches regularly at the github repo whether some + newer branches will be marked +\begin_inset Quotes eld +\end_inset + +stable +\begin_inset Quotes erd +\end_inset + +, or at least +\begin_inset Quotes eld +\end_inset + +beta +\begin_inset Quotes erd +\end_inset + +. + At the moment (spring 2018), +\family typewriter +mars0.1a.y +\family default + is marked +\begin_inset Quotes eld +\end_inset + +beta +\begin_inset Quotes erd +\end_inset + + although it is in production at several thousands of machines for several + months. +\end_layout + +\begin_layout Standard +Low-level documentation is available by calling any of the scripts with + +\family typewriter +--help +\family default + parameter (see also appendix +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:football-–help" + +\end_inset + + ff): +\end_layout + +\begin_layout Itemize + +\family typewriter +./ --help +\end_layout + +\begin_layout Itemize + +\family typewriter +./ --help +\end_layout + +\begin_layout Standard +By adding +\family typewriter +--verbose +\family default +, you can get a list of parameters for configuring and tweaking. +\end_layout + +\begin_layout Section +Football Overview +\begin_inset CommandInset label +LatexCommand label +name "sec:Football-Overview" + +\end_inset + + +\end_layout + +\begin_layout Standard +Topmost architectural level (not yet implemented): +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/pool-optimizer.fig + width 100col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +The planned heart of the Football system is the generic pool optimizer, + which aims to provide a similar functionality than Kubernetes, but working + on a sharding architecture. + Instead of controlling +\emph on +stateless +\emph default + Docker containers, its designated goal is to control masses of LVs on thousands + of machines, creating a +\begin_inset Quotes eld +\end_inset + +Virtually Distributed LVM pool +\begin_inset Quotes erd +\end_inset + + (petabytes of total storage), and doing similar things than Software Defined + Storage (SDS) on the virtual pool. +\end_layout + +\begin_layout Standard +In addition to load balancing of storage space (and its special cases like + hardware lifecycle), there will be designated plugins for dealing with + CPU and RAM dimensions. + Further dimensions and a variety of goal functions could be added via future + plugins. + The optimizer itself aims to be as generic as possible, while functionality + and interfaces can be added via plugins and/or drivers. + Future versions might even support DRBD in addition to MARS. + The first version may use a simple greedy algorithm for solving the underlying + +\begin_inset Formula ${\cal NP}$ +\end_inset + +-complete problem, but could be augmented with more sophisticated problem + solvers in future. +\end_layout + +\begin_layout Standard +The automatic operations generated by pool-optimizer will be customizable + by dozens of parameters, and also extendable by action plugins. + At the moment, the following +\family typewriter +\family default + actions are planned: +\end_layout + +\begin_layout Description + +\family typewriter +migrate +\family default + This will move an LV (together with its VM / LXC container / etc) to a + different machine in the machine pool. + This is the classical Football +\begin_inset Quotes eld +\end_inset + +kick +\begin_inset Quotes erd +\end_inset + + operation. +\end_layout + +\begin_layout Description + +\family typewriter +shrink +\family default + This decreases the occupied LV space of a filesystem (currently only +\family typewriter +xfs +\family default + implemented, but easily extendable) via creation of a smaller temporary + LV at the hypervisor, then transferring all data during operations via + local +\family typewriter +rsync +\family default +, then shutting down the VM for a short period, doing a final incremental + +\family typewriter +rsync +\family default +, renaming the copied temporary LV to its original name, restarting the + VM on the new version (which contains the same data as before but wastes + less space), and finally re-establishing the MARS replicas (but of course + with smaller LV size). +\end_layout + +\begin_layout Description + +\family typewriter +extend +\family default + This is much easier than shrinking: it first increases the underlying LV + size dynamically on all replicas, then +\family typewriter +marsadm resize +\family default +, and finally calls +\family typewriter +xfs_growfs +\family default + while the filesystem remains mounted and while the VM / container is running. +\end_layout + +\begin_layout Description + +\family typewriter +migrate+shrink +\family default + Similar to +\family typewriter +migrate +\family default + immediately followed by +\family typewriter +shrink +\family default +, but produces less network traffic and runs faster. +\end_layout + +\begin_layout Description + +\family typewriter +migrate+shrink+back +\family default + Use this when there is not enough local temporary space for shrinking. + The LV is first migrated to a temporary host, then shrunk, and finally + migrated back to its original position. +\end_layout + +\begin_layout Standard +By running the overall system in an endless loop, a control loop for permanent + optimization can be established. + Typical periods are each few days, or once a week. + In addition, manual triggering is also possible. +\end_layout + +\begin_layout Standard +The result of an (incremental) pool-optimizer run is a CSV file, which may + be automatically forwarded to the execution engine +\family typewriter +\family default + for +\emph on +manual +\emph default + execution, or to +\family typewriter +\family default + for mass execution on a common control machine. + Alternatively, intermediate steps like manual checking, filtering etc may + be inserted into the processing pipeline. +\end_layout + +\begin_layout Standard +The excecution engine +\family typewriter +\family default + rep. + its 1&1-internal variant +\family typewriter +\family default + is already in production at 1&1, and already reached more than 300 migrations + per week. + Architecture of the execution engine: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/football.fig + width 90col% + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +The so-called Screener is simply a generic program allowing mass execution + of arbitrary scripts in background +\family typewriter +screen +\family default + sessions. + This allows masses (several hundreds, possibly thousands) of long-lasting + processes (hours or days) to run +\emph on +unattended +\emph default + in background, while allowing a (larger) group of sysadmins to attach / + detach to +\family typewriter +screen +\family default + sessions at any time for corrective by-hand actions, e.g. + in case of failures or other problems, or for supervision, etc. +\end_layout + +\begin_layout Standard +When Screener is combined with the Football execution engine +\family typewriter +\family default +, more specialized functionality is available (via a variety of plugins): +\end_layout + +\begin_layout Itemize +Optional waiting for sysadmin confirmation before some customer downtime + is initiated. +\end_layout + +\begin_layout Itemize +Automatic generation of +\family typewriter +motd +\family default + status reporting to other sysadmins. +\end_layout + +\begin_layout Itemize +Automatic sending of email alerts or status reports, e.g. + on errors or critical errors, etc. + By sending email to SMS gateways, real-time alerting can be configured + (e.g. + over the weekend). +\end_layout + +\begin_layout Itemize +Generic interfacing to external scripts with configurable parameters, e.g. + for triggering monitoring systems, feeding external databases, etc. +\end_layout + +\begin_layout Standard +Screener can detect and will automatically manage the following states (in + this example, all state lists are empty): +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +$common_user> ./ list +\end_layout + +\begin_layout Plain Layout + +List of waiting: +\end_layout + +\begin_layout Plain Layout + +List of delayed: +\end_layout + +\begin_layout Plain Layout + +List of condition: +\end_layout + +\begin_layout Plain Layout + +List of running: +\end_layout + +\begin_layout Plain Layout + +List of critical: +\end_layout + +\begin_layout Plain Layout + +List of serious: +\end_layout + +\begin_layout Plain Layout + +List of interrupted: +\end_layout + +\begin_layout Plain Layout + +List of illegal: +\end_layout + +\begin_layout Plain Layout + +List of failed: +\end_layout + +\begin_layout Plain Layout + +List of timeouted: +\end_layout + +\begin_layout Plain Layout + +List of done: +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +Screener can discriminate the +\emph on +seriosity +\emph default + of errors as follows: +\end_layout + +\begin_layout Description + +\family typewriter +failed +\family default + An error occurred +\emph on +outside +\emph default + of critical sections, e.g. + during preparation of LV space etc. + During ordinary operations, VMs / containers are usually running continuously, + and there is no customer impact to be expected. + Typically, +\family typewriter +./ restart $resource +\family default + should fix the problem if it is only a temporary problem. + However, for maximum safety, manual inspection via . +\family typewriter +/ attach $resource +\family default + or inspection of the logfile via . +\family typewriter +/ show $resource +\family default + is recommended before trying an automatic restart. +\end_layout + +\begin_layout Description + +\family typewriter +serious +\family default + An error occured while a VM / container was temporarily stopped, which + +\series bold +would +\series default + normally lead to customer downtime, but Football was able to +\emph on +compensate +\emph default + the problem +\emph on +for now +\emph default + by +\emph on +automatically +\emph default + restarting the VM. + Thus no long-lasting customer impact has likely occurred. + However, manual inspection and repair by sysadmins is likely necessary. +\end_layout + +\begin_layout Description + +\family typewriter +critical +\family default + An +\emph on +uncompensated +\emph default + error occured during customer downtime. + The VM / container is likely down. + This will need manual sysadmin actions ASAP, such as hardware replacement, + networking fixes, etc. +\end_layout + +\begin_layout Description + +\family typewriter +timeouted +\family default + This means that the script is assumed to hang because it did not produce + any output for more than +\family typewriter +$session_timeout +\family default + seconds (default 3600 * 3 = 3 hours). +\end_layout + +\begin_layout Description + +\family typewriter +illegal +\family default + This means that a precondition is not met. + For example, there is not enough space at the target LVM. +\end_layout + +\begin_layout Description + +\family typewriter +interrupted +\family default + Somebody has pressed +\family typewriter +Ctl-c +\family default + in a +\family typewriter +screen +\family default + session, or has otherwise sent a signal to the running script. + As a result, a signal +\family typewriter +trap +\family default + has been executed. +\end_layout + +\begin_layout Standard +\noindent +Ordinary Screener states during execution: +\end_layout + +\begin_layout Description + +\family typewriter +running +\family default + This means that a (background) process is currently running. + You can attach to the screen session either manually via +\family typewriter +screen -x $pid.$resource +\family default +, or more comfortably via +\family typewriter +./ attach $resource +\family default +. + Then you can use +\family typewriter +screen +\family default + as documented in +\family typewriter +man screen +\family default +. + The most important operation is detaching via keystrokes +\family typewriter +Ctrl-a d +\family default +. + +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + +Notice: don't press +\family typewriter +Ctrl-c +\family default + unless you know what you are doing. + In most cases, this will terminate the running process, and in consequence + lead to +\family typewriter +\series bold +interrupted +\family default +\series default + or +\family typewriter +\series bold +failed +\family default +\series default + or even +\family typewriter +\series bold +critical +\family default +\series default + state (depending on the moment of keypress). + Depending on parameter +\family typewriter +drop_shell +\family default +, the Screener session will also terminate, or you will get an interactive + shell for manual repair. +\end_layout + +\begin_layout Description + +\family typewriter +waiting +\family default + When the plugins +\family typewriter +football-waiting +\family default + and +\family typewriter +screener-waiting +\family default + are configured properly (which is +\emph on +not +\emph default + the default), the script execution will pause immediately before a customer + downtime action would be started. + Now any sysadmin from the larger group has a chance to +\family typewriter +./screener attach $resource +\family default + and to press RETURN to continue the waiting script and to personally watch + the course of the critical section. + There are some more comfortable variants like +\family typewriter +./screener continue $resource +\family default + for background continuation of a single session, or +\family typewriter +./screener continue 100 +\family default + which can be used for continuing masses of waiting sessions. + There are further variants which are automatically attaching to sessions, + see Appendix +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:screener–help" + +\end_inset + +. +\end_layout + +\begin_layout Description + +\family typewriter +delayed +\family default + This state is only entered before +\family typewriter +lvremove $resource +\family default + is executed (which will destroy your old internal backup copy), and when + configured appropriately. + Typically, you also need to configure the +\family typewriter +$wait_before_cleanup +\family default + variable in order to avoid endless waiting. + Notice that old LV data gets soon outdated after a while, so please don't + unnecessarily prolong the running time of your scripts by choosing too + long +\family typewriter +$wait_before_cleanup +\family default + values. +\end_layout + +\begin_layout Description + +\family typewriter +condition +\family default + Special case of delay: some condition is currently not met, such as the + +\family typewriter +$business_hours +\family default + feature, where you can configure when customer downtimes are allowed, and + when not. +\end_layout + +\begin_layout Description + +\family typewriter +done +\family default + This means that the script reported successful execution by exit status + +\family typewriter +0 +\family default +. + The background screen session terminated automatically. + You can inspect the logfile manually via +\family typewriter +./ show $resource +\family default +, or by looking into the directory +\family typewriter +$screener_logdir/done/ +\family default +. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Logfiles of other states can also be inspected (or monitored by standard + tools like +\family typewriter +grep +\family default +) by looking into sister directories, such as +\family typewriter +$screener_logdir/running/ +\family default +. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + +When running Screener for several months or years, old logfiles will accumulate + in these directories over time. + Call +\family typewriter +./ purge +\family default + or +\family typewriter +./ cron +\family default + regularly via a cron job, or archieve your old logfiles from time to time + via another method. +\end_layout + +\begin_layout Section +HOWTO instantiate / customize Football +\begin_inset CommandInset label +LatexCommand label +name "sec:HOWTO-instantiate-Football" + +\end_inset + + +\end_layout + +\begin_layout Standard +In order to install and operate Football, the recommended +\emph on +deployment +\emph default + strategy is bottom-up, layer by layer. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Top-down strategies should be used +\emph on +only +\emph default +, and +\emph on +only +\emph default +, for planning. + An Egyptian pyramid can never be built, even if you had some billions of + workers, by starting at the tip and by creating the foundations as the + very last step. + Suchalike attempt would end up in a disaster. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + + +\series bold +Testing +\series default + of each layer +\series bold +separately +\series default + is very important. + Before proceeding to the next higher layer, first ensure that any lower + layer is working +\emph on +correctly +\emph default +. + Otherwise debugging can become tricky. +\end_layout + +\begin_layout Subsection +Block Device Layer +\end_layout + +\begin_layout Standard +Step-by-step instructions can be found in chapter +\begin_inset CommandInset ref +LatexCommand vref +reference "chap:Quick-Start-Guide" + +\end_inset + +. +\end_layout + +\begin_layout Standard +Please ensure that your hardware (including RAID controllers and LVM and + so on), and your operating system, and your network / setup, and MARS is + working correctly before proceeding to the next layer. +\end_layout + +\begin_layout Subsection +Mechanics Layer of Cluster Operations +\begin_inset CommandInset label +LatexCommand label +name "subsec:Mechanics-Layer of Cluster" + +\end_inset + + +\end_layout + +\begin_layout Standard +In the following example, it is assumed that +\family typewriter +systemd +\family default + is used, as explained in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:systemd-Templates" + +\end_inset + +, and now applied to +\family typewriter +vm4711 +\family default + supposed to run on hypervisors +\family typewriter +hyper1234a +\family default + (primary role) and +\family typewriter +hyper1234b +\family default + (secondary role), which is assumed to be controllable via the following + +\family typewriter +systemd +\family default + start and stop units: +\end_layout + +\begin_layout Itemize + +\family typewriter +marsadm set-systemd-unit vm4711 vol-vm4711.mount +\end_layout + +\begin_layout Standard +Test the cluster mechanics layer like in the following example: +\end_layout + +\begin_layout Itemize +On host +\family typewriter +hyper1234b +\family default +, the following must work: +\family typewriter +marsadm primary vm4711 +\end_layout + +\begin_layout Standard +This must result in an automatic handover of +\family typewriter +vm4711 +\family default + from the current primary site +\family typewriter +hyper1234a +\family default + to the new primary +\family typewriter +hyper1234b +\family default +, as explained in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:systemd-Templates" + +\end_inset + +. + Please check that +\family typewriter +vm4711 +\family default + is running correctly at the new location. + It must be reachable via network. + In case you are using BGP because +\family typewriter +hyper1234a +\family default + and +\family typewriter +hyper1234b +\family default + are located in different datacenters, ensure that BGP is also controlled + by your +\family typewriter +systemd +\family default + unit dependencies, and test it. +\end_layout + +\begin_layout Subsection +Mechanics Layer of Football Operations +\begin_inset CommandInset label +LatexCommand label +name "subsec:Mechanics-Layer-of-Football" + +\end_inset + + +\end_layout + +\begin_layout Standard +At the moment, there are two alternative plugins already implemented in + the Football sub-project (see subdirectory +\family typewriter +football/plugins/ +\family default +). + Of course, you can implement some further plugins. + Please put them under GPL, and share them. + Please contact the author of MARS for inclusion into the official MARS + release. +\end_layout + +\begin_layout Description + +\family typewriter +\family default + This plugin can be only used at Shared Hosting Linux (ShaHoLin) at 1&1, + since it is bound to a specific +\emph on +proprietary +\emph default + instance. + However, the +\emph on +sourcecode +\emph default + of the +\emph on +plugin +\emph default + itself (not the code called by the plugin, e.g. + over REST interfaces) is under GPL. + You can (and +\emph on +should +\emph default +) +\emph on +inspect +\emph default + the plugin code, and +\series bold +learn +\series default + how a real-world system (which has grown over some decades and bears a + lot of history) is actually working at certain points. +\begin_inset Newline newline +\end_inset + +This plugin is automatically activated when called via the symlink +\family typewriter +\family default + instead of directly calling +\family typewriter +\family default +. + This has historic reasons. +\end_layout + +\begin_layout Description + +\family typewriter +\family default + This plugin uses the new +\family typewriter +systemd +\family default + interface of +\family typewriter +marsadm +\family default + for controlling the mechanics. + See section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:systemd-Templates" + +\end_inset + +. + You should be familiar with commands like +\family typewriter +marsadm set-systemd-unit +\family default +. + Manual handover via +\family typewriter +marsadm primary $resource +\family default + must be already working (with high reliability +\begin_inset Formula $\leadsto$ +\end_inset + + check that any +\family typewriter +umount +\family default + works everywhere without hangups) before you can start using this plugin + for +\family typewriter +\family default +. +\begin_inset Newline newline +\end_inset + +This plugin is automatically activated when calling + It can be deactivated by overriding variable +\family typewriter +enable_basic +\family default +=0. +\end_layout + +\begin_layout Subsubsection +Configuring and Overriding Variables +\end_layout + +\begin_layout Standard +A detailed list of all available customization options can be obtained via + +\family typewriter +./ --help --verbose +\family default +. + Each option is documented by some help text, and you can always see the + default settings. + See also section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:football-help-verbose" + +\end_inset + +. +\end_layout + +\begin_layout Standard +If you create any new plugin for Football, or if you modify an existing + one, please follow these standards. + Try to describe any option as concisely as possible. +\end_layout + +\begin_layout Standard +Configuring is possible in the following ways, in order of precedence: +\end_layout + +\begin_layout Itemize +at the command line via +\family typewriter +./ --$variable_name=$value $arguments +\family default +. +\end_layout + +\begin_layout Itemize +via environment variables, e.g. + globally via +\family typewriter +export $variable_name=$value && ./ $arguments +\family default +, or locally via +\family typewriter +$variable_name=$value ./ $arguments +\family default +. +\end_layout + +\begin_layout Itemize +by adding some small +\family typewriter +football-*.conf +\family default + files into one of the directories +\family typewriter +/usr/lib/mars/plugins +\family default + +\family typewriter +/etc/mars/plugins +\family default + +\family typewriter +$script_dir/plugins +\family default + +\family typewriter +$HOME/.mars/plugins +\family default + +\family typewriter +./plugins +\family default +, in this order of precedence. + This list of directories can be modifed externally over the environment + variable +\family typewriter +football_includes +\family default + (but not during already running inclusions of +\family typewriter +football-*.conf +\family default + files). +\end_layout + +\begin_layout Subsubsection + +\family typewriter +\family default + Customization +\end_layout + +\begin_layout Standard +Here is a brief summary of the most important configuration tasks and options: +\end_layout + +\begin_layout Description + +\family typewriter +initial_hostname_file +\family default + Somehow, the +\family typewriter +\family default + plugin must know the hostnames of your pool. + Once Football is working, the hostname will be +\emph on +automatically +\emph default + maintained whenever +\family typewriter +marsadm join-cluster +\family default + or +\family typewriter +marsadm merge-cluster +\family default + is executed somewhere. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +For your hardware deployment strategy, this means the following: just deploy + any new hardware, or remove your old one (after Football has emptied all + of your former LV resources). + It does not matter how you are doing this, e.g. + via OpenStack, or via the proprietary +\family typewriter +Schlunix +\family default + methods used at ShaHoLin, or whatever. + Then you have the following options for adding the new machines to the + Football hostname cache (see variable +\family typewriter +hostname_cache +\family default +): +\end_layout + +\begin_deeper +\begin_layout Enumerate +Write the pure hostname(s) into the file as configured with +\family typewriter +initial_hostname_file +\family default + (by default: +\family typewriter +./hostnames.input +\family default +). + Each hostname must be on its own ASCII line. + Not only these new hosts will be picked up automatically, but also... +\end_layout + +\begin_layout Enumerate +...any further hosts reported anywhere (at the already known hosts) by +\family typewriter +marsadm view-cluster-members +\family default +, +\series bold +transitively +\series default +. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Consequence: if you are running the new +\family typewriter +mars0.1b.y +\family default + (or newer) branch of MARS, you don't need +\family typewriter +marsadm split-cluster +\family default + anymore. + Then you can operate several thousands of machines as a big +\series bold +virtual +\series default + cluster, even if their storage is local (see +\family typewriter +LocalSharding +\family default + model described in section +\begin_inset CommandInset ref +LatexCommand vref +reference "subsec:Variants-of-Sharding" + +\end_inset + +). +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Previous versions of MARS, like +\family typewriter +mars0.1.y +\family default + and +\family typewriter +mars0.1a.y +\family default +, are not yet scalable at their +\series bold +metadata +\series default + exchange level. + Trying to +\family typewriter +join-cluster +\family default + or +\family typewriter +merge-cluster +\family default + several tens or even hundreds of machines with those versions will surely + lead to a disaster. + Always use +\family typewriter +marsadm split-cluster +\family default + at those versions, regularly. + First upgrade to the future +\family typewriter +mars0.1b.y +\family default + (or later versions) before creating big clusters at +\emph on +metadata +\emph default + level! +\end_layout + +\begin_layout Enumerate +Use +\family typewriter +./ basic_add_host $hostname +\family default + for adding a single new host manually. + Afterwards, the transitive closure of all reachable hosts is computed as + usual. + This may also be used for the very first initialization of a fresh Football + installation, provided you already have a big cluster at metadata level. +\end_layout + +\end_deeper +\begin_layout Standard +Test the Football mechanics like one of the following example command sequences, + where it is assumed that +\family typewriter +hyper4321a +\family default + and +\family typewriter +hyper4321b +\family default + are already +\emph on +newly +\emph default + deployed hypervisors having enough local LVM storage, and have been already + added to the MARS cluster via +\family typewriter +marsadm join-cluster +\family default +, or have been at least added to +\family typewriter +hostname_cache +\family default + as explained above: +\end_layout + +\begin_layout Itemize + +\family typewriter +ssh-add; ./ migrate vm4711 hyper4321a hyper4321b +\end_layout + +\begin_layout Itemize + +\family typewriter +ssh-add; ./ migrate vm4711 hyper4321a hyper4321b --screener; ./screener.s +h attach vm4711 +\end_layout + +\begin_layout Standard +Check the automatically produced logfile (via +\family typewriter +./ show vm4711 +\family default +) that Football has automatically determined the old hypervisor where +\family typewriter +vm4711 +\family default + was running before, that it has automatically executed +\family typewriter +marsadm merge-cluster +\family default + when necessary, and has created the LV replicas at the new hypervisors, + and has executed some +\family typewriter +marsadm join-resource +\family default + commands, has automatically waited for MARS fast fullsync to finish, then + successfully executed an automatic handover to the new primary hypervisor, + and finally has destructed the old MARS replicas including their old LVs. + Check that +\family typewriter +vm4711 +\family default + is running correctly at the new hypervisor pair, and that handover between + the new hypervisor sites +\family typewriter +*a +\family default + and +\family typewriter +*b +\family default + is working correctly. + +\end_layout + +\begin_layout Standard +A larger group of sysadmins can co-work over a central common control machine + via ssh agent forwarding (which must be enabled in +\family typewriter +/etc/ssh/sshd_config +\family default +) in the following way: +\end_layout + +\begin_layout Itemize +At the workstation: +\family typewriter +ssh-add; ssh -A +\family default + +\begin_inset Newline newline +\end_inset + +Then +\family typewriter +cd $script_dir +\family default + and run your +\family typewriter +./ +\family default + or +\family typewriter +./ +\family default + commands as usual. + The automatically generated logfiles will be tagged with the +\emph on +real +\emph default + usernames from your original workstation login, as reported by +\family typewriter +ssh-add -l +\family default +, even transitively when using ssh agent forwarding. + Thus you may use a common username like +\family typewriter +football +\family default + on the common +\begin_inset Foot +status open + +\begin_layout Plain Layout +Of course, it is also possible to maintain individual accounts for the same + Unix group, and set +\family typewriter +umask +\family default + and common directory permissions accordingly, such that the classical group-wis +e working concept from the 1970s will do the rest. + This is much more work, but can establish more fine-grained access control. + Even more sophisticated methods could involve ACLs, but suchalike is probably + only necessary at extremely high-sensitive installations. +\end_layout + +\end_inset + + control machine. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Hint: use +\family typewriter +./ list +\family default + (or one of its more specific variants like +\family typewriter +./ list-running +\family default +) for determining what's currently going on in a larger group of sysadmins. +\end_layout + +\begin_layout Chapter +MARS for Developers +\end_layout + +\begin_layout Standard +This chapter is organized strictly top-down. +\end_layout + +\begin_layout Standard +If you are a sysadmin and want to inform yourself about internals (useful + for debugging), the relevant information is at the beginning, and you don't + need to dive into all technical details at the end. +\end_layout + +\begin_layout Standard +If you are a kernel developer and want to contribute code to the emerging + MARS community, please read it (almost) all. + Due to the top-down organization, sometimes you will need to follow some + forward references in order to understand details. + Therefore I recommend reading this chapter twice in two different reading + modes: in the first reading pass, you just get a raw network of principles + and structures in your brain (you don't want to grasp details, therefore + don't strive for a full understanding). + In the second pass, you will exploit your knowlegde from the first pass + for a deeper understanding of the details. +\end_layout + +\begin_layout Standard +Alternatively, you may first read the sections about general architecture, + and then start a bottom-up scan by first reading the last section about + generic objects and aspects, and working in reverse +\emph on +section +\emph default + order (but read +\emph on +sub +\emph default +sections in-order) until you finally reach the kernel interfaces / symlink + trees. +\end_layout + +\begin_layout Section +Motivation / Politics +\end_layout + +\begin_layout Standard +MARS is not yet upstream in the Linux kernel. + This section tries to clear up some potential doubts. + Some people have asked why MARS uses its own internal framework instead + of +\emph on +directly +\emph default + +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice that +\emph on +indirect +\emph default + use of pre-existing Linux infrastructure is not only possible, but actually + implemented, by usinig it +\emph on +internally +\emph default + in brick +\emph on +implementations +\emph default + (black-box principle). + However, such bricks are not portable to other environments like userspace. +\end_layout + +\end_inset + + being based on some already existing Linux kernel infrastructures like + the device mapper. + Here is a list of technical reasons: +\end_layout + +\begin_layout Enumerate +The existing device mapper infrastructure is based on +\family typewriter +struct bio +\family default +. + In contrast, the new XIO personality of the generic brick infrastructure + is based on the concept of AIO (Asynchronous IO), which is a +\series bold +true superset +\series default + of block IO. +\end_layout + +\begin_layout Enumerate +In particular, +\family typewriter +struct bio +\family default + is firmly referencing to +\family typewriter +struct page +\family default + (via intermediate +\family typewriter +struct bio_vec +\family default +), using types like +\family typewriter +sector_t +\family default + in the field +\family typewriter +bi_sector +\family default +. + Basic transfer units are blocks, or sectors, or pages, or the like. + In contrast, +\family typewriter +struct aio_object +\family default + used by the XIO personality can address +\series bold +arbitrary granularity +\series default + memory with byte resolution even at odd +\begin_inset Foot +status open + +\begin_layout Plain Layout +Some brick +\emph on +implementations +\emph default + (as opposed to the capabilities of the +\emph on +interface +\emph default +) may be (and, in fact, +\emph on +are +\emph default +) restricted to +\family typewriter +PAGE_SIZE +\family default + operations or the like. + This is no general problem, because IOP can automatically insert some translato +r bricks extending the capabilities to universal granularity (of course + at some performance costs). +\end_layout + +\end_inset + + positions in (virtual) files / devices, similar to classical Unix file + IO, but +\emph on +asynchronously +\emph default +. + Practical experience shows that even non-functional properties like performance + of many datacenter workloads are profiting from that +\begin_inset Foot +status open + +\begin_layout Plain Layout +The current transaction logger uses variable-sized headers at +\begin_inset Quotes eld +\end_inset + +odd +\begin_inset Quotes erd +\end_inset + + addresses. + Although this increases +\family typewriter +memcpy() +\family default + load due to +\begin_inset Quotes eld +\end_inset + +misalignment +\begin_inset Quotes erd +\end_inset + +, the +\emph on +overall performance +\emph default + was provably better than in variants where sector / page alignment was + strictly obeyed, but space was wasted for alignments. + Such functionality is only possible if the XIO infrastructure +\emph on +allows +\emph default + +\emph on +for +\emph default + (but doesn't force) +\begin_inset Quotes eld +\end_inset + +mis-aligned +\begin_inset Quotes erd +\end_inset + + IO operations. + In future, many different transaction logfile formats showing different + runtime behaviour (e.g. + optimized for high-throughput SSD loads) may co-exist in parallel. + Note that properly aligned XIO operations bear no noticeable overhead compared + to classical block IO, at least in typical datacenter RAID scenarios. +\end_layout + +\end_inset + +. + The AIO/XIO abstraction contains no fixed link to kernel abstractions and + should be +\series bold +easily portable +\series default + to other environments. + In summary, the new personality provides a uniform abstraction which abstracts + away from multiple different kernel interfaces; it is designed to be useful + even in userspace. +\end_layout + +\begin_layout Enumerate +Kernel infrastructures for the concept of +\emph on +direct IO +\emph default + are different from those for +\emph on +buffered IO +\emph default +. + The XIO personality used by MARS subsumes both concepts as use case +\emph on +variants +\emph default +. + +\series bold +Buffering +\series default + is an optional internal property of XIO bricks (almost non-functional property + with support for consistency guarantees). +\end_layout + +\begin_layout Enumerate +The AIO/XIO personality is generically designed for remote operations over + networks, at arbitrary places in the IO stack, with (almost +\begin_inset Foot +status open + +\begin_layout Plain Layout +By default, automatic network connection re-establishment and infinite network + retries are already implemented in the +\family typewriter +xio_client +\family default + and +\family typewriter +xio_server +\family default + bricks to provide fully transparent semantics. + However, this may be undesirable in case of fatal crashes. + Therefore, abort operations are also configurable, as well as network timeouts + which are then mapped to classical IO errors. +\end_layout + +\end_inset + +) no semantic differences to local operations (built-in +\series bold + network transparency +\series default +). + There are universal provisions for mixed operation of different versions + ( +\series bold +rolling software updates +\series default + in clusters / grids). +\end_layout + +\begin_layout Enumerate +The generic brick infrastructure (as well as its personalities like XIO + or any other future personality) supports +\series bold +dynamic re-wiring / re-configuration +\series default + +\emph on +during +\emph default + operation (even while parallel IO requests are flying, some of them taking + different paths in the IO stack in parallel). + This is absolutely needed for MARS logfile rotation. + In the long term, this would be useful for many advanced new features and + products, not limited to multipathing. +\end_layout + +\begin_layout Enumerate +The generic brick infrastructure (and in turn all personalities) provide + +\series bold +additional comfort +\series default + to the programmer while enabling +\series bold +increased functionality +\series default +: by use of a generalization of +\series bold +aspect orientation +\series default + +\begin_inset Foot +status open + +\begin_layout Plain Layout +Similar to AOP, insertion of IOP bricks for checking / debugging etc is + one of the key advantages of the generic brick infrastructure. + In contrast to AOP where debugging is usually {en,dis}abled statically + at compile time, IOP allows for +\emph on +dynamic +\emph default + (re-)configuration of debugging bricks, automatic repair, and many more + features promoted by +\emph on +organic computing +\emph default +. +\end_layout + +\end_inset + +, the programmer need no longer worry about dynamic memory allocations for + +\emph on +local state +\emph default + in a brick instance. + MARS is +\series bold +automating local state +\series default + even when dynamically instantiating new bricks (possibly having the same + brick type) at runtime. + Specifially, XIO is automating +\series bold +request stacking +\series default + at the completion path this way, even while dynamically reconfiguring the + IO stack +\begin_inset Foot +status open + +\begin_layout Plain Layout +The generic aspect orientation approach leads to better +\series bold +separation of concerns +\series default +: local state needed by brick implementations is not visible from outside + by default. + In other words, local state is also +\series bold +private state +\series default +. + Accidental hampering of internal operations is impeded. +\end_layout + +\begin_layout Plain Layout +Example from the kernel: in +\family typewriter +include/linux/blkdev.h +\family default + the definition of +\family typewriter +struct request +\family default + contains the following comment: +\family typewriter +/* the following two fields are internal, NEVER access directly */ +\family default +. + It appears that +\family typewriter +struct request +\family default + contains not only fields relevant for the caller, but also +\series bold +internal fields +\series default + needed only in +\emph on +some +\emph default + +\emph on +specific +\emph default + callees. + For example, +\family typewriter +rb_node +\family default + is documented to be used only in IO schedulers. +\end_layout + +\begin_layout Plain Layout +XIO goes one step further: there need not exist exactly one IO scheduler + instance in the IO stack for a single device. + Future +\family typewriter +xio_scheduler_{deadline,cfq,...} +\family default + brick types could be each instantiated many times, and in arbitrary places, + even for the same (logical) device. + The equivalent of +\family typewriter +rb_node +\family default + would then be automatically instantiated multiple times for the same IO + request, by automatically instantiating the right local aspect instances. +\end_layout + +\end_inset + +. + A similar automation +\begin_inset Foot +status open + +\begin_layout Plain Layout +DM can achieve stacking and dynamic routing by a workaround called +\emph on +request cloning +\emph default +, potentially leading to mass creation of temporary / intermediate object + instances. +\end_layout + +\end_inset + + does not exist in the rest of the Linux kernel. +\end_layout + +\begin_layout Enumerate +The generic brick infrastructure, together with personalities like XIO, + enables +\series bold +new long-term functional and non-functional opportunities +\series default + by use of concepts from instance-oriented programming (IOP +\begin_inset Foot +status open + +\begin_layout Plain Layout +See +\begin_inset Flex URL +status collapsed + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + + +\end_layout + +\end_inset + +). + The application area is +\series bold +not limited to device drivers +\series default +. + For example, a new personality for +\emph on +stackable filesystems +\emph default + could be developed in future. +\end_layout + +\begin_layout Standard +In summary, anyone who would insist that MARS should be +\emph on +directly +\begin_inset Foot +status open + +\begin_layout Plain Layout +Notice that kernel-specific structures like +\family typewriter +struct bio +\family default + are of course used by MARS, but only +\emph on +inside +\emph default + the blackbox implementation of bricks like +\family typewriter +mars_bio +\family default + or +\family typewriter +mars_if +\family default + which act as +\series bold +adaptors +\series default + to/from that structure. + It is possible to write further adaptors, e.g. + for direct interfacing to the device mapper infrastructure. +\end_layout + +\end_inset + + +\emph default + based on pre-existing kernel structures / frameworks instead of contributing + a new framework would cause a +\emph on +massive regression of functionality +\emph default +. +\end_layout + +\begin_layout Itemize +On one hand, all code contributed by the MARS project is +\series bold +non-intrusive +\series default + into the rest of the Linux kernel. + From the viewpoint of other parts of the kernel, the whole addition +\emph on +behaves +\emph default + +\emph on +like +\emph default + a driver (although its infrastructure is much more than a driver). +\end_layout + +\begin_layout Itemize +On the other hand, if people are interested, the contributed infrastructure + +\emph on +may +\emph default + be used to +\emph on +add +\emph default + to the power of the Linux kernel. + It is designed to be +\series bold +open for contributions +\series default +. +\end_layout + +\begin_layout Itemize +A +\emph on +possible +\emph default + (but not the only possible) way to do this is giving the generic brick + framework / the XIO personality as well as future personalities / the MARS + application the status of a +\emph on +subsystem +\emph default + inside the kernel (in the long term), similar to the SCSI subsystem or + the network subsystem. + Noone is forced to use it, but anybody may use it if he/she likes. +\end_layout + +\begin_layout Itemize +Politically, the author is a FOSS advocate willing to collaborate and to + support anyone interested in contributions. + The author's personal interest is long-term and is open for both in-tree + and out-of-tree extensions of both the framework and MARS by any other + party obeying the GPL and not hazarding FOSS by patents (instead supporting + organizations like the Open Invention Network). + The author is open to closer relationships with the Linux Foundation and + other parts of the Linux ecosystem. +\end_layout + +\begin_layout Section +Architecture Overview +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/MARS_Framework_Architecture.pdf + width 100col% + +\end_inset + + +\end_layout + +\begin_layout Section +Some Architectural Details +\end_layout + +\begin_layout Standard +The following pictures show some +\begin_inset Quotes eld +\end_inset + +zones of responsibility +\begin_inset Quotes erd +\end_inset + +, not necessarily a strict hierarchy (although Dijkstra's famous layering + rules from THE are tried to be respected as much as possible). + The construction principle follows the concept of +\series bold +Instance Oriented Programming +\series default + (IOP) described in +\begin_inset Flex URL +status collapsed + +\begin_layout Plain Layout + + +\end_layout + +\end_inset + +. + Please note that MARS is only instance- +\emph on +based +\emph default + +\begin_inset Foot +status open + +\begin_layout Plain Layout +Similar to OOP, where +\begin_inset Quotes eld +\end_inset + +object-based +\begin_inset Quotes erd +\end_inset + + means a weaker form of +\begin_inset Quotes eld +\end_inset + +object-oriented +\begin_inset Quotes erd +\end_inset + +, the term +\begin_inset Quotes eld +\end_inset + +instance-based +\begin_inset Quotes erd +\end_inset + + means that the +\emph on +strategy +\emph default + brick layer need not be fully modularized according to the IOP principles, + but the +\emph on +worker +\emph default + brick layer already is. +\end_layout + +\end_inset + +, while MARS Full is planned to be fully instance- +\emph on +oriented +\emph default +. +\end_layout + +\begin_layout Subsection +MARS Architecture +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/mars-light-architecture.fig + width 40col% + +\end_inset + + +\end_layout + +\begin_layout Subsection +MARS Full Architecture (planned) +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename images/mars-full-architecture.fig + width 80col% + +\end_inset + + +\end_layout + +\begin_layout Section +Documentation of the Symlink Trees +\begin_inset CommandInset label +LatexCommand label +name "sec:Documentation-of-the" + +\end_inset + + +\end_layout + +\begin_layout Standard +The +\family typewriter +/mars/ +\family default + symlink tree is serving the following purposes, all at the same time: +\end_layout + +\begin_layout Enumerate +For +\series bold +communication +\series default + between cluster nodes, see sections +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-Lamport-Clock" + +\end_inset + + and +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:The-Symlink-Tree" + +\end_inset + +. + This communication is even the +\emph on +only +\emph default + communication between cluster nodes (apart from the +\emph on +contents +\emph default + of transaction logfiles and sync data). +\end_layout + +\begin_layout Enumerate + +\series bold +\emph on +Internal +\emph default + interface +\series default + between the kernel module and the userspace tool +\family typewriter +marsadm +\family default +. +\end_layout + +\begin_layout Enumerate + +\series bold +\emph on +Internal +\emph default + persistent repository +\series default + which keeps state information between reboots (also in case of node crashes). + It is even the +\emph on +only +\emph default + place where state information is kept. + There is no other place like +\family typewriter +/etc/drbd.conf +\family default +. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + +Because of its internal character, its representation and semantics may + change at any time without notice (e.g. + via an +\emph on +internal +\emph default + upgrade procedure between major releases). + It is +\emph on +not +\emph default + an external interface to the outer world. + Don't build anything on it. +\end_layout + +\begin_layout Standard +However, knowledge of the symlink tree is useful for advanced sysadmins, + for +\series bold +human inspection +\series default + and for +\series bold +debugging +\series default +. + And, of course, for developers. +\end_layout + +\begin_layout Standard +As an +\begin_inset Quotes eld +\end_inset + +official +\begin_inset Quotes erd +\end_inset + + interface from outside, only the +\family typewriter +marsadm +\family default + command should be used. +\end_layout + +\begin_layout Subsection +Documentation of the MARS Symlink Tree +\end_layout + +\begin_layout Section +XIO Worker Bricks +\end_layout + +\begin_layout Section +StrategY Worker Bricks +\end_layout + +\begin_layout Standard +NYI +\end_layout + +\begin_layout Section +The XIO Brick Personality +\end_layout + +\begin_layout Section +The Generic Brick Infrastructure Layer +\end_layout + +\begin_layout Section +The Generic Object and Aspect Infrastructure +\end_layout + +\begin_layout Chapter +\start_of_appendix +Technical Data MARS +\begin_inset CommandInset label +LatexCommand label +name "chap:Technical-Data-MARS" + +\end_inset + + +\end_layout + +\begin_layout Standard +MARS has some built-in limitations which should be overcome +\begin_inset Foot +status open + +\begin_layout Plain Layout +Some internal algorithms are quadratic. + The reason is that MARS evolved from a lab prototype which wasn't originally + intended for enterprise grade usage, but should have been succeeded by + the fully instance-oriented MARS Full much earlier. +\end_layout + +\end_inset + + by the future MARS Full. + Please don't exceed the following limits: +\end_layout + +\begin_layout Itemize +maximum 10 nodes per cluster +\end_layout + +\begin_layout Itemize +maximum 10 resources per cluster +\end_layout + +\begin_layout Itemize +maximum 100 logfiles per resource +\end_layout + +\begin_layout Chapter +Handout for Midnight Problem Solving +\begin_inset CommandInset label +LatexCommand label +name "chap:Handout-for-Midnight" + +\end_inset + + +\end_layout + +\begin_layout Standard +Here are generic instructions for the generic +\family typewriter +marsadm +\family default + and commandline level. + Other levels (e.g. + different types of cluster managers, PaceMaker, control scripts / +\family typewriter +rc +\family default + scripts / +\family typewriter +upstart +\family default + scripts, etc should be described elsewhere. +\end_layout + +\begin_layout Section +Inspecting the State of MARS +\end_layout + +\begin_layout Standard +For manual inspection, please prefer the new +\family typewriter +marsadm view all +\family default + over the old +\family typewriter +marsadm view-1and1 all +\family default +. + It shows more appropriate / detailed information. +\end_layout + +\begin_layout Standard +Hint: this might change in future when somebody will program better marcros + for the +\family typewriter +view-1and1 +\family default + variant, or create even better other macros. +\end_layout + +\begin_layout Quotation + +\family typewriter +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +# watch marsadm view all +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +Checking the low-level network connections at runtime: +\end_layout + +\begin_layout Quotation + +\family typewriter +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +# watch "netstat --tcp | grep 777" +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +Meaning of the port numbers (as currently configured into the kernel module, + may change in future): +\end_layout + +\begin_layout Itemize +7777 = metadata / symlink propagation +\end_layout + +\begin_layout Itemize +7778 = transfer of transaction logfiles +\end_layout + +\begin_layout Itemize +7779 = transfer of sync traffic +\end_layout + +\begin_layout Standard +7777 must be always active on a healthy cluster. + 7778 and 7779 will appear only on demand, when some data is transferred. +\end_layout + +\begin_layout Standard +Hint: when one of the columns Send-Q or Recv-Q are constantly at high values, + you might have a network bottleneck. +\end_layout + +\begin_layout Section +Replication is Stuck +\end_layout + +\begin_layout Standard +Indications for a stuck: +\end_layout + +\begin_layout Itemize +One of the flags shown by +\family typewriter +marsadm view all +\family default + or +\family typewriter +marsadm view-flags all +\family default + contain a symbol +\family typewriter +"-" +\family default + (dash). + This means that some switch is currently switched off (deliberately). + Please check whether there is a valid reason why somebody else switched + it off. + If the switch-off is just by accident, use the following command to fix + the stuck: +\family typewriter + +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +# marsadm up all +\end_layout + +\end_inset + + +\family default +(or replace +\family typewriter +all +\family default + by a particular resource name if you want to start only a specific one). +\begin_inset Newline newline +\end_inset + +Note: +\family typewriter +up +\family default + is equivalent to the sequence +\family typewriter +attach; resume-fetch; resume-replay; resume-sync +\family default +. + Instead of switching each individual knob, use +\family typewriter +up +\family default + as a shortcut for switching on anything which is currently off. +\end_layout + +\begin_layout Itemize + +\family typewriter +netstat --tcp | grep 7777 +\family default + does not show anything. + Please check the following: +\begin_inset Separator latexpar +\end_inset + + +\end_layout + +\begin_deeper +\begin_layout Itemize +Is the kernel module loaded? Check +\family typewriter +lsmod | grep mars +\family default +. + When necessary, run +\family typewriter +modprobe mars +\family default +. +\end_layout + +\begin_layout Itemize +Is the network interface down? Check +\family typewriter +ifconfig +\family default +, and/or +\family typewriter +ethtool +\family default + and friends, and fix it when necessary. +\end_layout + +\begin_layout Itemize +Is a +\family typewriter +ping +\family default + possible? If not, fix the network / routing / firewall / etc. + When fixed, the MARS connections should automatically appear after about + 1 minute. +\end_layout + +\begin_layout Itemize +When +\family typewriter +ping +\family default + is possible, but a MARS connection to port 7777 does not appear after a + few minutes, try to connect to remote port 7777 by hand via +\family typewriter +telnet +\family default +. + But don't type anything, just abort the connection immediately when it + works! Typing anything will almost certainly throw a harsh error message + at the other server, which could unnecessarily alarm other people. +\end_layout + +\end_deeper +\begin_layout Itemize +Check whether +\family typewriter +marsadm view all +\family default + shows some progress bars somewhere. + Example: +\family typewriter +\size scriptsize + +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +istore-test-bap1:~# marsadm view all +\end_layout + +\begin_layout Plain Layout + +--------- resource lv-0 +\end_layout + +\begin_layout Plain Layout + + lv-0 OutDated[F] PausedReplay dCAS-R Secondary istore-test-bs1 +\end_layout + +\begin_layout Plain Layout + + replaying: [>...................] 1.21% (12/1020)MiB logs: [2..3] +\end_layout + +\begin_layout Plain Layout + + > fetch: 1008.198 MiB rate: 0 B/sec remaining: --:--:-- hrs +\end_layout + +\begin_layout Plain Layout + + > replay: 0 B rate: 0 B/sec remaining: 00:00:00 hrs +\end_layout + +\end_inset + + +\family default +\size default +At least one of the +\family typewriter +rate: +\family default + values should be greater than 0. + When none of the +\family typewriter +rate: +\family default + values indicate any progress for a longer time, try +\family typewriter +marsadm up all +\family default + again. + If it doesn't help, check and repair the network. + If even this does not help, check the hardware for any IO hangups, or kernel + hangups. + First, check the RAID controllers. + Often (but not certainly), a stuck kernel can be recognized when many processes + are +\emph on +permanently +\emph default + in state "D", for a long time: +\family typewriter +ps ax | grep " D" | grep -v grep +\family default + or similar. + Please check whether there is just an overload, or +\emph on +really +\emph default + a true kernel problem. + Discrimination is not easy, and requires experience (as with any other + system; not limited to MARS). + A truly stuck kernel can only be resurrected by rebooting. + The same holds for any hardware problems. +\end_layout + +\begin_layout Itemize +Check whether +\family typewriter +marsadm view all +\family default + reports any lines like +\family typewriter +WARNING: SPLIT BRAIN at '' detected +\family default +. + In such a case, check that there is +\emph on +really +\emph default + a split brain, before obeying the instructions in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Resolution-of-Split" + +\end_inset + +. + Notice that network outages or missing +\family typewriter +marsadm log-delete-all all +\family default + or +\family typewriter +cron +\family default + may continue to report an old split brain which has gone in the meantime. +\end_layout + +\begin_layout Itemize +Check whether +\family typewriter +/mars/ +\family default + is too full. + For a rough impression, +\family typewriter +df /mars/ +\family default + may be used. + For getting authoritative values as internally used by the MARS emergency-mode + computations, use +\family typewriter +marsadm view-rest-space +\family default + (the unit is GiB). + In practice, the differences are only marginal, at least on bigger +\family typewriter +/mars/ +\family default + partitions. + When there is only few rest space (or none at all), please obey the instruction +s in section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Resolution-of-Emergency" + +\end_inset + +. +\end_layout + +\begin_layout Section +Resolution of Emergency Mode +\begin_inset CommandInset label +LatexCommand label +name "sec:Resolution-of-Emergency" + +\end_inset + + +\end_layout + +\begin_layout Standard +Emergency mode occurs when +\family typewriter +/mars/ +\family default + runs out of space, such that no new logfile data can be written anymore. +\end_layout + +\begin_layout Standard +In emergency mode, the primary will write any write requests +\emph on +directly +\emph default + to the underlying disk, as if MARS were not present at all. + Thus, your application will continue to run. + Only the +\emph on +replication +\emph default + as such is stopped. +\end_layout + +\begin_layout Standard +\begin_inset Note Greyedout +status open + +\begin_layout Plain Layout +Notice: emergency mode means that your secondary nodes are usually in a + +\emph on +consistent +\emph default +, but +\emph on +outdated +\emph default + state (exception: when a sync was running in parallel to the emergency + mode, then the sync will be automatically started over again). + You can check consistency via +\family typewriter +marsadm view-flags all +\family default +. + Only when a local disk shows a lower-case letter +\family typewriter +"d" +\family default + instead of an uppercase +\family typewriter +"D" +\family default +, it is known to be inconsistent (e.g. + during a sync). + When there is a dash instead, it usually means that the disk is detatched + or misconfigured or the kernel module is not started. + Please fix these problems first before believing that your local disk is + unusable. + Even if it is really inconsistent (which is very unlikely, typically occurring + only as a consequence of hardware failures, or of the above-mentioned exception +), you have a big chance to recover most of the data via +\family typewriter +fsck +\family default + and friends. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +A currently existing Emergency mode can be detected by +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +primary:~# marsadm view-is-emergency all +\end_layout + +\begin_layout Plain Layout + +secondary:~# marsadm view-is-emergency all +\end_layout + +\end_inset + + Notice: this delivers the current state, telling nothing about the past. +\end_layout + +\begin_layout Standard +Currently, emergency mode will also show something like +\family typewriter +WARNING: SPLIT BRAIN at '' detected +\family default +. + This ambiguity will be resolved in a future MARS release. + It is however not crucial: the resolution methods for both cases are very + similar. + If in doubt, start emergency resolution first, and only proceed to split + brain resoultion if it did not help. +\end_layout + +\begin_layout Standard +Preconditions: +\end_layout + +\begin_layout Itemize +Only current version of MARS: the space at the primary side should have + been already released, and the emergency mode should have been already + left. + Otherwise, you might need the split-brain resolution method from section + +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Resolution-of-Split" + +\end_inset + +. +\end_layout + +\begin_layout Itemize +The network +\series bold +must +\series default + be working. + Check that the following gives an entry for each secondary: +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +primary:~# netstat --tcp | grep 7777 +\end_layout + +\end_inset + +When necessary, fix the network first (see instructions above). +\end_layout + +\begin_layout Standard +Emergency mode should now be resolved via the following instructions: +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +primary:~# marsadm view-is-emergency all +\end_layout + +\begin_layout Plain Layout + +primary:~# du -s /mars/resource-* | sort -n +\end_layout + +\end_inset + +Remember the affected resources. + Best practice is to do the following, starting with the +\emph on +biggest +\emph default + resource as shown by the +\family typewriter +du | sort +\family default + output in reverse order, but +\emph on +starting +\emph default + the following only with the +\emph on +affected +\emph default + resources in the first place: +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +secondary1:~# marsadm invalidate +\end_layout + +\begin_layout Plain Layout + +secondary1:~# marsadm log-delete-all all +\end_layout + +\begin_layout Plain Layout + +... + dito with all resources showing emergency mode +\end_layout + +\begin_layout Plain Layout + +... + dito on all other secondaries +\end_layout + +\begin_layout Plain Layout + +primary:~# marsadm log-delete-all all +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +Hint: during the resolution process, some other resources might have gone + into emergency mode concurrently. + In addition, it is possible that some secondaries are stuck at particular + resources while the corresponding primary has +\emph on +not yet +\emph default + entered emergency mode. + Please repeat the steps in such a case, and look for emergency modes at + secondaries additionally. + When necessary, extend your list of +\emph on +affected +\emph default + resources. +\end_layout + +\begin_layout Standard +Hint: be patient. + Deleting large bulks of logfile data may take a long time, at least on + highly loaded systems. + You should give the cleanup processes at least 5 minutes before concluding + that an +\family typewriter +invalidate +\family default + followed by +\family typewriter +log-delete-all +\family default + had no effect! Don't forget to give the +\family typewriter +log-delete-all +\family default + at all cluster nodes, even when seemingly unaffected. +\end_layout + +\begin_layout Standard +In very complex scenarios, when the primary roles of different resources + are spread over diffent hosts (aka mixed operation), you may need to repeat + the whole cycle iteratively for a few cycles until the jam is resolved. +\end_layout + +\begin_layout Standard +If it does not go away, you have another chance by the following split-brain + resolution process, which will also cleanup emergency mode as a side effect. +\end_layout + +\begin_layout Section +Resolution of Split Brain and of Emergency Mode +\begin_inset CommandInset label +LatexCommand label +name "sec:Resolution-of-Split" + +\end_inset + + +\end_layout + +\begin_layout Standard +Hint: in many cases (but not guaranteed), the previous receipe for resolution + of emergency mode will also cleanup split brain. + Good chances are in case of +\begin_inset Formula $k=2$ +\end_inset + + total replicas. + Please collect your own experiences which method works better for you! +\end_layout + +\begin_layout Standard +Precondition: the network must be working. + Check that the following gives an entry for each secondary: +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +primary:~# netstat --tcp | grep 7777 +\end_layout + +\end_inset + + When necessary, fix the network first (see instructions above). +\end_layout + +\begin_layout Standard +Inspect the split brain situation: +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +primary:~# marsadm view all +\end_layout + +\begin_layout Plain Layout + +primary:~# du -s /mars/resource-* | sort -n +\end_layout + +\end_inset + +Remember those resources where a message like +\family typewriter +WARNING: SPLIT BRAIN at '' detected +\family default + appears. + Do the following only for +\emph on +affected +\emph default + resources, starting with the biggest one (before proceeding to the next + one). +\end_layout + +\begin_layout Standard +Do the following with only +\emph on +one +\emph default + resource at a time (before proceeding to the next one), and repeat the + actions on that resource at every secondary (if there are multiple secondaries) +: +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +secondary1:~# marsadm leave-resource $res1 +\end_layout + +\begin_layout Plain Layout + +secondary1:~# marsadm log-delete-all all +\end_layout + +\end_inset + +Check whether the split brain has vanished everywhere. + Startover with other resources at their secondaries when necessary. +\end_layout + +\begin_layout Standard +Finally, when no split brain is reported at any (former) secondary, do the + following on the primary: +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +primary:~# marsadm log-delete-all all +\end_layout + +\begin_layout Plain Layout + +primary:~# sleep 30 +\end_layout + +\begin_layout Plain Layout + +primary:~# marsadm view all +\end_layout + +\end_inset + + Now, the split brain should be gone even at the primary. + If not, repeat this step. +\end_layout + +\begin_layout Standard +In case even this should fail on some +\family typewriter +$res +\family default + (which is very unlikely), read the PDF manual before using +\family typewriter +marsadm log-purge-all $res +\family default +. + +\end_layout + +\begin_layout Standard +Finally, when the split brain is gone everywhere, rebuild the redundancy + at every secondary via +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +secondary1:~# marsadm join-resource $res1 /dev//$res1 +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +If even this method does not help, setup the whole cluster afresh by +\family typewriter +rmmod mars +\family default + everywhere, and creating a fresh +\family typewriter +/mars/ +\family default + filesystem everywhere, followed by the same procedure as installing MARS + for the first time (which is outside the scope of this handout). +\end_layout + +\begin_layout Section +Handover of Primary Role +\end_layout + +\begin_layout Standard +When there exists a method for primary handover in higher layers such as + cluster managers, please prefer that method (e.g. + +\family typewriter +cm3 +\family default + or other tools). +\end_layout + +\begin_layout Standard +If suchalike doesn't work, or if you need to handover some resource +\family typewriter +$res1 +\family default + by hand, do the following: +\end_layout + +\begin_layout Itemize +Stop the load / application corresponding to +\family typewriter +$res1 +\family default + on the old primary side. +\end_layout + +\begin_layout Itemize + +\family typewriter +umount /dev/mars/$res1 +\family default +, or otherwise close any openers such as iSCSI. +\end_layout + +\begin_layout Itemize +At the new primary: +\family typewriter +marsadm primary $res1 +\end_layout + +\begin_layout Itemize +Restart the application at the new site (in reverse order to above). + In case you want to switch +\emph on +all +\emph default + resources which are not yet at the new side, you may use +\family typewriter +marsadm primary all +\family default +. +\end_layout + +\begin_layout Section +Emergency Switching of Primary Role +\end_layout + +\begin_layout Standard +Emergency switching is necessary when your primary is no longer reachable + over the network for a +\emph on +longer +\emph default + time, or when the hardware is defective. +\end_layout + +\begin_layout Standard +Emergency switching will very often lead to a split brain, which requires + lots of manual actions to resolve (see above). + Therefore, try to avoid emergency switching when possible! +\end_layout + +\begin_layout Standard +Hint: MARS can automatically recover after a primary crash / reboot, as + well as after secondary crashes, just by executing +\family typewriter +modprobe mars +\family default + after +\family typewriter +/mars/ +\family default + had been mounted. + Please consider to wait until your system comes up again, instead of risking + a split brain. +\end_layout + +\begin_layout Standard +The decision between emergency switching and continuing operation at the + same primary side is an operational one. + MARS can support your decision by the following information at the potentially + new primary side (which was in secondary mode before): +\family typewriter +\size scriptsize + +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +istore-test-bap1:~# marsadm view all +\end_layout + +\begin_layout Plain Layout + +--------- resource lv-0 +\end_layout + +\begin_layout Plain Layout + +lv-0 InConsistent Syncing dcAsFr Secondary istore-test-bs1 +\end_layout + +\begin_layout Plain Layout + +syncing: [====>..............] 27.84% (567/2048)MiB rate: 72583.00 KiB/sec remaining: 00:00:20 + hrs +\end_layout + +\begin_layout Plain Layout + +> sync: 567.293/2048 MiB rate: 72583 KiB/sec remaining: 00:00:20 hrs +\end_layout + +\begin_layout Plain Layout + +replaying: [>:::::::::::::::::::] 0.00% (0/12902)KiB logs: [1..1] +\end_layout + +\begin_layout Plain Layout + +> fetch: 0 B rate: 38 KiB/s remaining: 00:00:00 +\end_layout + +\begin_layout Plain Layout + +> replay: 12902.047 KiB rate: 0 B/s remaining: --:--:-- +\end_layout + +\end_inset + + +\family default +\size default +When your target is syncing (like in this example), you cannot switch to + it (same as with DRBD). + When you had an emergency mode before, you should first resolve that (whenever + possible). + When a split brain is reported, try to resolve it first (same as with DRBD). + Only in case you +\emph on +know +\emph default + that the primary is really damaged, or it is really impossible to the run + the application there for some reason, emergency switching is desirable. +\end_layout + +\begin_layout Standard +Hint: in case the secondary is inconsistent for some reason, e.g. + because of an incremental fast full-sync, you have a last chance to recover + most data after forceful switching by using a filesystem check or suchalike. + This might be even faster than restoring data from the backup. + But use it only if you are +\emph on +really +\emph default + desperate! +\end_layout + +\begin_layout Standard +The amount of data which is +\emph on +known +\emph default + to be missing at your secondary is shown after the +\family typewriter +> fetch: +\family default + in human-readable form. + However, in cases of networking problems this information may be outdated. + You +\emph on +always +\emph default + need to consider further facts which cannot be known by MARS. +\end_layout + +\begin_layout Standard +When there exists a method for emergency switching of the primary in higher + layers such as cluster managers, please prefer that method in front of + the following one. +\end_layout + +\begin_layout Standard +If suchalike doesn't work, or when a handover attempt has failed several + times, or if you +\emph on +really need +\emph default + forceful switching of some resource +\family typewriter +$res1 +\family default + by hand, you can do the following: +\end_layout + +\begin_layout Itemize +When possible, stop the load / application corresponding to +\family typewriter +$res1 +\family default + on the old primary side. +\end_layout + +\begin_layout Itemize +When possible, +\family typewriter +umount /dev/mars/$res1 +\family default +, or otherwise close any openers such as iSCSI. +\end_layout + +\begin_layout Itemize +When possible (if you have some time), wait until as much data has been + propagated to the new primary as possible (watch the +\family typewriter +fetch: +\family default + indicator). +\end_layout + +\begin_layout Itemize +At the new primary: +\family typewriter +marsadm disconnect $res1; marsadm primary --force $res1 +\end_layout + +\begin_layout Itemize +Restart the application at the new site (in reverse order to above). +\end_layout + +\begin_layout Itemize +After the application is known to run reliably, check for split brains and + cleanup them when necessary. +\end_layout + +\begin_layout Chapter +Alternative Methods for Split Brain Resolution +\begin_inset CommandInset label +LatexCommand label +name "chap:Alternative-Methods-for" + +\end_inset + + +\end_layout + +\begin_layout Standard +Instead of +\family typewriter +marsadm invalidate +\family default +, the following steps may be used. + In preference, start with the old +\begin_inset Quotes eld +\end_inset + +wrong +\begin_inset Quotes erd +\end_inset + + primaries first: +\end_layout + +\begin_layout Enumerate + +\family typewriter +marsadm leave-resource mydata +\end_layout + +\begin_layout Enumerate +After having done this on one cluster node, check whether the split brain + is already gone (e.g. + by saying +\family typewriter +marsadm view mydata +\family default +). + There are chances that you don't need this on all of your nodes. + Only in very rare +\begin_inset Foot +status open + +\begin_layout Plain Layout +When your network had partitioned in a very awkward way for a long time, + and when your partitioned primaries did several +\family typewriter +log-rotate +\family default + operations indendently from each other, there is a small chance that +\family typewriter +leave-resource +\family default + does not clean up +\emph on +all +\emph default + remains of such an awkward situation. + Only in such a case, try +\family typewriter +log-purge-all +\family default +. +\end_layout + +\end_inset + + cases, it might happen that the preceding l +\family typewriter +eave-resource +\family default + operations were not able to clean up all logfiles produced in parallel + by the split brain situation. + +\end_layout + +\begin_layout Enumerate +Read the documentation about +\family typewriter +log-purge-all +\family default + (see page +\begin_inset CommandInset ref +LatexCommand pageref +reference "log-purge-all$res" + +\end_inset + +) and use it. +\end_layout + +\begin_layout Enumerate +If you want to restore redundancy, you can follow-up a +\family typewriter +join-resource +\family default + phase to the old resource name (using the correct device name, double-check + it!) This will restore your redundancy by overwriting your bad split brain + version with the correct one. +\end_layout + +\begin_layout Standard +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +It is important to resolve the split brain +\emph on +before +\emph default + you can start the +\family typewriter +join-resource +\family default + reconstruction phase! In order to keep as many +\begin_inset Quotes eld +\end_inset + +good +\begin_inset Quotes erd +\end_inset + + versions as possible (e.g. + for emergency cases), don't re-join them all in parallel, but rather start + with the oldest / most outdated / worst / inconsistent version first. + It is recommended to start the next one only when the previous one has + sucessfully finished. +\end_layout + +\begin_layout Chapter +Alternative De- and Reconstruction of a Damaged Resource +\begin_inset CommandInset label +LatexCommand label +name "chap:Alternative-De--and" + +\end_inset + + +\end_layout + +\begin_layout Standard +In case +\family typewriter +leave-resource --host= +\family default + does not work, you may use the following fallback. + On the surviving new designated primary, give the following commands: +\end_layout + +\begin_layout Enumerate + +\family typewriter +marsadm disconnect-all mydata +\end_layout + +\begin_layout Enumerate + +\family typewriter +marsadm down mydata +\end_layout + +\begin_layout Enumerate +Check by hand whether your local disk is consistent, e.g. + by test-mounting it readonly, +\family typewriter +fsck +\family default +, etc. +\end_layout + +\begin_layout Enumerate + +\family typewriter +marsadm delete-resource mydata +\end_layout + +\begin_layout Enumerate +Check whether the other vital cluster nodes don't report the dead resource + any more, e.g. + +\family typewriter +marsadm view all +\family default + at +\emph on +each +\emph default + of them. + In case the resource has not disappeared anywhere (which may happen during + network problems), do the +\family typewriter +down ; delete-resource +\family default + steps also there (optionally again with +\family typewriter +--force +\family default +). +\end_layout + +\begin_layout Enumerate +Be sure that the resource has disappeared +\emph on +everywhere +\emph default +. + When necessary, repeat the +\family typewriter +delete-resource +\family default + with +\family typewriter +--force +\family default +. +\end_layout + +\begin_layout Enumerate + +\family typewriter +marsadm create-resource newmydata ... + +\family default + at the +\emph on +correct +\emph default + node using the +\emph on +correct +\emph default + disk device containing the +\emph on +correct +\emph default + version, and further steps to setup your resource from scratch, preferably + under a different name to minimize any risk. +\end_layout + +\begin_layout Standard +\noindent +In any case, +\series bold +manually check +\series default + whether a split brain is reported for any resource on any of your +\emph on +surviving +\emph default + cluster nodes. + If you find one there (and only then), please (re-)execute the split brain + resolution steps on the affected node(s). +\end_layout + +\begin_layout Chapter +Cleanup in case of Complicated Cascading Failures +\begin_inset CommandInset label +LatexCommand label +name "subsec:Cleanup-in-case" + +\end_inset + + +\end_layout + +\begin_layout Standard +MARS does its best to recover even from multiple failures (e.g. + +\series bold +rolling disasters +\series default +). + Chances are high that the instructions from sections +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Split-Brain-Resolution" + +\end_inset + + +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Final-Destroy-of" + +\end_inset + + or appendix +\begin_inset CommandInset ref +LatexCommand ref +reference "chap:Alternative-Methods-for" + +\end_inset + + +\begin_inset CommandInset ref +LatexCommand ref +reference "chap:Alternative-De--and" + +\end_inset + + will work even in case of multiple failures, such as a network failure + plus local node failure at only 1 node (even if that node is the former + primary node). +\end_layout + +\begin_layout Standard +However, in general (e.g. + when more than 1 node is damaged and/or when the filesystem +\family typewriter +/mars/ +\family default + is badly damaged) there is no general guarantee that recovery will +\emph on +always +\emph default + succeed under +\emph on +any +\emph default + (weird) circumstances. + That said, your chances for recovery are +\emph on +very +\emph default + high when some disk remains usable at least at one of your surviving secondarie +s. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +It should be very hard to finally trash a secondary, because the transaction + logfiles are containing +\family typewriter +md5 +\family default + checksums for all data records. + Any attempt to replay currupted logfiles is refused by MARS. + In addition, the sequence numbers of +\family typewriter +log-rotate +\family default +d logfiles are checked for contiguity. + Finally, the +\emph on +sequence path +\emph default + of logfile applications (consisting of logfile names plus their respective + length) is additionally secured by a +\family typewriter +git +\family default +-like incremental checksum over the whole path history (so-called +\begin_inset Quotes eld +\end_inset + +version links +\begin_inset Quotes erd +\end_inset + +). + This should detect split brains even if logfiles are appended / modified + +\emph on +after +\emph default + a (forceful) switchover has already taken place. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresToxiques.png + lyxscale 50 + scale 17 + +\end_inset + + That said, your risk of final data loss is very high if you remove the + +\series bold +BBU +\series default + from your hardware RAID controller before all hot data has been flushed + to the physical disks. + Therefore, never try to +\begin_inset Quotes eld +\end_inset + +repair +\begin_inset Quotes erd +\end_inset + + a seemingly dead node before your replication is up again somewhere else! + Only unplug the network cables when advised, but never try to repair the + hardware instantly! +\end_layout + +\begin_layout Standard +In case of desperate situations where none of the previous instructions + have succeeded, your last chance is rebuilding all your resources from + intact disks as follows: +\end_layout + +\begin_layout Enumerate +Do +\family typewriter +rmmod mars +\family default + on all your cluster nodes and/or reboot them. + Note: if you are less desperate, chances are high that the following will + also work when the kernel module remains active and everywhere a +\family typewriter +marsadm down +\family default + is given instead, but for an +\emph on +ultimate +\emph default + instruction you should eliminate +\emph on +potential +\emph default + kernel problems by +\family typewriter +rmmod +\family default + / +\family typewriter +reboot +\family default +, at least if you can afford the downtime on concurrently operating resources. +\end_layout + +\begin_layout Enumerate +For safety, physically remove the storage network cables on +\emph on +all +\emph default + your cluster nodes. + Note: the same disclaimer holds. + MARS really does its best, even when +\family typewriter +delete-resource +\family default + is given while the network is fully active and multiple split-brain primaries + are actively using their local device in parallel (approved by some testcases + from the automatic test suite, but note that it is impossible to catch + all possible failure scenarios). + Don't challenge your fate if you are desperate! Don't +\emph on +rely +\emph default + on this! Nothing is absolutely fail-safe! +\end_layout + +\begin_layout Enumerate + +\series bold +Manually +\series default + check which surviving disk is usable, and which is the +\begin_inset Quotes eld +\end_inset + +best +\begin_inset Quotes erd +\end_inset + + one for your purpose. +\end_layout + +\begin_layout Enumerate +Do +\family typewriter +modprobe mars +\family default + +\emph on +only +\emph default + on that node. + If that fails, +\family typewriter +rmmod +\family default + and/or reboot again, and start over with a completely fresh +\family typewriter +/mars/ +\family default + partition ( +\family typewriter +mkfs.ext4 /mars/ +\family default + or similar) +\emph on +everywhere +\emph default + on +\emph on +all +\emph default + cluster nodes, and continue with step 7. +\end_layout + +\begin_layout Enumerate +If your old +\family typewriter +/mars/ +\family default + works, and you did not already (forcefully) switch your designated primary + to the final destination, do it now (see description in section +\begin_inset CommandInset ref +LatexCommand ref +reference "subsec:Forced-Switching" + +\end_inset + +). + Wait until any old logfile data has been replayed. +\end_layout + +\begin_layout Enumerate +Say +\family typewriter +marsadm delete-resource mydata --force +\family default +. + This will cleanup all internal symlink tree information for the resource, + but will leave your disk data intact. +\end_layout + +\begin_layout Enumerate +Locally build up the new resource(s) as usual, out of the underlying disks. +\end_layout + +\begin_layout Enumerate +Check whether the new resource(s) work in standalone mode. +\end_layout + +\begin_layout Enumerate +When necessary, repeat these steps with other resources. +\end_layout + +\begin_layout Standard +Now you can choose how the rebuild your cluster. + If you rebuilt +\family typewriter +/mars/ +\family default + anywhere, you +\emph on +must +\emph default + rebuild it on +\emph on +all +\emph default + new cluster nodes and start over with a fresh +\family typewriter +join-cluster +\family default + on each of them, from scratch. + It is not possible to mix the old cluster with the new one. +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +begin{enumerate} +\backslash +setcounter{enumi}{9} +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +item +\end_layout + +\end_inset + + Finally, do all the necessary +\family typewriter +join-resource +\family default +s on the respective cluster nodes, according to your new redundancy scenario + after the failures (e.g. + after activating spare nodes, etc). + If you have +\begin_inset Formula $k>2$ +\end_inset + + replicas, start +\family typewriter +join-resource +\family default + on the worst / most damaged version first, and start the next preferably + only after the previous sync has completed successfully. + This way, you will be permanently retaining some (old and outdated, but + hopefully potentially usable) replicas while a sync is running. + Don't start too many syncs in parallel. +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +end{enumerate} +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Never use +\family typewriter +delete-resource +\family default + twice on the same resource name, after you have already a working standalone + primary +\begin_inset Foot +status open + +\begin_layout Plain Layout +Of course, when you don't have created the +\emph on +same +\emph default + resource anew, you may repeat +\family typewriter +delete-resource +\family default + on other cluster nodes in order to get rid of local files / symlinks which + had not been propagated to other nodes before. +\end_layout + +\end_inset + +. + You might accidentally destroy your again-working copy! You +\emph on +can +\emph default + issue +\family typewriter +delete-resource +\family default + multiple times on different nodes, e.g. + when the network has problems, but doing so +\emph on +after +\emph default + re-establishment of the initial primary bears some risk. + Therefore, the safest way is first deleting the resources everywhere, and + then starting over afresh. +\end_layout + +\begin_layout Standard +Before re-connecting any network cable on any non-primary (new secondaries), + ensure that all +\family typewriter +/dev/mars/mydata +\family default + devices are no longer in use (e.g. + from an old primary role before the incident happened), and that each local + disk is detached. + Only after that, you should be able to safely re-connect the network. + The +\family typewriter +delete-resource +\family default + given at the new primary should propagate now to each of your secondaries, + and your local disk should be usable for a re- +\family typewriter +join-resource +\family default +. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +When you did not rebuild your cluster from scratch with fresh +\family typewriter +/mars/ +\family default + filesystems, and one of the old cluster nodes is supposed to be removed + permanently, use +\family typewriter +leave-resource +\family default + (optionally with +\family typewriter +--host= +\family default + and/or +\family typewriter +--force +\family default +) and finally +\family typewriter +leave-cluster +\family default +. +\end_layout + +\begin_layout Chapter +Experts only: Special Trick Switching and Rebuild +\begin_inset CommandInset label +LatexCommand label +name "chap:Experts-only:-Special" + +\end_inset + + +\end_layout + +\begin_layout Standard +The following is a further alternative for +\series bold +experts +\series default + who really know what they are doing. + The method is very simple and therefore well-suited for coping with mass + failures, e.g. + +\series bold +power blackout of whole datacenters +\series default +. +\end_layout + +\begin_layout Standard +In case a primary datacenter fails as a whole for whatever reason and you + have a backup datacenter, do the following steps in the backup datacenter: +\end_layout + +\begin_layout Enumerate +Fencing step: by means of firewalling, +\series bold +ensure +\series default + that the (virtually) damaged datacenter nodes +\series bold +cannot +\series default + be reached over the network. + For example, you may place REJECT rules into all of your local iptables + firewalls at the backup datacenter. + Alternatively / additionally, you may block the routes at the appropriate + central router(s) in your network. +\end_layout + +\begin_layout Enumerate +Run the sequence +\family typewriter +marsadm disconnect all; marsadm primary --force all +\family default + on all nodes in the backup datacenter. +\end_layout + +\begin_layout Enumerate +Restart your services in the backup datacenter (as far as necessary). + Depending on your network setup, further steps like switching BGP routes + etc may be necessary. +\end_layout + +\begin_layout Enumerate +Check that +\emph on +all +\emph default + your services are +\emph on +really +\emph default + up and running, before you try to repair anything! Failing to do so may + result in data loss when you execute the following restore method for +\emph on +experts +\emph default +. +\end_layout + +\begin_layout Standard +Now your backup datacenter should continue servicing your clients. + The final reconstruction of the originally primary datacenter works as + follows: +\end_layout + +\begin_layout Enumerate +At the damaged primary datacenter, ensure that nowhere the MARS kernel module + is running. + In case of a power blackout, you shouldn't have executed an automatic +\family typewriter +modprobe mars +\family default + anywhere during reboot, so you should be already done when all your nodes + are up again. + In case some nodes had no reboot, execute +\family typewriter +rmmod mars +\family default + everywhere. + If +\family typewriter +rmmod +\family default + refuses to run, you may need to umount the +\family typewriter +/dev/mars/mydata +\family default + device first. + When nothing else helps, you may just mass reboot your hanging nodes. +\end_layout + +\begin_layout Enumerate +At the failed side, do +\family typewriter +rm -rf /mars/resource-$mydata/ +\family default + for all those resources which had been primary before the blackout. + Do this +\emph on +only +\emph default + for those cases, otherwise you will need unnecessary +\family typewriter +leave-resource +\family default +s or +\family typewriter +invalidate +\family default +s later (e.g. + when half of your nodes were already running at the surving side). + In order to avoid unnecessary traffic, please do this only as far as really + necessary. + Don't remove any other directories. + In particular, +\family typewriter +/mars/ips/ +\family default + +\emph on +must +\emph default + remain intact. + In case you accidentally deleted them, or you had to re-create +\family typewriter +/mars/ +\family default + from scratch, try +\family typewriter +rsync +\family default + with the correct options. +\begin_inset Newline newline +\end_inset + + +\begin_inset Graphics + filename images/MatieresCorrosives.png + lyxscale 50 + scale 17 + +\end_inset + + Caution! before doing this, check that the corresponding directory exists + at the backup datacenter, and that it is +\emph on +really +\emph default + healthy! +\end_layout + +\begin_layout Enumerate +Un-Fencing: restore your network firewall / routes and check that they work + ( +\family typewriter +ping +\family default + etc). +\end_layout + +\begin_layout Enumerate +Do +\family typewriter +modprobe mars +\family default + everywhere. + All missing directories and their missing symlinks should be automatically + fetched from the backup datacenter. +\end_layout + +\begin_layout Enumerate +Run +\family typewriter +marsadm join-resource $res +\family default +, but only at those places where the directory was removed previously, while + using the same disk devices as before. + This will minimize actual traffic thanks to the fast full sync algorithm. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +It is +\series bold +crucial +\series default + that the fencing step +\series bold +must +\series default + be executed +\emph on +before +\emph default + any +\family typewriter +primary --force +\family default +! This way, no split brain will be +\emph on +visible +\emph default + at the backup datacenter side, because there is simply no chance for transferri +ng different versions over the network. + It is also crucial to remove any (potentially diverging) resource directories + +\emph on +before +\emph default + the +\family typewriter +modprobe +\family default +! This way, the backup datacenter never runs into split brain. + This saves you a lot of detail work for split brain resolution when you + have to restore bulks of nodes in a short time. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +In case the repair of a full datacenter should take so extremely long that + some +\family typewriter +/mars/ +\family default + partitions are about to run out of space at the surviving side, you may + use the +\family typewriter +leave-resource --host=failed-node +\family default + trick described earlier, followed by +\family typewriter +log-delete-all +\family default +. + Best if you have prepared a fully automatic script long before the incident, + which executes suchalike only as far as necessary in each individual case. +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Graphics + filename images/lightbulb_brightlit_benj_.png + lyxscale 12 + scale 7 + +\end_inset + +Even better: train such scenarios in advance, and prepare scripts for mass + automation. + Look into section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Scripting-HOWTO" + +\end_inset + +. +\end_layout + +\begin_layout Chapter +Mathematical Model of Architectural Reliability +\begin_inset CommandInset label +LatexCommand label +name "chap:Mathematical-Model-of" + +\end_inset + + +\end_layout + +\begin_layout Standard +The assumptions used in the model are explained in detail in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sub:Detailed-explanation" + +\end_inset + +. + Here is a quick recap of the main parameters: +\end_layout + +\begin_layout Itemize +\begin_inset Formula $n$ +\end_inset + + is the number of basic storage units. + It is also used for the number of application units, assumed to be the + same. +\end_layout + +\begin_layout Itemize +\begin_inset Formula $k$ +\end_inset + + is the replication degree, or number of replicas. + In general, you will have to deploy +\begin_inset Formula $N=k*n$ +\end_inset + + storage servers for getting +\begin_inset Formula $n$ +\end_inset + + basic storage units. + This applies to any of the competing architectures. + +\end_layout + +\begin_layout Itemize +\begin_inset Formula $s$ +\end_inset + + is the architecture-dependent spread exponent: it tells whether a storage + incident will spread to the application units. + Examples: +\begin_inset Formula $s=0$ +\end_inset + + means that there is no spread between storage unit failures and application + unit failures, other than a local 1:1 one. + +\begin_inset Formula $s=1$ +\end_inset + + means that an uncompensated storage node incident will cause +\begin_inset Formula $n$ +\end_inset + + application incidents. +\end_layout + +\begin_layout Itemize +\begin_inset Formula $p$ +\end_inset + + is the probability of a storage server incident. + In the examples at section +\begin_inset CommandInset ref +LatexCommand vref +reference "sec:Reliability-Arguments-from" + +\end_inset + +, a fixed +\begin_inset Formula $p=0.0001$ +\end_inset + + was used for easy understanding, but the following formulae should also + hold for any other +\begin_inset Formula $p\in(0,1)$ +\end_inset + +. +\end_layout + +\begin_layout Itemize +\begin_inset Formula $T$ +\end_inset + + is the observational period, introduced for convenience of understanding. + The following can also be computed independently from any +\begin_inset Formula $T$ +\end_inset + +, as long as the probability +\begin_inset Formula $p$ +\end_inset + + does not change over time, which is assumed. + Because +\begin_inset Formula $T$ +\end_inset + + is only here for convenience of understanding, we set it to +\begin_inset Formula $T=1/p$ +\end_inset + +. + In the examples from section +\begin_inset CommandInset ref +LatexCommand vref +reference "sub:Detailed-explanation" + +\end_inset + +, a fixed +\begin_inset Formula $T=10,000$ +\end_inset + + hours was used. +\end_layout + +\begin_layout Section +Formula for DRBD / MARS +\end_layout + +\begin_layout Standard +We need not discrimiate between a storage failure probability S and an applicati +on failure probability A because applications are run locally at the storage + servers 1:1. + The probability for failure of a single shard consisting of +\begin_inset Formula $k$ +\end_inset + + nodes is +\end_layout + +\begin_layout Standard +\begin_inset Formula +\[ +A_{p}(k)=p^{k} +\] + +\end_inset + +because all +\begin_inset Formula $k$ +\end_inset + + shard members have to be down all at the same time. + In section +\begin_inset CommandInset ref +LatexCommand vref +reference "sub:Detailed-explanation" + +\end_inset + + we assumed that there is no cross-communication between shards. + Therefore they are completely independent from each other, and the total + downtime of +\begin_inset Formula $n$ +\end_inset + + shards during the observational period +\begin_inset Formula $T$ +\end_inset + + is +\end_layout + +\begin_layout Standard +\begin_inset Formula +\[ +A_{p,T}(k,n)=T*n*p^{k} +\] + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +When introducing the spread exponent +\begin_inset Formula $s$ +\end_inset + +, the formula turns into +\end_layout + +\begin_layout Standard +\begin_inset Formula +\[ +A_{s,p,T}(k,n)=T*n^{s+1}*p^{k} +\] + +\end_inset + + +\end_layout + +\begin_layout Section +Formula for Unweighted BigCluster +\end_layout + +\begin_layout Standard +This is based on the Bernoulli formula. + The probability that exactly +\begin_inset Formula $\bar{k}$ +\end_inset + + storage nodes out of +\begin_inset Formula $N=k*n$ +\end_inset + + total storage nodes are down is +\end_layout + +\begin_layout Standard +\begin_inset Formula +\[ +\bar{S}_{p}(\bar{k},N)=\binom{N}{\bar{k}}*p^{\bar{k}}*(1-p)^{N-\bar{k}} +\] + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +Similarly, the probability for getting +\begin_inset Formula $k$ +\end_inset + + or more storage node failures (up to +\begin_inset Formula $N$ +\end_inset + +) at the same time is +\end_layout + +\begin_layout Standard +\begin_inset Formula +\[ +S_{p}(k,N)=\sum_{\bar{k}=k}^{N}\bar{S}_{p}(\bar{k},N)=\sum_{\bar{k}=k}^{N}\binom{N}{\bar{k}}*p^{\bar{k}}*(1-p)^{N-\bar{k}} +\] + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +By replacing +\begin_inset Formula $N$ +\end_inset + + with +\begin_inset Formula $k*n$ +\end_inset + + (for conversion of the x axis into basic storage units) and by introducing + +\begin_inset Formula $T$ +\end_inset + + we get +\end_layout + +\begin_layout Standard +\begin_inset Formula +\[ +S_{p,T}(k,n)=T*\sum_{\bar{k}=k}^{k*n}\binom{k*n}{\bar{k}}*p^{\bar{k}}*(1-p)^{k*n-\bar{k}} +\] + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +For comparability with DRBDorMARS, we have to compute the application downtime + A instead of the storage downtime S, which depends on the spread exponent + +\begin_inset Formula $s$ +\end_inset + + as follows: +\end_layout + +\begin_layout Standard +\begin_inset Formula +\[ +A_{s,p,T}(k,n)=n^{s+1}*S_{p,T}(k,n)=n^{s+1}*T*\sum_{\bar{k}=k}^{k*n}\binom{k*n}{\bar{k}}*p^{\bar{k}}*(1-p)^{k*n-\bar{k}} +\] + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +Notice that at +\begin_inset Formula $s=0$ +\end_inset + + we have introduced a factor of +\begin_inset Formula $n$ +\end_inset + +, which corresponds to the hashing effect (teardown of +\begin_inset Formula $n$ +\end_inset + + application instances by a single uncompensated storage incident) as described + in section +\begin_inset CommandInset ref +LatexCommand vref +reference "sub:Detailed-explanation" + +\end_inset + +. +\end_layout + +\begin_layout Section +Formula for SizeWeighted BigCluster +\end_layout + +\begin_layout Standard +In difference to above, we need to introduce a correction factor by the + fraction of affected objects, relative to basic storage units. + Otherwise the y axis would not stay comparable due to different units. +\end_layout + +\begin_layout Standard +For the special case of +\begin_inset Formula $k=1$ +\end_inset + +, there is no difference to above. +\end_layout + +\begin_layout Standard +For the special case of +\begin_inset Formula $k=2$ +\end_inset + + replica, the correction factor is +\begin_inset Formula $1/(N-1)$ +\end_inset + +, because we assume that all the replica of the affected first node are + uniformly spread to all other nodes, which is +\begin_inset Formula $N-1$ +\end_inset + +. + The probability for hitting the intersection of the first node with the + second node is thus +\begin_inset Formula $1/(N-1)$ +\end_inset + +. +\end_layout + +\begin_layout Standard +For higher values of +\begin_inset Formula $k$ +\end_inset + +, and with a similar argument (never put another replica of the same object + onto the same storage node) we get the correction factor as +\end_layout + +\begin_layout Standard +\begin_inset Formula +\[ +C(k,N)=\prod_{l=1}^{k-1}\frac{1}{N-l} +\] + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +Hint: there are maximum +\begin_inset Formula $k$ +\end_inset + + physical replicas on the disks. + For higher values of +\begin_inset Formula $\bar{k}\geq k$ +\end_inset + +, there are +\begin_inset Formula $\binom{\bar{k}}{k}$ +\end_inset + + combinations of object intersections (when assuming that the number of + objects on a node is very large such and no further object repetition can + occur execpt for the +\begin_inset Formula $k$ +\end_inset + +-fold replica placement). + Thus the generalization to +\begin_inset Formula $\bar{k}\geq k$ +\end_inset + + is +\end_layout + +\begin_layout Standard +\begin_inset Formula +\[ +C(k,\bar{k},N)=\binom{\bar{k}}{k}\prod_{l=1}^{k-1}\frac{1}{N-l} +\] + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +By inserting this into the above fomula, we get +\end_layout + +\begin_layout Standard +\begin_inset Formula +\[ +A_{s,p,T}(k,n)=n^{s+1}*T*\sum_{\bar{k}=k}^{k*n}C(k,\bar{k},k*n)*\binom{k*n}{\bar{k}}*p^{\bar{k}}*(1-p)^{k*n-\bar{k}} +\] + +\end_inset + + +\end_layout + +\begin_layout Chapter +Command Documentation for Userspace Tools +\begin_inset CommandInset label +LatexCommand label +name "chap:Command-Documentation-for" + +\end_inset + + +\end_layout + +\begin_layout Section + +\family typewriter +marsadm --help +\begin_inset CommandInset label +LatexCommand label +name "sec:marsadm-–help" + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +input{} +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section + +\family typewriter --help +\begin_inset CommandInset label +LatexCommand label +name "sec:football-–help" + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +input{} +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section + +\family typewriter --help --verbose +\begin_inset CommandInset label +LatexCommand label +name "sec:football-help-verbose" + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +input{} +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section + +\family typewriter --help +\begin_inset CommandInset label +LatexCommand label +name "sec:screener–help" + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +input{} +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section + +\family typewriter --help --verbose +\begin_inset CommandInset label +LatexCommand label +name "sec:screener-help-verbose" + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +input{} +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Chapter +Football Redundancy Diagrams +\begin_inset CommandInset label +LatexCommand label +name "chap:Football-Redundancy-Diagrams" + +\end_inset + + +\end_layout + +\begin_layout Standard +The following tables are showing the number of replicas during Football. + We focus at the common case of starting with 2 replicas, and ending up + in a total of another 2 replicas at another machines. + Further cases, involving multiple secondaries, should go analogously. +\end_layout + +\begin_layout Standard +Active primaries are colored in red. +\end_layout + +\begin_layout Standard +Backup or shadow replicas (which are present at LVM level, but currently + not used by MARS) are in parentheses. + In case of emergency, they could be activated again. +\end_layout + +\begin_layout Standard +Replicas which are not in parentheses are kept in +\family typewriter +UpToDate +\family default + state all the time, until they are retired into backup replicas. +\end_layout + +\begin_layout Section +Parallel +\family typewriter +migrate +\end_layout + +\begin_layout Standard +This creates two additional replicas in parallel, at the target pair. + After handover to the new site, and after some configurable waiting time, + the old replicas are deleted. +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "14col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +migrate +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +SRC +\begin_inset Newline newline +\end_inset + +Primary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +SRC +\begin_inset Newline newline +\end_inset + +Secondary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +DST +\begin_inset Newline newline +\end_inset + +Primary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +DST +\begin_inset Newline newline +\end_inset + +Secondary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "12col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +# Replicas +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Start +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Migrate x 2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +4 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +After Cleanup +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Section +Stepwise +\family typewriter +migrate +\end_layout + +\begin_layout Standard +This variant is useful for hardware lifecycle. + The uplink of the old hardware is only loaded with creation of 1 replica + in migration step 1. + Step 2 creates then another replica at the new hardware, which should have + a better replication network (e.g. + better uplinks and/or better capacity for cross-traffic between datacenters). +\end_layout + +\begin_layout Standard +This variant is selected by parameter +\family typewriter +migrate_two_phase=1 +\family default +. +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "14col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +2-Step +\begin_inset Newline newline +\end_inset + +migrate +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +SRC +\begin_inset Newline newline +\end_inset + +Primary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +SRC +\begin_inset Newline newline +\end_inset + +Secondary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +DST +\begin_inset Newline newline +\end_inset + +Primary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +DST +\begin_inset Newline newline +\end_inset + +Secondary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "12col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +# Replicas +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Start +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Migrate 1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +3 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Migrate 2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +4 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +After Cleanup +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Section +Plain +\family typewriter +shrink +\end_layout + +\begin_layout Standard +Here we need to discriminate between replicas with the old size, and the + new size (which is typically smaller than the old size). +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "14col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +shrink +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +SRC +\begin_inset Newline newline +\end_inset + +Primary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +SRC +\begin_inset Newline newline +\end_inset + +Secondary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +DST +\begin_inset Newline newline +\end_inset + +Primary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +DST +\begin_inset Newline newline +\end_inset + +Secondary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "12col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +# Replicas +\begin_inset Newline newline +\end_inset + +old_size +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "12col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +# Replicas +\begin_inset Newline newline +\end_inset + +new_size +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Start +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\color inherit ++ (1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Working +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) + +\color red + 1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(2) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Finished +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) + 1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(2) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +After Cleanup +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Section +Full +\family typewriter +migrate+shrink +\end_layout + +\begin_layout Standard +This variant is +\emph on +almost +\emph default + equivalent to +\family typewriter +migrate +\family default + followed by +\family typewriter +shrink +\family default +. + The only difference is that cleanup is done +\emph on +later +\emph default +. + This means, more replicas are kept for some longer time. + Thus this variant is more safe than doing +\family typewriter +migrate +\family default + and +\family typewriter +shrink +\family default + separately. +\end_layout + +\begin_layout Standard +This variant is selected by parameters +\family typewriter +migrate_two_phase=0 +\family default + and +\family typewriter +migrate_always_all=1 +\family default + and +\family typewriter +migrate_early_cleanup=0 +\family default +: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "14col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +FULL +\begin_inset Newline newline +\end_inset + +migrate+shrink +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +SRC +\begin_inset Newline newline +\end_inset + +Primary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +SRC +\begin_inset Newline newline +\end_inset + +Secondary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +DST +\begin_inset Newline newline +\end_inset + +Primary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +DST +\begin_inset Newline newline +\end_inset + +Secondary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "12col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +# Replicas +\begin_inset Newline newline +\end_inset + +old_size +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "12col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +# Replicas +\begin_inset Newline newline +\end_inset + +new_size +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Start +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Migrate x 2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +4 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Start +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\color inherit ++ (1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +4 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Working +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) + +\color red + 1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(4) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Finished +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) + 1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(4) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +After Cleanup +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +There is a variant which does early cleanup, which is roughly equivalent + to doing a standalone +\family typewriter +migrate +\family default + followed by a standalone +\family typewriter +shrink +\family default +. + +\end_layout + +\begin_layout Standard +This variant is selected by parameters +\family typewriter +migrate_two_phase=0 +\family default + and +\family typewriter +migrate_always_all=1 +\family default + and +\family typewriter +migrate_early_cleanup=1 +\family default +. + It is less safe because it keeps less replicas and is thus less recommended: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "14col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +Sequential +\begin_inset Newline newline +\end_inset + +migrate+shrink +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +SRC +\begin_inset Newline newline +\end_inset + +Primary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +SRC +\begin_inset Newline newline +\end_inset + +Secondary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +DST +\begin_inset Newline newline +\end_inset + +Primary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +DST +\begin_inset Newline newline +\end_inset + +Secondary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "12col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +# Replicas +\begin_inset Newline newline +\end_inset + +old_size +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "12col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +# Replicas +\begin_inset Newline newline +\end_inset + +new_size +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Start +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Migrate x 2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +4 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Early Cleanup +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Start +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\color inherit ++ (1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Working +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) + +\color red + 1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(2) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Finished +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) + 1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(2) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +After Cleanup +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Section +Stepwise +\family typewriter +migrate+shrink +\end_layout + +\begin_layout Standard +This variant is useful for hardware lifecycle. + The uplink of the old hardware is only loaded with creation of 1 replica + in migration step 1. + Step 2 creates then another replica at the new hardware, which should have + a better replication network. + +\end_layout + +\begin_layout Standard +This variant is selected by parameters +\family typewriter +migrate_two_phase=1 +\family default + and +\family typewriter +migrate_always_all=1 +\family default + and +\family typewriter +migrate_early_cleanup=0 +\family default +: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "14col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +2-Step +\begin_inset Newline newline +\end_inset + +migrate+shrink +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +SRC +\begin_inset Newline newline +\end_inset + +Primary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +SRC +\begin_inset Newline newline +\end_inset + +Secondary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +DST +\begin_inset Newline newline +\end_inset + +Primary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +DST +\begin_inset Newline newline +\end_inset + +Secondary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "12col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +# Replicas +\begin_inset Newline newline +\end_inset + +old_size +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "12col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +# Replicas +\begin_inset Newline newline +\end_inset + +new_size +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Start +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Migrate 1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +3 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Migrate 2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +4 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Start +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\color inherit ++ (1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +4 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Working +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) + +\color red + 1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(4) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Finished +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) + 1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(4) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +After Cleanup +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +This variant can also be combined with early cleanup. + The result is similar to above. + The only difference is that the second additional replica is created at + the new hardware. + +\end_layout + +\begin_layout Standard +This variant is selected by parameters +\family typewriter +migrate_two_phase=1 +\family default + and +\family typewriter +migrate_always_all=0 +\family default + and +\family typewriter +migrate_early_cleanup= +\family default +1. + Again, this variant is less safe and therefore less recommended. +\end_layout + +\begin_layout Standard +However, it keeps at least 2 (backup) replicas all the time and thus could + be an alternative when decommissioning of old hardware is time-critical. +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "14col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +2-Step +\begin_inset Newline newline +\end_inset + +migrate+shrink +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +SRC +\begin_inset Newline newline +\end_inset + +Primary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +SRC +\begin_inset Newline newline +\end_inset + +Secondary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +DST +\begin_inset Newline newline +\end_inset + +Primary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +DST +\begin_inset Newline newline +\end_inset + +Secondary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "12col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +# Replicas +\begin_inset Newline newline +\end_inset + +old_size +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "12col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +# Replicas +\begin_inset Newline newline +\end_inset + +new_size +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Start +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Migrate 1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +3 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Migrate 2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +4 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Early Cleanup +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Start +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\color inherit ++ (1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Working +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) + +\color red + 1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(2) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Finished +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) + 1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(2) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +After Cleanup +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Section +FAST +\family typewriter +migrate+shrink +\end_layout + +\begin_layout Standard +This variant tries to keep a balance by not creating too many unnecessary + replicas and to reduce network traffic. +\end_layout + +\begin_layout Standard +This variant is selected by parameters +\family typewriter +migrate_two_phase=0 +\family default + and +\family typewriter +migrate_always_all=0 +\family default + and +\family typewriter +migrate_early_cleanup=0 +\family default +: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "14col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +FAST +\begin_inset Newline newline +\end_inset + +migrate+shrink +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +SRC +\begin_inset Newline newline +\end_inset + +Primary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +SRC +\begin_inset Newline newline +\end_inset + +Secondary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +DST +\begin_inset Newline newline +\end_inset + +Primary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "10col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +DST +\begin_inset Newline newline +\end_inset + +Secondary +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "12col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +# Replicas +\begin_inset Newline newline +\end_inset + +old_size +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +use_makebox 0 +width "12col%" +special "none" +height "1in" +height_special "totalheight" +thickness "0.4pt" +separation "3pt" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +# Replicas +\begin_inset Newline newline +\end_inset + +new_size +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Start +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Migrate x 1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +3 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Start +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\color inherit ++ (1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +3 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Working +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) + +\color red + 1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(3) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +Shrink Finished +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(1) + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +(3) +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +After Cleanup +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout + +\color red +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +As before, this could +\emph on +theoretically +\emph default + be combined with early cleanup. + Such a combination is however not recommended because there is one intermediate + step where all existing replicas are at the DST primary, and thus this + one machine must not fail. +\end_layout + +\begin_layout Chapter +GNU Free Documentation License +\begin_inset CommandInset label +LatexCommand label +name "chap:GNU-FDL" + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent + +\family typewriter +\size footnotesize +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +lstinputlisting{fdl.txt} +\end_layout + +\end_inset + + +\end_layout + +\end_body +\end_document