mirror of
https://github.com/schoebel/mars
synced 2025-01-21 14:53:49 +00:00
17579 lines
367 KiB
Plaintext
17579 lines
367 KiB
Plaintext
#LyX 2.3 created this file. For more info see http://www.lyx.org/
|
||
\lyxformat 544
|
||
\begin_document
|
||
\begin_header
|
||
\save_transient_properties true
|
||
\origin unavailable
|
||
\textclass scrreprt
|
||
\begin_preamble
|
||
\usepackage{listings}
|
||
\end_preamble
|
||
\options abstracton,dvipsnames
|
||
\use_default_options true
|
||
\begin_modules
|
||
customHeadersFooters
|
||
enumitem
|
||
fixltx2e
|
||
\end_modules
|
||
\maintain_unincluded_children false
|
||
\language english
|
||
\language_package default
|
||
\inputencoding auto
|
||
\fontencoding global
|
||
\font_roman "default" "default"
|
||
\font_sans "default" "default"
|
||
\font_typewriter "default" "default"
|
||
\font_math "auto" "auto"
|
||
\font_default_family rmdefault
|
||
\use_non_tex_fonts false
|
||
\font_sc false
|
||
\font_osf false
|
||
\font_sf_scale 100 100
|
||
\font_tt_scale 100 100
|
||
\use_microtype false
|
||
\use_dash_ligatures false
|
||
\graphics default
|
||
\default_output_format default
|
||
\output_sync 0
|
||
\bibtex_command default
|
||
\index_command default
|
||
\paperfontsize 10
|
||
\spacing single
|
||
\use_hyperref true
|
||
\pdf_title "MARS Architecture Guide"
|
||
\pdf_author "Thomas Schöbel-Theuer"
|
||
\pdf_bookmarks true
|
||
\pdf_bookmarksnumbered false
|
||
\pdf_bookmarksopen true
|
||
\pdf_bookmarksopenlevel 2
|
||
\pdf_breaklinks true
|
||
\pdf_pdfborder true
|
||
\pdf_colorlinks true
|
||
\pdf_backref section
|
||
\pdf_pdfusetitle true
|
||
\papersize a4paper
|
||
\use_geometry true
|
||
\use_package amsmath 1
|
||
\use_package amssymb 1
|
||
\use_package cancel 1
|
||
\use_package esint 1
|
||
\use_package mathdots 1
|
||
\use_package mathtools 1
|
||
\use_package mhchem 1
|
||
\use_package stackrel 1
|
||
\use_package stmaryrd 1
|
||
\use_package undertilde 1
|
||
\cite_engine basic
|
||
\cite_engine_type default
|
||
\biblio_style plain
|
||
\use_bibtopic false
|
||
\use_indices false
|
||
\paperorientation portrait
|
||
\suppress_date false
|
||
\justification true
|
||
\use_refstyle 1
|
||
\use_minted 0
|
||
\index Index
|
||
\shortcut idx
|
||
\color #008000
|
||
\end_index
|
||
\leftmargin 3.7cm
|
||
\topmargin 2.7cm
|
||
\rightmargin 2.8cm
|
||
\bottommargin 2.3cm
|
||
\secnumdepth 3
|
||
\tocdepth 3
|
||
\paragraph_separation indent
|
||
\paragraph_indentation default
|
||
\is_math_indent 0
|
||
\math_numbering_side default
|
||
\quotes_style english
|
||
\dynamic_quotes 0
|
||
\papercolumns 1
|
||
\papersides 2
|
||
\paperpagestyle headings
|
||
\tracking_changes false
|
||
\output_changes false
|
||
\html_math_output 0
|
||
\html_css_as_file 0
|
||
\html_be_strict false
|
||
\end_header
|
||
|
||
\begin_body
|
||
|
||
\begin_layout Standard
|
||
\begin_inset ERT
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
|
||
\backslash
|
||
title{MARS Architecture Guide}
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\begin_inset CommandInset include
|
||
LatexCommand input
|
||
preview true
|
||
filename "common-front-matter.lyx"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Chapter
|
||
Architectures of Cloud Storage / Software Defined Storage / Big Data
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "chap:Cloud-Storage"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Datacenter architects have no easy job.
|
||
Building up some petabytes of data in the wrong way can easily endanger
|
||
a company, as will be shown later.
|
||
There are some architectural laws to know and some rules to follow.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
First, we need to take a look at the most general possibilities how storage
|
||
can be architecturally designed:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/storage-classification.fig
|
||
width 80col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
The topmost question is: do we always need to access bigger masses of (typically
|
||
unstructured) data over a network?
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
There is a common belief that both reliability and scalability could be
|
||
only achieved this way.
|
||
In the past, local storage has often been viewed as
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
too simple
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
to provide both enterprise grade reliability, and scalability.
|
||
In the past, this was sometimes true.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
However, this picture has changed with the advent of a new
|
||
\series bold
|
||
load balancing
|
||
\series default
|
||
method called
|
||
\series bold
|
||
LV Football
|
||
\series default
|
||
, see chapter
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "chap:LV-Football"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
We will later review what level of reliability and scalability can be achieved
|
||
with each of the fundamental models mentioned here.
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
What is Architecture
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "sec:What-is-Architecture"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
From
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
https://en.wikipedia.org/wiki/Software_architecture
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
:
|
||
\end_layout
|
||
|
||
\begin_layout Quote
|
||
Software architecture refers to the
|
||
\series bold
|
||
high level structures
|
||
\series default
|
||
of a software system and the
|
||
\series bold
|
||
discipline
|
||
\series default
|
||
of creating such structures and systems.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Throughout this paper, the term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
architecture
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
is strictly separated from
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
implementations
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
Any of
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
architecture
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
or
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
implementation
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
can relate to both hard- and software in general.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Confusion of
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
architecture
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
with
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
implementation
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
is a major source of ill-designs, which then often cause major product
|
||
flaws and/or operational problems.
|
||
Be sure to understand the difference.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Another source of costly ill-designs is starting with a particular implementatio
|
||
n in mind, and not sufficiently reasoning abouts its fundamental architecture.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Recommended best practice is to (1) look at the
|
||
\series bold
|
||
problem space
|
||
\series default
|
||
, then (2) consider a
|
||
\emph on
|
||
set
|
||
\emph default
|
||
of
|
||
\series bold
|
||
architectural solution classes
|
||
\series default
|
||
, and (3) look at the
|
||
\series bold
|
||
mappings
|
||
\series default
|
||
between them.
|
||
This means: start with
|
||
\series bold
|
||
architectural requirements
|
||
\series default
|
||
for a particular
|
||
\series bold
|
||
application area
|
||
\series default
|
||
(typically covering
|
||
\emph on
|
||
multiple
|
||
\emph default
|
||
use cases), then look at
|
||
\series bold
|
||
multiple solution architectures
|
||
\series default
|
||
, and finally go down to a
|
||
\series bold
|
||
\emph on
|
||
set
|
||
\series default
|
||
\emph default
|
||
of potential implementations, but only
|
||
\emph on
|
||
after
|
||
\emph default
|
||
the former has been understood.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Starting with a particular single solution in mind is almost a
|
||
\emph on
|
||
guarantee
|
||
\emph default
|
||
for a non-optimum solution, or even a failed project, or even a disaster
|
||
at company level when
|
||
\series bold
|
||
enterprise-critical mass data
|
||
\series default
|
||
is involved.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Nevertheless, don't think in waterfall models.
|
||
Always work
|
||
\series bold
|
||
iteratively
|
||
\series default
|
||
and
|
||
\series bold
|
||
evolutionary
|
||
\series default
|
||
, but nevertheless obey the principle that any bug in an architectural ill-desig
|
||
n cannot be fixed by the best implementation of the world.
|
||
Be sure to understand the fundamental difference between architecture and
|
||
its (multiple / alternative) implemenations by their respective
|
||
\series bold
|
||
reach
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
What is
|
||
\emph on
|
||
Cloud Storage
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "sec:Requirements-for-Cloud"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
According to a popular definition from
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
https://en.wikipedia.org/wiki/Cloud_storage
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
(retrieved June 2018), cloud storage is
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
(1) Made up of many
|
||
\series bold
|
||
distributed resources
|
||
\series default
|
||
, but still
|
||
\series bold
|
||
act as one
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
(2) Highly
|
||
\series bold
|
||
fault tolerant
|
||
\series default
|
||
through redundancy and distribution of data.
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
(3) Highly
|
||
\series bold
|
||
durable
|
||
\series default
|
||
through the creation of versioned copies.
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
(4) Typically
|
||
\series bold
|
||
eventually consistent
|
||
\series default
|
||
with regard to data replicas.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Notice that the term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
network
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
does not occur in this definition.
|
||
However, the term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
distributed resources
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
is implying
|
||
\emph on
|
||
some(!)
|
||
\emph default
|
||
kind of network.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Important! The definition does
|
||
\emph on
|
||
not
|
||
\emph default
|
||
imply some
|
||
\emph on
|
||
specific
|
||
\emph default
|
||
type of network, such as a
|
||
\series bold
|
||
storage network
|
||
\series default
|
||
which must be capable of transporting masses of IO operations in
|
||
\series bold
|
||
realtime
|
||
\series default
|
||
.
|
||
We are free to use other types of networks, such as
|
||
\series bold
|
||
replication networks
|
||
\series default
|
||
, which need not be dimensioned for realtime IO traffic, but are usable
|
||
for
|
||
\series bold
|
||
background data migration
|
||
\series default
|
||
, and even over long distances, where the network typically has some bottlenecks.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Notice that the definition says nothing about the
|
||
\series bold
|
||
time scale
|
||
\series default
|
||
of operations
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Notice: go down to a time scale of microseconds.
|
||
You will then notice that typical IO operations will require several hundreds
|
||
of machine instructions between IO request
|
||
\emph on
|
||
submission
|
||
\emph default
|
||
and the corresponding IO request
|
||
\emph on
|
||
completion
|
||
\emph default
|
||
.
|
||
This is not only true for local IO.
|
||
In network clusters like Ceph, it will even involve creation of network
|
||
packets, and lead to additional IO latencies implied by the network packet
|
||
transfer latencies.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
We are free to implement certain operations, such as background data migrations
|
||
, in a rather long timescale (from a human point of view).
|
||
Example: increasing the number of replicas in an operational Ceph cluster,
|
||
already containing a few hundreds of terabytes of data, will not only require
|
||
additional storage hardware, but also take a rather long time, implied
|
||
by the very nature of such reorganisational tasks.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
The famous CAP theorem is one of the motivations behind requirement (4)
|
||
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
eventually consistent
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
This is not an accident.
|
||
There is a
|
||
\emph on
|
||
reason
|
||
\emph default
|
||
for it, although it is not a
|
||
\emph on
|
||
hard
|
||
\emph default
|
||
requirement.
|
||
Strict consistency is not needed for many applications running on top of
|
||
cloud storage.
|
||
In addition, the CAP theorem and some other theorems cited at
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
https://en.wikipedia.org/wiki/CAP_theorem
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
are telling us that Strict Consistency would be
|
||
\series bold
|
||
difficult and expensive
|
||
\series default
|
||
to achieve at global level in a bigger Distributed System, and at the cost
|
||
of other properties.
|
||
More detailed explanations are in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sec:Explanation-via-CAP"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
There are some consequences from this definition of Cloud Storage, for each
|
||
of our high-level storage architectures:
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
Distributed
|
||
\begin_inset space ~
|
||
\end_inset
|
||
|
||
Storage, in particular
|
||
\family typewriter
|
||
BigCluster
|
||
\family default
|
||
architectures (see section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Distributed-vs-Local:"
|
||
|
||
\end_inset
|
||
|
||
): many of them (with few exceptions) are conforming to all of these requirement
|
||
s.
|
||
Typical granularity are objects, or chunks, or other relatively small units
|
||
of data.
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
Centralized
|
||
\begin_inset space ~
|
||
\end_inset
|
||
|
||
Storage: does not conform to (1) and to (4) by definition
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Notice that sharding on top of CentralStorage is no longer a CentralStorage
|
||
model by definition, but a RemoteSharding model according to section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "subsec:Variants-of-Sharding"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
By introduction of synchronous or asynchronous replication, it can be made
|
||
to
|
||
\emph on
|
||
almost
|
||
\emph default
|
||
conform, except for (1) where some concept mismatches remain (probably
|
||
resolvable by going to a RemoteSharding model on top of CentralStorage,
|
||
where CentralStorage is only a
|
||
\emph on
|
||
sub-component
|
||
\emph default
|
||
).
|
||
Typical granularity is replication of whole internal storage pools, or
|
||
of LVs, or of filesystem instances.
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
LocalStorage, and some further models like
|
||
\family typewriter
|
||
RemoteSharding
|
||
\family default
|
||
(see section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "subsec:Variants-of-Sharding"
|
||
|
||
\end_inset
|
||
|
||
):
|
||
\end_layout
|
||
|
||
\begin_deeper
|
||
\begin_layout Description
|
||
(1) can be achieved at LV granularity with Football (see chapter
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "chap:LV-Football"
|
||
|
||
\end_inset
|
||
|
||
), which creates a
|
||
\series bold
|
||
Big Virtual LVM Pool
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
(2) can be achieved at disk granularity with local RAID, and at LV granularity
|
||
with DRBD or MARS.
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
(3) can be achieved at LV granularity with LVM snapshots, and/or ZFS (or
|
||
other filesystem) snapshots, and/or above filesystem layer by addition
|
||
of classical backup.
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
(4) at least
|
||
\family typewriter
|
||
Eventually Consistent
|
||
\family default
|
||
or better can be alternatively achieved by
|
||
\end_layout
|
||
|
||
\begin_deeper
|
||
\begin_layout Description
|
||
(4a)
|
||
\series bold
|
||
DRBD
|
||
\series default
|
||
, which provides
|
||
\family typewriter
|
||
Strict Consistency
|
||
\family default
|
||
during
|
||
\family typewriter
|
||
connected
|
||
\family default
|
||
state, but works only reliably with passive crossover cables over
|
||
\series bold
|
||
short distances
|
||
\series default
|
||
(see CAP theorem in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sec:Explanation-via-CAP"
|
||
|
||
\end_inset
|
||
|
||
).
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
Notice: DRBD violates any type of consistency within your
|
||
\emph on
|
||
replicas
|
||
\emph default
|
||
during (automatic) re-sync, and thus does not
|
||
\emph on
|
||
fully
|
||
\emph default
|
||
comply with the above definition of cloud storage in a
|
||
\emph on
|
||
strong
|
||
\emph default
|
||
sense.
|
||
But you can argue at a course time granularity level in order to fix this.
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
(4b)
|
||
\series bold
|
||
MARS
|
||
\series default
|
||
, which works over
|
||
\series bold
|
||
long distances
|
||
\series default
|
||
and provides two different consistency guarantees at different levels,
|
||
|
||
\emph on
|
||
both at the same time
|
||
\emph default
|
||
:
|
||
\end_layout
|
||
|
||
\begin_deeper
|
||
\begin_layout Description
|
||
locally:
|
||
\family typewriter
|
||
Strict Consistency
|
||
\family default
|
||
at local LV granularity, also
|
||
\emph on
|
||
within
|
||
\emph default
|
||
each of the LV replicas.
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
globally:
|
||
\family typewriter
|
||
Eventually Consistent
|
||
\family default
|
||
|
||
\emph on
|
||
between
|
||
\emph default
|
||
different LV replicas (global level).
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
The CAP theorem (see section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Explanation-via-CAP"
|
||
|
||
\end_inset
|
||
|
||
) says that
|
||
\family typewriter
|
||
Strict Consistency
|
||
\family default
|
||
is
|
||
\series bold
|
||
not possible
|
||
\series default
|
||
in general at
|
||
\emph on
|
||
unplanned failover
|
||
\emph default
|
||
during long-distance network outages (P = Partitioning Tolerance), when
|
||
A = Availability is also a requirement.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
However, in case of a
|
||
\emph on
|
||
planned handover
|
||
\emph default
|
||
, MARS is also
|
||
\family typewriter
|
||
Strictly Consistent
|
||
\family default
|
||
at a global level, but may need some extra time for catching up.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
Notice: global
|
||
\family typewriter
|
||
Strict Consistency
|
||
\family default
|
||
is also possible at a
|
||
\emph on
|
||
coarse timescale
|
||
\emph default
|
||
, in accordance with the CAP theorem, if you decide to sacrifice A = Availabilit
|
||
y during such a network incident by simply
|
||
\emph on
|
||
not
|
||
\emph default
|
||
doing a failover action.
|
||
Just wait until the network outage is gone, and MARS will automatically
|
||
resume
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
This automatic MARS behaviour is similar to the behaviour of DRBD in such
|
||
situations, when DBRD can automatically go to
|
||
\family typewriter
|
||
disconnected
|
||
\family default
|
||
-like state, and you are later manually or automatically resuming the DRBD
|
||
connection for an incremental re-sync.
|
||
MARS does everything automatically because it has no firmly built-in assumption
|
||
s about the actual duration of any network communication.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
everything ASAP, and thus you are using MARS
|
||
\emph on
|
||
only
|
||
\emph default
|
||
as a protection against
|
||
\series bold
|
||
fatal
|
||
\series default
|
||
storage failures / unplanned
|
||
\series bold
|
||
disasters
|
||
\series default
|
||
.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
Notice: A = Availability is
|
||
\emph on
|
||
not generally
|
||
\emph default
|
||
required by the above definition of cloud storage, because from a user's
|
||
perspective it would not generally make sense in the global internet where
|
||
connection loss may anyway occur at any time.
|
||
Thus it is a valid operational strategy to
|
||
\emph on
|
||
not
|
||
\emph default
|
||
fail-over your LVs during certain major network outages.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
Notice: long-term
|
||
\series bold
|
||
disaster tolerance
|
||
\series default
|
||
(e.g.
|
||
perpetual loss of some storage nodes during an earthquake) is
|
||
\emph on
|
||
not
|
||
\emph default
|
||
modeled by the CAP theorem, but is more or less required by (2) and (3)
|
||
from the above definition of cloud storage.
|
||
\end_layout
|
||
|
||
\end_deeper
|
||
\end_deeper
|
||
\end_deeper
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Notice:
|
||
\family typewriter
|
||
BigCluster
|
||
\family default
|
||
architectures are creating
|
||
\emph on
|
||
virtual
|
||
\emph default
|
||
storage pools out of physically distributed storage servers.
|
||
For fairness reasons, creation of a big virtual LVM pool, must be considered
|
||
as
|
||
\emph on
|
||
another
|
||
\emph default
|
||
valid Cloud Storage
|
||
\emph on
|
||
model
|
||
\emph default
|
||
, matching the above definition of Cloud Storage.
|
||
The main architectural difference is granularity, as explained in section
|
||
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Granularity-at-Architecture"
|
||
|
||
\end_inset
|
||
|
||
, and the stacking order of sub-components.
|
||
Notice that Football is creating
|
||
\series bold
|
||
location transparency
|
||
\series default
|
||
inside of the distributed virtual LVM pool.
|
||
This is an important (though not always required) basic property of any
|
||
type of clusters and/or grids.
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
Granularity at Architecture
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "sec:Granularity-at-Architecture"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Here are the most important architectural differences between object-based
|
||
storages and LV-based (Logical Volume) storages:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Tabular
|
||
<lyxtabular version="3" rows="13" columns="3">
|
||
<features tabularvalignment="middle">
|
||
<column alignment="left" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Objects
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
LVs
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Granularity
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
small (typically KiB)
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
huge (several TiB)
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Number of instances
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
very high
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
low to medium
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Typical access
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
random keys
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
named
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Update in place
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
no
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Resize during operation
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
no
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Object support
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
native
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
on top of
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
LV support
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
on top of
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
native
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Filesystem support
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
on top of
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
on top of
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Scalable
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
at cluster
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
both cluster and grid
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Location distances
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
per datacenter / on campus
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
long distances possible
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Centralized pool management
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
per cluster
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Football uniting clusters
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Easy sharding support
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
cumbersome
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
</lyxtabular>
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
Replication vs Backup
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "sec:Replication-vs-Backup"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Intuitively, data backup and data replication are two different solution
|
||
classes, addressing different problems.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
However, there exist descriptions where both solution classes are overlapping,
|
||
as well as their corresponding problem classes.
|
||
For example, backup as explained in
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
https://en.wikipedia.org/wiki/Backup
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
could be seen as also encompassing some types of storage replications explained
|
||
in
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
https://en.wikipedia.org/wiki/Replication_(computing)
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
For a rough comparison of
|
||
\emph on
|
||
typical
|
||
\emph default
|
||
implementations, see the following
|
||
\emph on
|
||
typical
|
||
\emph default
|
||
differences:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Tabular
|
||
<lyxtabular version="3" rows="6" columns="3">
|
||
<features tabularvalignment="middle">
|
||
<column alignment="left" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Backup
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Replication
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Fast handover (planned)
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
no, or cumbersome
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Fast failover (unplanned)
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
no, or cumbersome
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Protect for physical failures
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Protect for logical data corruption
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes (partly)
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
typically no
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Disaster Recovery Time (MTTR)
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
typically (very) slow
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
fast
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
</lyxtabular>
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
Because of these typical differences, enterprise-critical data typically
|
||
deserves
|
||
\emph on
|
||
both
|
||
\emph default
|
||
solution classes.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Confusion of solution classes and/or their corresponding problem classes
|
||
/ properties can be harmful to enterprises and to carreers of responsible
|
||
persons.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Example: Point-in-time Replication via ZFS Snapshots
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Example:-ZFS-Replication"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Some ZFS advocates believe that ZFS snapshots, which were originally designed
|
||
for backup-like use cases, are also appropriate solutions for achieving
|
||
geo-redundancy.
|
||
The basic idea is to run incremental ZFS snapshots in an endless loop,
|
||
e.g.
|
||
via some simple scripts, and expediting to another host where the snapshots
|
||
are then applied to another ZFS instance.
|
||
When there is less data to be expedited, loop cycle times can go down to
|
||
a few seconds.
|
||
When much data is written at the primary site, loop cycle times will rise
|
||
up.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The following table tries to explain why geo-redundancy is not as simple
|
||
to achieve as believed, at least without addition of sophisticated additional
|
||
means
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
ZFS advocates often argue with many features which aren't present at other
|
||
filesystem types.
|
||
The above table shows some dimensions not dealing with properties of local
|
||
filesystems, but with
|
||
\emph on
|
||
problems / tasks
|
||
\emph default
|
||
arising in long-distance distributed systems involving masses of enterprise-cri
|
||
tical storage.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Tabular
|
||
<lyxtabular version="3" rows="15" columns="4">
|
||
<features tabularvalignment="middle">
|
||
<column alignment="left" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
OpenSource Component
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
DRBD
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
MARS
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
ZFS
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Synchronity (in average)
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
delay
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
delay * 1.5
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Generic solution
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
FS-specific
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Granularity
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
LVs
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
LVs
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
subvolumes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Built-in snapshots
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
no
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
no
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Long distances
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
no
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Replication parallelism (per gran.)
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $1$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\geq2$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $1$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Built-in primary/secondary roles
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
no
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Built-in handover (planned)
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
mostly
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
no
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Built-in failover (unplanned)
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
no
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Built-in data overflow handling
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
unnecessary
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
no, missing
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Unnoticed data loss due to overflow
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
no
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
no
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
possible
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Split-brain awareness
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
no
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Execute split-brain resolution
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
no
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Protect against illegal data modification
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
no
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
</lyxtabular>
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
The last item means that ZFS by itself does not protect against amok-running
|
||
applications modifiying the secondary (backup) side in parallel to the
|
||
replication process (at least not by default).
|
||
Workarounds may be possible, but are not easy to create and to test for
|
||
enterprise-critical applications.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Notice that zfs snapshots can be combined with DRBD or MARS, because zfs
|
||
snapshots are residing at
|
||
\emph on
|
||
filesystem
|
||
\emph default
|
||
layer, while DRBD / MARS replicas are located at
|
||
\emph on
|
||
block
|
||
\emph default
|
||
layer.
|
||
Just create your zpools at the
|
||
\emph on
|
||
top
|
||
\emph default
|
||
of DRBD or MARS virtual devices, and import / export them
|
||
\emph on
|
||
individually
|
||
\emph default
|
||
upon handover / failover of each LV.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
There is a
|
||
\series bold
|
||
\emph on
|
||
fundamental
|
||
\series default
|
||
\emph default
|
||
difference between zpools and classical RAID / LVM stacked architectures.
|
||
Some zfs advocates are propagating zpools as a replacement for both RAID
|
||
and LVM.
|
||
However, there is a
|
||
\series bold
|
||
massive difference
|
||
\series default
|
||
in architecture, as explained in the following example (10 logical resources
|
||
over 48 physical spindles), achieving practically the
|
||
\series bold
|
||
\emph on
|
||
same
|
||
\series default
|
||
zfs snapshot functionality
|
||
\emph default
|
||
from a user's perspective, but in a different way:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/raid-lvm-architecture.fig
|
||
height 6cm
|
||
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/zpool-architecture.fig
|
||
height 6cm
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
When RAID functionality is executed by zfs, it will be located at the
|
||
\emph on
|
||
top
|
||
\emph default
|
||
of the hierarchy.
|
||
On one hand, this easily allows for different RAID levels for each of the
|
||
10 different logical resources.
|
||
On the other hand, this
|
||
\emph on
|
||
exposes
|
||
\emph default
|
||
the
|
||
\series bold
|
||
physical spindle configuration
|
||
\series default
|
||
to the topmost filesystem layer (48 spindles in this example).
|
||
There is no easy way for replication of these
|
||
\emph on
|
||
physical properties
|
||
\emph default
|
||
in a larger / heterogenous distributed system, e.g.
|
||
when some hardware components are replaced over a longer period of time
|
||
(hardware lifecycle, or LV Football as explained in chapter
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "chap:LV-Football"
|
||
|
||
\end_inset
|
||
|
||
).
|
||
Essentially, only replication of
|
||
\emph on
|
||
logical
|
||
\emph default
|
||
structures like snapshots remains as the only reasonable option, with its
|
||
drawbacks as explained above.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
There is another argument: zfs tries to
|
||
\emph on
|
||
hide
|
||
\emph default
|
||
its internal structures and interfaces from the sysadmins, forming a more
|
||
or less
|
||
\series bold
|
||
monolithic
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Some sysadmins acting as zfs advocates are reclaiming this as an advantage,
|
||
because they need to understand only a single tool for managing
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
everything
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
However, this is a short-sighted argument when it comes to
|
||
\emph on
|
||
true
|
||
\emph default
|
||
flexibility as offered by a component-based system, where multiple types
|
||
of hardware / software RAID, multiple types of LVM functionality, and much
|
||
more can be almost orthogonally combined in a very flexible way.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
architecture
|
||
\series default
|
||
as seen from outside.
|
||
This violates the classical
|
||
\emph on
|
||
layering rules
|
||
\emph default
|
||
from Dijkstra.
|
||
In contrast, classical LVM-based configurations are
|
||
\series bold
|
||
component oriented
|
||
\series default
|
||
, according to the
|
||
\series bold
|
||
Unix philosophy
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
Local vs Centralized Storage
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "sec:Local-vs-Centralized"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
There is some old-fashioned belief that only centralized storage systems,
|
||
as typically sold by commercial storage vendors, could achieve a high degree
|
||
of reliability, while local storage were inferior by far.
|
||
In the following, we will see that this is only true for an
|
||
\series bold
|
||
\emph on
|
||
unfair
|
||
\series default
|
||
\emph default
|
||
comparison involving different classes of storage systems.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Internal Redundancy Degree
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Centralized commerical storage systems are typically built up from highly
|
||
redundant
|
||
\emph on
|
||
internal
|
||
\emph default
|
||
components:
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Redundant power supplies with UPS.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Redundancy at the storage HDDs / SSDs.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Redandancy at internal transport busses.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Redundant RAM / SSD caches.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Redundant network interfaces.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Redundant compute heads.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Redundancy at control heads / management interfaces.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
What about local hardware RAID controllers? Many people think that these
|
||
relatively cheap units were massively inferior at practically each of these
|
||
points.
|
||
However, please take a
|
||
\emph on
|
||
really deep
|
||
\emph default
|
||
look at what classical RAID chip manufacturers like LSI / Avago / Broadcom
|
||
and their competitors are offering as configuration variants of their top
|
||
notch models.
|
||
The following enumeration is in the same order as above (item by item):
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Redundant hardware RAID cards with BBU caches, each with local goldcaps
|
||
surviving power outages, their BBU caches cross-coupled via high-speed
|
||
interconnects.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
HDD / SSD redundancy: almost any RAID level you can think of.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Redundant SAS cross-cabling: any head can access any device.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
BBU caches are redundant and cross-coupled, similarly to RDMA.
|
||
When SSD caches are added to both cards, you also get redundancy there.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
When using cross-coupled redundant cards, you automatically get redundant
|
||
host bus interfaces (HBAs).
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
The same story: you also get two independent RAID controller instances which
|
||
can do RAID computations independently from each other.
|
||
Some implementations do this even in hardware (ASICs).
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Dito: both cards may be plugged into two different servers, thereby creating
|
||
redundancy at control level.
|
||
As a side effect, you may also get a similar functionality than DRBD.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
If you compare typical prices for both competing systems, you will notice
|
||
a huge difference.
|
||
See also section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Cost-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Capacity Differences
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
There is another hard-to-die myth: commercial storage would provide higher
|
||
capacity.
|
||
Please read the data sheets.
|
||
It is
|
||
\emph on
|
||
possible
|
||
\emph default
|
||
(but not generally recommended) to put several hundreds of spindles into
|
||
several external HDD enclosures, and then connect them to a redundant cross-cou
|
||
pled pair of RAID controllers via several types of SAS busses.
|
||
By filling a rack this way, you can easily reach similar, if not higher
|
||
capacities than commercial storage boxes, for a
|
||
\emph on
|
||
fraction
|
||
\emph default
|
||
of the price.
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
However, this is not the recommended way for general use cases (but could
|
||
be an option for low demands like archiving).
|
||
The big advantage of RAID-based local storage is
|
||
\series bold
|
||
massive scale-out by sharding,
|
||
\series default
|
||
as explained in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Distributed-vs-Local:"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Caching Differences
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
A frequent argument is that centralized storage systems had bigger caches
|
||
than local RAID systems.
|
||
While this argument is often true, it neglects an important point:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Local RAID systems often
|
||
\emph on
|
||
don't need
|
||
\emph default
|
||
bigger caches, because they are typically located at the
|
||
\emph on
|
||
bottom
|
||
\emph default
|
||
of a cache hierarchy, playing only a
|
||
\emph on
|
||
particular
|
||
\emph default
|
||
role in that hierarchy.
|
||
There exist
|
||
\emph on
|
||
further
|
||
\emph default
|
||
caches which are
|
||
\series bold
|
||
erronously not considered
|
||
\series default
|
||
by such an argument!
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Example, see also section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Performance-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
for more details: At 1&1 Shared Hosting Linux (ShaHoLin), a typical LXC
|
||
container containing several thousands to tenthousands of customer home
|
||
directories, creates a long-term
|
||
\emph on
|
||
average(!)
|
||
\emph default
|
||
IOPS load at block layer of about 70 IOPS.
|
||
No, this isn't a typo.
|
||
It is not 70,000 IOPS.
|
||
It is only 70 IOPS.
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Linux kernel experts know why I am not kidding.
|
||
The standard Linux kernel has two main caches, the Page Cache for file
|
||
content, and the Dentry Cache (plus Inode slave cache) for metadata.
|
||
Both caches are residing in
|
||
\series bold
|
||
RAM
|
||
\series default
|
||
, which is the
|
||
\emph on
|
||
fastest
|
||
\emph default
|
||
type of cache you can get.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Nowadays, typical servers have several hundreds of gigabytes of RAM, sometimes
|
||
even up to terabytes, resulting in an incredible caching behaviour which
|
||
can be measured by those people who know how to do it (caution: it can
|
||
be easily done wrongly).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Many people are neglecting these caches, sometimes not knowing of their
|
||
existence, and are falsely assuming that 1 application r
|
||
\family typewriter
|
||
ead()
|
||
\family default
|
||
or
|
||
\family typewriter
|
||
write()
|
||
\family default
|
||
operation will also lead to 1 IOPS at block layer.
|
||
As a consequence, they are demanding 50,000 IOPS or 100,000 or even 1,000,000
|
||
IOPS.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Some (but not all) commercial storage systems can deliver similar IOPS rates,
|
||
because they have internal RAM caches in the same order of magnitude.
|
||
People who are buying such systems are typically falling into some of the
|
||
following classes (list is probably incomplete):
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
some people know this, but price does not matter - the more caches, the
|
||
better.
|
||
Wasted money for doubled caches does not count for them, or is even viewed
|
||
as an advantage to them (personally).
|
||
Original citation of an anonymous person:
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
only the best and the most expensive storage is good enough for us
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
using NFS, which has extremely poor filesystem caching behaviour because
|
||
the Linux nfs client implementation does not take full advantage of the
|
||
dentry cache.
|
||
Sometimes people know this, sometimes not.
|
||
It seems that few people have read an important paper on the Linux implementati
|
||
on of nfs.
|
||
Please search the internet for
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
Why nfs sucks
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
from Olaf Kirch (who is one of the original Linux nfs implementors), and
|
||
|
||
\emph on
|
||
read
|
||
\emph default
|
||
it.
|
||
Your opinion about nfs might change.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
have transactional databases, where high IOPS may be
|
||
\emph on
|
||
really
|
||
\emph default
|
||
needed, but
|
||
\series bold
|
||
\emph on
|
||
exceptionally
|
||
\series default
|
||
\emph default
|
||
(!) for this class of application.
|
||
For very big enterprise databases like big SAP installations, there may
|
||
be a very valid justification for big RAM caches at storage layers.
|
||
However: smaller transactional loads, as in webhosting, are
|
||
\emph on
|
||
often
|
||
\emph default
|
||
(not always) hammering a
|
||
\emph on
|
||
low
|
||
\emph default
|
||
number of
|
||
\series bold
|
||
hot spots
|
||
\series default
|
||
, where
|
||
\emph on
|
||
big
|
||
\emph default
|
||
caches are not really needed.
|
||
Relatively small BBU caches of RAID cards will do it also.
|
||
Often people don't notice this because they don't measure the
|
||
\series bold
|
||
workingset behaviour
|
||
\series default
|
||
of their application, as could be done for example with
|
||
\family typewriter
|
||
blkreplay
|
||
\family default
|
||
(see
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
https://blkreplay.org
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
).
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
do not notice that
|
||
\emph on
|
||
well-tuned
|
||
\emph default
|
||
filesystem caches over iSCSI are typically demanding much less IOPS, sometimes
|
||
by several orders of magnitude, and are wasting money with caches at commercial
|
||
boxes they don't need (classical
|
||
\series bold
|
||
over-engineering
|
||
\series default
|
||
).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Anyway, local storage can be augmented with various types of local caches
|
||
with various dimensioning.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
However, there is no point in accessing the fastest possible type of RAM
|
||
cache remotely over a network.
|
||
Even expensive hardware-based RDMA (e.g.
|
||
over Infiniband) cannot deliver the same performance as
|
||
\series bold
|
||
directly caching
|
||
\series default
|
||
your data in the
|
||
\series bold
|
||
\emph on
|
||
same
|
||
\emph default
|
||
RAM
|
||
\series default
|
||
where your application is running.
|
||
The Dentry Cache in the Linux kernel provides highly optimized
|
||
\series bold
|
||
shared metadata
|
||
\series default
|
||
in SMP and NUMA systems (nowadays scaling to more than 100 processor cores),
|
||
while the Page Cache provides
|
||
\series bold
|
||
shared memory
|
||
\series default
|
||
via hardware MMU.
|
||
This is crucial for the performance of classical local filesystems.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The physical laws of Einstein and others are telling us that neither this
|
||
type of caching, nor its shared memory behaviour, can be transported over
|
||
whatever type of network without causing performance degradation.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Latencies and Throughput
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Latencies-and-Throughput"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
First of all: today there exist only a small number of HDD manufacturers
|
||
on the world.
|
||
The number of SSD manufacturers will likely decline in the long run.
|
||
Essentially, commercial storage vendors are more or less selling you the
|
||
same HDDs or SSDs as you could buy and deploy yourself.
|
||
If at all, there are only some minor technical differences.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In the meantime, many people agree to a Google paper that the
|
||
\emph on
|
||
ratio
|
||
\emph default
|
||
of market prices (price per terabyte) between HDDs and SSDs are unlikely
|
||
to change in a fundamental
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
In folklore, there exists a
|
||
\series bold
|
||
fundamental empirical law
|
||
\series default
|
||
, fuzzily called
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
Storage Pyramid
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
or
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
Memory Hierarchy Law
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
or similar, which is well-known at least in German OS academic circles.
|
||
The empirical law (extrapolated from
|
||
\series bold
|
||
observations
|
||
\series default
|
||
, similarly to Moore's law) tells us that faster storage technology is always
|
||
|
||
\series bold
|
||
more expensive
|
||
\series default
|
||
than slower storage technology, and that capacities of faster storage are
|
||
typically always lesser than capacity of slower storage.
|
||
This observation has been roughly valid for more than 50 years now.
|
||
You can find it in several German lecture scripts.
|
||
Unfortunately, the Wikipedia article
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
https://en.wikipedia.org/wiki/Memory_hierarchy
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
(retrieved in June 2018) does not cite this very important fundamental
|
||
law about
|
||
\series bold
|
||
costs
|
||
\series default
|
||
.
|
||
In contrast, the German article
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
https://de.wikipedia.org/wiki/Speicherhierarchie
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
about roughly the same subject is mentioning
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
Kosten
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
which means
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
cost
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, and
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
teuer
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
which means
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
expensive
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
way during the next 10 years.
|
||
Thus, most large-capacity enterprise storage systems are built on top of
|
||
HDDs.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Typically, HDDs and their mechanics are forming the overall bottleneck.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
by construction, a
|
||
\emph on
|
||
local
|
||
\emph default
|
||
HDD attached via HBAs or a hardware RAID controller will show the least
|
||
|
||
\emph on
|
||
additional
|
||
\emph default
|
||
overhead in terms of
|
||
\emph on
|
||
additional
|
||
\emph default
|
||
latencies and throughput degradation caused by the attachment.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
When the
|
||
\emph on
|
||
same
|
||
\emph default
|
||
HDD is
|
||
\emph on
|
||
indirectly
|
||
\emph default
|
||
attached via Ethernet or Infiniband or another rack-to-rack transport,
|
||
both latencies and throughput will become worse.
|
||
Depending on further factors and influences, the overall bottleneck may
|
||
shift to the network.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The laws of information transfer are telling us: with increasing distance,
|
||
both latencies (laws of Einstein) and throughput (laws of energy needed
|
||
for compensation of SNR = signal to noise ratio) are becoming worse.
|
||
Distance matters.
|
||
And the number of intermediate components, like routers / switches and
|
||
their
|
||
\series bold
|
||
queuing
|
||
\series default
|
||
, matters too.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This means that local storage has
|
||
\emph on
|
||
always
|
||
\emph default
|
||
an advantage in front of any attachment via network.
|
||
Centralized storages are bound to some network, and thus suffer from disadvanta
|
||
ges in terms of latencies and throughput.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
What is the expected long-term future? Will additional latencies and throughput
|
||
of centralized storages become better over time?
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
It is difficult to predict the future.
|
||
Let us first look at the past evolution.
|
||
The following graphics has taken its numbers from Wikipedia articles
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
https://en.wikipedia.org/wiki/List_of_device_bit_rates
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
and
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
https://en.wikipedia.org/wiki/History_of_hard_disk_drives
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
, showing that HDD capacities have grown
|
||
\series bold
|
||
over-proportionally
|
||
\series default
|
||
by about 2 orders of magnitude over about 30 years, when compared to the
|
||
relative growth of network bandwidth.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In the following graphics, effects caused by decreasing form factors have
|
||
been neglected, which would even
|
||
\emph on
|
||
amplify
|
||
\emph default
|
||
the trend.
|
||
For fairness, bundling of parallel disks or parallel communication channels
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
It is easy to see that the slopes of
|
||
\family typewriter
|
||
HDD.capacity
|
||
\family default
|
||
vs
|
||
\family typewriter
|
||
Infiniband.rates
|
||
\family default
|
||
are different.
|
||
Parallelizing by bundling of Infiniband wires will only lift the line a
|
||
little upwards, but will not alter its slope in logarithmic scale.
|
||
For extrapolated time
|
||
\begin_inset Formula $t\rightarrow\infty$
|
||
\end_inset
|
||
|
||
, the extrapolated empirical long-term behaviour is rather striking.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
have been ignored.
|
||
All comparisons are in logarithmic y axis scale:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename BitRates/Capacity-BitRate-Comparison.pdf
|
||
width 100col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
What does this mean when extrapolated into the future?
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
It means that concentrating more and more capacity into a single rack due
|
||
to increasing data density will likely lead to more problems in future.
|
||
Accessing more and more data over the network will become increasingly
|
||
more difficult when concentrating high-capacity HDDs or SSDs
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
It is difficult to compare the space density of contemporary SSDs in a fair
|
||
way.
|
||
There are too many different form factors.
|
||
For example, M2 cards are typically consuming even less
|
||
\begin_inset Formula $cm^{3}/TB$
|
||
\end_inset
|
||
|
||
than classical 2.5 inch form factors.
|
||
This trend is likely to continue in future.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
into the same space volume as before.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In other words: centralized storages are no good idea yet, and will likely
|
||
become an even worse idea in the future.
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Example: there was a major incident at a German web hosting company at the
|
||
beginning of the 2000's.
|
||
Their entire webhosting main business was running on a single proprietary
|
||
highly redundant CentralStorage solution, which failed.
|
||
Restore from backup took way too long from the viewpoint of a huge number
|
||
of customers, leading to major press attention.
|
||
Before this incident, they were the #1 webhoster in Germany.
|
||
A few years later, 1&1 was the #1 instead.
|
||
You can speculate whether this has to do with the incident.
|
||
But anyway, the later geo-redundancy strategy of 1&1 basing on a sharding
|
||
model (originally using DRBD, later MARS) was motivated by conclusions
|
||
drawn from this incident.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Another example: in the 1980s, a CentralStorage
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
dinosaur
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
With the advent of NVME, SSDs are almost directly driven by DMA.
|
||
Accessing any high-speed DMA devices by default via network is a foolish
|
||
idea, similarly foolish than playing games via an expensive high-end gamer
|
||
graphics cards which is then
|
||
\emph on
|
||
indirectly
|
||
\emph default
|
||
attached via RDMA, or even via Ethernet.
|
||
Probably no serious gamer would ever
|
||
\emph on
|
||
try
|
||
\emph default
|
||
to do that.
|
||
But some storage vendors do, for strategic reasons.
|
||
Probably for their own survival, their customers are to be misguided to
|
||
overlook the blinking red indicators that centralized SSD storage is likely
|
||
nothing but an expensive dead end in the history of dinosaur architectures.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
architecture called SLED = Single Large Expensive Disk was propagated with
|
||
huge marketing noise and effort, but its historic fate was predictable
|
||
for real experts not bound to particular interests: SLED finally lost against
|
||
their contemporary RAID competition.
|
||
Nowadays, many people don't even remember the term SLED.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Today's future is likely dominated by
|
||
\series bold
|
||
scaling-out architectures
|
||
\series default
|
||
like sharding, as explained in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Distributed-vs-Local:"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Reliability Differences CentralStorage vs Sharding
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Reliability-Differences-CentralStorage"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In this section, we look at
|
||
\emph on
|
||
fatal
|
||
\emph default
|
||
failures only, ignoring temporary failures.
|
||
A fatal failure of a storage is an incident which needs to be corrected
|
||
by
|
||
\series bold
|
||
restore from backup
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
By definition, even a
|
||
\emph on
|
||
highly redundant
|
||
\emph default
|
||
CentralStorage is
|
||
\emph on
|
||
nevertheless
|
||
\emph default
|
||
a SPOF = Single Point of Failure.
|
||
This also applies to fatal failures.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Some people are incorrectly arguing with redundancy.
|
||
However, the problem is that
|
||
\emph on
|
||
any
|
||
\emph default
|
||
system, even a highly redundant one, can fail fatally.
|
||
There exists no perfect system on earth.
|
||
One of the biggest known sources of fatal failure is
|
||
\series bold
|
||
human error
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In contrast, sharded storage (for example the LocalSharding model, see also
|
||
section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "subsec:Variants-of-Sharding"
|
||
|
||
\end_inset
|
||
|
||
) has MPOF = Multiple Points Of Failure.
|
||
It is unlikely that many shards are failing fatally at the same time, because
|
||
shards are
|
||
\emph on
|
||
independent
|
||
\emph default
|
||
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
When all shards are residing in the same datacenter, there exists a SPOF
|
||
by power loss or other impacts onto the whole datacenter.
|
||
However, this applies to both the CentralStorage and to the LocalSharding
|
||
model.
|
||
In contrast to CentralStorage, LocalSharding can be more easily distributed
|
||
over multiple datacenters.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
from each other by definition (cf paragraph
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "par:Definition-of-Sharding"
|
||
|
||
\end_inset
|
||
|
||
for disambiguation of terms
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
sharding
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
and
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
shared-nothing
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
What is the difference from the viewpoint of customers of the services?
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
When a CentralStorage fails fatally, a
|
||
\emph on
|
||
huge
|
||
\emph default
|
||
number of customers will be affected for a
|
||
\emph on
|
||
long
|
||
\emph default
|
||
time (see the example German webhoster mentioned in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "subsec:Latencies-and-Throughput"
|
||
|
||
\end_inset
|
||
|
||
).
|
||
Reason: restore from backup will take extremely long because huge masses
|
||
of data have to be restored.
|
||
MTBF = Mean Time Between Failures is (hopefully) longer thanks to redundancy,
|
||
but MTTR = Mean Time To Repair is also very long.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
With (Local)Sharding, the risk of
|
||
\emph on
|
||
some
|
||
\emph default
|
||
fatal incident
|
||
\emph on
|
||
somewhere
|
||
\emph default
|
||
in the sharding pool is higher, but the
|
||
\series bold
|
||
\emph on
|
||
size
|
||
\series default
|
||
\emph default
|
||
of such an incident is smaller in three dimensions at the same time:
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
There are much
|
||
\series bold
|
||
less customers affected
|
||
\series default
|
||
(typically only
|
||
\begin_inset Formula $1$
|
||
\end_inset
|
||
|
||
shard out of
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
shards).
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
|
||
\series bold
|
||
MTTR
|
||
\series default
|
||
= Mean Time To Repair is typically much better because there is much less
|
||
data to be restored.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
|
||
\series bold
|
||
Residual risk
|
||
\series default
|
||
plus resulting fatal damage by
|
||
\series bold
|
||
un-repairable problems
|
||
\series default
|
||
is thus lower.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
What does this mean from the viewpoint of an investor of a big
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
global player
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
company?
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
As is promised by the vendors, let us assume that failure of CentralStorage
|
||
might be occurring less frequently.
|
||
But
|
||
\emph on
|
||
when
|
||
\emph default
|
||
it happens on
|
||
\series bold
|
||
enterprise-critical mass data
|
||
\series default
|
||
, the stock exchange value of the affected company will be exposed to a
|
||
|
||
\series bold
|
||
hazard
|
||
\series default
|
||
.
|
||
This is not bearable from the viewpoint of an investor.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In contrast, the (Local)Sharding model is
|
||
\emph on
|
||
distributing
|
||
\emph default
|
||
the
|
||
\series bold
|
||
indispensible incidents
|
||
\series default
|
||
(because
|
||
\series bold
|
||
perfect systems do not exist
|
||
\series default
|
||
, and
|
||
\series bold
|
||
perfect humans do not exist
|
||
\series default
|
||
) to a lower number of customers with higher frequency, such that the
|
||
\series bold
|
||
total impact onto the business
|
||
\series default
|
||
becomes bearable.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Risk analysis of enterprise-critical use cases is summarized in the following
|
||
table:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Tabular
|
||
<lyxtabular version="3" rows="8" columns="3">
|
||
<features tabularvalignment="middle">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top" width="0pt">
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
CentralStorage
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
(Local)Sharding
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Probability of
|
||
\emph on
|
||
some
|
||
\emph default
|
||
fatal incident
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
lower
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
higher
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
# Customers affected
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
very high
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
very low
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
MTBF per storage
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
higher
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
lower
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
MTTR per storage
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
higher
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
lower
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Unrepairable residual risk
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
higher
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
lower
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Total impact
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
higher
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
lower
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Investor's risk
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\series bold
|
||
unbearable
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
stock exchange compatible
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
</lyxtabular>
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
Summary: CentralStorage is something for
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
\noindent
|
||
Small to medium-sized companies which don't have the
|
||
\series bold
|
||
manpower
|
||
\series default
|
||
and the
|
||
\series bold
|
||
skills
|
||
\series default
|
||
for professionally building and operating a (Local)Sharding (or similar)
|
||
system for their enterprise-critical mass data their business is relying
|
||
upon.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
|
||
\series bold
|
||
\emph on
|
||
Monolithic
|
||
\emph default
|
||
enterprise applications
|
||
\series default
|
||
like classical SAP which are anyway bound to a specific vendor, where you
|
||
cannot select a different solution (so-called
|
||
\series bold
|
||
Vendor Lock-In
|
||
\series default
|
||
).
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
When your application
|
||
\series bold
|
||
is neither shardable
|
||
\series default
|
||
by construction (c.f.
|
||
section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Distributed-vs-Local:"
|
||
|
||
\end_inset
|
||
|
||
), or when doing so would be a too high effort,
|
||
\series bold
|
||
nor going to BigCluster
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Theoretically, BigCluster can be used to create 1 single huge remote LV
|
||
(or 1 single huge remote FS instance) out of a pool of storage machines.
|
||
Double-check, better triple-check that such a
|
||
\series bold
|
||
big
|
||
\emph on
|
||
logical
|
||
\emph default
|
||
SPOF
|
||
\series default
|
||
is
|
||
\emph on
|
||
really
|
||
\emph default
|
||
needed, and cannot be circumvented by any means.
|
||
Only in such a case, the current version of MARS cannot help (yet), because
|
||
its
|
||
\emph on
|
||
current
|
||
\emph default
|
||
|
||
\emph on
|
||
focus
|
||
\emph default
|
||
is on a big number of machines each having relatively small LVs.
|
||
At 1&1 ShaHoLin, the biggest LVs are 40TiB at the moment, running for years
|
||
now, and bigger ones are certainly possible.
|
||
Only when current local RAID technology with external enclosures cannot
|
||
easily create a single LV in the petabyte scale, BigCluster is probably
|
||
the better solution (c.f.
|
||
section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sec:Reliability-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
).
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
|
||
\series default
|
||
(e.g.
|
||
Ceph / Swift / etc, see secion
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sec:Reliability-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
) is an option.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
If you have an
|
||
\emph on
|
||
already sharded
|
||
\emph default
|
||
system, e.g.
|
||
in webhosting, don't convert it to a non-shardable one, and don't introduce
|
||
SPOFs needlessly.
|
||
You will introduce
|
||
\series bold
|
||
technical debts
|
||
\series default
|
||
which are likely to hurt back somewhen in future!
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
As a real big
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
global player
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, or as a company being part of such a structure, you should be careful
|
||
when listening to
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
marketing drones
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
of proprietary CentralStorage vendors.
|
||
Always check your
|
||
\emph on
|
||
concrete
|
||
\emph default
|
||
use case.
|
||
Never believe in wrongly generalized claims, which are only valid in some
|
||
specific context, but do not really apply to your use case.
|
||
It could be about your
|
||
\emph on
|
||
life
|
||
\emph default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Proprietary vs OpenSource
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Proprietary-vs-OpenSource"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In theory, the following dimensions are orthogonal to each other:
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
Architecture: LocalStorage vs CentralStorage vs DistributedStorage
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
Licensing: Proprietary vs OpenSource
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In practice, however, many vendors of proprietary storage systems are selecting
|
||
the CentralStorage model.
|
||
This way, they can avoid inter-operability with their competitors.
|
||
This opens the door for the so-called
|
||
\series bold
|
||
Vendor Lock-In
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In contrast, the OpenSource community is based on
|
||
\emph on
|
||
cooperation
|
||
\emph default
|
||
.
|
||
Opting for OpenSource means that you can
|
||
\series bold
|
||
combine and exchange
|
||
\series default
|
||
numerous
|
||
\series bold
|
||
components
|
||
\series default
|
||
with each other.
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Key OpenSource players are
|
||
\emph on
|
||
basing
|
||
\emph default
|
||
their business on the
|
||
\series bold
|
||
usefulness
|
||
\series default
|
||
of their software components for you, their customer.
|
||
Please search the internet for further explanations from Eric S.
|
||
Raymond.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Therefore
|
||
\series bold
|
||
interoperability
|
||
\series default
|
||
is a
|
||
\emph on
|
||
must
|
||
\emph default
|
||
in the opensource business.
|
||
For example, you can relatively easily migrate between DRBD and MARS, forth
|
||
and backwards, see section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Setup-Primary-and"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
The
|
||
\emph on
|
||
generic
|
||
\emph default
|
||
block devices provided by both DRBD and MARS (and by the kernel LVM2 implementa
|
||
tion, and many others
|
||
\begin_inset Formula $\ldots$
|
||
\end_inset
|
||
|
||
) can interact with zillions of filesystems, VMs, applications, and so forth.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Summary:
|
||
\series bold
|
||
genericity
|
||
\series default
|
||
is a highly desired property in OpenSource communities, while proprietary
|
||
products often try to control their usage by limiting either technical
|
||
interoperability at certain layers, and/or legally by contracts.
|
||
Trying to do so with OpenSource would make no sense, because
|
||
\emph on
|
||
you
|
||
\emph default
|
||
, the customer, are the
|
||
\emph on
|
||
real
|
||
\emph default
|
||
king who can
|
||
\emph on
|
||
really
|
||
\emph default
|
||
select and combine components.
|
||
You can form a
|
||
\series bold
|
||
really customized system
|
||
\series default
|
||
to your
|
||
\series bold
|
||
\emph on
|
||
real needs
|
||
\series default
|
||
\emph default
|
||
, not as just promised but not always actually delivered by so-called
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
marketing drones
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
from commercial vendors who are actually prefering the needs of their employer
|
||
in front of yours.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
There is another fundamental difference between proprietary software and
|
||
OpenSource: the former is bound to some company, which may
|
||
\emph on
|
||
vanish
|
||
\emph default
|
||
from the market.
|
||
Commercial storage systems may be
|
||
\series bold
|
||
discontinued
|
||
\series default
|
||
.
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This can be a serious threat to your business relying on the value of your
|
||
data.
|
||
In particular, buying storage systems from
|
||
\emph on
|
||
small
|
||
\emph default
|
||
vendors may increase this risk
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
There is a risk of a
|
||
\emph on
|
||
domino effect
|
||
\emph default
|
||
: once there is a critical incident on highly redundant CentralStorage boxes
|
||
from a particular (smaller) vendor, this may lead to major public media
|
||
attention.
|
||
This may form the
|
||
\emph on
|
||
root cause
|
||
\emph default
|
||
for such a vendor to vanish from the market.
|
||
Thus you may be left alone with a buggy system, even if you aren't the
|
||
victim of the concrete incident.
|
||
\end_layout
|
||
|
||
\begin_layout Plain Layout
|
||
In contrast, bugs in an OpenSource component can be fixed by a larger community
|
||
of interested people, or by yourself if you hire somebody for this.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
OpenSource is different: it cannot die, even if the individual, or the (small)
|
||
company which produced it, does no longer exist.
|
||
The sourcecode is in the
|
||
\series bold
|
||
public
|
||
\series default
|
||
.
|
||
It just could get
|
||
\emph on
|
||
outdated
|
||
\emph default
|
||
over time.
|
||
However, as long as there is enough public interest, you will always find
|
||
somebody who is willing to adapt and to
|
||
\emph on
|
||
maintain
|
||
\emph default
|
||
it.
|
||
Even if you would be the only one having such an interest, you can
|
||
\emph on
|
||
hire
|
||
\emph default
|
||
a maintainer for it, specifically for your needs.
|
||
You aren't
|
||
\series bold
|
||
helpless
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
Distributed vs Local: Scalability Arguments from Architecture
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "sec:Distributed-vs-Local:"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Datacenters aren't usually operated for fun or for hobby.
|
||
Scalability of an
|
||
\emph on
|
||
architecture
|
||
\emph default
|
||
(cf section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:What-is-Architecture"
|
||
|
||
\end_inset
|
||
|
||
) is very important, because it can seriously limit your business.
|
||
Overcoming architectural ill-designs can grow extremely cumbersome and
|
||
costly.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Many enterprise system architects are starting with a particular architecture
|
||
in mind, called
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
Big Cluster
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
There is a common belief that otherwise
|
||
\series bold
|
||
scalability
|
||
\series default
|
||
could not be achieved:
|
||
\begin_inset Separator latexpar
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/Architecure_Big_Cluster.pdf
|
||
width 100col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
The crucial point is the
|
||
\series bold
|
||
storage network
|
||
\series default
|
||
here:
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
storageservers are interconnected with
|
||
\begin_inset Formula $m=O(n)$
|
||
\end_inset
|
||
|
||
frontend servers, in order to achieve properties like scalability, failure
|
||
tolerance, etc.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Since
|
||
\emph on
|
||
any
|
||
\emph default
|
||
of the
|
||
\begin_inset Formula $m$
|
||
\end_inset
|
||
|
||
frontends must be able to access
|
||
\emph on
|
||
any
|
||
\emph default
|
||
of the
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
storages in realtime, the storage network must be dimensioned for
|
||
\begin_inset Formula $O(n\cdot m)=O(n^{2})$
|
||
\end_inset
|
||
|
||
network connections running in parallel.
|
||
Even if the total network throughput is scaling only with
|
||
\begin_inset Formula $O(n)$
|
||
\end_inset
|
||
|
||
, nevertheless
|
||
\begin_inset Formula $O(n^{2})$
|
||
\end_inset
|
||
|
||
network connections have to be maintained at connection oriented protocols
|
||
and at various layers of the operating software.
|
||
The network has to
|
||
\emph on
|
||
switch
|
||
\emph default
|
||
the packets from
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
sources to
|
||
\begin_inset Formula $m$
|
||
\end_inset
|
||
|
||
destinations (and their opposite way back) in
|
||
\series bold
|
||
realtime
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This
|
||
\series bold
|
||
cross-bar functionality
|
||
\series default
|
||
in realtime makes the storage network complicated and expensive.
|
||
Some further factors are increasing the costs of storage networks:
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
In order to limit error propagation from other networks, the storage network
|
||
is often built as a
|
||
\emph on
|
||
physically separate
|
||
\emph default
|
||
=
|
||
\emph on
|
||
dedicated
|
||
\emph default
|
||
network.
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Because storage networks are heavily reacting to high latencies and packet
|
||
loss, they often need to be dimensioned for the
|
||
\series bold
|
||
worst case
|
||
\series default
|
||
(load peaks, packet storms, etc), needing one of the best = typically most
|
||
expensive components for reducing latency and increasing throughput.
|
||
Dimensioning to the worst case instead of an average case plus some safety
|
||
margins is nothing but an expensive
|
||
\series bold
|
||
overdimensioning
|
||
\series default
|
||
/
|
||
\series bold
|
||
over-engineering
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
When
|
||
\series bold
|
||
multipathing
|
||
\series default
|
||
is required for improving fault tolerance of the storage network itself,
|
||
these efforts will even
|
||
\emph on
|
||
double
|
||
\emph default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
When geo-redundancy is required, the total effort may easily more than double
|
||
another time because in cases of disasters like terrorist attacks the backup
|
||
datacenter must be prepared for taking over for multiple days or weeks.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Fortunately, there is an alternative called
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
|
||
\series bold
|
||
Sharding Architecture
|
||
\series default
|
||
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
or
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
|
||
\series bold
|
||
Shared-nothing Architecture
|
||
\series default
|
||
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Paragraph
|
||
Definition of Sharding
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "par:Definition-of-Sharding"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Notice that the term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
Sharding
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
originates from database architecture
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
https://en.wikipedia.org/wiki/Shard_(database_architecture)
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
where it has a slightly different meaning than used here.
|
||
Our usage of the term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
sharding
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
reflects slightly different situations in some webhosting companies
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
According to
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
https://en.wikipedia.org/wiki/Shared-nothing_architecture
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
, Google also uses the term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
sharding
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
for a particular
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
shared-nothing architecture
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
Although our above definition of
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
sharding
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
does not fully comply with its original meaning, a similar usage by Google
|
||
probably means that our usage of the term is not completely uncommon.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
, and can be certainly transferred to some more application areas.
|
||
Our more specific use of the term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
sharding
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
has the following properties,
|
||
\emph on
|
||
all at the same time:
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
User / customer data is
|
||
\series bold
|
||
partitioned
|
||
\series default
|
||
.
|
||
This is very similar to database sharding.
|
||
However, the original database term also allows
|
||
\emph on
|
||
some
|
||
\emph default
|
||
data to remain unpartitioned.
|
||
In webhosting, suchalike may exists also, but typically only for
|
||
\emph on
|
||
system data,
|
||
\emph default
|
||
like OS images, including large parts of their configuration data.
|
||
Suchalike system data is typically
|
||
\emph on
|
||
replicated
|
||
\emph default
|
||
from a central
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
golden image
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
in an
|
||
\emph on
|
||
offline
|
||
\emph default
|
||
fashion, e.g.
|
||
via regular
|
||
\family typewriter
|
||
rsync
|
||
\family default
|
||
cron jobs, etc.
|
||
Typically, it comprises only of few gigabytes per instance and is mostly
|
||
read-only with a slow change rate, while total customer data is typically
|
||
in the range of some petabytes with a higher total change rate.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Servers have
|
||
\series bold
|
||
no single point of contention
|
||
\series default
|
||
, and thus are
|
||
\series bold
|
||
completely independent
|
||
\series default
|
||
from each other, like in
|
||
\series bold
|
||
shared-nothing
|
||
\series default
|
||
architectures
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
https://en.wikipedia.org/wiki/Shared-nothing_architecture
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
However, the original term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
shared-nothing
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
has also been used for describing
|
||
\emph on
|
||
replicas
|
||
\emph default
|
||
, e.g.
|
||
DRBD mirrors.
|
||
In our context of
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
sharding
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, the shared-nothing principle
|
||
\emph on
|
||
only
|
||
\emph default
|
||
refers to the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
|
||
\series bold
|
||
no single point of contention
|
||
\series default
|
||
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
principle at
|
||
\emph on
|
||
partitioning
|
||
\emph default
|
||
level, which means it
|
||
\emph on
|
||
only
|
||
\emph default
|
||
refers to to the
|
||
\emph on
|
||
partitioning
|
||
\emph default
|
||
of the user data, but
|
||
\emph on
|
||
not
|
||
\emph default
|
||
to their replicas.
|
||
Shared-nothing replicas in the sense of DRBD may be also present (and in
|
||
fact they are at 1&1 Shared Hosting Linux), but these replicas are
|
||
\emph on
|
||
not
|
||
\emph default
|
||
meant by our usage of the term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
sharding
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
Customer data replicas form an
|
||
\emph on
|
||
independent
|
||
\emph default
|
||
dimension called
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
replication layer
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
The replication layer also obeys the shared-nothing principle in original
|
||
sense, but it is
|
||
\emph on
|
||
not
|
||
\emph default
|
||
meant by our term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
sharding
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
in order to avoid confusion
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Notice that typically
|
||
\family typewriter
|
||
BigCluster
|
||
\family default
|
||
architectures are also abstracting away their replicas when talking about
|
||
their architecture.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
between these two independent dimensions.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Our sharding model does not need a dedicated storage network at all, at
|
||
least when built and dimensioned properly.
|
||
Instead, it
|
||
\emph on
|
||
should have
|
||
\emph default
|
||
(but not always needs) a so-called
|
||
\series bold
|
||
replication network
|
||
\series default
|
||
which can, when present, be dimensioned much smaller because it does neither
|
||
need realtime operations nor scalabiliy to
|
||
\begin_inset Formula $O(n^{2})$
|
||
\end_inset
|
||
|
||
:
|
||
\begin_inset Separator latexpar
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/Architecure_Sharding.pdf
|
||
width 100col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
Sharding architectures are extremely well suited when both the input traffic
|
||
and the data is
|
||
\series bold
|
||
already partitioned
|
||
\series default
|
||
.
|
||
For example, when several thousands or even millions of customers are operating
|
||
on disjoint data sets, like in web hosting where each webspace is residing
|
||
in its own home directory, or when each of millions of mySQL database instances
|
||
has to be isolated from its neighbour.
|
||
Masses of customers are also appearing at cloud storage applications like
|
||
Cloud Filesystems (e.g.
|
||
Dropbox or similar).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Even in cases when any customer may potentially access any of the data items
|
||
residing in the whole storage pool (e.g.
|
||
like in a search engine), sharding can be often applied.
|
||
The trick is to create some relatively simple content-based dynamic switching
|
||
or redirect mechanism in the input network traffic, similar to HTTP load
|
||
balancers or redirectors.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Only when partitioning of input traffic plus data is not possible in a reasonabl
|
||
e way, big cluster architectures as implemented for example in Ceph or Swift
|
||
(and partly even possible with MARS when restricted to the block layer)
|
||
have a very clear use case.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In the following sections, we will see: when sharding is possible, it is
|
||
the preferred model due to reliability and cost and performance reasons.
|
||
Another good explanation can be found at
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
http://www.benstopford.com/2009/11/24/understanding-the-shared-nothing-architectur
|
||
e/
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Variants of Sharding
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Variants-of-Sharding"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
LocalSharding The simplest possible sharding architecture is simply putting
|
||
both the storage and the compute CPU power onto the same iron.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
Example: at 1&1 Shared Hosting Linux (ShaHoLin), we have dimensioned several
|
||
variants of this.
|
||
(a) we are using 1U pizza boxes with local hardware RAID controllers with
|
||
fast hardware BBU cache and up 10 local disks for the majority of LXC container
|
||
instances where the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
small-sized
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
customers (up to ~100 GB webspace per customer) are residing.
|
||
Since most customers have very small home directories with extremely many
|
||
but small files, this is a very cost-efficient model.
|
||
(b) less that 1 permille of all customers have > 250 GB (up to 2TB) per
|
||
home directory.
|
||
For these few customers we are using another dimensioning variant of the
|
||
same architecture: 4U servers with 48 high-capacity spindles on 3 RAID
|
||
sets, delivering a total PV capacity of ~300 TB, which are then cut down
|
||
to ~10 LXC containers of ~30 TB each.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
In order to operate this model at a bigger scale, you should consider the
|
||
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
container football
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
method as described in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "subsec:Principle-of-Background"
|
||
|
||
\end_inset
|
||
|
||
and in chapter
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "chap:LV-Football"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
RemoteSharding This variant needs a (possibly dedicated) storage network,
|
||
which is however only
|
||
\begin_inset Formula $O(n)$
|
||
\end_inset
|
||
|
||
in total.
|
||
Each storage server exports a block device over iSCSI (or over another
|
||
transport) to at most
|
||
\begin_inset Formula $O(k)$
|
||
\end_inset
|
||
|
||
dedicated compute nodes where
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
is some
|
||
\series bold
|
||
constant
|
||
\series default
|
||
.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
Hint 1: it is advisable to build this type of storage network with
|
||
\series bold
|
||
local switches
|
||
\series default
|
||
and no routers inbetween, in order to avoid
|
||
\begin_inset Formula $O(n^{2})$
|
||
\end_inset
|
||
|
||
-style network architectures and traffic.
|
||
This reduces error propagation upon network failures.
|
||
Keep the storage and the compute nodes locally close to each other, e.g.
|
||
in the same datacenter room, or even in the same rack.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
Hint 2: additionally, you can provide some (low-dimensioned) backbone for
|
||
|
||
\series bold
|
||
exceptional(!)
|
||
\series default
|
||
cross-traffic between the local storage switches.
|
||
Don't plan to use any realtime cross-traffic
|
||
\emph on
|
||
regularly
|
||
\emph default
|
||
, but only in clear cases of emergency!
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
Notice: in this model, a shard typically consists of one storage node plus
|
||
|
||
\begin_inset Formula $k+1$
|
||
\end_inset
|
||
|
||
or
|
||
\begin_inset Formula $k+2$
|
||
\end_inset
|
||
|
||
compute servers, introducing some additional failure redundancy
|
||
\emph on
|
||
within
|
||
\emph default
|
||
such a shard, while retaining the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
no single point of contention
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
property
|
||
\emph on
|
||
between
|
||
\emph default
|
||
the shards (according to the definition
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "par:Definition-of-Sharding"
|
||
|
||
\end_inset
|
||
|
||
).
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
FlexibleSharding This is a dynamic combination of LocalSharding and RemoteShardi
|
||
ng, dynamically re-configurable, as explained below.
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
BigClusterSharding The sharding model can also be placed
|
||
\series bold
|
||
on top of
|
||
\series default
|
||
a BigCluster model, or possibly
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
internally
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
in such a model, leading to a similar effect.
|
||
Whether this makes sense needs some discussion.
|
||
It can be used to reduce the
|
||
\emph on
|
||
logical
|
||
\emph default
|
||
BigCluster size from
|
||
\begin_inset Formula $O(n)$
|
||
\end_inset
|
||
|
||
to some
|
||
\begin_inset Formula $O(k)$
|
||
\end_inset
|
||
|
||
, such that it is no longer a
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
big cluster
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
but a
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
small cluster
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, and thus reducing the serious problems described in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Reliability-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
to some degree.
|
||
This could make sense in the following use cases:
|
||
\end_layout
|
||
|
||
\begin_deeper
|
||
\begin_layout Itemize
|
||
When you
|
||
\series bold
|
||
already have
|
||
\series default
|
||
invested into a big cluster, e.g.
|
||
Ceph or Swift, which does not really scale and/or does not really deliver
|
||
the expected reliability.
|
||
Some possible reasons for this are explained in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Reliability-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
When you really need a
|
||
\emph on
|
||
single
|
||
\emph default
|
||
LV which is necessarily
|
||
\series bold
|
||
bigger
|
||
\series default
|
||
than can be reasonably built on top of local LVM.
|
||
This means, you are likely claiming that you really need
|
||
\series bold
|
||
strict consistency
|
||
\series default
|
||
as provided by a block device on more than 1 PB with current technology
|
||
(2018).
|
||
Examples are very
|
||
\series bold
|
||
big enterprise databases
|
||
\series default
|
||
like classical SAP (c.f.
|
||
section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Local-vs-Centralized"
|
||
|
||
\end_inset
|
||
|
||
), or if you really need
|
||
\series bold
|
||
POSIX-compliance
|
||
\series default
|
||
on a single big filesystem instance.
|
||
Be conscious when you think this is the only solution to your problem.
|
||
Double-check or triple-check whether there is
|
||
\emph on
|
||
really
|
||
\emph default
|
||
no other solution than creating such a huge block device and/or such a
|
||
huge filesystem instance.
|
||
Such huge SPOFs are tending to create similar problems as described in
|
||
section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Reliability-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
for similar reasons.
|
||
\end_layout
|
||
|
||
\end_deeper
|
||
\begin_layout Standard
|
||
When building a
|
||
\series bold
|
||
new
|
||
\series default
|
||
storage system, be sure to check the following use cases.
|
||
You should seriously consider a LocalSharding / RemoteSharding / FlexibleShardi
|
||
ng model in favor of BigClusterSharding when ...
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
...
|
||
when more than 1 LV instance would be placed onto your
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
small cluster
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
shards.
|
||
Then a
|
||
\series bold
|
||
{Local,Remote,Flexible}Sharding
|
||
\series default
|
||
model could be likely used instead.
|
||
Then the total overhead (
|
||
\series bold
|
||
total cost of ownership
|
||
\series default
|
||
) introduced by a BigCluster
|
||
\emph on
|
||
model
|
||
\emph default
|
||
but actually stripped down to a
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
SmallCluster
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
|
||
\emph on
|
||
implementation / configuration
|
||
\emph default
|
||
should be examined separately.
|
||
Does it really pay off?
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
...
|
||
when there are
|
||
\series bold
|
||
legal requirements
|
||
\series default
|
||
that you can tell at any time where your data is.
|
||
Typically, this is all else but easy on a BigCluster model, even when stripped
|
||
down to SmallCluster size.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
FlexibleSharding
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:FlexibleSharding"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Notice that MARS' new remote device feature from the 0.2 branch series (which
|
||
is kind of replacement for iSCSI)
|
||
\emph on
|
||
could
|
||
\emph default
|
||
be used for implementing some sort of
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
big cluster
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
model at block layer.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Nevertheless, such models re-introducing some kind of
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
big dedicated storage network
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
into MARS operations are not the preferred model.
|
||
Following is the a super-model which combines both the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
big cluster
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
and sharding model at block layer in a very flexible way.
|
||
The following example shows only two servers from a pool consisting of
|
||
hundreds or thousands of servers:
|
||
\begin_inset Separator latexpar
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/MARS_Cluster_on_Demand.pdf
|
||
width 100col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
The idea is to use iSCSI or the MARS remote device
|
||
\emph on
|
||
only where necessary
|
||
\emph default
|
||
.
|
||
Preferably, local storage is divided into multiple Logical Volumes (LVs)
|
||
via LVM, which are
|
||
\emph on
|
||
directly
|
||
\emph default
|
||
used
|
||
\emph on
|
||
locally
|
||
\emph default
|
||
by Virtual Machines (VMs), such as KVM or filesystem-based variants like
|
||
LXC containers.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In the above example, the left machine has relatively less CPU power or
|
||
RAM than storage capacity.
|
||
Therefore, not
|
||
\emph on
|
||
all
|
||
\emph default
|
||
LVs could be instantiated locally at the same time without causing operational
|
||
problems, but
|
||
\emph on
|
||
some
|
||
\emph default
|
||
of them can be run locally.
|
||
The example solution is to
|
||
\emph on
|
||
exceptionally(!)
|
||
\emph default
|
||
export LV3 to the right server, which has some otherwise unused CPU and
|
||
RAM capacity.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Notice that local operations of VMs doesn't produce any storage network
|
||
traffic at all.
|
||
Therefore, this is the preferred runtime configuration.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Only in cases of resource imbalance, such as (transient) CPU or RAM peaks
|
||
(e.g.
|
||
caused by DDOS attacks),
|
||
\emph on
|
||
some
|
||
\emph default
|
||
VMs or containers may be run somewhere else over the network.
|
||
In a well-balanced and well-dimensioned system, this will be the
|
||
\series bold
|
||
vast minority
|
||
\series default
|
||
, and should be only used for dealing with timely load peaks etc.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Running VMs directly on the same servers as their storage is a
|
||
\series bold
|
||
major cost reducer.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
You simply don't need to buy and operate
|
||
\begin_inset Formula $n+m$
|
||
\end_inset
|
||
|
||
servers, but only about
|
||
\begin_inset Formula $\max(n,m)+m\cdot\epsilon$
|
||
\end_inset
|
||
|
||
servers, where
|
||
\begin_inset Formula $\epsilon$
|
||
\end_inset
|
||
|
||
corresponds to some relative small extra resources needed by MARS.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
In addition to this and to reduced networking costs, there are further cost
|
||
savings at power consumption, air conditioning, Height Units (HUs), number
|
||
of HDDs, operating costs, etc as explained below in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Cost-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Principle of Background Migration
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Principle-of-Background"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The sharding model needs a different approach to load balancing of storage
|
||
space than the big cluster model.
|
||
There are serveral possibilities at different layers, each addressing different
|
||
|
||
\series bold
|
||
granularities
|
||
\series default
|
||
:
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Moving customer data at filesystem or database level via
|
||
\family typewriter
|
||
rsync
|
||
\family default
|
||
or
|
||
\family typewriter
|
||
mysqldump
|
||
\family default
|
||
or similar.
|
||
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
Example: at 1&1 Shared Hosting Linux, we have about 9 millions of customer
|
||
home directories.
|
||
We also have a script
|
||
\family typewriter
|
||
movespace.pl
|
||
\family default
|
||
using incremental
|
||
\family typewriter
|
||
tar
|
||
\family default
|
||
for their moves.
|
||
Now, if we would try to move around
|
||
\emph on
|
||
all
|
||
\emph default
|
||
of them this way, it could easily take years or even decades for millions
|
||
of extremely small home directories, due to overhead like DNS updates etc.
|
||
However, there exist a small handful of large customer home directories
|
||
in the terabyte range.
|
||
For these, and only for these, it is a clever idea to use
|
||
\family typewriter
|
||
movespace.pl
|
||
\family default
|
||
because thereby the size of a LV can be regulated more fine grained than
|
||
at LV level.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Dynamically growing the sizes of LVs during operations:
|
||
\family typewriter
|
||
lvresize
|
||
\family default
|
||
followed by
|
||
\family typewriter
|
||
marsadm resize
|
||
\family default
|
||
followed by
|
||
\family typewriter
|
||
xfs_growfs
|
||
\family default
|
||
or similar operations.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Moving whole LVs via MARS, as shown in the following example:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/MARS_Background_Migration.pdf
|
||
width 100col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
The idea is to dynamically create
|
||
\emph on
|
||
additional
|
||
\emph default
|
||
LV replicas for the sake of
|
||
\series bold
|
||
background migration
|
||
\series default
|
||
.
|
||
Examples:
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
In case you had no redundancy at LV level before, you have
|
||
\begin_inset Formula $k=1$
|
||
\end_inset
|
||
|
||
replicas during ordinary operation.
|
||
If not yet done, you should transparently introduce MARS into your LVM-based
|
||
stack by using the so-called
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
standalone mode
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
of MARS.
|
||
When necessary, create the first MARS replica with
|
||
\family typewriter
|
||
marsadm create-resource
|
||
\family default
|
||
on your already-existing LV data, which is retained unmodified, and restart
|
||
your application again.
|
||
Now, for the sake of migration, you just create an additional replica at
|
||
another server via
|
||
\family typewriter
|
||
marsadm join-resource
|
||
\family default
|
||
there and wait until the second mirror has been fully
|
||
\series bold
|
||
synced
|
||
\series default
|
||
in background, while your application is running and while the contents
|
||
of the LV is modified
|
||
\emph on
|
||
in parallel
|
||
\emph default
|
||
by your ordinary applications.
|
||
Then you do a primary
|
||
\series bold
|
||
handover
|
||
\series default
|
||
to your mirror.
|
||
This is usually a matter of minutes, or even seconds.
|
||
Once the application runs again at the new location, you can delete the
|
||
old replica via
|
||
\family typewriter
|
||
marsadm leave-resource
|
||
\family default
|
||
and
|
||
\family typewriter
|
||
lvremove
|
||
\family default
|
||
.
|
||
Finally, you may re-use the freed-up space for something else (e.g.
|
||
|
||
\family typewriter
|
||
lvresize
|
||
\family default
|
||
of
|
||
\emph on
|
||
another
|
||
\emph default
|
||
LV followed by
|
||
\family typewriter
|
||
marsadm resize
|
||
\family default
|
||
followed by
|
||
\family typewriter
|
||
xfs_growfs
|
||
\family default
|
||
or similar).
|
||
For the sake of some hardware lifecycle, you may run a different strategy:
|
||
evacuate the original source server completely via the above MARS migration
|
||
method, and eventually decommission it.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
In case you already have a redundant LV copy somewhere, you should run a
|
||
similar procedure, but starting with
|
||
\begin_inset Formula $k=2$
|
||
\end_inset
|
||
|
||
replicas, and temporarily increasing the number of replicas to either
|
||
\begin_inset Formula $k'=3$
|
||
\end_inset
|
||
|
||
when moving each replica step-by-step, or you may even directly go up to
|
||
|
||
\begin_inset Formula $k'=4$
|
||
\end_inset
|
||
|
||
when moving pairs at once.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
Example: see
|
||
\family typewriter
|
||
football.sh
|
||
\family default
|
||
in the
|
||
\family typewriter
|
||
football/
|
||
\family default
|
||
directory of MARS, which is a checkout of the Football sub-project (see
|
||
chapter
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "chap:LV-Football"
|
||
|
||
\end_inset
|
||
|
||
).
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
When already starting with
|
||
\begin_inset Formula $k>2$
|
||
\end_inset
|
||
|
||
LV replicas in the starting position, you can do the same analogously,
|
||
or you may then use a lesser variant.
|
||
For example, we have some mission-critical servers at 1&1 which are running
|
||
|
||
\begin_inset Formula $k=4$
|
||
\end_inset
|
||
|
||
replicas all the time on relatively small but important LVs for extremely
|
||
increased safety.
|
||
Only in such a case, you may have the freedom to temporarily decrease from
|
||
|
||
\begin_inset Formula $k=4$
|
||
\end_inset
|
||
|
||
to
|
||
\begin_inset Formula $k'=3$
|
||
\end_inset
|
||
|
||
and then going up to
|
||
\begin_inset Formula $k''=4$
|
||
\end_inset
|
||
|
||
again.
|
||
This has the advantage of requiring less temporary storage space for
|
||
\emph on
|
||
swapping
|
||
\emph default
|
||
some LVs.
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
Cost Arguments
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "sec:Cost-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
A common pre-jugdement is that
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
big cluster
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
is the cheapest scaling storage technology when built on so-called
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
commodity hardware
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
While this is very often true for the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
commodity hardware
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
part, it is often not true for the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
big cluster
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
part.
|
||
But let us first look at the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
commodity
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
part.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Cost Arguments from Technology
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Here are some rough market prices for basic storage as determined around
|
||
end of 2016 / start of 2017:
|
||
\begin_inset Separator latexpar
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Tabular
|
||
<lyxtabular version="3" rows="6" columns="3">
|
||
<features tabularvalignment="middle">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
Technology
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
Enterprise-Grade
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
Price in € / TB
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
Consumer SATA disks via on-board SATA controllers
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
no (small-scale)
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
< 30 possible
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
SAS disks via SAS HBAs (e.g.
|
||
in external 14
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
shelfs)
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
halfways
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
< 80
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
SAS disks via hardware RAID + LVM (+DRBD/MARS)
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
80 to 150
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
Commercial storage appliances via iSCSI
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
around 1000
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
Cloud storage, S3 over 5 years lifetime
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
yes
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\size small
|
||
3000 to 8000
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
</lyxtabular>
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
You can see that any self-built and self-administered storage (whose price
|
||
varies with slower high-capacity disks versus faster low-capacity disks)
|
||
is much cheaper than any commercial offering by about a factor of 10 or
|
||
even more.
|
||
If you need to operate several petabytes of data, self-built storage is
|
||
always cheaper than commercial one, even if additional manpower is needed
|
||
for commissioning and operating.
|
||
You don't have to pay the shareholders of the storage provider.
|
||
Here we just assume that the storage is needed permanently for at least
|
||
5 years, as is the case in web hosting, databases, backup / archival systems,
|
||
and many other application areas.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Commercial offerings of cloud storage are way too much hyped.
|
||
Some people apparently don't know that the generic term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
Cloud Storage
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
refers to a
|
||
\emph on
|
||
storage class
|
||
\emph default
|
||
, not to a particular
|
||
\emph on
|
||
instance
|
||
\emph default
|
||
like original Amazon S3, and that it is possible to build and operate almost
|
||
any instance of any storage class yourself.
|
||
From a commercial perspective,
|
||
\series bold
|
||
outsourcing
|
||
\series default
|
||
of
|
||
\emph on
|
||
huge masses
|
||
\emph default
|
||
of enterprise-critical storage (to whatever class of storage) usually pays
|
||
off
|
||
\series bold
|
||
only when
|
||
\series default
|
||
your storage demands are either
|
||
\emph on
|
||
relatively low
|
||
\emph default
|
||
, or are
|
||
\emph on
|
||
extremely
|
||
\emph default
|
||
varying over time, and/or when you need some
|
||
\emph on
|
||
extra
|
||
\emph default
|
||
capacity only
|
||
\emph on
|
||
temporarily
|
||
\emph default
|
||
for a
|
||
\emph on
|
||
very
|
||
\emph default
|
||
short time.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Cost Arguments from Architecture
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In addition to basic storage prices, many further factors come into play
|
||
when roughly comparing big cluster architectures versus sharding.
|
||
The following table bears the
|
||
\emph on
|
||
unrealistic assumption
|
||
\emph default
|
||
that BigCluster can be reliably operated with 2 replicas (
|
||
\family roman
|
||
\series medium
|
||
\shape up
|
||
\size normal
|
||
\emph off
|
||
\bar no
|
||
\strikeout off
|
||
\uuline off
|
||
\uwave off
|
||
\noun off
|
||
\color none
|
||
the suffix
|
||
\begin_inset Formula $\times2$
|
||
\end_inset
|
||
|
||
|
||
\family default
|
||
\series default
|
||
\shape default
|
||
\size default
|
||
\emph default
|
||
\bar default
|
||
\strikeout default
|
||
\uuline default
|
||
\uwave default
|
||
\noun default
|
||
\color inherit
|
||
means with additional geo-redundancy):
|
||
\begin_inset Separator latexpar
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Tabular
|
||
<lyxtabular version="3" rows="5" columns="5">
|
||
<features tabularvalignment="middle">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
BC
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
SHA
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
BC
|
||
\begin_inset Formula $\times2$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
SHA
|
||
\begin_inset Formula $\times2$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
# of Disks
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
>200%
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
<120%
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
>400%
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
<240%
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
# of Servers
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times2$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times1.1$
|
||
\end_inset
|
||
|
||
possible
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times4$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times2.2$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Power Consumption
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times2$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times1.1$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times4$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times2.2$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
HU Consumption
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times2$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times1.1$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times4$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times2.2$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
</lyxtabular>
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
As shown in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Reliability-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
, two replicas are typically not sufficient for BigCluster.
|
||
Even addicts of BigCluster are typically recommending 3 replicas in some
|
||
so-called
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
best practices
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, leading to the following more realistic table:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Tabular
|
||
<lyxtabular version="3" rows="5" columns="5">
|
||
<features tabularvalignment="middle">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
BC
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
SHA
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
BC
|
||
\begin_inset Formula $\times2$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
SHA
|
||
\begin_inset Formula $\times2$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
# of Disks
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
>300%
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
<120%
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
>600%
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
<240%
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
# of Servers
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times3$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times1.1$
|
||
\end_inset
|
||
|
||
possible
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times6$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times2.2$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Power Consumption
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times3$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times1.1$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times6$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times2.2$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
HU Consumption
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times3$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times1.1$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times6$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
\begin_inset Formula $\approx\times2.2$
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
</lyxtabular>
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
The crucial point is not only the number of extra servers needed for dedicated
|
||
storage boxes, but also the total number of HDDs.
|
||
While big cluster implementations like Ceph or Swift can
|
||
\emph on
|
||
theoretically
|
||
\emph default
|
||
use some erasure encoding for avoiding full object replicas, their
|
||
\emph on
|
||
practice
|
||
\emph default
|
||
as seen in internal 1&1 Ceph clusters is similar to RAID-10, but just on
|
||
objects instead of block-based sectors.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Therefore a big cluster typically needs >300% disks to reach the same net
|
||
capacity as a simple sharded cluster.
|
||
The latter can typically take advantage of hardware RAID-60 with a significantl
|
||
y smaller disk overhead, while providing sufficient failure tolerance at
|
||
disk level.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
There is a surprising consequence from this: geo-redundancy is not as expensive
|
||
as many people are believing.
|
||
It just needs to be built with the proper architecture.
|
||
A sharded geo-redundant pool based on hardware RAID-60 (last column
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
SHA
|
||
\begin_inset Formula $\times2$
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
) costs typically
|
||
\emph on
|
||
less
|
||
\emph default
|
||
than a non-georedundant big cluster with typically needed / recommended
|
||
number of replicas (column
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
BC
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
).
|
||
A geo-redundant sharded pool provides even better failure compensation
|
||
(see section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Reliability-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Notice that geo-redundancy implies by definition that an unforeseeable
|
||
\series bold
|
||
full datacenter loss
|
||
\series default
|
||
(e.g.
|
||
caused by
|
||
\series bold
|
||
disasters
|
||
\series default
|
||
like a terrorist attack or an earthquake) must be compensated for
|
||
\series bold
|
||
several days or weeks
|
||
\series default
|
||
.
|
||
Therefore it is
|
||
\emph on
|
||
not
|
||
\emph default
|
||
sufficient to take a big cluster and just spread it to two different locations.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In any case, a MARS-based geo-redundant sharding pool is cheaper than using
|
||
commercial storage appliances which are much more expensive by their nature.
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
Reliability Arguments from Architecture
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "sec:Reliability-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
A contemporary common belief is that big clusters and their random replication
|
||
methods would provide better reliability than anything else.
|
||
There are some practical observations at 1&1 and its daughter companies
|
||
which cannot confirm this.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Similar experiences are part of a USENIX paper about copysets, see
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
https://www.usenix.org/system/files/conference/atc13/atc13-cidon.pdf
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
Their proposed solution is different from the solution proposed here, but
|
||
interestingly their
|
||
\emph on
|
||
problem analysis
|
||
\emph default
|
||
part contains not only similar observations, but also comes to similar
|
||
conclusions about random replication.
|
||
Citation from the abstract:
|
||
\end_layout
|
||
|
||
\begin_layout Quote
|
||
However, random replication is
|
||
\series bold
|
||
almost guaranteed
|
||
\series default
|
||
to lose data in the common scenario of simultaneous node failures due to
|
||
cluster-wide power outages.
|
||
|
||
\size footnotesize
|
||
[emphasis added by me]
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Stimulated by our practical experiences even in truly less disastrous scenarios
|
||
than mass power outage, theoretical explanations were sought.
|
||
Surprisingly, they show that LocalSharding is superior to true big clusters
|
||
under practically important preconditions.
|
||
Here is an intutitive explanation.
|
||
A detailed mathematical description of the model can be found in appendix
|
||
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "chap:Mathematical-Model-of"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Storage Server Node Failures
|
||
\end_layout
|
||
|
||
\begin_layout Subsubsection
|
||
Simple intuitive explanation
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Block-level replication systems like DRBD are constructed for failover in
|
||
local redundancy scenarios.
|
||
Or, when using MARS, even for geo-redundant failover scenarios.
|
||
They are traditionally dealing with
|
||
\series bold
|
||
pairs
|
||
\series default
|
||
of servers, or with triples, etc.
|
||
In order to get a storage incident with them,
|
||
\emph on
|
||
both
|
||
\emph default
|
||
sides of a DRBD or MARS small-cluster (also called
|
||
\series bold
|
||
shard
|
||
\series default
|
||
in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "par:Definition-of-Sharding"
|
||
|
||
\end_inset
|
||
|
||
) must have an incident
|
||
\emph on
|
||
at the same time
|
||
\emph default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In contrast, big clusters are conceptually spreading their objects over
|
||
a huge number of nodes
|
||
\begin_inset Formula $O(n)$
|
||
\end_inset
|
||
|
||
, with some redundancy degree
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
denoting the number of replicas.
|
||
As a consequence,
|
||
\emph on
|
||
any
|
||
\emph default
|
||
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
node failures out of
|
||
\begin_inset Formula $O(n)$
|
||
\end_inset
|
||
|
||
will produce an incident.
|
||
For example, when
|
||
\begin_inset Formula $k=2$
|
||
\end_inset
|
||
|
||
and
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
is equal for both models, then
|
||
\emph on
|
||
any
|
||
\emph default
|
||
combination to two node failures occurring at the same time will lead to
|
||
an incident:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/Incident_Probabilities.pdf
|
||
width 100col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
Intuitively, it is easy to see that hitting both members of the same pair
|
||
at the same time is less likely than hitting
|
||
\emph on
|
||
any
|
||
\emph default
|
||
two nodes of a big cluster.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
If you are curious about some concrete numbers, read on.
|
||
\end_layout
|
||
|
||
\begin_layout Subsubsection
|
||
Detailed explanation
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "sub:Detailed-explanation"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
For the sake of simplicity, the following more detailed explanation is based
|
||
on the following assumptions:
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
We are looking at
|
||
\series bold
|
||
storage node
|
||
\series default
|
||
failures only.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Disk failures are regarded as already solved (e.g.
|
||
by local RAID-6 or by the well-known compensation mechanisms of big clusters).
|
||
Only in case they don't work, they are mapped to node failures, and are
|
||
already included in the probability of storage node failures.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
We only look at
|
||
\series bold
|
||
data replication
|
||
\series default
|
||
with a redundancy degree of a relatively small
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
.
|
||
CRC methods are not used across storage nodes, but may be present
|
||
\emph on
|
||
internally
|
||
\emph default
|
||
at some storage nodes, e.g.
|
||
RAID-5 or RAID-6 or similar methods.
|
||
Notice that CRC methods generally involve very high overhead, and even
|
||
won't work in realtime across long distances (geo-redundancy).
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
We restrict ourselves to temporary /
|
||
\series bold
|
||
transient
|
||
\series default
|
||
failures, without regarding permanent data loss.
|
||
Otherwise, the differences between local-storage sharding architectures
|
||
and big clusters would become even worse.
|
||
When loosing some physical storage nodes forever in a big cluster, it is
|
||
typically all else but easy to determine which data of which application
|
||
instances / customers have been affected, and which will need a restore
|
||
from backup.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Storage network failures (as a whole) are ignored.
|
||
Otherwise a fair comparison between the architectures would become difficult.
|
||
If they were taken into account, the advantages of LocalSharding would
|
||
become even bigger.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
We assume that the storage network (when present) forms no bottleneck.
|
||
Network implementations like TCP/IP versus Infiniband or similar are thus
|
||
ignored.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Software failures / bugs are also ignored.
|
||
We only compare
|
||
\emph on
|
||
architectures
|
||
\emph default
|
||
here, not their various implementations.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
The x axis shows the number of basic storage units
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
from an
|
||
\emph on
|
||
application
|
||
\emph default
|
||
perspective, meaning
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
usable storage
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
or
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
net amount of storage
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
For simplicitiy of the model, one basic application storage unit equals
|
||
to the total disk space provided by one physical storage node in the special
|
||
case of
|
||
\begin_inset Formula $k=1$
|
||
\end_inset
|
||
|
||
replicas.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Attention! when increasing the number of replicas
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
, the total number of storage nodes needs to be
|
||
\series bold
|
||
increased accordingly
|
||
\series default
|
||
.
|
||
Typically, you will need to deploy
|
||
\begin_inset Formula $k\cdot n$
|
||
\end_inset
|
||
|
||
physical storage nodes in order to get
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
net storage units from a user's perspective.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Attention!
|
||
\begin_inset space ~
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
has a strong influence at the
|
||
\series bold
|
||
price tag
|
||
\series default
|
||
of any of the competing architectures.
|
||
You cannot assume an
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
infinite amount of money
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
Therefore, only relatively small
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
are bearable for business cases.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
We assume that the number of application instances is linearly scaling with
|
||
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
.
|
||
For simplicity, we assume that the number of applications running on the
|
||
whole pool is exactly
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
We assume that the storage nodes are (almost completely) filled with data
|
||
(sectors with RAID, and/or objects with BigCluster).
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
We assume that the number of sectors / objects per storage node is
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
very large
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
Some examples: a logical volume of 4 TB has 1,000,000,000 sectors or object,
|
||
each 4 KB in size.
|
||
A physical storage node providing 40 TB of storage will then provide 10
|
||
billions of sectors / objects.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
For the BigCluster architecture, we assume that all objects are always distribut
|
||
ed to
|
||
\begin_inset Formula $O(n)$
|
||
\end_inset
|
||
|
||
nodes.
|
||
For simiplicy of the model, we assume a distribution via a
|
||
\emph on
|
||
uniform
|
||
\emph default
|
||
hash function.
|
||
When other hash functions were used (e.g.
|
||
distributing only to a constant number of nodes), it would no longer be
|
||
a big cluster architecture in our sense.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
In the following example, we assume a uniform object distribution to exactly
|
||
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
nodes.
|
||
Notice that any other
|
||
\begin_inset Formula $n'=O(n)$
|
||
\end_inset
|
||
|
||
with
|
||
\begin_inset Formula $n'<n$
|
||
\end_inset
|
||
|
||
will produce similar results for
|
||
\begin_inset Formula $n'\rightarrow\infty$
|
||
\end_inset
|
||
|
||
, but may be better in detail for smaller
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
'.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
When random distribution / random replication methods are used at BigCluster
|
||
object stores, we assume that for any pair (or
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
-tuple) of storage nodes, the total number of objects is so high that there
|
||
always
|
||
\emph on
|
||
exists
|
||
\emph default
|
||
some objects which are present at
|
||
\emph on
|
||
all
|
||
\emph default
|
||
of the nodes of any pair /
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
-tuple for any reasonable (small)
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
.
|
||
This means, we assume not only uniformity in random replication, but also
|
||
that the total number of objects is practically
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
infinite
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
compared to relatively small practical values of
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
For mathematically interested readers: be careful when trying to argue
|
||
with the probability to hit some object intersection for some given
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
-tuple of storage nodes while
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
is a growing parameter.
|
||
Even when such a
|
||
\emph on
|
||
single
|
||
\emph default
|
||
probability is declining with growing both
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
and
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
, and even when the
|
||
\emph on
|
||
single
|
||
\emph default
|
||
probability for the existence of an intersection somewhen gets lower than
|
||
|
||
\begin_inset Formula $1$
|
||
\end_inset
|
||
|
||
, this has an impact onto the
|
||
\emph on
|
||
total
|
||
\emph default
|
||
incident probability of the
|
||
\emph on
|
||
whole
|
||
\emph default
|
||
BigCluster.
|
||
In
|
||
\emph on
|
||
general
|
||
\emph default
|
||
, the
|
||
\emph on
|
||
number
|
||
\emph default
|
||
of such tuples is growing with
|
||
\begin_inset Formula $O(\binom{k\cdot n}{k})=O((k\cdot n)!)$
|
||
\end_inset
|
||
|
||
.
|
||
So, don't forget to sum up
|
||
\emph on
|
||
all
|
||
\emph default
|
||
probabilities even if a single one appears to be
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
neglectible
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
For the LocalSharding (DRBDorMARS) architecture, we assume that only local
|
||
storage is used.
|
||
For higher replication degrees
|
||
\begin_inset Formula $k=2,\ldots$
|
||
\end_inset
|
||
|
||
, the only occurring communication is
|
||
\emph on
|
||
among
|
||
\emph default
|
||
the pairs / triples / and so on (shards), but no communication to other
|
||
shards is necessary.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
For simplicity of the example, we assume that any single storage server
|
||
node used in either architecture, including all of its local disks, has
|
||
a reliability of 99.99% (four nines).
|
||
This means, the probability of a storage node failure is uniformly assumed
|
||
as
|
||
\begin_inset Formula $p=0.0001$
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
This means, during an observation period of
|
||
\begin_inset Formula $T=10,000$
|
||
\end_inset
|
||
|
||
operation hours, we will have a total downtime of 1 hour per server in
|
||
statistical average.
|
||
For simplicity, we assume that the failure probability of a single server
|
||
does neither depend on previous failures nor on the operating conditions
|
||
of any other server.
|
||
It is known that this is not true in general, but otherwise our model would
|
||
become extremely complex.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
More intuitively, our observation period of
|
||
\begin_inset Formula $T=10,000$
|
||
\end_inset
|
||
|
||
operation hours corresponds to about 13 months, or slightly more than a
|
||
year.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Consequence: when operating a pool of 10,000 storage servers, then in statistica
|
||
l
|
||
\emph on
|
||
average
|
||
\emph default
|
||
there will be
|
||
\emph on
|
||
almost always
|
||
\emph default
|
||
one node which is failed.
|
||
This is like a
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
permanent incident
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
which has to be solved by the competing storage architectures.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Hint: the term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
statistical average
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
is somewhat vague here, in order to not confuse readers
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
The problem is that sometimes more servers than average can be down, and
|
||
sometimes less.
|
||
Average values should not be used in the mathematical model, but exact
|
||
ones.
|
||
However, humans can often better imagine when provided with
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
average behaviour
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, so we use it here just for the sake of ease of understanding.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
A more elaborate statistical model can be found in appendix
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "chap:Mathematical-Model-of"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Let us start the comparison with a simple corner case: plain old servers
|
||
with no further redundancy, other than their local RAIDs.
|
||
This naturally corresponds to
|
||
\begin_inset Formula $k=1$
|
||
\end_inset
|
||
|
||
replicas when using the DRBDorMARS architecture.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Now we apply the corner case of
|
||
\begin_inset Formula $k=1$
|
||
\end_inset
|
||
|
||
replicas to both architectures, i.e.
|
||
also to BigCluster, in order to shed some spotlight at the fundamental
|
||
properties of the architectures.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Under the precondition of
|
||
\begin_inset Formula $k=1$
|
||
\end_inset
|
||
|
||
replicas, an incident of each one of the
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
servers has two possible ways to influence the downtime from an application's
|
||
perspective:
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Downtime of 1 storage node only influences 1 application unit depending
|
||
on 1 basic storage unit.
|
||
This is the case with the DRBDorMARS model, because there is no communication
|
||
between shards, and we assumed that 1 storage server unit also carries
|
||
exactly 1 application unit.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Downtime of 1 storage node will
|
||
\series bold
|
||
tear down more
|
||
\series default
|
||
than 1 application unit, because any of the application units have spread
|
||
their storage to more than 1 storage node via uniform hashing, as is the
|
||
case at BigCluster.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
For ease of understanding, let us zoom into the special case
|
||
\begin_inset Formula $n=2$
|
||
\end_inset
|
||
|
||
and
|
||
\begin_inset Formula $k=1$
|
||
\end_inset
|
||
|
||
for a moment.
|
||
These are the smallest numbers where you already can see the effect.
|
||
In the following table, we denote 4 possible status combinations out of
|
||
2 servers A and B, where the cells are showing the number of application
|
||
units influenced:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset ERT
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
|
||
\backslash
|
||
hfill
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Tabular
|
||
<lyxtabular version="3" rows="3" columns="3">
|
||
<features tabularvalignment="middle">
|
||
<column alignment="right" valignment="top" width="0pt">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<row>
|
||
<cell alignment="right" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
LocalSharding
|
||
\size tiny
|
||
(DRBDorMARS)
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
A up
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
A down
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="right" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
B up
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
0
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
1
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="right" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
B down
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
1
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
2
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
</lyxtabular>
|
||
|
||
\end_inset
|
||
|
||
|
||
\begin_inset ERT
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
|
||
\backslash
|
||
hfill
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Tabular
|
||
<lyxtabular version="3" rows="3" columns="3">
|
||
<features tabularvalignment="middle">
|
||
<column alignment="right" valignment="top" width="0pt">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<row>
|
||
<cell alignment="right" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
BigCluster
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
A up
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
A down
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="right" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
B up
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
0
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
2
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="right" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
B down
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
2
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
2
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
</lyxtabular>
|
||
|
||
\end_inset
|
||
|
||
|
||
\begin_inset ERT
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
|
||
\backslash
|
||
hfill
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
|
||
\begin_inset space ~
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
What is the heart of the difference? While a node failure at LocalSharding
|
||
(DRBDorMARS) will tear down only the local application, the teardown produced
|
||
by BigCluster will spread to
|
||
\emph on
|
||
all
|
||
\emph default
|
||
of the
|
||
\begin_inset Formula $n=2$
|
||
\end_inset
|
||
|
||
application units, because of the uniform hashing and because we have only
|
||
|
||
\begin_inset Formula $k=1$
|
||
\end_inset
|
||
|
||
replica.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Would it help to increase both
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
and
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
to larger values?
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In the following graphics, the thick red line shows the behaviour for
|
||
\begin_inset Formula $k=1$
|
||
\end_inset
|
||
|
||
PlainServers (which is the same as
|
||
\begin_inset Formula $k=1$
|
||
\end_inset
|
||
|
||
DRBDorMARS) with increasing number of storage units
|
||
\begin_inset Formula $n,$
|
||
\end_inset
|
||
|
||
ranging from 1 to 10,000 storage units = number of servers for
|
||
\begin_inset Formula $k=1$
|
||
\end_inset
|
||
|
||
.
|
||
Higher values of
|
||
\begin_inset Formula $k\in[1,4]$
|
||
\end_inset
|
||
|
||
are also displayed.
|
||
All lines corresponding to the same
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
are drawn in the same color.
|
||
Notice that both the x and y axis are logscale:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/SERVICE_Comparison_of_Reversible_StorageNode_Failures.pdf
|
||
lyxscale 200
|
||
width 100col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
When you look at the thin solid BigCluster lines for
|
||
\begin_inset Formula $k=2,\ldots$
|
||
\end_inset
|
||
|
||
drawn in different colors, you may wonder why they are alltogether converging
|
||
to the thin red BigCluster line, which corresponds to
|
||
\begin_inset Formula $k=1$
|
||
\end_inset
|
||
|
||
BigCluster.
|
||
And they also converge against the grey dotted topmost line indicating
|
||
the total possible uptime of all applications (depending on x).
|
||
It can be explained as follows:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The x axis shows the number of basic storage units.
|
||
When you have to create 10,000 storage units with a replication degree
|
||
of
|
||
\begin_inset Formula $k=2$
|
||
\end_inset
|
||
|
||
replicas, then you will have to deploy
|
||
\begin_inset Formula $k*10,000=20,000$
|
||
\end_inset
|
||
|
||
servers in total.
|
||
When operating a pool of 20,000 servers, in statistical average 2 servers
|
||
of them will be down at any given point in time.
|
||
However, 2 is the same number as the replication degree
|
||
\begin_inset Formula $k.$
|
||
\end_inset
|
||
|
||
Because our BigCluster model as defined above will distribute
|
||
\emph on
|
||
all
|
||
\emph default
|
||
objects to
|
||
\emph on
|
||
all
|
||
\emph default
|
||
servers uniformly, there will almost always
|
||
\emph on
|
||
exist
|
||
\emph default
|
||
some objects for which no replica is available at any given point in time.
|
||
This means, you will almost always have a
|
||
\series bold
|
||
permanent incident
|
||
\series default
|
||
involving the same number of nodes as your replication degree
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
, and in turn
|
||
\emph on
|
||
some
|
||
\emph default
|
||
of your objects will not be accessible at all.
|
||
This means, at
|
||
\begin_inset Formula $x=10,000$
|
||
\end_inset
|
||
|
||
storage units you will loose almost any advantage from increasing the number
|
||
of replicas.
|
||
Adding more replicas will no longer help at
|
||
\begin_inset Formula $x\geq10,000$
|
||
\end_inset
|
||
|
||
storage units.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Notice that the
|
||
\emph on
|
||
solid
|
||
\emph default
|
||
lines are showing the probability of
|
||
\emph on
|
||
some
|
||
\emph default
|
||
incident, disregarding the
|
||
\series bold
|
||
size of the incident
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
What's about the
|
||
\emph on
|
||
dashed
|
||
\emph default
|
||
lines showing much better behaviour for BigCluster?
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Under some further preconditions, it would be possible to argue with the
|
||
|
||
\emph on
|
||
size
|
||
\emph default
|
||
of incidents.
|
||
However, now a big fat warning.
|
||
When you are
|
||
\series bold
|
||
responsible
|
||
\series default
|
||
for operations of thousands of servers, you should be very conscious about
|
||
these preconditions.
|
||
Otherwise you could risk your career.
|
||
In short:
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
When your application, e.g.
|
||
a smartphone app, consists of accessing only 1 object at all during a reasonabl
|
||
y long timeframe, you can safely
|
||
\series bold
|
||
assume that there is no interdependency
|
||
\series default
|
||
between all of your objects.
|
||
In addition, you have to assume (and you should check) that your cluster
|
||
operating software as a whole does not introduce any further
|
||
\series bold
|
||
hidden / internal interdependencies
|
||
\series default
|
||
.
|
||
Only in this case, and only then, you can take the dashed lines arguing
|
||
with the number of inaccessible objects instead of with the number of basic
|
||
storage units.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Whenever your application uses
|
||
\series bold
|
||
bigger structured logical objects
|
||
\series default
|
||
, such as filesystems or block devices or whole VMs / containers, then you
|
||
likely will get
|
||
\series bold
|
||
interdependent objects
|
||
\series default
|
||
at your big cluster storage layer.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
Practical example: experienced sysadmins will confirm that even a data loss
|
||
rate of only 1/1,000,000 of blocks in a classical Linux filesystem like
|
||
|
||
\family typewriter
|
||
xfs
|
||
\family default
|
||
or
|
||
\family typewriter
|
||
ext4
|
||
\family default
|
||
will likely imply the need of an offline filesystem check (
|
||
\family typewriter
|
||
fsck
|
||
\family default
|
||
), which is a major incident for the affected filesystem instances.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
Theoretical explanation: servers are running for a very long time, and filesyste
|
||
ms are typically also mounted for a long time.
|
||
Notice that the probability of hitting any vital filesystem data roughly
|
||
equals the probability of hitting any other data.
|
||
Sooner or later, any defective sector in the metadata structures or in
|
||
freespace management etc will stop your whole filesystem, and in turn will
|
||
stop your application instance(s) running on top of it.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
Similar arguments hold for transient failures: most classical filesystems
|
||
are not constructed for compensation of hanging IO, typically leading to
|
||
|
||
\series bold
|
||
system hangs
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Blindly taking the dashed lines will expose you to a high risk of error.
|
||
Practical experience shows that there are often
|
||
\series bold
|
||
hidden dependencies
|
||
\series default
|
||
in many applications, often also at application level.
|
||
You cannot necessarily see them when inspecting their data structures!
|
||
You will only notice some of them by analyzing their
|
||
\series bold
|
||
runtime behaviour
|
||
\series default
|
||
, e.g.
|
||
with tools like
|
||
\family typewriter
|
||
strace
|
||
\family default
|
||
.
|
||
Notice that in general the runtime behaviour of an arbitrary program is
|
||
|
||
\series bold
|
||
undecidable
|
||
\series default
|
||
.
|
||
Be cautious when drawing assumptions out of thin air!
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Conversely, the assumption that
|
||
\emph on
|
||
any
|
||
\emph default
|
||
unaccessible object may halt your application, might be too strong for
|
||
|
||
\emph on
|
||
some
|
||
\emph default
|
||
use cases.
|
||
Therefore, some practical behaviour may be inbetween the solid thin lines
|
||
and the dashed lines of some given color.
|
||
Be extremely careful when constructing such an intermediate case.
|
||
The above example of a loss rate of 1/1,000,000 of sectors in a classical
|
||
filesystem should not be extended to lower values like 1/1,000,000,000
|
||
without knowing exactly how the filesystem works, and how it will react
|
||
|
||
\emph on
|
||
in detail
|
||
\emph default
|
||
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
In general, it is insufficient to analyze the logical dependencies inside
|
||
of a filesystem instance, such as which inode contains some pointers to
|
||
which other filesystem objects, etc.
|
||
There exist further
|
||
\series bold
|
||
runtime dependencies
|
||
\series default
|
||
, such as
|
||
\family typewriter
|
||
nr_requests
|
||
\family default
|
||
block-layer restrictions on IO queue depths, and/or capabilities / limitiations
|
||
of the hardware, and so on.
|
||
Trying to model all of these influences in a reasonable way could be a
|
||
|
||
\emph on
|
||
major
|
||
\emph default
|
||
research undertakement outside the scope of this MARS manual.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
The grey zone between the extreme cases thin solid vs dashed is a
|
||
\series bold
|
||
dangerous zone
|
||
\series default
|
||
!
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
If you want to stay at the
|
||
\series bold
|
||
safe side
|
||
\series default
|
||
, simply obey the fundamental law as explained in the next section:
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Optimum Reliability from Architecture
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Optimum-Reliability-from"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Another argument could be: don't distribute the BigCluster objects to exactly
|
||
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
nodes, but to less nodes.
|
||
Would the result be better than DRBDorMARS LocalSharding?
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
When distributing to
|
||
\begin_inset Formula $O(k')$
|
||
\end_inset
|
||
|
||
nodes with some constant
|
||
\begin_inset Formula $k'$
|
||
\end_inset
|
||
|
||
, we have no longer a BigCluster architecture, but a mixed BigClusterSharding
|
||
form.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
As can be generalized from the above tables, the reliability of
|
||
\series bold
|
||
any
|
||
\series default
|
||
BigCluster on
|
||
\begin_inset Formula $k'>k$
|
||
\end_inset
|
||
|
||
nodes is
|
||
\series bold
|
||
always
|
||
\series default
|
||
worse than of LocalSharding on exactly
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
nodes, where
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
is also the redundancy degree.
|
||
In general:
|
||
\end_layout
|
||
|
||
\begin_layout Quote
|
||
|
||
\series bold
|
||
\size large
|
||
The LocalSharding model is the optimum model for reliability of operation,
|
||
compared to any other model truly distributing its data and operations
|
||
over truly more nodes, like RemoteSharding or BigClusterSharding or BigCluster
|
||
does.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
There exists no better model because shards consisting of exactly
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
nodes where
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
is the redundancy degree are already the
|
||
\emph on
|
||
smallest possible shards
|
||
\emph default
|
||
under the assumptions of section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sub:Detailed-explanation"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
Any other model truly involving
|
||
\begin_inset Formula $k'>k$
|
||
\end_inset
|
||
|
||
nodes for distribution of objects at any shard is
|
||
\series bold
|
||
always
|
||
\series default
|
||
worse in the dimension of reliability.
|
||
Thus the above sentence follows by induction.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The above sentence is formulating a
|
||
\series bold
|
||
fundamental law of storage systems
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Error Propagation to Client Mountpoints
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Error-Propagation-to"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The following is only applicable when filesystems (or their objectstore
|
||
counterparts) are exported over a storage network, in order to be mounted
|
||
in parallel at
|
||
\begin_inset Formula $O(n)$
|
||
\end_inset
|
||
|
||
mountpoints each.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In such a scenario, any problem / incident inside of your storage pool for
|
||
the filesystem instances will be spread to
|
||
\begin_inset Formula $O(n)$
|
||
\end_inset
|
||
|
||
clients, leading to an increase of the incident size by a factor of
|
||
\begin_inset Formula $O(n)$
|
||
\end_inset
|
||
|
||
when measured in number of affected mountpoints:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/MOUNTPOINTS_Comparison_of_Reversible_StorageNode_Failures.pdf
|
||
lyxscale 200
|
||
width 100col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
As a results, we now have a total of
|
||
\begin_inset Formula $O(n^{2})$
|
||
\end_inset
|
||
|
||
mountpoints = our new basic application units.
|
||
Such
|
||
\begin_inset Formula $O(n^{2})$
|
||
\end_inset
|
||
|
||
architectures are quickly becoming even worse than before.
|
||
Thus a clear warning: don't try to build systems in such a way.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Notice: DRBD or MARS are traditionally used for running the application
|
||
on the same box as the storage.
|
||
Thus they are not vulnerable to these kinds of failure propagation over
|
||
network.
|
||
Even with traditional iSCSI exports over DRBD or MARS, you won't have suchalike
|
||
problems.
|
||
Your only chance to increase the error propagation are
|
||
\begin_inset Formula $O(n)$
|
||
\end_inset
|
||
|
||
NFS or
|
||
\family typewriter
|
||
glusterfs
|
||
\family default
|
||
exports to
|
||
\begin_inset Formula $O(n)$
|
||
\end_inset
|
||
|
||
clients leading to a total number of
|
||
\begin_inset Formula $O(n^{2})$
|
||
\end_inset
|
||
|
||
mountpoints, or similar setups.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Clear advice: don't do that.
|
||
It's a bad idea.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Similarities and Differences to Copysets
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Similarities-and-differences"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This section is mostly of academic interest.
|
||
You can skip it when looking for practical advice.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The USENIX paper about copysets (see
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
https://www.usenix.org/system/files/conference/atc13/atc13-cidon.pdf
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
) relates to the Sharding model in the following way:
|
||
\end_layout
|
||
|
||
\begin_layout Paragraph
|
||
Similarities
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The concept of Random Replication of the storage data to large number of
|
||
machines will reduce reliability.
|
||
When chosing too big sets of storage machines, then the storage system
|
||
as a whole will become practically unusable.
|
||
This is common sense between the USENIX paper and the Sharding Approach
|
||
as propagated here.
|
||
\end_layout
|
||
|
||
\begin_layout Paragraph
|
||
Differences
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The USENIX paper and many other Cloud Storage approaches are
|
||
\emph on
|
||
presuming
|
||
\emph default
|
||
that there exists a storage network, allowing real-time distribution of
|
||
replicas over this kind of network.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In contrast, the Sharding Approach to Cloud Storage tries to
|
||
\emph on
|
||
avoid
|
||
\emph default
|
||
real-time storage networks
|
||
\emph on
|
||
as much as possible
|
||
\emph default
|
||
.
|
||
Notice that RemoteSharding and further variants (including future improvements)
|
||
do
|
||
\emph on
|
||
not
|
||
\emph default
|
||
preclude it, but are trying to
|
||
\emph on
|
||
avoid
|
||
\emph default
|
||
real-time storage network traffic.
|
||
Instead, the load-balancing problem is addressed via
|
||
\series bold
|
||
background data migration
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This changes the
|
||
\emph on
|
||
timely granularity
|
||
\emph default
|
||
of data access: many real-time accesses are
|
||
\emph on
|
||
shifted over
|
||
\emph default
|
||
to migration processes, which in turn are weakening the requirements to
|
||
the network.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In detail, there are some more differences to the USENIX paper.
|
||
Some examples:
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Terminology: the scatter width
|
||
\begin_inset Formula $S$
|
||
\end_inset
|
||
|
||
is defined (see page 39 of the paper) as: each node's data is split
|
||
\emph on
|
||
uniformly
|
||
\emph default
|
||
across a group of
|
||
\begin_inset Formula $S$
|
||
\end_inset
|
||
|
||
|
||
\emph on
|
||
other
|
||
\emph default
|
||
nodes.
|
||
In difference, we neither assume uniformity, nor do we require the data
|
||
to be distributed to
|
||
\emph on
|
||
other
|
||
\emph default
|
||
nodes.
|
||
By using the term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
other
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, the USENIX paper (as well as many other BigCluster approaches) are probably
|
||
presuming something like a distinction between
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
client
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
and
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
server
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
machines: while data processing is done on a
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
client
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, data storage is on a
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
server
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
We don't disallow this in variants like RemoteSharding or FlexibleSharding
|
||
and so on, but we gave some arguments why we are trying to
|
||
\emph on
|
||
avoid
|
||
\emph default
|
||
this.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
It seems that some definitions in the USENIX paper may implicitly relate
|
||
to
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
each chunk
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
In contrast, the Sharding Approach typically relates to LVs (logical volumes),
|
||
which could however be viewed as a special case of
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
chunk
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, e.g.
|
||
by minimizing the number of chunks in a system.
|
||
However notice: there exists definitions of
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
chunk
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
where it is the basic transfer unit.
|
||
An LV has the fundamental property that small-granularity
|
||
\series bold
|
||
update in place
|
||
\series default
|
||
(at any offset inside the LV) can be executed.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Notice: we do not preclude further fine-grained distribution of LV data,
|
||
but this is something which should be
|
||
\emph on
|
||
avoided
|
||
\emph default
|
||
if not absolutely necessary.
|
||
Preferred method in typical practical use cases: some storage servers may
|
||
have some spare RAID slots to be populated later, by resizing the PVs =
|
||
Physical Volumes before resizing LVs.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Notice that a typical local RAID system
|
||
\emph on
|
||
is also
|
||
\emph default
|
||
a Distributed System, according to some reasonable definition.
|
||
Typical RAID implementations just involve SAS cables instead of Ethernet
|
||
cables or Infiniband cables.
|
||
Notice that this also applies to many
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
Commodity Hardware
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
approaches, like Ceph storage nodes driving dozens of local HDDs connected
|
||
over SAS or SATA.
|
||
The main difference is just that instead of a hardware RAID controller,
|
||
a hardware HBA = Host Bus Adapter is used instead.
|
||
Instead of Ethernet switches, SAS multiplexers in backplanes are used.
|
||
Anyway, this forms a locally distributed sub-system.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Future variants of the Sharding Approach might extend this already present
|
||
locally Distributed System to a somewhat wider one.
|
||
For example, creation of a local LV (called
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
disk
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
in MARS terminology) could be implemented by a subordinate DRBD instance
|
||
implementing a future RAID-10 mode over local Infiniband or crossover Ethernet
|
||
cables, avoiding local switches.
|
||
While DRBD would essentially create the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
local
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
LV, the higher-level MARS instance would then be responsible for its wide-dista
|
||
nce replication.
|
||
See chapter
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "chap:Use-Cases-for"
|
||
|
||
\end_inset
|
||
|
||
about use cases of MARS vs DRBD.
|
||
Potential future use cases could be
|
||
\emph on
|
||
extremely huge
|
||
\emph default
|
||
LVs where external SAS disk shelves are no longer sufficient to get the
|
||
desired capacity.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
The USENIX paper needs to treat the following parameters as more or less
|
||
fixed (or only slowly changable)
|
||
\series bold
|
||
constants
|
||
\series default
|
||
, given by the system designer: the replication degree
|
||
\begin_inset Formula $R$
|
||
\end_inset
|
||
|
||
, and the scatter width
|
||
\begin_inset Formula $S$
|
||
\end_inset
|
||
|
||
.
|
||
In contrast, the replication degree
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
of our Sharding Approach is not necessarily firmly given by the system,
|
||
but can be
|
||
\series bold
|
||
dynamically changed
|
||
\series default
|
||
at runtime on a per-LV basis.
|
||
For example, during background migration via MARS the command
|
||
\family typewriter
|
||
marsadm join-resource
|
||
\family default
|
||
is used for creating additional per-LV replicas.
|
||
However notice: this freedom is limited by the total number of deployed
|
||
hardware nodes.
|
||
If you want
|
||
\begin_inset Formula $k=3$
|
||
\end_inset
|
||
|
||
replicas at the
|
||
\emph on
|
||
whole
|
||
\emph default
|
||
pool, then you will need to (dynamically) deploy at least about
|
||
\begin_inset Formula $k*x$
|
||
\end_inset
|
||
|
||
nodes in general.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
The USENIX paper defines its copysets on a per-chunk basis.
|
||
Similarly to before, we can transfer this definition to a Sharding Approach
|
||
by relating it to a per-LV basis.
|
||
As a side effect, a copyset can then trivially become identical to
|
||
\begin_inset Formula $S$
|
||
\end_inset
|
||
|
||
when the definition is
|
||
\begin_inset Formula $S$
|
||
\end_inset
|
||
|
||
is also changed to a per-LV basis, analogously.
|
||
In the Sharding Approach, a distiction is not absolutely necessary, while
|
||
the USENIX paper has to invest some effort into clarifying the relationship
|
||
between
|
||
\begin_inset Formula $S$
|
||
\end_inset
|
||
|
||
and copysets as defined on a BigCluster model.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Neglecting the mentioned differences, we see our typical use case (LocalSharding
|
||
) roughly equivalent to
|
||
\begin_inset Formula $S=R$
|
||
\end_inset
|
||
|
||
in the terminology of the USENIX paper, or to
|
||
\begin_inset Formula $S=k$
|
||
\end_inset
|
||
|
||
(our number of replicas) in our terminology.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
This means: we try to minimize the
|
||
\emph on
|
||
size
|
||
\emph default
|
||
of
|
||
\begin_inset Formula $S$
|
||
\end_inset
|
||
|
||
for any given per-LV
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
, which will lead to the best possible reliability (under the conditions
|
||
described in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sub:Detailed-explanation"
|
||
|
||
\end_inset
|
||
|
||
) as has been shown in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "subsec:Optimum-Reliability-from"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
Performance Arguments from Architecture
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "sec:Performance-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Some people think that replication is easily done at filesystem layer.
|
||
There exist lots of cluster filesystems and other filesystem-layer solutions
|
||
which claim to be able to replicate your data, sometimes even over long
|
||
distances.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Trying to replicate several petabytes of data, or some billions of inodes,
|
||
is however a much bigger challenge than many people can imagine.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Choosing the wrong layer for
|
||
\series bold
|
||
mass data replication
|
||
\series default
|
||
may get you into trouble.
|
||
Here is an architectural-level (cf section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:What-is-Architecture"
|
||
|
||
\end_inset
|
||
|
||
) explanation why replication at the block layer is more easy and less error
|
||
prone:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/Layers.pdf
|
||
width 100col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
The picture shows the main components of a standalone Unix / Linux system.
|
||
In the late 1970s / early 1980s, a so-called
|
||
\emph on
|
||
Buffer Cache
|
||
\emph default
|
||
had been introduced into the architecture of Unix.
|
||
Today's Linux has refined the concept to various internal caches such as
|
||
the
|
||
\series bold
|
||
Page Cache
|
||
\series default
|
||
(for data) and the
|
||
\series bold
|
||
Dentry Cache
|
||
\series default
|
||
(for metadata).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
All these caches serve one main purpose
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Another important purpose is
|
||
\series bold
|
||
providing shared memory
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
: they are reducing the load onto the storage by exploitation of fast RAM.
|
||
A well-tuned cache can yield high cache hit ratios, typically 99%.
|
||
In some cases (as observed in practice) even more than 99.9%.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Now start distributing the system over long distances.
|
||
There are potential cut points A and B and C
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
In theory, there is another cut point D by implementing a generically distribute
|
||
d cache.
|
||
There exists some academic research on this, but practically usable enterprise-
|
||
grade systems are rare and not wide-spread.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Cut point A is application specific, and can have advantages because it
|
||
has knowledge of the application.
|
||
For example, replication of mail queues can be controlled much more fine-graine
|
||
d than at filesystem or block layer.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Cut points B and C are
|
||
\emph on
|
||
generic
|
||
\emph default
|
||
, supporting a wide variety of applicactions, without altering them.
|
||
Cutting at B means replication at filesystem level.
|
||
C means replication at block level.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
When replicating at B, you will notice that the caches are
|
||
\emph on
|
||
below
|
||
\emph default
|
||
your cut point.
|
||
Thus you will have to re-implement
|
||
\series bold
|
||
distributed caches
|
||
\series default
|
||
, and you will have to
|
||
\series bold
|
||
maintain cache coherence
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
When replicating at C, the Linux caches are
|
||
\emph on
|
||
above
|
||
\emph default
|
||
your cut point.
|
||
Thus you will receive much less traffic, typically already reduced by a
|
||
factor of 100, or even more.
|
||
This is much more easy to cope with.
|
||
You will also profit from
|
||
\series bold
|
||
journalling filesystems
|
||
\series default
|
||
like
|
||
\family typewriter
|
||
ext4
|
||
\family default
|
||
or
|
||
\family typewriter
|
||
xfs
|
||
\family default
|
||
.
|
||
In contrast,
|
||
\emph on
|
||
truly distributed
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
In this context,
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
truly
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
means that the POSIX semantics would be always guaranteed cluster-wide,
|
||
and even in case of partial failures.
|
||
In practice, some distributed filesystems like NFS don't even obey the
|
||
POSIX standard
|
||
\emph on
|
||
locally
|
||
\emph default
|
||
on 1 standalone client.
|
||
We know of projects which have
|
||
\emph on
|
||
failed
|
||
\emph default
|
||
right because of this.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
|
||
\emph default
|
||
journalling is typically not available with distributed cluster filesystems.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
A
|
||
\emph on
|
||
potential
|
||
\emph default
|
||
drawback of block layer replication is that you are typically limited to
|
||
active-passive replication.
|
||
An active-active operation is not impossible at block layer (see combinations
|
||
of DRBD with
|
||
\family typewriter
|
||
ocfs2
|
||
\family default
|
||
), but less common, and less safe to operate.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This limitation isn't necessarily caused by the choice of layer.
|
||
It is simply caused by the
|
||
\series bold
|
||
laws of physics
|
||
\series default
|
||
: communication is always limited by the speed of light.
|
||
A distributed filesystem is nothing else but a logically
|
||
\series bold
|
||
distributed shared memory
|
||
\series default
|
||
(DSM).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Some decades of research on DSM have shown that there exist applications
|
||
/ workloads where the DSM model is
|
||
\emph on
|
||
inferior
|
||
\emph default
|
||
to the direct communication paradigm.
|
||
Even in short-distance / cluster scenarios.
|
||
Long-distance DSM is extremely cumbersome.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Therefore: you simply shouldn't try to solve long-distance communication
|
||
needs via communication over filesystems.
|
||
Even simple producer-consumer scenarios (one-way communication) are less
|
||
performant (e.g.
|
||
when compared to plain TCP/IP) when it comes to distributed POSIX semantics.
|
||
There is simply too much
|
||
\series bold
|
||
synchronisation overhead at metadata level
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
If you have a need for mixed operations at different locations in parallel:
|
||
just split your data set into disjoint filesystem instances (or database
|
||
/ VM instances, etc).
|
||
All you need is careful thought about the
|
||
\emph on
|
||
appropriate
|
||
\emph default
|
||
|
||
\emph on
|
||
granularity
|
||
\emph default
|
||
of your data sets (such as well-chosen
|
||
\emph on
|
||
sets
|
||
\emph default
|
||
of user homedirectory subtrees, or database sets logically belonging together,
|
||
etc).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Replication at filesystem level is often at single-file granularity.
|
||
If you have several millions or even billions of inodes, you may easily
|
||
find yourself in a snakepit.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Conclusion: active-passive operation over long distances (such as between
|
||
continents) is even an advantage.
|
||
It keeps you from trying bad / almost impossible things.
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
Scalability Arguments from Architecture
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "sec:Scalability-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Some people are talking about scalability by (1) looking at a relatively
|
||
small example cluster
|
||
\emph on
|
||
implementation
|
||
\emph default
|
||
of their respective (pre-)chosen
|
||
\emph on
|
||
architecture
|
||
\emph default
|
||
having
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
machines or
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
network components or running
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
application instances, and then (2) extrapolating its behaviour to bigger
|
||
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
.
|
||
They think if it runs with small
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
, it will also run for bigger
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This way of thinking and acting is completely broken, and can endanger both
|
||
companies and careers.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This is not only because of confusion of
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
architecture
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
with
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
implementation
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, cf section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:What-is-Architecture"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
It is also fundamentally broken because it assumes some
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
linearity
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
in a field which is non-linear
|
||
\emph on
|
||
by definition
|
||
\emph default
|
||
.
|
||
If scalability would be linear, the term would not be useful at all, because
|
||
there would be
|
||
\emph on
|
||
no limit
|
||
\emph default
|
||
.
|
||
However, limits exist in practice, and the term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
scalability
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
is the
|
||
\emph on
|
||
means
|
||
\emph default
|
||
for describing the behaviour at or around the limit.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Another
|
||
\emph on
|
||
incorrect
|
||
\emph default
|
||
way of ill-defining the term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
scalability
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
is looking at some relatively big
|
||
\emph on
|
||
example
|
||
\emph default
|
||
cluster, which is working in practice.
|
||
Arguing with an example of a working system is wrong by construction.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
|
||
\emph on
|
||
Every
|
||
\emph default
|
||
storage system on this globe has
|
||
\emph on
|
||
always
|
||
\emph default
|
||
some scalability limit, somewhere.
|
||
Scalability is
|
||
\emph on
|
||
always
|
||
\emph default
|
||
a
|
||
\series bold
|
||
non-linear
|
||
\series default
|
||
behaviour.
|
||
In order to find the practical limit, you must
|
||
\emph on
|
||
reach
|
||
\emph default
|
||
it.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Therefore, examples are principally insufficient for proving scalability,
|
||
as well as for comparing the scalability of architectures and/or of certain
|
||
implementations.
|
||
Examples can be only used for
|
||
\emph on
|
||
disproving
|
||
\emph default
|
||
scalability.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Example Failures of Scalability
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Example-Failures-of"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The following description is a
|
||
\series bold
|
||
must read
|
||
\series default
|
||
for sysadmins and system architects, and also for managers who are
|
||
\series bold
|
||
responsible
|
||
\series default
|
||
.
|
||
The numbers and some details are from my memory, thus it need not be 100%
|
||
accurate in all places.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
It is about an operation environment for a
|
||
\emph on
|
||
new
|
||
\emph default
|
||
product, which was a proprietary web page editor running under a complicated
|
||
variant of a LAMP stack.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The setup started with a
|
||
\family typewriter
|
||
BigCluster
|
||
\family default
|
||
|
||
\emph on
|
||
architecture
|
||
\emph default
|
||
, but actually sized as a
|
||
\family typewriter
|
||
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
SmallCluster
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
|
||
\family default
|
||
implementation.
|
||
\end_layout
|
||
|
||
\begin_layout Paragraph
|
||
Setup 1 (NFS)
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The first setup consisted of
|
||
\begin_inset Formula $n=6$
|
||
\end_inset
|
||
|
||
storage servers, each replicated to another datacenter via DRBD.
|
||
Each were exporting their filesystems via NFS to about the same number
|
||
of client servers, where Apache/PHP was supposed to serve the HTTP requests
|
||
from the customers, which were entering the client cluster via a HTTP load
|
||
balancer.
|
||
The load balancer was supposed to spread the HTTP load to the client servers
|
||
in a
|
||
\series bold
|
||
round-robin
|
||
\series default
|
||
fashion.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
|
||
\color lightgray
|
||
At this point, eager readers may notice some similarity with the error propagati
|
||
on problem treated in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "subsec:Error-Propagation-to"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
Notice that this is about
|
||
\emph on
|
||
scalability
|
||
\emph default
|
||
instead, but you should compare with that, to find some similarities.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
After the complicated system was built up and was working well enough, the
|
||
new product was launched via a marketing campaign with free trial accounts,
|
||
limited to some time.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
So the number of customers was ramping up from 0 to about 20,000 within
|
||
a few weeks.
|
||
When about 20,000 customers were running on the client machines, system
|
||
hangs were noticed, and also from a customer's perspective.
|
||
When too many customers were pressing the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
save
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
button in parallel on reasonably large web page projects, a big number
|
||
of small files, including a huge bunch of small image files, was generated
|
||
over a short period of time.
|
||
A few customers were pressing the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
save
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
button several times a minute, each time re-creating all of these files
|
||
again and again from the proprietary web page generator.
|
||
Result: the system appeared to hang.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
However, all of the servers, including the storage servers, were almost
|
||
|
||
\emph on
|
||
idle
|
||
\emph default
|
||
with respect to CPU consumption.
|
||
RAM sizes were also no problem.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
After investigating the problem for a while, it was noticed that the
|
||
\series bold
|
||
\emph on
|
||
network
|
||
\series default
|
||
\emph default
|
||
was the bottleneck, but not in terms of throughput.
|
||
The internal sockets were forming some
|
||
\series bold
|
||
queues
|
||
\series default
|
||
which were
|
||
\emph on
|
||
delaying
|
||
\emph default
|
||
the NFS requests in some
|
||
\series bold
|
||
ping-pong
|
||
\series default
|
||
like fashion, almost resulting in a
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
deadlock
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
from a customer's perspective (a better term would be
|
||
\series bold
|
||
distributed livelock
|
||
\series default
|
||
or
|
||
\series bold
|
||
distributed thrashing
|
||
\series default
|
||
).
|
||
\end_layout
|
||
|
||
\begin_layout Paragraph
|
||
Setup 2 (
|
||
\family typewriter
|
||
ocfs2
|
||
\family default
|
||
)
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Due to some external investigations and recommendations, the system was
|
||
converted from NFS to
|
||
\family typewriter
|
||
ocfs2
|
||
\family default
|
||
.
|
||
Now DRBD was operated in active-active mode.
|
||
Only one system software component was replaced with another one, without
|
||
altering the
|
||
\family typewriter
|
||
BigCluster
|
||
\family default
|
||
architecture, and without changing the number of servers, which remained
|
||
a stripped-down
|
||
\family typewriter
|
||
SmallCluster
|
||
\family default
|
||
implementation.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Result: the problem with the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
hangs
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
disappeared.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
However, after the number of customers had exceeded the
|
||
\series bold
|
||
next scalability limit
|
||
\series default
|
||
of about 30,000 customers, the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
hang
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
problem appeared once again, in a similar way.
|
||
The system showed systematical incidents again.
|
||
\end_layout
|
||
|
||
\begin_layout Paragraph
|
||
Setup 3 (
|
||
\family typewriter
|
||
glusterfs
|
||
\family default
|
||
as a substitute for NFS)
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
After investigating the network queueing behaviour and the lock contention
|
||
problems of
|
||
\family typewriter
|
||
ocfs2
|
||
\family default
|
||
, the next solution was
|
||
\family typewriter
|
||
glusterfs
|
||
\family default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
However, when the number of customers exceeded the
|
||
\series bold
|
||
\emph on
|
||
next
|
||
\emph default
|
||
scalability limit
|
||
\series default
|
||
, which was about 50,000 customers, some of them hammering the cluster with
|
||
their
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
save
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
button, the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
hangs
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
appeared again.
|
||
\end_layout
|
||
|
||
\begin_layout Paragraph
|
||
Setup 4 (
|
||
\family typewriter
|
||
glusterfs
|
||
\family default
|
||
replication as a substitute for DRBD)
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
After analyzing the problem once again, it was discovered by accident that
|
||
|
||
\family typewriter
|
||
drbdadm disconnect
|
||
\family default
|
||
|
||
\emph on
|
||
appeared
|
||
\emph default
|
||
to
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
solve
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
the problem.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Therefore DRBD was replaced with
|
||
\family typewriter
|
||
glusterfs
|
||
\family default
|
||
replication.
|
||
There exists a
|
||
\family typewriter
|
||
glusterfs
|
||
\family default
|
||
feature allowing replication of files at filesystem level.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This attempt was
|
||
\emph on
|
||
immediately
|
||
\emph default
|
||
resulting in an almost fatal disaster, and thus was stopped immediately:
|
||
the cluster completely broke down.
|
||
Almost nothing was working anymore.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The problem was even worse: switching off the
|
||
\family typewriter
|
||
glusterfs
|
||
\family default
|
||
replication and rollback to DRBD did not work.
|
||
The system remained unusable.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
As a temporary workaround,
|
||
\family typewriter
|
||
drbdadm disconnect
|
||
\family default
|
||
was improving the situation enough for some humbling operation.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Retrospective explanation: some of the reasons can be found in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "subsec:Behaviour-of-DRBD"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
|
||
\family typewriter
|
||
glusterfs
|
||
\family default
|
||
replication does not scale at all because it stores its replication information
|
||
at
|
||
\series bold
|
||
per-inode granularity
|
||
\series default
|
||
in EAs (extended attributes), which must
|
||
\emph on
|
||
necessarily
|
||
\emph default
|
||
be worse than DRBD, because there were some hundreds of millions of them
|
||
in total as reported by
|
||
\family typewriter
|
||
df -i
|
||
\family default
|
||
(see the cut point discussion in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sec:Performance-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
).
|
||
Overnight in some cron jobs, these EAs had to be deleted in reasonably
|
||
sized batches in order to become more or less
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
operable
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
again.
|
||
\end_layout
|
||
|
||
\begin_layout Paragraph
|
||
Setup5 (Sharding on top of DRBD)
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
After the almost fatal incident had been resolved to a less critical one,
|
||
the responsibility for setup was taken over by another person.
|
||
After the
|
||
\begin_inset Formula $O(n^{2})$
|
||
\end_inset
|
||
|
||
behaviour from section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sec:Distributed-vs-Local:"
|
||
|
||
\end_inset
|
||
|
||
had been understood, and after it was clear that sharding is only
|
||
\begin_inset Formula $O(k)$
|
||
\end_inset
|
||
|
||
from a customer's perspective, it was the final solution.
|
||
Now the problem was resolved at
|
||
\series bold
|
||
\emph on
|
||
architectural level
|
||
\series default
|
||
\emph default
|
||
, no longer by just replacing some components with some others.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The system was converted to a variant of a
|
||
\family typewriter
|
||
RemoteSharding
|
||
\family default
|
||
model (see section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "subsec:Variants-of-Sharding"
|
||
|
||
\end_inset
|
||
|
||
), and some
|
||
\family typewriter
|
||
migrate
|
||
\family default
|
||
scripts were introduced for load balancing of customer homedirectories
|
||
and databases between shards.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
As a side effect, the load balancer became a new role: instead of spreading
|
||
|
||
\emph on
|
||
all
|
||
\emph default
|
||
of the HTTP requests to
|
||
\emph on
|
||
all
|
||
\emph default
|
||
of the client servers in a round-robin fashion, it now acted as a redirection
|
||
mechanism at
|
||
\emph on
|
||
shard granularity
|
||
\emph default
|
||
, e.g.
|
||
when one of the client servers was handed over to another one for maintenance.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Retrospective explanation: DRBD was definitely
|
||
\emph on
|
||
not
|
||
\emph default
|
||
the real reason for the critical incident.
|
||
The replication traffic per shard is so low in average that until today,
|
||
no replacement by MARS was absolutely necessary
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Many sysadmins are running a conservative strategy: never touch a running
|
||
system...
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
, although the distance is over 50 km.
|
||
If you wonder why such low write traffic demands can cause such a big incident:
|
||
look at the
|
||
\series bold
|
||
cache reduction
|
||
\series default
|
||
graphics in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sec:Performance-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
Today, the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
save
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
buttons of the customers are just triggering some
|
||
\emph on
|
||
extra
|
||
\emph default
|
||
|
||
\series bold
|
||
writebacks
|
||
\series default
|
||
from the Page Cache of the kernel into the block layer, after some
|
||
\emph on
|
||
delay
|
||
\emph default
|
||
.
|
||
These writebacks are not performance critical in reality, because the Page
|
||
Cache is running them
|
||
\series bold
|
||
\emph on
|
||
asynchronously in background
|
||
\series default
|
||
\emph default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
In contrast, distributed filesystems like
|
||
\family typewriter
|
||
NFS
|
||
\family default
|
||
or
|
||
\family typewriter
|
||
ocfs2
|
||
\family default
|
||
or
|
||
\family typewriter
|
||
glusterfs
|
||
\family default
|
||
are not working asynchronously in many places, but will often schedule
|
||
their requests
|
||
\emph on
|
||
synchronously
|
||
\emph default
|
||
into ordinary network queues, which form a
|
||
\series bold
|
||
sequential bottleneck
|
||
\series default
|
||
, competing with other high-frequent filesystem operations.
|
||
In addition, the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
save
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
button triggers masses of metadata / inode updates in a short time, often
|
||
residing in the same directory.
|
||
Such a directory may thus form a
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
global
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
bottleneck.
|
||
When suchalike competing
|
||
\series bold
|
||
metadata updates
|
||
\series default
|
||
are distributed via a round-robin load balancer, the problem can easily
|
||
become critical by the
|
||
\series bold
|
||
cache coherence problem
|
||
\series default
|
||
.
|
||
While local filesystems can smoothen such application behaviour via the
|
||
Dentry Cache plus Inode Cache, which also show some asynchronous writeback
|
||
behaviour, network filesystems are often unable to deal with this performantly.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Although DRBD has a similar sequential bottleneck at the low-frequency
|
||
block layer by its write-through strategy into its replica, this does not
|
||
really matter: all other writebacks from the Page Cache are
|
||
\emph on
|
||
also
|
||
\emph default
|
||
started asynchronously, and triggered low-frequently, and are occurring
|
||
after some
|
||
\emph on
|
||
delay
|
||
\emph default
|
||
(which in turn will smoothen the
|
||
\series bold
|
||
spikes
|
||
\series default
|
||
caused by
|
||
\series bold
|
||
mass dirtification
|
||
\series default
|
||
of many small files and inodes in a short time as caused by the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
save
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
button), and thus are not really performance critical for this particular
|
||
use case.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
This is a striking example why careful
|
||
\series bold
|
||
selection of granularity level
|
||
\series default
|
||
(filesystem vs block layer) is essential.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
This is also a striking example why asynchronous operations can form a
|
||
huge advantage in certain use cases.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The sharding setup is working until today, scaling up to the current number
|
||
of customers, which is more than an order of magnitude, in the range of
|
||
about a million of customers.
|
||
Of course, the number of shards had to be increased, but this is just what
|
||
sharding is about.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Properties of Storage Scalability
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Properties-Scalability"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Subsubsection
|
||
Influence Factors at Scalability
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Influence-Factors-Scalability"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In general, scalability of storage systems depends on the following factors
|
||
(list may be incomplete):
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
The
|
||
\series bold
|
||
application class
|
||
\series default
|
||
, in particular its principal
|
||
\series bold
|
||
workingset behaviour
|
||
\series default
|
||
(in both dimensions: timely and locality).
|
||
More explanations about workingsets can be found at
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
http://blkreplay.org
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
The
|
||
\series bold
|
||
size
|
||
\series default
|
||
|
||
\begin_inset Formula $x$
|
||
\end_inset
|
||
|
||
of the application data and/or the
|
||
\series bold
|
||
number of application instances
|
||
\series default
|
||
(possibly also denoted by
|
||
\begin_inset Formula $x$
|
||
\end_inset
|
||
|
||
), and the amount of storage needed for it (could be also termed
|
||
\begin_inset Formula $x$
|
||
\end_inset
|
||
|
||
).
|
||
Besides the data itself, the corresponding
|
||
\series bold
|
||
metadata
|
||
\series default
|
||
(inodes, indexes, etc) can form an important factor, or can even
|
||
\emph on
|
||
dominate
|
||
\emph default
|
||
the whole story.
|
||
Typically, critical datacenter application data is tremendously differently
|
||
sized from workstation data.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/MatieresToxiques.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Caution! Many people think erronously that scalability would be
|
||
\emph on
|
||
linearly
|
||
\emph default
|
||
depending on
|
||
\begin_inset Formula $x$
|
||
\end_inset
|
||
|
||
.
|
||
However, as is known at least since the 1960s (read some ancient papers
|
||
from Saltzer and/or from Denning), scalability is
|
||
\series bold
|
||
never linear
|
||
\series default
|
||
, but sometimes even
|
||
\series bold
|
||
\emph on
|
||
disruptive
|
||
\series default
|
||
\emph default
|
||
, in particular when RAM size is the bottleneck.
|
||
IO queues and/or networking queues are often also reacting to overload
|
||
in a disruptive fashion.
|
||
This means: after exceeding the
|
||
\series bold
|
||
scalability limit
|
||
\series default
|
||
of a particular system for its particular class of applications, the system
|
||
will always
|
||
\series bold
|
||
break down
|
||
\series default
|
||
from a customer's perspective, sometimes almost completely, and sometimes
|
||
even
|
||
\series bold
|
||
\emph on
|
||
fatally
|
||
\series default
|
||
\emph default
|
||
.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
On the other hand, some other systems are reacting with
|
||
\series bold
|
||
graceful degradation
|
||
\series default
|
||
.
|
||
Whether a particular systems reacts to a particular type of (over)load,
|
||
either with graceful degradation, or with fatal disruption, or with some
|
||
intermediate behaviour, is some sort of
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
quality property
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
of the system and/or of the application.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
EVERY SYSTEM, even sharded systems, and even the internet as a whole, has
|
||
|
||
\emph on
|
||
always
|
||
\emph default
|
||
some scalability limit
|
||
\emph on
|
||
somewhere
|
||
\emph default
|
||
.
|
||
There exists
|
||
\series bold
|
||
no
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
inifinitely scaling
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
system
|
||
\series default
|
||
on earth!
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
The
|
||
\series bold
|
||
\emph on
|
||
distribution
|
||
\series default
|
||
\emph default
|
||
of the application behaviour in both
|
||
\series bold
|
||
timely
|
||
\series default
|
||
and
|
||
\series bold
|
||
locality
|
||
\series default
|
||
dimensions.
|
||
Depending on the application class, this is often an
|
||
\emph on
|
||
exponential
|
||
\emph default
|
||
distribution according to Zipf's law.
|
||
By falsely
|
||
\emph on
|
||
assuming
|
||
\emph default
|
||
an equal distribution (or a Gaussian distribution) instead of actually
|
||
measuring the distribution in both dimensions, you can easily induce zillions
|
||
of costly problems for big
|
||
\begin_inset Formula $x$
|
||
\end_inset
|
||
|
||
, or even fatal failure of the whole system / project.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
The
|
||
\series bold
|
||
transformation
|
||
\series default
|
||
of the application workingset behaviour at architectural level, sometimes
|
||
caused by certain components resp their specific implementation or parameteriza
|
||
tion.
|
||
Examples are intermediate virtualization layers, e.g.
|
||
vmware
|
||
\family typewriter
|
||
*.vmdk
|
||
\family default
|
||
or KVM
|
||
\family typewriter
|
||
*.qcow2
|
||
\family default
|
||
container formats which can completely change the game, not only in extreme
|
||
cases.
|
||
Another example is
|
||
\series bold
|
||
random distribution
|
||
\series default
|
||
to object stores, which can turn some uncomplicated sequential workloads
|
||
into highly problematic
|
||
\emph on
|
||
random IO
|
||
\emph default
|
||
workloads.
|
||
Don't overlook such potential pitfalls!
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
The storage
|
||
\series bold
|
||
architecture
|
||
\series default
|
||
to be chosen, such as
|
||
\family typewriter
|
||
CentralStorage
|
||
\family default
|
||
vs
|
||
\family typewriter
|
||
BigCluster
|
||
\family default
|
||
vs
|
||
\family typewriter
|
||
*Sharding
|
||
\family default
|
||
.
|
||
Choice of the wrong architecture can be fatal for big
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
and/or for certain timely / spatial application behaviour.
|
||
Changing an architecture during operations on some petabytes of data and/or
|
||
some billions of inodes can be almost impossible, and/or can consume a
|
||
lot of time and money.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
The
|
||
\series bold
|
||
number
|
||
\series default
|
||
of storage
|
||
\series bold
|
||
nodes
|
||
\series default
|
||
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
.
|
||
In some architectures, addition of more nodes can make the system
|
||
\emph on
|
||
worse
|
||
\emph default
|
||
instead of better, c.f.
|
||
section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sec:Reliability-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
In case of architectures relying on a storage network: choice of
|
||
\series bold
|
||
layer
|
||
\series default
|
||
for cut point, e.g.
|
||
filesystem layer vs block layer, see section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sec:Performance-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
, and/or introduction of an additional intermediate object storage layer
|
||
(which can result in major degradation from an architectural view).
|
||
Due to fundamental differences in distributed vs local
|
||
\series bold
|
||
cache coherence
|
||
\series default
|
||
, suchalike can have a
|
||
\emph on
|
||
tremendous
|
||
\emph default
|
||
effect on scalability.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
The
|
||
\series bold
|
||
implementation
|
||
\series default
|
||
of the architecture.
|
||
Be sure to understand the difference between an
|
||
\emph on
|
||
architecture
|
||
\emph default
|
||
and an
|
||
\emph on
|
||
implementation
|
||
\emph default
|
||
of that architecture.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
The size and types / properties of various
|
||
\series bold
|
||
caches
|
||
\series default
|
||
at various layers.
|
||
You need to know the general properties of
|
||
\series bold
|
||
inclusive
|
||
\series default
|
||
vs
|
||
\series bold
|
||
exclusive
|
||
\series default
|
||
cache architecture.
|
||
You absolutely need to know what
|
||
\series bold
|
||
thrashing
|
||
\series default
|
||
is, and under which conditions it can occur.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
It is advantagous for system architects to know
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Reading a few Wikipedia articles does not count as
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
knowledge
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
You need to be able to
|
||
\emph on
|
||
apply
|
||
\emph default
|
||
your knowdedge to enterprise level systems (as opposed to workstation-sized
|
||
systems),
|
||
\emph on
|
||
sustainable
|
||
\emph default
|
||
and
|
||
\emph on
|
||
reproducible
|
||
\emph default
|
||
.
|
||
Therefore you need to have
|
||
\emph on
|
||
actually worked
|
||
\emph default
|
||
in the matter and gained some extraordinary experiences, on top of deep
|
||
understanding of the matter.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
pre-loading strategies, as well as replacement strategies.
|
||
It is advantageous to know what
|
||
\family typewriter
|
||
LRU
|
||
\family default
|
||
or
|
||
\family typewriter
|
||
MFU
|
||
\family default
|
||
means, what their induced
|
||
\emph on
|
||
overhead
|
||
\emph default
|
||
is, and how they
|
||
\emph on
|
||
really
|
||
\emph default
|
||
work on
|
||
\emph on
|
||
actual
|
||
\emph default
|
||
data, not just on some artificial lab data.
|
||
You also should know what an
|
||
\series bold
|
||
anomaly
|
||
\series default
|
||
is, and how it can be produced not only by
|
||
\family typewriter
|
||
FIFO
|
||
\family default
|
||
strategies, but also by certain types of ill-designed multi-layer caching.
|
||
Beware: there are places where
|
||
\family typewriter
|
||
FIFO
|
||
\family default
|
||
-like behaviour is almost impossible to avoid, such as networks.
|
||
All of these is outside the scope of this MARS manual.
|
||
You should
|
||
\emph on
|
||
measure
|
||
\emph default
|
||
, when possible, the
|
||
\series bold
|
||
overhead
|
||
\series default
|
||
of cache implementations.
|
||
I know of
|
||
\emph on
|
||
examples
|
||
\emph default
|
||
where caching is c
|
||
\emph on
|
||
ounter-productive
|
||
\emph default
|
||
.
|
||
For example, certain types and implementations of SSD caches are over-hyped.
|
||
Removing a certain cache will then
|
||
\emph on
|
||
improve
|
||
\emph default
|
||
the situation.
|
||
Notice: caches are conceptually based on some type of
|
||
\series bold
|
||
associative memory
|
||
\series default
|
||
, which is either very costly when directly implemented in hardware, or
|
||
can suffer from tremendous performance penalties when implemented inappropriate
|
||
ly in software.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
|
||
\series bold
|
||
Hardware dimensioning
|
||
\series default
|
||
of the implementation: choice of storage hardware, for each storage node.
|
||
This includes SSDs vs HDDs, their attachment (e.g.
|
||
SAS multiplexing bottlenecks), RAID level, and controller limitations,
|
||
etc.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Only for architectures relying on a storage network: network
|
||
\series bold
|
||
throughput
|
||
\series default
|
||
and network
|
||
\series bold
|
||
latencies
|
||
\series default
|
||
, and network
|
||
\series bold
|
||
bottlenecks
|
||
\series default
|
||
, including the
|
||
\series bold
|
||
queueing
|
||
\series default
|
||
behaviour / congestion control /
|
||
\series bold
|
||
packet loss
|
||
\series default
|
||
behaviour upon overload.
|
||
The latter is often neglected, leading to unexpected behaviour at load
|
||
peaks, and/or leading to costly over-engineering (examples see section
|
||
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "subsec:Example-Failures-of"
|
||
|
||
\end_inset
|
||
|
||
).
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
|
||
\series bold
|
||
\emph on
|
||
Hidden
|
||
\emph default
|
||
bottlenecks
|
||
\series default
|
||
of various types.
|
||
A complete enumeration is almost impossible, because there are too many
|
||
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
opportunities
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
To reduce the latter, my general advice is to try to build bigger systems
|
||
as
|
||
\emph on
|
||
simple
|
||
\emph default
|
||
as possible.
|
||
This is why you should involve some
|
||
\emph on
|
||
real
|
||
\emph default
|
||
experts in storage systems, at least on critical enterprise data.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
|
||
\emph on
|
||
Any
|
||
\emph default
|
||
of these factors can be dangerous when not carefully thought about and
|
||
treated, depending on your use case.
|
||
\end_layout
|
||
|
||
\begin_layout Subsubsection
|
||
Example Scalability Scenario
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Example-Scalability-Scenario"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
To get an impression what
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
enterprise critical data
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
can mean in a concrete example, here are some characteristic numbers on
|
||
1&1 ShaHoLin (Shared Hosting Linux) around spring 2018, which would be
|
||
the
|
||
\emph on
|
||
input parameters
|
||
\emph default
|
||
for
|
||
\emph on
|
||
any
|
||
\emph default
|
||
potential solution architecture
|
||
\family typewriter
|
||
CentralStorage
|
||
\family default
|
||
vs
|
||
\family typewriter
|
||
BigCluster
|
||
\family default
|
||
vs
|
||
\family typewriter
|
||
Sharding
|
||
\family default
|
||
:
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
About 9 millions of customer homedirectories.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
About 10 billions of inodes, with daily incremental backup.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
More than 4 petabytes of
|
||
\emph on
|
||
net
|
||
\emph default
|
||
data (total
|
||
\family typewriter
|
||
df
|
||
\family default
|
||
filling level) in spring 2018, with a growth rate of 21% per year.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
All of this permanently replicated into a second datacenter.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Webhosting very close to 24/7/365.
|
||
For maintenance, any resource must be switchable to the other datacenter
|
||
at any time, indepently from other resources; while in catastrophic failure
|
||
scenarios
|
||
\emph on
|
||
all
|
||
\emph default
|
||
resources must be switchable within a short time.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
For simplicity of our sandbox game, we assume that all of this is in one
|
||
campus.
|
||
In reality, about 30% is residing in another continent.
|
||
Introducing this as an additional input parameter would not fundamentally
|
||
change the game.
|
||
Many other factors, like dependencies from existing infrastructure, are
|
||
also neglected.
|
||
\end_layout
|
||
|
||
\begin_layout Paragraph
|
||
Theoretical Solution:
|
||
\family typewriter
|
||
CentralStorage
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Let us assume somebody would try to operate this on classical
|
||
\family typewriter
|
||
CentralStorage
|
||
\family default
|
||
, and let us assume that migration of this amount of data including billions
|
||
of inodes would be no technical problem.
|
||
What would be the outcome?
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
With current technology, finding a single
|
||
\family typewriter
|
||
CentralStorage
|
||
\family default
|
||
appliance would be all else but easy.
|
||
Dimensioning would be needed for the
|
||
\emph on
|
||
lifetime
|
||
\emph default
|
||
of such a solution, which is at least 5 years.
|
||
In five years, the data would grow by a factor of about
|
||
\begin_inset Formula $1.21^{5}=2.6$
|
||
\end_inset
|
||
|
||
, which is then about
|
||
\begin_inset Formula $10.5$
|
||
\end_inset
|
||
|
||
petabytes.
|
||
This is only the
|
||
\emph on
|
||
net
|
||
\emph default
|
||
capacity; at hardware layer much more is needed for spare space and for
|
||
local redundancy.
|
||
The single
|
||
\family typewriter
|
||
CentralStorage
|
||
\family default
|
||
instance will need to scale up to at least this number, in one datacenter
|
||
(under the simplified game assumptions).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The current number of client LXC containers is about
|
||
\begin_inset Formula $2600$
|
||
\end_inset
|
||
|
||
, independent from location.
|
||
You will have to support growth in number of them.
|
||
For maintenance, any of these need to be switchable to a different location
|
||
at any time.
|
||
The number of bare metal servers running them can vary with hardware architectu
|
||
re / hardware lifecycle, and with growth.
|
||
You will need to dimension a dedicated storage network for all of this.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
If you find a solution which can do this with current
|
||
\family typewriter
|
||
CentralStorage
|
||
\family default
|
||
technology for the next 5 years, then you will have to ensure that restore
|
||
from backup
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Local snapshots, whether LVM or via some COW filesystem, do not count as
|
||
backups! You need a
|
||
\emph on
|
||
logical
|
||
\emph default
|
||
copy, not a
|
||
\emph on
|
||
physical
|
||
\emph default
|
||
one, in case your production filesystem instance gets damaged.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
can be done in less than 1 day in case of a fatal disaster, see also treatment
|
||
of
|
||
\family typewriter
|
||
CentralStorage
|
||
\family default
|
||
reliability in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "subsec:Reliability-Differences-CentralStorage"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
Notice that the current self-built backup solution for a total of 15 billions
|
||
of inodes is based on a sharding model; converting this to some more or
|
||
less centralized solution would turn out as another challenge.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Attention! Buying 10 or 50 or 100 CentralStorage instances does not count
|
||
as a
|
||
\family typewriter
|
||
CentralStorage
|
||
\family default
|
||
architecture.
|
||
By definition, suchalike would be
|
||
\family typewriter
|
||
RemoteSharding
|
||
\family default
|
||
instead.
|
||
Notice that the current 1&1 solution is already a mixture of
|
||
\family typewriter
|
||
LocalSharding
|
||
\family default
|
||
and
|
||
\family typewriter
|
||
RemoteSharding
|
||
\family default
|
||
, so you would win
|
||
\emph on
|
||
nothing
|
||
\emph default
|
||
at architectural level.
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In your business case, you would need to justify the price difference between
|
||
the current component-based hardware solution (horizontally extensible
|
||
by
|
||
\emph on
|
||
scale-out
|
||
\emph default
|
||
) and
|
||
\family typewriter
|
||
CentralStorage
|
||
\family default
|
||
, which is about a factor of 10 per terabyte according to the table in section
|
||
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Cost-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
Even if you manage to find a vendor who is willing to subsidize to a factor
|
||
of only 3, this is not all you need.
|
||
You need to add the costs for the dedicated storage network.
|
||
On top of this, you need to account for the
|
||
\emph on
|
||
migration costs
|
||
\emph default
|
||
after the lifetime of 5 years has passed, where the full data set needs
|
||
to be migrated to a successor storage system.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Notice that classical argumentations with
|
||
\series bold
|
||
\emph on
|
||
manpower
|
||
\series default
|
||
\emph default
|
||
will not work.
|
||
The current operating team is about 10 persons, with no dedicated storage
|
||
admin.
|
||
This relatively small team is not only operating a total of more than 6,000
|
||
shared boxes in all datacenters, but also some tenthousands of managed
|
||
dedicated servers, running essentially the same software stack, with practicall
|
||
y fully automated mass deployment.
|
||
Most of their tasks are related to central software installation, which
|
||
is then automatically distributed, and to operation / monitoring / troubleshoot
|
||
ing of masses of client servers.
|
||
Storage administration tasks in isolation are costing only a
|
||
\emph on
|
||
fraction
|
||
\emph default
|
||
of this.
|
||
Typical claims that
|
||
\family typewriter
|
||
CentralStorage
|
||
\family default
|
||
would require less manpower will not work here.
|
||
Almost everything which is needed for
|
||
\emph on
|
||
mass automation
|
||
\emph default
|
||
is already automated.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Neglecting the tenthousands of managed dedicated servers would be a catastrophi
|
||
c ill-design.
|
||
Their hardware is already given, by existing customer contracts, some of
|
||
them decades old.
|
||
You simply cannot fundamentally change the hardware of these customers
|
||
including their
|
||
\emph on
|
||
dedicated
|
||
\emph default
|
||
local disks, which is their
|
||
\emph on
|
||
main selling point
|
||
\emph default
|
||
.
|
||
You cannot simply convert them to a shared
|
||
\family typewriter
|
||
CentralStorage
|
||
\family default
|
||
, even if it would be technically possible, and if it would deliver similar
|
||
IOPS rates than tenthousands of local spindles (and if you could reach
|
||
the bundled performance of local SSDs from newer contracts), and even if
|
||
you would introduce some interesting
|
||
\series bold
|
||
storage classes
|
||
\series default
|
||
for all of this.
|
||
A dedicated server on top of a shared storage is no longer a dedicated
|
||
one.
|
||
You would have to migrate these customers to another product, with all
|
||
of its consequences.
|
||
Alone for these machines,
|
||
\emph on
|
||
most
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Only a few out of >1000 self-built or customized Debian packages are dealing
|
||
with MARS and/or with the clustermanager
|
||
\family typewriter
|
||
cm3
|
||
\family default
|
||
.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
|
||
\emph default
|
||
of the current automation of
|
||
\family typewriter
|
||
LocalStorage
|
||
\family default
|
||
is needed
|
||
\emph on
|
||
anyway
|
||
\emph default
|
||
, although they are not geo-redundant at current stage.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Conclusion:
|
||
\family typewriter
|
||
CentralStorage
|
||
\family default
|
||
is simply
|
||
\emph on
|
||
unrealistic
|
||
\emph default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Paragraph
|
||
Theoretical Solution:
|
||
\family typewriter
|
||
BigCluster
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The main problem of
|
||
\family typewriter
|
||
BigCluster
|
||
\family default
|
||
is
|
||
\series bold
|
||
reliability
|
||
\series default
|
||
, as explained intuitively in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sec:Reliability-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
and mathematically in appendix
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "chap:Mathematical-Model-of"
|
||
|
||
\end_inset
|
||
|
||
, and as observed in numerous installations not working as expected.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Let us assume that all of these massive technical problems were solved,
|
||
somehow.
|
||
Then the business case would have to deal with the following:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The total number of servers would need to be roughly
|
||
\emph on
|
||
doubled
|
||
\emph default
|
||
.
|
||
Not only their CAPEX, but also the corresponding OPEX (electrical power,
|
||
rackspace, manpower) would increase.
|
||
Alone their current electrical power cost, including cooling, is more than
|
||
the current sysadmin manpower cost.
|
||
Datacenter operations would also increase.
|
||
On top, a dedicated storage network and its administration would also be
|
||
needed.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
With respect to the tenthousands of managed dedicated servers and their
|
||
customer contracts, a similar argument as above holds.
|
||
You simply cannot convert them to
|
||
\family typewriter
|
||
BigCluster
|
||
\family default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Conclusion:
|
||
\family typewriter
|
||
BigCluster
|
||
\family default
|
||
is also
|
||
\emph on
|
||
unrealistic
|
||
\emph default
|
||
.
|
||
There is nothing to win, but a lot to loose.
|
||
\end_layout
|
||
|
||
\begin_layout Paragraph
|
||
Current Solution:
|
||
\family typewriter
|
||
LocalSharding
|
||
\family default
|
||
, sometimes
|
||
\family typewriter
|
||
RemoteSharding
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Short story: it works since decades, and is both cheap and robust since
|
||
geo-redundancy had been added around 2010.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
With the advent of Football (see chapter
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "chap:LV-Football"
|
||
|
||
\end_inset
|
||
|
||
), the
|
||
\family typewriter
|
||
LocalSharding
|
||
\family default
|
||
architecture is raising up on par with the most important management abilities
|
||
of
|
||
\family typewriter
|
||
CentralStorage
|
||
\family default
|
||
and
|
||
\family typewriter
|
||
BigCluster
|
||
\family default
|
||
/ Software Defined Storage.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The story with the tenthousands of managed dedicated servers is arguing
|
||
vice versa: without the traditional ShaHoLin sharding architecture and
|
||
all of its automation, including the newest addition called Football, the
|
||
product
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
managed dedicated servers
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
would not be possible in this scale.
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Summay: the sharded
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
shared
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
product enables another
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
dedicated
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
product, which is sharded by definition, and it actually is known to scale
|
||
up by at least another order of magnitude (in terms of number of servers).
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Scalability of Filesystem Layer vs Block Layer
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Filesystem-Layer-vs"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Following factors are responsible for better architectural (cf section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:What-is-Architecture"
|
||
|
||
\end_inset
|
||
|
||
) scalability of the block layer vs the filesystem layer, at least in many
|
||
cases, with a few exceptions (list may be incomplete):
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
|
||
\series bold
|
||
Granularity
|
||
\series default
|
||
of access:
|
||
\series bold
|
||
metadata
|
||
\series default
|
||
is often smaller than the content data it refers to, but access to data
|
||
is typically not possible without accessing corresponding metadata
|
||
\emph on
|
||
first
|
||
\emph default
|
||
.
|
||
When masses of metadata are present (e.g.
|
||
some billions of inodes as in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "subsec:Example-Scalability-Scenario"
|
||
|
||
\end_inset
|
||
|
||
), and when it is accessed
|
||
\series bold
|
||
more frequently
|
||
\series default
|
||
than the corresponding data (e.g.
|
||
in stateless designs like Apache), it is likely to become the bottleneck.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/MatieresToxiques.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Neglecting metadata and its access patterns is a major source of ill-designs.
|
||
I know of projects which have failed (in their original setup) because
|
||
of this.
|
||
Repair will typically involve some non-trivial architectural changes.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
By default, the block layer itself has almost no metadata at all (or only
|
||
tiny ones, such as describing a whole block device).
|
||
Therefore it has an
|
||
\emph on
|
||
inherent advantage
|
||
\emph default
|
||
over the filesystem layer in such use cases.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
|
||
\series bold
|
||
Caching
|
||
\series default
|
||
: shared memory caches in kernelspace (page cache + dentry cache) vs distributed
|
||
caches over network.
|
||
See the picture in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sec:Performance-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/MatieresToxiques.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
There exist
|
||
\emph on
|
||
examples
|
||
\emph default
|
||
where shared distributed caches do not work at all.
|
||
I know of
|
||
\emph on
|
||
several
|
||
\emph default
|
||
projects which have failed.
|
||
Another project than mentioned in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "subsec:Example-Failures-of"
|
||
|
||
\end_inset
|
||
|
||
has failed because of violations of POSIX filesystem semantics.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Only in distributed systems: the
|
||
\series bold
|
||
cache coherence problem
|
||
\series default
|
||
, both on metadata and on data.
|
||
Depending on load patterns, this can lead to tremendous performance degradation
|
||
, see example in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "subsec:Example-Failures-of"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Dimensioning of the
|
||
\series bold
|
||
network
|
||
\series default
|
||
: throughput, latencies, queueing behaviour.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
There exist a few known exceptions (list may be incomplete, please report
|
||
further examples if you know some):
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Databases: these are typically operating on specific container formats,
|
||
where no frequent
|
||
\emph on
|
||
external
|
||
\emph default
|
||
metadata access is necessary, and where no sharing of the
|
||
\emph on
|
||
container as such
|
||
\emph default
|
||
is necessary.
|
||
Typically, there is no big difference between storing them in block devices
|
||
vs local filesystems.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Exception from the exception: MyISAM is an old design from the 1980s, originall
|
||
y based on DBASE data structures.
|
||
Don't try to access them over NFS or similar.
|
||
Or, better, try to avoid them at all if possible.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
VM images: these are logical BLOBS, so there is typically no big difference
|
||
whether you have an intermediate
|
||
\emph on
|
||
true
|
||
\emph default
|
||
filesystem layer, or not.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Filesystems on top of object stores are no true intermediate filesystems.
|
||
They are violating Dijkstra's important layering rules, as stated in his
|
||
famous articles on THE.
|
||
A similar argument holds for block devices on top of object stores.
|
||
Intermediate container formats like
|
||
\family typewriter
|
||
*.vmdk
|
||
\family default
|
||
or
|
||
\family typewriter
|
||
*.qcow2
|
||
\family default
|
||
can also act as game changers.
|
||
This does not mean that you have to avoid them at all.
|
||
However, be sure to
|
||
\series bold
|
||
check their influence
|
||
\series default
|
||
, and don't forget their
|
||
\emph on
|
||
workingset
|
||
\emph default
|
||
and their
|
||
\emph on
|
||
caching behaviour
|
||
\emph default
|
||
(which can go both into positive and into negative direction), in order
|
||
to really
|
||
\emph on
|
||
know what you are doing!
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
There exist a few cases where a distributed filesystem, sometimes even actually
|
||
with
|
||
\begin_inset Formula $O(n^{2})$
|
||
\end_inset
|
||
|
||
behaviour,
|
||
\emph on
|
||
must
|
||
\emph default
|
||
be used, because there exists a
|
||
\emph on
|
||
requirement
|
||
\emph default
|
||
for it.
|
||
Some examples (list is certainly incomplete):
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
HPC =
|
||
\series bold
|
||
High Performance Computing
|
||
\series default
|
||
on modern supercomputers, consisting of a high number of
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
compute nodes, are often requiring access to a shared persistent data pool,
|
||
where each of the
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
nodes must be sometimes able to access the same persistent data, sometimes
|
||
both for reading and writing.
|
||
Therefore, several supercomputers are using cluster filesystems like Lustre.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Care must be taken that high-frequency / fine granularity communication
|
||
over the distributed filesystem and its dedicated storage network does
|
||
not take place, but instead occurs over the ordinary low-latency communication
|
||
fabrics each modern supercomputer is relying on.
|
||
True
|
||
\begin_inset Formula $O(n^{2})$
|
||
\end_inset
|
||
|
||
storage access behaviour should be avoided as far as possible (given by
|
||
the problem to be solved).
|
||
When absolutely necessary, location transparency (as possible with cluster
|
||
filesystems like Lustre) as well as its DSM = Distributed Shared Memory
|
||
model must be given up, and an
|
||
\series bold
|
||
explicit communication model
|
||
\series default
|
||
must be used instead, which allows explicit control over replicas and their
|
||
communication paths (e.g.
|
||
propagation in a binary tree fashion), although it results in much more
|
||
work for the programmers.
|
||
Only low frequency / coarse granularity transfers of
|
||
\emph on
|
||
bulk data
|
||
\emph default
|
||
with
|
||
\emph on
|
||
high locality
|
||
\emph default
|
||
should run over distributed filesystems, preferably in streaming mode.
|
||
The total frequency of metadata access should be low, because metadata
|
||
consistency may form a bottleneck when updated too frequently.
|
||
The programmers of the distributed application software need to take care
|
||
for this.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Notice that certain supercomputer workloads may be crying for a RemoteSharding
|
||
or FlexibleSharding storage architecture in place of a BigCluster architecture.
|
||
However, this is very application specific.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Student pools at universities, or location-independent workplaces at companies.
|
||
This is just the usecase where NFS was originally constructed for.
|
||
Typically,
|
||
\series bold
|
||
workstation workloads
|
||
\series default
|
||
are neither performance critical, nor prone to actual
|
||
\begin_inset Formula $O(n^{2})$
|
||
\end_inset
|
||
|
||
behaviour (although the network infrastructure would
|
||
\emph on
|
||
allow
|
||
\emph default
|
||
for it), because each user has her own home directory which is typically
|
||
|
||
\emph on
|
||
not shared
|
||
\emph default
|
||
with others, and she cannot split herself and sit in front of multiple
|
||
workstations at the same time.
|
||
Thus the
|
||
\emph on
|
||
local per-workstation
|
||
\emph default
|
||
NFS caching strategies have a good chance to hide much of the network latencies
|
||
, and thus the actual total network workload is typically only
|
||
\begin_inset Formula $O(n).$
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
This can lead to a dangerous misinterpretation: because it apparently works
|
||
even for a few thousands of workstations, people conclude
|
||
\emph on
|
||
wrongly
|
||
\emph default
|
||
that the network filesystem
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
must be scalable
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
Some people are then applying their experience to completely different
|
||
usecases, where much higher metadata traffic by several orders of magnitudes
|
||
is occurring (such as in webhosting), or even where true
|
||
\begin_inset Formula $O(n^{2})$
|
||
\end_inset
|
||
|
||
runtime behaviour is occuring (see example of a failed scalability scenario
|
||
in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "subsec:Example-Failures-of"
|
||
plural "false"
|
||
caps "false"
|
||
noprefix "false"
|
||
|
||
\end_inset
|
||
|
||
).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\begin_inset Graphics
|
||
filename images/MatieresToxiques.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
In general: when something works for usecase A, this
|
||
\series bold
|
||
does
|
||
\emph on
|
||
not
|
||
\emph default
|
||
prove
|
||
\series default
|
||
that it will also work for another usecase B.
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
Recommendations for Design and Operation of Storage Systems
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "sec:Recommendations-for-Designing"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Recommendations for Managers
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Recommendations-for-Managers"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
When you are responsible for
|
||
\series bold
|
||
masses of enterprise-critical data
|
||
\series default
|
||
, the most important point is to get people with
|
||
\series bold
|
||
the right skills
|
||
\series default
|
||
, in
|
||
\emph on
|
||
addition(!) to
|
||
\emph default
|
||
the
|
||
\emph on
|
||
right mindset
|
||
\emph default
|
||
, and to assign the right roles to them.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Practical observation from many groups in many companies: which storage
|
||
systems / architectures are in use, and how much they are
|
||
\emph on
|
||
really
|
||
\emph default
|
||
failure resistent and reliable, and how much they are
|
||
\emph on
|
||
really
|
||
\emph default
|
||
scalable for their workload, and what is their TCO (Total Cost of Ownership),
|
||
does often
|
||
\emph on
|
||
not
|
||
\emph default
|
||
depend on real knowledge and facts.
|
||
It often depends on
|
||
\series bold
|
||
personal habits
|
||
\series default
|
||
and
|
||
\series bold
|
||
pre-judgement
|
||
\series default
|
||
of staff
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
\noindent
|
||
This can be seen in a bigger company (e.g.
|
||
after mergers etc) when very different architectures have been built by
|
||
different teams for very similar usecases, although they are sometimes
|
||
even roughly comparable in size and workload.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
In essence, this results in a gambling game how safe / cost-effective etc
|
||
your critical data
|
||
\emph on
|
||
really
|
||
\emph default
|
||
is.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
As just explained in the previous section, there are so many pitfalls, and
|
||
there are only a few people who know them, because more people are working
|
||
in small-scale systems than in large-scale enterprise ones.
|
||
There are so many lots of people at the market who
|
||
\emph on
|
||
claim
|
||
\emph default
|
||
to have some experience, but in reality they don't know what they don't
|
||
know (
|
||
\series bold
|
||
second-order ignorance
|
||
\series default
|
||
).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Second-order ignorance is very dangerous, even for affected people themselves,
|
||
because they are in good faith about their own skills, and that they would
|
||
be able to control everything (sometimes they really want to control literally
|
||
|
||
\emph on
|
||
everything
|
||
\emph default
|
||
, even other people who have more real experience and knowledge).
|
||
See for example wrong assumptions and
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
false proofs
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
about scalability, derived from different usecases (or in extreme cases
|
||
even from workstations workloads), or the failed scalability scenario in
|
||
section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "subsec:Example-Failures-of"
|
||
plural "false"
|
||
caps "false"
|
||
noprefix "false"
|
||
|
||
\end_inset
|
||
|
||
where some freelancers were consulted as
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
external experts
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Quotation
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Check your information sources! There is a
|
||
\emph on
|
||
systematic reason
|
||
\emph default
|
||
for ill-informed
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
experts
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
On the internet, you can find a lot of so-called
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
best practices
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
Many of them propagating badly scaling storage architectures for enterprise
|
||
workloads, sometimes even
|
||
\emph on
|
||
generally
|
||
\emph default
|
||
claiming they would
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
scale very well
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, which is however often based on
|
||
\emph on
|
||
assumptions
|
||
\emph default
|
||
instead of knowledge (and almost never based on
|
||
\emph on
|
||
measurements
|
||
\emph default
|
||
at the right measurement points for deriving substantial knowledge about
|
||
your real application behaviour).
|
||
Literally
|
||
\emph on
|
||
anyone
|
||
\emph default
|
||
can post falsely generalized
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
best practices
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
to the internet.
|
||
Together with second-order ignorance about the non-transferability of
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
success stories
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
from usecase A to usecase B (resulting in
|
||
\emph on
|
||
false
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
proofs
|
||
\emph default
|
||
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
), the internet is creating
|
||
\series bold
|
||
information bubbles
|
||
\series default
|
||
.
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Quotation
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Real knowledge originates from evaluated sources, such as
|
||
\series bold
|
||
scientific publications
|
||
\series default
|
||
which have undergone at least some minimum
|
||
\emph on
|
||
quality check
|
||
\emph default
|
||
, and which are trying to describe their preconditions and operating environment
|
||
s as precisely
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
\noindent
|
||
Therefore, chances are better to get a real expert when he has some (higher)
|
||
academic degrees, and was working in the area for a longer time.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
as possible.
|
||
\end_layout
|
||
|
||
\begin_layout Quotation
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Real experts will tell you when they don't know something.
|
||
In addition, they will tell you
|
||
\emph on
|
||
multiple
|
||
\emph default
|
||
ways for abtaining such information, such as measurements, simulation,
|
||
etc.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
If you don't have anyone in your teams who knows how
|
||
\series bold
|
||
caching
|
||
\series default
|
||
|
||
\emph on
|
||
really
|
||
\emph default
|
||
works, or if it is a single guy who cannot withstand the pressure from
|
||
a whole group of
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
alpha animals
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, you are running an
|
||
\series bold
|
||
increased risk
|
||
\series default
|
||
of unnecessary expenses
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
I know of cases which have produced unnecessary
|
||
\emph on
|
||
direct
|
||
\emph default
|
||
cost of at least € 20 millions.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
, worse services (indirect costs), failed projects, and sometimes even resulting
|
||
in loss of market share and/or of stock exchange value.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The problem is that it
|
||
\emph on
|
||
looks so easy
|
||
\emph default
|
||
, as if everyone could build a larger storage system, with ease.
|
||
For example, just
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
spend some more money
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, that's all you would need.
|
||
Unfortunately, both
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
marketing drones
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
from commercial storage vendors, and even a few OpenSource advocates, are
|
||
propagating this
|
||
\series bold
|
||
dangerous mindset
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
As a responsible manager, how can you detect dangerous partly knowledge?
|
||
Good indicators are wrong usage of the term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
architecture
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
(see definition in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sec:What-is-Architecture"
|
||
plural "false"
|
||
caps "false"
|
||
noprefix "false"
|
||
|
||
\end_inset
|
||
|
||
), and/or
|
||
\series bold
|
||
confusion of architecture with implementation
|
||
\series default
|
||
.
|
||
When somebody confuses
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Notice that there exist people who use the term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
architecture
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
inadvertly.
|
||
They even don't even know that they are confusing architecture with implementat
|
||
ion.
|
||
Pure usage of a certain term is no clear indicator that somebody is really
|
||
an expert.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
this, he does not really have an overview of different architectural solution
|
||
classes.
|
||
Instead, such people are tending to propagate their random
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
favourite product
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
For a responsible, this increases the risk of getting a non-optimum or
|
||
even bad / dangerous solutions.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Not everything which works in a garage, or in a student pool, or in the
|
||
testlab (whether it's yours or from a commercial storage vendor), or in
|
||
a PoC with some
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
friendly customers
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, is well-suited for large enterprises and their critical data (measured
|
||
in petabytes / billions of files / etc), or is the optimum solution for
|
||
TCO.
|
||
Some rules of thumb, out of experience and observation:
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
For each 1 or 2 orders of magnitude of the
|
||
\series bold
|
||
size
|
||
\series default
|
||
of your data, you need better methods for safe construction and operation.
|
||
At least for each 3 to 4 orders of magnitude (sometimes even for less),
|
||
you need
|
||
\series bold
|
||
better architectures
|
||
\series default
|
||
, and people who can deal with them.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
For each 1 or 2 orders of magntitude of
|
||
\series bold
|
||
criticality
|
||
\series default
|
||
of your data (measured by
|
||
\emph on
|
||
losses
|
||
\emph default
|
||
in case of certain incidents), you will also need better architecture,
|
||
not just better components.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Recommendations for Architects and Sysadmins
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Recommendations-for-Architects"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In order of precedence, do the following:
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
|
||
\series bold
|
||
Fix and/or limit and/or tune the
|
||
\emph on
|
||
application
|
||
\series default
|
||
\emph default
|
||
.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
Some extreme examples:
|
||
\end_layout
|
||
|
||
\begin_deeper
|
||
\begin_layout Itemize
|
||
When you encounter a classical Unix
|
||
\series bold
|
||
fork bomb
|
||
\series default
|
||
, you have no chance against it.
|
||
Even the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
best and the most expensive hardware
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
is unable to successfully run a fork bomb.
|
||
The only countermeasure is
|
||
\emph on
|
||
limitation of resources
|
||
\emph default
|
||
.
|
||
Reason: unlimited resources do not exist on earth.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
If you think that this were only of academic interest: several types of
|
||
internet
|
||
\series bold
|
||
DDOS attacks
|
||
\series default
|
||
are acting like a fork bomb, and
|
||
\series bold
|
||
Apache
|
||
\series default
|
||
is also acting similar to a fork bomb when not configured properly.
|
||
This is not about academics, it is about
|
||
\emph on
|
||
your survival
|
||
\emph default
|
||
(in the sense of Darwin).
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
If you think it cannot hurt you because you are running
|
||
\family typewriter
|
||
fast-cgi
|
||
\family default
|
||
or another application scheme where forks are not part of the game (e.g.
|
||
databases and many others): please notice that
|
||
\series bold
|
||
network queues
|
||
\series default
|
||
are often acting as a replacement for processes.
|
||
Overflow of queues can have a similar effect than fork bombs from the viewpoint
|
||
of customers: they simply don't get the service they are expecting.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Real-life example: some percentage of
|
||
\family typewriter
|
||
WordPress
|
||
\family default
|
||
customers are typically and
|
||
\emph on
|
||
systematically
|
||
\emph default
|
||
|
||
\series bold
|
||
misconfiguring
|
||
\series default
|
||
their
|
||
\family typewriter
|
||
wp-cron
|
||
\family default
|
||
cron jobs.
|
||
They create backups of their website, which
|
||
\emph on
|
||
include
|
||
\emph default
|
||
their old backups.
|
||
Result: in each generation of the backups, the needed disk space will roughly
|
||
|
||
\emph on
|
||
double
|
||
\emph default
|
||
.
|
||
Even if you had
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
unlimited storage
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
on top of the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
best and the most expensive storage system
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, and even if you would like to give
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
unlimited storage
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
to your customers, it simply cannot work at all.
|
||
Exponential growth is exponential growth.
|
||
After a few months of this kind of daily backup, you would need more storage
|
||
than atoms exist in the whole universe.
|
||
You
|
||
\emph on
|
||
must
|
||
\emph default
|
||
introduce some quota limits somewhere.
|
||
And you
|
||
\emph on
|
||
must
|
||
\emph default
|
||
ensure that the
|
||
\family typewriter
|
||
wp-cron
|
||
\family default
|
||
misconfiguration is fixed, whoever is responsible for fixing it.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Another
|
||
\family typewriter
|
||
WordPress
|
||
\family default
|
||
example: the
|
||
\family typewriter
|
||
wp-cron
|
||
\family default
|
||
configuration syntax is not easily understandable by laymen.
|
||
It is easy to
|
||
\series bold
|
||
misconfigure
|
||
\series default
|
||
such that a backup is created
|
||
\emph on
|
||
once per minute
|
||
\emph default
|
||
.
|
||
As long as the website is very small, this will not even be noticed by
|
||
sysadmins.
|
||
However, for bigger websites (and they are typically growing over time),
|
||
the IO load may increase to a point until even asynchronous replication
|
||
over 10Gig interfaces cannot catch up.
|
||
Even worse: the next run of
|
||
\family typewriter
|
||
wp-cron
|
||
\family default
|
||
may start before the old one has finished within a minute.
|
||
Again, there is no chance except fixing the
|
||
\emph on
|
||
root cause
|
||
\emph default
|
||
at application level.
|
||
\end_layout
|
||
|
||
\end_deeper
|
||
\begin_layout Enumerate
|
||
|
||
\series bold
|
||
Choose the right
|
||
\emph on
|
||
overall
|
||
\emph default
|
||
architecture
|
||
\series default
|
||
(not limited to storage).
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
An impressive example for architectural (cf section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:What-is-Architecture"
|
||
|
||
\end_inset
|
||
|
||
) ill-design can be found in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "subsec:Example-Failures-of"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
Important explanations are in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "subsec:Properties-Scalability"
|
||
|
||
\end_inset
|
||
|
||
, in particular subsection
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "subsec:Influence-Factors-Scalability"
|
||
|
||
\end_inset
|
||
|
||
, and section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "subsec:Filesystem-Layer-vs"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
A strategic example is in subsection
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "subsec:Example-Scalability-Scenario"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
It is absolutely necessary to know the standard cache hierarchy of Unix
|
||
(similarly also found in Windows) from section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sec:Performance-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
More explanations are in this manual at many places.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
In general, major ill-designs of overall architectures (end-to-end) cannot
|
||
be fixed at component level.
|
||
Even the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
best tuning of the world
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
executed by the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
best tuning expert
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
on top of the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
best and most expensive storage
|
||
\emph on
|
||
components
|
||
\emph default
|
||
and the best storage
|
||
\emph on
|
||
network
|
||
\emph default
|
||
of the world
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
cannot compensate major ill-designs, such as
|
||
\begin_inset Formula $O(n^{2})$
|
||
\end_inset
|
||
|
||
behaviour.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Similarly for reliability: if you have problems with too many and/or too
|
||
large incidents affecting too many customers, read sections
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sec:Reliability-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
and
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "subsec:Reliability-Differences-CentralStorage"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
|
||
\series bold
|
||
Choice and tuning of components
|
||
\series default
|
||
.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
No further explanations necessary, because most people already know this.
|
||
In case you think this is the only way: no, it is typically the
|
||
\emph on
|
||
worst
|
||
\emph default
|
||
and typically only the
|
||
\emph on
|
||
last resort
|
||
\emph default
|
||
when compared to the previous enumeration items.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
Exception: choice of wrong components with insufficient properties for your
|
||
particular application / use case.
|
||
But this is an
|
||
\emph on
|
||
architectural
|
||
\emph default
|
||
problem in reality.
|
||
\end_layout
|
||
|
||
\begin_layout Chapter
|
||
Use Cases for MARS vs DRBD
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "chap:Use-Cases-for"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
DRBD has a long history of successfully providing HA features to many users
|
||
of Linux.
|
||
With the advent of MARS, many people are wondering what the difference
|
||
is.
|
||
They ask for recommendations.
|
||
In which use cases should DRBD be recommended, and in which other cases
|
||
is MARS the better choice?
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The following table is a short guide to the most important cases where the
|
||
decision is rather clear:
|
||
\begin_inset Separator latexpar
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Tabular
|
||
<lyxtabular version="3" rows="6" columns="2">
|
||
<features tabularvalignment="middle">
|
||
<column alignment="center" valignment="top">
|
||
<column alignment="center" valignment="top">
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Use Case
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
Recommendation
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
server pairs, each directly connected via
|
||
\series bold
|
||
crossover cables
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
DRBD
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\series bold
|
||
active-active
|
||
\series default
|
||
/ dual-primary, e.g.
|
||
|
||
\family typewriter
|
||
\series bold
|
||
gfs2
|
||
\family default
|
||
\series default
|
||
,
|
||
\family typewriter
|
||
\series bold
|
||
ocfs2
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
DRBD
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
distance
|
||
\series bold
|
||
> 50km
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
MARS
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
\series bold
|
||
> 100 server pairs
|
||
\series default
|
||
over a short-distance
|
||
\series bold
|
||
shared
|
||
\series default
|
||
line
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
MARS
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
<row>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
all else / intermediate cases
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
||
\begin_inset Text
|
||
|
||
\begin_layout Plain Layout
|
||
read the following details
|
||
\end_layout
|
||
|
||
\end_inset
|
||
</cell>
|
||
</row>
|
||
</lyxtabular>
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
There exist some use cases where DRBD is clearly better than MARS.
|
||
1&1 has a long history of experiences with DRBD where it works very fine,
|
||
in particular coupling Linux devices rack-to-rack via crossover cables.
|
||
DRBD is just
|
||
\emph on
|
||
constructed
|
||
\emph default
|
||
for that use case (RAID-1 over network).
|
||
In such a scenario, DRBD is better than MARS because it uses up less disk
|
||
space resources.
|
||
In addition, newer DRBD versions can run over high-speed but short-distance
|
||
interconnects like Infiniband (via the SDP protocol).
|
||
Another use case for DRBD is active-active / dual-primary mode, e.g.
|
||
|
||
\family typewriter
|
||
ocfs2
|
||
\family default
|
||
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Notice that
|
||
\family typewriter
|
||
ocfs2
|
||
\family default
|
||
is appearantly not constructed for long distances.
|
||
1&1 has some experiences on a specific short distance cluster where the
|
||
|
||
\family typewriter
|
||
ocfs2
|
||
\family default
|
||
/
|
||
\family typewriter
|
||
DRBD
|
||
\family default
|
||
combination scaled a little bit better than
|
||
\family typewriter
|
||
NFS
|
||
\family default
|
||
, but worse than
|
||
\family typewriter
|
||
glusterfs
|
||
\family default
|
||
(using 2 clients in both cases – notice that
|
||
\family typewriter
|
||
glusterfs
|
||
\family default
|
||
showed extremely bad performance when trying to enable active-active
|
||
\family typewriter
|
||
glusterfs
|
||
\family default
|
||
replication between 2 server instances, therefore we ended up using active-pass
|
||
ive DRBD replication below a single
|
||
\family typewriter
|
||
glusterfs
|
||
\family default
|
||
server).
|
||
Conclusion:
|
||
\family typewriter
|
||
NFS
|
||
\family default
|
||
<
|
||
\family typewriter
|
||
ocfs2
|
||
\family default
|
||
<
|
||
\family typewriter
|
||
glusterfs
|
||
\family default
|
||
< sharding.
|
||
We found that
|
||
\family typewriter
|
||
glusterfs
|
||
\family default
|
||
on top of active-passive DRBD scalability was about 2 times better than
|
||
|
||
\family typewriter
|
||
NFS
|
||
\family default
|
||
on top of active-passive DRBD, while
|
||
\family typewriter
|
||
ocfs2
|
||
\family default
|
||
on top of
|
||
\family typewriter
|
||
DRBD
|
||
\family default
|
||
in active-active mode was somewhere inbetween.
|
||
All cluster comparisons with an increasing workload over time (measured
|
||
as number of customers which could be safely operated).
|
||
Each system was replaced by the next one when the respective scalability
|
||
was at its respective end, each time leading to operational problems.
|
||
The ultimate solution was to replace all of these clustering concepts by
|
||
the general concept of
|
||
\series bold
|
||
sharding
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
over short
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Active-active won't work over long distances at all because of high network
|
||
latencies (cf chapter
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "chap:Cloud-Storage"
|
||
|
||
\end_inset
|
||
|
||
).
|
||
Probably, for replication of whole clusters over long distances DRBD and
|
||
MARS could be stacked: using DRBD on top for MARS for active-active clustering
|
||
of
|
||
\family typewriter
|
||
gfs2
|
||
\family default
|
||
or
|
||
\family typewriter
|
||
ocfs2
|
||
\family default
|
||
, and a MARS instance
|
||
\emph on
|
||
below
|
||
\emph default
|
||
for failover of
|
||
\emph on
|
||
one
|
||
\emph default
|
||
of the DRBD replicas over long distances.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
distances.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
On the other hand, there exist other use cases where DRBD did not work as
|
||
expected, leading to incidents and other operational problems.
|
||
We analyzed them for our specific use cases.
|
||
The later author of MARS came to the conclusion that they could only be
|
||
resolved by fundamental changes in the overall architecture of DRBD.
|
||
The development of MARS started at the personal initiative of the author,
|
||
first in form of a personal project during holidays, but later picked up
|
||
by 1&1 as an official project.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
MARS and DRBD simply have
|
||
\series bold
|
||
different application areas
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In the following, we will discuss the pros and cons of each system in particular
|
||
situations and contexts, and we shed some light at their conceptual and
|
||
operational differences.
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
Network Bottlenecks
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "sec:Network-Bottlenecks"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Behaviour of DRBD
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Behaviour-of-DRBD"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In order to describe the most important problem we found when DRBD was used
|
||
to couple whole datacenters (each encompassing thousands of servers) over
|
||
metro distances, we strip down that complicated real-life scenario to a
|
||
simplified laboratory scenario in order to demonstrate the effect with
|
||
minimal means.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Notice that the following DRBD effect does not appear at crossover cables.
|
||
The following scenario covers a non-standard case of DRBD.
|
||
DRBD works fine when no network bottleneck appears!
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The following picture illustrates an effect which has been observed in 1&1
|
||
datacenters when running masses of DBRD instances through a single network
|
||
bottleneck.
|
||
In addition, the effect is also reproducible by an elder version of the
|
||
MARS test suite
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
The effect has been demonstrated some years ago with DRBD version 8.3.13.
|
||
By construction, is is independent from any of the DRBD series 8.3.x, 8.4.x,
|
||
or 9.0.x.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
:
|
||
\begin_inset Separator latexpar
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/network-bottleneck-drbd.fig
|
||
width 80col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
The simplified scenario is the following:
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
DRBD is loaded with a low to medium, but constant rate of write operations
|
||
for the sake of simplicity of the scenario.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
The network has some throughput bottleneck, depicted as a red line.
|
||
For the sake of simplicity, we just linearly decrease it over time, starting
|
||
from full throughput, down to zero.
|
||
The decrease is very slowly over time (some minutes, or even hours).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
What will happen in this scenario?
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
As long as the actual DRBD write throughput is lower than the network bandwidth
|
||
(left part of the horizontal blue line), DRBD works as expected.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Once the maximum network throughput (red line) starts to fall short of the
|
||
required application throughput (first blue dotted line), we get into trouble.
|
||
By its very nature, DRBD works
|
||
\series bold
|
||
synchronously
|
||
\series default
|
||
.
|
||
Therefore, it
|
||
\emph on
|
||
must
|
||
\emph default
|
||
transfer all your application writes through the bottleneck, but now it
|
||
is impossible
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
This is independent from the DRBD protocols A through C, because it just
|
||
depends on an information-theoretic argument independently from any protocol.
|
||
We have a fundamental conflict between network capabilities and application
|
||
demands here, which cannot be circumvented due to the
|
||
\series bold
|
||
synchronous
|
||
\series default
|
||
nature of DRBD.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
due to the bottleneck.
|
||
As a consequence, the application running on top of DRBD will see increasingly
|
||
higher IO latencies and/or stalls / hangs.
|
||
We found practical cases (at least with former versions of DRBD) where
|
||
IO latencies exceeded practical monitoring limits such as
|
||
\begin_inset Formula $5$
|
||
\end_inset
|
||
|
||
s by far, up to the range of
|
||
\emph on
|
||
minutes
|
||
\emph default
|
||
.
|
||
As an experienced sysadmin, you know what happens next: your application
|
||
will run into an incident, and your customers will be dissatisfied.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In order to deal with such situations, DRBD has lots of tuning parameters.
|
||
In particular, the
|
||
\family typewriter
|
||
timeout
|
||
\family default
|
||
parameter and/or the
|
||
\family typewriter
|
||
ping-timeout
|
||
\family default
|
||
parameter will determine when DRBD will give up in such a situation and
|
||
simply drop the network connection as an emergency measure.
|
||
Dropping the network connection is roughly equivalent to an automatic
|
||
\family typewriter
|
||
disconnect
|
||
\family default
|
||
, followed by an automatic re-connect attempt after
|
||
\family typewriter
|
||
connect-int
|
||
\family default
|
||
seconds.
|
||
During the dropped connection, the incident will appear as being resolved,
|
||
but at some hidden cost
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
By appropriately tuning various DRBD parameters, such as
|
||
\family typewriter
|
||
timeout
|
||
\family default
|
||
and/or
|
||
\family typewriter
|
||
ping-timeout
|
||
\family default
|
||
, you can keep the impact of the incident below some viable limit.
|
||
However, the automatic disconnect will then happen earlier and more often
|
||
in practice.
|
||
Flaky or overloaded networks may easily lead to an enormous number of automatic
|
||
disconnects.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
What happens next in our scenario? During the
|
||
\family typewriter
|
||
disconnect
|
||
\family default
|
||
, DRBD will record all positions of writes in its bitmap and/or in its activity
|
||
log.
|
||
As soon as the automatic re-connect succeeds after
|
||
\family typewriter
|
||
connect-int
|
||
\family default
|
||
seconds, DRBD has to do a partial re-sync of those blocks which were marked
|
||
dirty in the meantime.
|
||
This leads to an
|
||
\emph on
|
||
additional
|
||
\emph default
|
||
bandwidth demand
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
DRBD parameters
|
||
\family typewriter
|
||
sync-rate
|
||
\family default
|
||
resp
|
||
\family typewriter
|
||
resync-rate
|
||
\family default
|
||
may be used to tune the height of the additional demand.
|
||
In addition, the newer parameters
|
||
\family typewriter
|
||
c-plan-ahead
|
||
\family default
|
||
,
|
||
\family typewriter
|
||
c-fill-target
|
||
\family default
|
||
,
|
||
\family typewriter
|
||
c-delay-target
|
||
\family default
|
||
,
|
||
\family typewriter
|
||
c-min-rate
|
||
\family default
|
||
,
|
||
\family typewriter
|
||
c-max-rate
|
||
\family default
|
||
and friends may be used to dynamically adapt to
|
||
\emph on
|
||
some
|
||
\emph default
|
||
situations where the application throughput
|
||
\emph on
|
||
could
|
||
\emph default
|
||
fit through the bottleneck.
|
||
These newer parameters were developed in a cooperation between 1&1 and
|
||
Linbit, the maker of DRBD.
|
||
\end_layout
|
||
|
||
\begin_layout Plain Layout
|
||
Please note that lowering / dynamically adapting the resync rates may help
|
||
in lowering the
|
||
\emph on
|
||
probability
|
||
\emph default
|
||
of occurrences of the above problems in practical scenarios where the bottlenec
|
||
k would recover to viable limits after some time.
|
||
However, lowering the rates will also increase the
|
||
\emph on
|
||
duration
|
||
\emph default
|
||
of re-sync operations accordingly.
|
||
The
|
||
\emph on
|
||
total amount of re-sync data
|
||
\emph default
|
||
simply does not decrease when lowering
|
||
\family typewriter
|
||
resync-rate
|
||
\family default
|
||
; it even tends to increase over time when new requests arrive.
|
||
Therefore, the
|
||
\emph on
|
||
expectancy value
|
||
\emph default
|
||
of problems caused by
|
||
\emph on
|
||
strong
|
||
\emph default
|
||
network bottlenecks (i.e.
|
||
when not even the ordinary application rate is fitting through) is
|
||
\emph on
|
||
not
|
||
\emph default
|
||
improved by lowering or adapting
|
||
\family typewriter
|
||
resync-rate
|
||
\family default
|
||
, but rather the expectancy value mostly depends on the
|
||
\emph on
|
||
relation
|
||
\emph default
|
||
between the amount of holdback data versus the amount of application write
|
||
data, both measured for the duration of some given strong bottleneck.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
as indicated by the upper dotted blue box.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Of course, there is
|
||
\emph on
|
||
absolutely no chance
|
||
\emph default
|
||
to get the increased amount of data through our bottleneck, since not even
|
||
the ordinary application load (lower dotted lines) could be transferred.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Therefore, you run at a
|
||
\series bold
|
||
very high risk
|
||
\series default
|
||
that the re-sync cannot finish before the next
|
||
\family typewriter
|
||
timeout
|
||
\family default
|
||
/
|
||
\family typewriter
|
||
ping-timeout
|
||
\family default
|
||
cycle will drop the network connection again.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
What will be the final result when that risk becomes true? Simply, your
|
||
secondary site will be
|
||
\emph on
|
||
permanently
|
||
\emph default
|
||
in state
|
||
\family typewriter
|
||
inconsistent
|
||
\family default
|
||
.
|
||
This means, you have lost your redundancy.
|
||
In our scenario, there is no chance at all to become consistent again,
|
||
because the network bottleneck declines more and more, slowly.
|
||
It is simply
|
||
\emph on
|
||
hopeless
|
||
\emph default
|
||
, by construction.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In case you lose your primary site now, you are lost at all.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Some people may argue that the probability for a similar scenario were low.
|
||
We don't agree on such an argumentation.
|
||
Not only because it really happens in pratice, and it may even last some
|
||
days until problems are fixed.
|
||
In case of
|
||
\series bold
|
||
rolling disasters
|
||
\series default
|
||
, the network is very likely to become flaky and/or overloaded shortly before
|
||
the final damage.
|
||
Even in other cases, you can easily end up with inconsistent secondaries.
|
||
It occurs not only in the lab, but also in practice if you operate some
|
||
hundreds or even thousands of DRBD instances.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The point is that you can produce an ill behaviour
|
||
\emph on
|
||
systematically
|
||
\emph default
|
||
just by overloading the network a bit for some sufficient duration.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
When coupling whole datacenters via some thousands of DRBD connections,
|
||
any (short) network loss will almost certainly increase the re-sync network
|
||
load each time the outage appears to be over.
|
||
As a consequence, overload may be
|
||
\emph on
|
||
provoked
|
||
\emph default
|
||
by the re-sync repair attempts.
|
||
This may easily lead to self-amplifying
|
||
\series bold
|
||
throughput storms
|
||
\series default
|
||
in some resonance frequency (similar to self-destruction of a bridge when
|
||
an army is marching over it in lockstep).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The only way for reliable prevention of loss of secondaries is to start
|
||
any re-connect
|
||
\emph on
|
||
only
|
||
\emph default
|
||
in such situations where you can
|
||
\emph on
|
||
predict in advance
|
||
\emph default
|
||
that the re-sync is
|
||
\emph on
|
||
guaranteed
|
||
\emph default
|
||
to finish before any network bottleneck / loss will cause an automatic
|
||
disconnect again.
|
||
We don't know of any method which can reliably predict the future behaviour
|
||
of a complex network.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresToxiques.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Conclusion: in the presence of network bottlenecks, you run a considerable
|
||
risk that your DRBD mirrors get destroyed just in that moment when you
|
||
desperately need them.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Notice that crossover cables usually never show a behaviour like depicted
|
||
by the red line.
|
||
Crossover cables are
|
||
\emph on
|
||
passive components
|
||
\emph default
|
||
which normally
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Exceptions might be mechanical jiggling of plugs, or electro-magnetical
|
||
interferences.
|
||
We never noticed any of them.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
either work, or not.
|
||
The binary connect / disconnect behaviour of DRBD has no problems to cope
|
||
with that.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
or
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Linbit recommends a
|
||
\series bold
|
||
workaround
|
||
\series default
|
||
for the inconsistencies during re-sync: LVM snapshots.
|
||
We tried it, but found a
|
||
\emph on
|
||
performance penalty
|
||
\emph default
|
||
which made it prohibitive for our concrete application.
|
||
A problem seems to be the cost of destroying snapshots.
|
||
LVM uses by default a BOW strategy (Backup On Write, which is the counterpart
|
||
of COW = Copy On Write).
|
||
BOW increases IO latencies during ordinary operation.
|
||
Retaining snapshots is cheap, but reverting them may be very costly, depending
|
||
on workload.
|
||
We didn't fully investigate that effect, and our experience is a few years
|
||
old.
|
||
You might come to a different conclusion for a different workload, for
|
||
newer versions of system software, or for a different strategy if you carefully
|
||
investigate the field.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
DRBD problems usually arise
|
||
\emph on
|
||
only
|
||
\emph default
|
||
when the network throughput shows some
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
awkward
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
analog behaviour, such as overload, or as occasionally produced by various
|
||
switches / routers / transmitters, or other potential sources of packet
|
||
loss.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Behaviour of MARS
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Behaviour-of-MARS"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The behaviour of MARS in the above scenario:
|
||
\begin_inset Separator latexpar
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/network-bottleneck-mars.fig
|
||
width 80col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
When the network is restrained, an asynchronous system like MARS will continue
|
||
to serve the user IO requests (dotted green line) without any impact /
|
||
incident while the actual network throughput (solid green line) follows
|
||
the red line.
|
||
In the meantime, all changes to the block device are recorded at the transactio
|
||
n logfiles.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Here is one point in favour of DRBD: MARS stores its transaction logs on
|
||
the filesystem
|
||
\family typewriter
|
||
/mars/
|
||
\family default
|
||
.
|
||
When the network bottleneck is lasting very long (some days or even some
|
||
weeks), the filesystem will eventually run out of space some day.
|
||
Section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Defending-Overflow"
|
||
|
||
\end_inset
|
||
|
||
discusses countermeasures against that in detail.
|
||
In contrast to MARS, DRBD allocates its bitmap
|
||
\emph on
|
||
statically
|
||
\emph default
|
||
at resource creation time.
|
||
It uses up less space, and you don't have to monitor it for (potential)
|
||
overflows.
|
||
The space for transaction logs is the price you have to pay if you want
|
||
or need anytime consistency, or asynchronous replication in general.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In order to really grasp the
|
||
\emph on
|
||
heart
|
||
\emph default
|
||
of the difference between synchronous and asynchronous replication, we
|
||
look at the following modified scenario:
|
||
\begin_inset Separator latexpar
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/network-flaky-mars.fig
|
||
width 80col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
This time, the network throughput (red line) is varying
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
In real life, many long-distance lines or even some heavily used metro lines
|
||
usually show fluctuations of their network bandwidth by an order of magnitude,
|
||
or even higher.
|
||
We have measured them.
|
||
The overall behaviour can be characterized as
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
|
||
\series bold
|
||
chaotic
|
||
\series default
|
||
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
in some unpredictable way.
|
||
As before, the application throughput served by MARS is assumed to be constant
|
||
(dotted green line, often superseded by the solid green line).
|
||
The actual replication network throughput is depicted by the solid green
|
||
line.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
As you can see, a network dropdown undershooting the application demand
|
||
has no impact on the application throughput, but only on the replication
|
||
network throughput.
|
||
Whenever the network throughput is held back due to the flaky network,
|
||
it simply catches up as soon as possible by overshooting the application
|
||
throughput.
|
||
The amount of lag-behind is visualized as shaded area: downward shading
|
||
(below the application throughput) means an increase of the lag-behind,
|
||
while the upwards shaded areas (beyond the application throughput) indicate
|
||
a decrease of the lag-behind (catch-up).
|
||
Once the lag-behind has been fully caught up, the network throughput suddenly
|
||
jumps back to the application throughput (here visible in two cases).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Note that the existence of lag-behind areas is roughly corresponding to
|
||
DRBD disconnect states, and in turn to DRBD inconsistent states of the
|
||
secondary as long as the lag-behind has not been fully cought up.
|
||
The very rough
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Of course, this visualization is not exact.
|
||
On one hand, the DRBD inconsistency phase may start later as depicted here,
|
||
because it only starts
|
||
\emph on
|
||
after
|
||
\emph default
|
||
the first automatic disconnect, upon the first automatic re-connect.
|
||
In addition, the amount of resync data may be smaller than the amount of
|
||
corresponding MARS transaction logfile data, because the DRBD bitmap will
|
||
coalesce multiple writes to the same block into one single transfer.
|
||
On the other hand, DRBD will transfer no data at all during its disconnected
|
||
state, while MARS continues its best.
|
||
This leads to a prolongation of the DRBD inconsistent phase.
|
||
Depending on properties of the workload and of the network, the real duration
|
||
of the inconsistency phase may be both shorter or longer.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
duration of the corresponding DRBD inconsistency phase is visualized as
|
||
magenta line at the time scale.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
MARS utilizes the existing network bandwidth as best as possible in order
|
||
to pipe through as much data as possible, provided that there exists some
|
||
data requiring expedition.
|
||
Conceptually, there exists no better way due to information theoretic limits
|
||
(besides data compression).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Note that
|
||
\emph on
|
||
in average
|
||
\emph default
|
||
during a longer period of time, the network must have emough capacity for
|
||
transporting all of your data.
|
||
MARS cannot magically break through information-theoretic limits.
|
||
It cannot magically transport gigabytes of data over modem lines.
|
||
Only
|
||
\emph on
|
||
relatively short
|
||
\emph default
|
||
network problems / packet loss can be compensated.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
In case of lag-behind, the version of the data replicated to the secondary
|
||
site corresponds to some time in the past.
|
||
Since the data is always transferred in the same order as originally submitted
|
||
at the primary site, the secondary never gets inconsistent.
|
||
Your mirror always remains usable.
|
||
Your only potential problem could be the outdated state, corresponding
|
||
to some state in the past.
|
||
However, the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
as-best-as-possible
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
approach to the network transfer ensures that your version is always
|
||
\emph on
|
||
as up-to-date as possible
|
||
\emph default
|
||
even under ill-behaving network bottlenecks.
|
||
|
||
\series bold
|
||
There is simply no better way to do it.
|
||
|
||
\series default
|
||
In presence of temporary network bottlenecks such as network congestion,
|
||
there exists no better method than prescribed by the information theoretic
|
||
limit (red line, neglecting data compression).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
In order to get all of your data through the line, somewhen the network
|
||
must be healthy again.
|
||
Otherwise, data will be recorded until the capacity of the
|
||
\family typewriter
|
||
/mars/
|
||
\family default
|
||
filesystem is exhausted, leading to an emergency mode (see section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Resolution-of-Emergency"
|
||
|
||
\end_inset
|
||
|
||
).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
MARS' property of never sacrificing local data consistency (at the possible
|
||
cost of actuality, as long as you have enough capacity in
|
||
\family typewriter
|
||
/mars/
|
||
\family default
|
||
) is called
|
||
\series bold
|
||
Anytime Consistency
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Even when the capacity of
|
||
\family typewriter
|
||
/mars/
|
||
\family default
|
||
is exhausted and when emergency mode is entered, the replicas will not
|
||
become inconsistent by themselves.
|
||
However, when the emergency mode is later
|
||
\emph on
|
||
cleaned up
|
||
\emph default
|
||
for a replica, it will become temporarily inconsistent during the fast
|
||
full sync.
|
||
Details are in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Resolution-of-Emergency"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Conclusion: you can even use
|
||
\series bold
|
||
traffic shaping
|
||
\series default
|
||
on MARS' TCP connections in order to globally balance your network throughput
|
||
(of course at the cost of actuality, but without sacrificing local data
|
||
consistency).
|
||
If you would try to do the same with DRBD, you could easily provoke a disaster.
|
||
MARS simply tolerates any network problems, provided that there is enough
|
||
disk space for transaction logfiles.
|
||
Even in case of completely filling up your disk with transaction logfiles
|
||
after some days or weeks, you will not lose local consistency anywhere
|
||
(see section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Defending-Overflow"
|
||
|
||
\end_inset
|
||
|
||
).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Finally, here is yet another scenario where MARS can cope with the situation:
|
||
\begin_inset Separator latexpar
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/network-constant-mars.fig
|
||
width 80col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
This time, the network throughput limit (solid red line) is assumed to be
|
||
constant.
|
||
However, the application workload (dotted green line) shows some heavy
|
||
peaks.
|
||
We know from our 1&1 datacenters that such an application behaviour is
|
||
very common (e.g.
|
||
in case of certain kinds of DDOS attacks etc).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
When the peaks are exceeding the network capacities for some short time,
|
||
the replication network throughput (solid green line) will be limited for
|
||
a short time, stay a little bit longer at the limit, and finally drop down
|
||
again to the normal workload.
|
||
In other words, you get a flexible buffering behaviour, coping with the
|
||
peaks.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Similar scenarios (where both the application workload has peaks and the
|
||
network is flaky to some degree) are rather common.
|
||
If you would use DRBD there, you were likely to run into regular application
|
||
performance problems and/or frequent automatic disconnect cycles, depending
|
||
on the height and on the duration of the peaks, and on network resources.
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
Long Distances / High Latencies
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In general and in some theories, latencies are conceptually independent
|
||
from throughput, at least to some degree.
|
||
There exist all 4 possible combinations:
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
There exist communication lines with high latencies but also high throughput.
|
||
Examples are raw fibre cables at the ground of the Atlantic.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
High latencies on low-throughput lines is very easy to achieve.
|
||
If you never saw it, you never ran interactive
|
||
\family typewriter
|
||
vi
|
||
\family default
|
||
over
|
||
\family typewriter
|
||
ssh
|
||
\family default
|
||
in parallel to downloads on your old-fashioned modem line.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Low latencies need not be incompatible with high throughput.
|
||
See Myrinet, InfiniBand or high-speed point-to-point interconnects, such
|
||
as modern RAM busses.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Low latency combined with low throughput is also possible: in an ATM system
|
||
(or another pre-reservation system for bandwidth), just increase the multiplex
|
||
factor on low-capacity but short lines, which is only possible at the cost
|
||
of assigned bandwidth.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In the
|
||
\emph on
|
||
internet
|
||
\emph default
|
||
practice, however, it is very likely that high latencies will also lead
|
||
to worse throughput, because of the
|
||
\emph on
|
||
congestion control algorithms
|
||
\emph default
|
||
running all over the world.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
We have experimented with extremely large TCP send/receive buffers plus
|
||
various window sizes and congestion control algorithms over long-distance
|
||
lines between the USA and Europe.
|
||
Yes, it is possible to improve the behaviour to some degree.
|
||
But magic does not happen.
|
||
Natural laws will always hold.
|
||
You simply cannot travel faster than the speed of light.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Our experience leads to the following rule of thumb, not formally proven
|
||
by anything, but just observed in practice:
|
||
\end_layout
|
||
|
||
\begin_layout Quotation
|
||
In general
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
We have heard of cases where even less than 50 km were not working with
|
||
DRBD.
|
||
It depends on application workload, on properties of the line, and on congestio
|
||
n caused by other traffic.
|
||
Some other people told us that according to
|
||
\emph on
|
||
their
|
||
\emph default
|
||
experience, much lesser distances should be considered operable, only in
|
||
the range of a few single kilometers.
|
||
However, they agree that DRBD is rock stable when used on crossover cables.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
, synchronous data replication (not limited to applications of DRBD) works
|
||
reliably only over distances
|
||
\begin_inset Formula $<50$
|
||
\end_inset
|
||
|
||
km, or sometimes even less.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
There may be some exceptions, e.g.
|
||
when dealing with low-end workstation loads.
|
||
But when you are responsible for a whole datacenter and/or some centralized
|
||
storage units, don't waste your time by trying (almost) impossible things.
|
||
We recommend to use MARS in such use cases.
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
Explanation via CAP Theorem
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "sec:Explanation-via-CAP"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/cap-theorem.fig
|
||
width 60col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
The famous CAP theorem, also called Brewer's theorem, is important for a
|
||
deeper understanding of the differences between DRBD and MARS.
|
||
A good explanation can be found at
|
||
\begin_inset Flex URL
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
|
||
https://en.wikipedia.org/wiki/CAP_theorem
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
(retrieved July 2018).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The CAP theorem states that only 2 out of 3 properties can be achieved at
|
||
the same time, when a Distributed System is under pressure: C = Consistency
|
||
means
|
||
\series bold
|
||
\emph on
|
||
Strict
|
||
\series default
|
||
\emph default
|
||
Consistency at the level of the
|
||
\emph on
|
||
distributed
|
||
\emph default
|
||
system (which is
|
||
\emph on
|
||
not
|
||
\emph default
|
||
the same as strict consistency
|
||
\emph on
|
||
inside
|
||
\emph default
|
||
of one of the
|
||
\emph on
|
||
local
|
||
\emph default
|
||
systems), A = Availability = intuitively clear from a user's perspective,
|
||
and P = Partitioning Tolerance = the network may have its own outages at
|
||
any time (which is a negative criterion).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
As explained in the Wikipedia article, the P = Partitioning Tolerance is
|
||
a property which is imporant at least in
|
||
\emph on
|
||
wide-distance
|
||
\emph default
|
||
data replication scenarios, and possibly in some other scenarios.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
CAP Differences between DRBD and MARS
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:CAP-Differences"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
If you are considering only short distances like passive crossover cables
|
||
between racks,
|
||
\emph on
|
||
then
|
||
\emph default
|
||
(and
|
||
\emph on
|
||
only then
|
||
\emph default
|
||
) you may
|
||
\emph on
|
||
assume(!)
|
||
\emph default
|
||
that P is not required.
|
||
Then, and only then, you can get both A and C at the same time, without
|
||
sacrificing P, because P is already for free by assumption.
|
||
In such a crossover cable scenario, getting all three C and A and P is
|
||
possible, similarly to an explanation in the Wikipedia article.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This is the classical use case for DRBD: when both DRBD replicas are always
|
||
staying physically connected via a passive crossover cable (which is
|
||
\emph on
|
||
assumed
|
||
\emph default
|
||
to never break down), you can get both strict global consistency and availabili
|
||
ty, even in cases where one of the DRBD nodes is failing
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
In addition, you will need some further components like Pacemaker, iSCSI
|
||
failover, etc.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
Both C and A are provided by DRBD during
|
||
\family typewriter
|
||
connected
|
||
\family default
|
||
state, while P is assumed to be provided by a passive component.
|
||
By addition of iSCSI failover, A can be achieved even in case of single
|
||
storage node failures, while retaining C from the viewpoint
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Notice: the CAP theorem does not deal with node failures, only with
|
||
\emph on
|
||
network
|
||
\emph default
|
||
failures.
|
||
Node failures would always violate C by some
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
strong
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
definition.
|
||
By some
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
weaker
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
definition, the downtime plus recovery time (e.g.
|
||
DRBD re-sync) can be taken out of the game.
|
||
Notice: while a node can always
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
know
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
whether it has failed (at least after reboot), network failures cannot
|
||
be distinguished from failures of remote nodes in general.
|
||
Therefore node failures and network failures are fundamentally different
|
||
by their nature.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
of the application.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This is explained by the thick line in the following variant of the graphics,
|
||
which is only valid for crossover cables where P need not be guaranteed
|
||
by the replication because it is already assumed for free:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/cap-drbd-operational.fig
|
||
width 60col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
Now look at the case of a truly Distributed System, where P cannot be assumed
|
||
as for free.
|
||
For example, try to use DRBD in a long-distance replication scenario.
|
||
There we cannot assume P as already given.
|
||
We
|
||
\series bold
|
||
must
|
||
\emph on
|
||
tolerate
|
||
\series default
|
||
\emph default
|
||
replication network outages.
|
||
DRBD is reacting to this differently in two different modes.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
First we look at the (short) time interval
|
||
\emph on
|
||
before
|
||
\emph default
|
||
DRBD recognizes the replication network incident, and before it leaves
|
||
the
|
||
\family typewriter
|
||
connected
|
||
\family default
|
||
state.
|
||
During this phase, the application IO will
|
||
\series bold
|
||
hang
|
||
\series default
|
||
for some time, indicating the (temporary) sacrifice (from a user's perspective)
|
||
by a red X:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/cap-drbd-connected.fig
|
||
width 60col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
Because Availability is one of the highest goods of enterprise-critical
|
||
IT operations, you will typically configure DRBD such that it automatically
|
||
switches to some variant of a
|
||
\family typewriter
|
||
disconnected
|
||
\family default
|
||
state after some timeout, thereby giving up consistency between both replicas.
|
||
The red X indicates not only loss of global strict consistency in the sense
|
||
of the CAP theorem, but also that your replica will become
|
||
\family typewriter
|
||
Inconsistent
|
||
\family default
|
||
during the following re-sync:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/cap-drbd-disconnected.fig
|
||
width 60col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
You may wonder what the difference to MARS is.
|
||
As explained in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Requirements-for-Cloud"
|
||
|
||
\end_inset
|
||
|
||
, MARS is not only intended for wide distances, but also for
|
||
\series bold
|
||
Cloud Storage
|
||
\series default
|
||
where no strict consistency is required at global level by definition,
|
||
but instead
|
||
\series bold
|
||
Eventually Consistent
|
||
\series default
|
||
is the preferred model for the Distributed System.
|
||
Therefore,
|
||
\emph on
|
||
strict
|
||
\emph default
|
||
consistency (in the sense of the CAP theorem) is
|
||
\emph on
|
||
not required by definition
|
||
\emph default
|
||
.
|
||
Therefore, the red X is not present in the following graphics, showing
|
||
the state where MARS is remaining
|
||
\emph on
|
||
locally consistent
|
||
\emph default
|
||
all the time
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Notice that the
|
||
\emph on
|
||
initial
|
||
\emph default
|
||
full sync is not considered here, neither for DRBD, nor for MARS.
|
||
|
||
\emph on
|
||
Setup
|
||
\emph default
|
||
of the Distributed System is its own scenario, not considered here.
|
||
|
||
\emph on
|
||
Repair
|
||
\emph default
|
||
of a
|
||
\emph on
|
||
damaged
|
||
\emph default
|
||
system is also a different scenario, also not considered here.
|
||
Notice the MARS' emergency mode also belongs to the class of
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
damages
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, as well as DRBD' disk failure modes, where is has some additional functionalit
|
||
y compared to the current version of MARS.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
, even when a network outage occurs:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/cap-mars.fig
|
||
width 60col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Notice: MARS does not guarantee strict consistency
|
||
\emph on
|
||
between
|
||
\emph default
|
||
LV replicas at the level of the Distributed System, but only Eventually
|
||
Consistent.
|
||
However,
|
||
\emph on
|
||
at the same time
|
||
\emph default
|
||
it
|
||
\emph on
|
||
also
|
||
\emph default
|
||
guarantees strict consistency
|
||
\emph on
|
||
locally
|
||
\emph default
|
||
, and even at
|
||
\emph on
|
||
each
|
||
\emph default
|
||
of the passive replicas, each by each.
|
||
Don't confuse these different levels.
|
||
There are different consistency guarantees at different levels, at the
|
||
same time.
|
||
This might be confusing if you are not looking at the system at different
|
||
levels: (1) overall Distributed System versus (2) each of the local system
|
||
instances.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Why does MARS this? Because a better way is not possible at all.
|
||
The CAP theorem tells us that there exists no better way when both A have
|
||
to be guaranteed (as almost everywhere in enterprise-critical IT operations),
|
||
and P has to be ensured in datacenter disaster scenarios or some other
|
||
scenarios.
|
||
Similarly to natural laws like Einstein's laws of the speed of light, there
|
||
|
||
\emph on
|
||
does not exist
|
||
\emph default
|
||
a better way!
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Conclusion from the CAP theorem: when P is a
|
||
\emph on
|
||
hard
|
||
\emph default
|
||
|
||
\emph on
|
||
requirement
|
||
\emph default
|
||
, don't use DRBD (or other
|
||
\emph on
|
||
synchronous
|
||
\emph default
|
||
replication implementations) for long-distance and/or Cloud Storage scenarios.
|
||
The red X is in particular problematic during re-sync, after the network
|
||
has become healthy again (cf section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "subsec:Behaviour-of-DRBD"
|
||
|
||
\end_inset
|
||
|
||
).
|
||
MARS has no red X at C because of its
|
||
\series bold
|
||
Anytime Consistency
|
||
\series default
|
||
, which refers to
|
||
\emph on
|
||
local
|
||
\emph default
|
||
consistency, and which is violated by DRBD during certain important phases
|
||
of its regular operation.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Another conclusion from the CAP theorem: when A+C is a
|
||
\emph on
|
||
hard requirement
|
||
\emph default
|
||
, and when P can be faithfully assumed as already given by passive crossover
|
||
cables, then don't use the current version of MARS.
|
||
Use DRBD instead.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresToxiques.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
If you think that you require alle three properties C+A+P, but you don't
|
||
have passive crossover cables over short distances, you are requiring something
|
||
which is
|
||
\series bold
|
||
impossible
|
||
\series default
|
||
.
|
||
There exists no solution, with whatever component, or from whatever commercial
|
||
storage vendor.
|
||
The CAP theorem is as hard as Einstein's natural laws are.
|
||
Rethink your complete concept, from end to end.
|
||
Something is wrong, somewhere.
|
||
Ignoring this on enterprise-critical use cases can endanger a company and/or
|
||
your career.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
CAP Commonalities between DRBD and MARS
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:CAP-Commonalities"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In this subsection, we look at the case that P is not for free, but has
|
||
to be ensured by the Distributed Storage system.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
You may have noticed that MARS' ordinary CAP behaviour is similar to DRBD's
|
||
CAP picture in
|
||
\family typewriter
|
||
disconnected
|
||
\family default
|
||
state, or during similar states when the replication network is interrupted.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Replication network interruption is also known as
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
Network Partitioning
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
This is where property P = Partitioning Tolerance comes into play.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
When a network partition has
|
||
\emph on
|
||
actually occurred
|
||
\emph default
|
||
, both DRDB and MARS allow you to do the same: you may
|
||
\series bold
|
||
forcefully switch
|
||
\series default
|
||
the
|
||
\family typewriter
|
||
primary
|
||
\family default
|
||
role, which means activation of a former
|
||
\family typewriter
|
||
secondary
|
||
\family default
|
||
node.
|
||
In such a situation, you can issue commands like
|
||
\family typewriter
|
||
drbdadm primary --force
|
||
\family default
|
||
or
|
||
\family typewriter
|
||
marsadm primary --force
|
||
\family default
|
||
.
|
||
It is no accident that both commands are looking similar to each other.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The outcome will be the same: you will most likely get a
|
||
\family typewriter
|
||
\series bold
|
||
SplitBrain
|
||
\family default
|
||
\series default
|
||
situation.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The possibility of getting a split brain is no specific property of neither
|
||
DRBD nor MARS.
|
||
It will also happen with any other replication system, whether synchronous
|
||
or asynchronous.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
It is one of the consequences from the CAP theorem when (1a) P has to be
|
||
assured, and (1b) a network partition has
|
||
\emph on
|
||
actually occurred
|
||
\emph default
|
||
, and (2) when A = Availability is enforced at both sides of the network
|
||
partition.
|
||
The result is that C = global Consistency is violated, by creation of two
|
||
or more versions of the data.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Careful: at least for some application classes, it is a bad idea to systematica
|
||
lly create split brain via automatic cluster managers, e.g.
|
||
Pacemaker or similar.
|
||
As explained in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sec:Inappropriate-Clustermanger"
|
||
|
||
\end_inset
|
||
|
||
, some cluster managers were originally constructed for truly shared disk
|
||
scenarios, where no split brain can occur by construction.
|
||
Using them in masses on versioned data in truly distributed systems can
|
||
result in existential surprises, once a bigger network partition and/or
|
||
a flaky replication networks triggers them in masses, and at some moments
|
||
where you didn't really want to do what they now are doing automatically,
|
||
and in masses.
|
||
Split brain should not be provoked when not
|
||
\emph on
|
||
absolutely
|
||
\emph default
|
||
necessary.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Split brain resolution is all else but easy in general.
|
||
When the data is in a generic block device, you typically will have no
|
||
general means for merging both versions.
|
||
This means, split brain resolution is typically only possible by
|
||
\series bold
|
||
throwing away
|
||
\series default
|
||
some of the versions.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This kind of split brain resolution problem is no specific property of DRBD
|
||
or of MARS.
|
||
It is a fundamental property of generic block devices.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
DRBD and MARS have some commands like
|
||
\family typewriter
|
||
drbdadm invalidate
|
||
\family default
|
||
or
|
||
\family typewriter
|
||
marsadm invalidate
|
||
\family default
|
||
for this.
|
||
Again, the similarity is no accident.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Notice that classical filesystems aren't typically better than raw block
|
||
devices.
|
||
There are even more possibilities for tricky types of
|
||
\series bold
|
||
conflicts
|
||
\series default
|
||
(e.g.
|
||
on path names in addition to file content).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Similary, BigCluster object stores are often suffering from similar (or
|
||
even worse) problems, because higher application layers may have some hidden
|
||
internal dependencies between object versions, while the object store itself
|
||
is agnostic of version dependencies in general
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
There exists lots of types of potential dependencies between objects.
|
||
Timely ones are easy to capture, but this is not sufficient in general
|
||
for everything.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresToxiques.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
When stacking block devices or filesystems (or something else) on top of
|
||
some BigCluster object store, the latter will not magically resolve any
|
||
split brain for you.
|
||
Check whether your favorite object store implementation has some kind of
|
||
equivalent of a
|
||
\family typewriter
|
||
primary --force
|
||
\family default
|
||
command, and some equivalent
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Notice: BigCluster architectures are typically discriminating between between
|
||
client servers and storage servers.
|
||
This will typically introduce some more possibilities into the game, such
|
||
as forced client failover, independently from forced storage failover.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
of an
|
||
\family typewriter
|
||
invalidate
|
||
\family default
|
||
command.
|
||
If it doesn't have one, or only a restricted one, you should be
|
||
\emph on
|
||
alerted
|
||
\emph default
|
||
.
|
||
In case of a long-lasting storage network partition, you might need suchalike
|
||
|
||
\emph on
|
||
desperately
|
||
\emph default
|
||
for ensuring A, even at the cost of C.
|
||
Check: whether you need this is heavily depending on the
|
||
\series bold
|
||
\emph on
|
||
application class
|
||
\series default
|
||
\emph default
|
||
(see also the Cloud Storage definition in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sec:Requirements-for-Cloud"
|
||
|
||
\end_inset
|
||
|
||
, or look at webhosting, etc).
|
||
When you
|
||
\emph on
|
||
would
|
||
\emph default
|
||
need it, but you are
|
||
\series bold
|
||
not prepared for suchalike scenarios at your enterprise-critical data
|
||
\series default
|
||
, it could cost you a lot of money and/or reputation and/or even your existence.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Notice: the
|
||
\emph on
|
||
concept
|
||
\emph default
|
||
of
|
||
\family typewriter
|
||
SplitBrain
|
||
\family default
|
||
is occurring almost everywhere in truly Distributed Systems when C can
|
||
be violated in favour of A+P.
|
||
It is a very general consequence
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
There exist only few opportunities for generic conflict resolution, even
|
||
in classical databases where
|
||
\emph on
|
||
some
|
||
\emph default
|
||
knowledge about the structure of the data is available.
|
||
Typically, there are some more hidden dependencies.
|
||
Lossless
|
||
\family typewriter
|
||
SplitBrain
|
||
\family default
|
||
resolution will thus need to be implemented at application layer, if it
|
||
is possible at all.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
of the CAP theorem.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The only reliable way for avoiding split brain in truly distributed systems
|
||
would be: don't insist on A = Availability.
|
||
Notice that there exist some application classes, like certain types of
|
||
banking, where C is typically a higher good than A.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Notice that both DRBD and MARS are supporting this also: just don't add
|
||
the option
|
||
\family typewriter
|
||
--force
|
||
\family default
|
||
to the
|
||
\family typewriter
|
||
primary
|
||
\family default
|
||
switch command.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
However: even in banking, some
|
||
\emph on
|
||
extremely extraordinary
|
||
\emph default
|
||
scenarios might occur, where sacrifice of C in favour of A could be necessary
|
||
(e.g.
|
||
when
|
||
\emph on
|
||
manual cleanup
|
||
\emph default
|
||
of C is cheaper than long-lasting violations of A).
|
||
Good to know that both DRBD and MARS have some emergency measure for killing
|
||
C in favour of A!
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
Higher Consistency Guarantees vs Actuality
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
We already saw in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "sec:Network-Bottlenecks"
|
||
|
||
\end_inset
|
||
|
||
that certain types of network bottlenecks can easily (and reproducibly)
|
||
destroy the consistency of your DRBD secondary, while MARS will preserve
|
||
local consistency at the cost of actuality (
|
||
\series bold
|
||
anytime consistency
|
||
\series default
|
||
).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Some people, often located at database operations, are obtrusively arguing
|
||
that actuality is such a high good that it must not be sacrificed under
|
||
any circumstances.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Anyone arguing this way has at least the following choices (list may be
|
||
incomplete):
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
None of the above use cases for MARS apply.
|
||
For instance, short distance replication over crossover cables is sufficient
|
||
(which occurs very often), or the network is reliable enough such that
|
||
bottlenecks can never occur (e.g.
|
||
because the total load is extremely low, or conversely the network is extremely
|
||
overengineered / expensive), or the occurrence of bottlenecks can
|
||
\emph on
|
||
provably
|
||
\emph default
|
||
be taken into account.
|
||
In such cases, DRBD is clearly the better solution than MARS, because it
|
||
provides better actuality than the current version of MARS, and it uses
|
||
up less disk resources.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
In the presence of network bottlenecks, people didn't notice and/or didn't
|
||
understand and/or did under-estimate the risk of accidental invalidation
|
||
of their DRBD secondaries.
|
||
They should carefully check that risk.
|
||
They should convince themselves that the risk is
|
||
\emph on
|
||
really
|
||
\emph default
|
||
bearable.
|
||
Once they are hit by a systematic chain of events which
|
||
\emph on
|
||
reproducibly
|
||
\emph default
|
||
provoke the bad effect, it is too late
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Some people seem to need a bad experience before they get the difference
|
||
between risk caused by reproducible effects and inverted luck.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
In the presence of network bottlenecks, people found a solution such that
|
||
DRBD does not automatically re-connect after the connection has been dropped
|
||
due to network problems (c.f.
|
||
|
||
\family typewriter
|
||
ko-count
|
||
\family default
|
||
parameter).
|
||
So the risk of inconsistency
|
||
\emph on
|
||
appears
|
||
\emph default
|
||
to have vanished.
|
||
In some cases, people did not notice that the risk has
|
||
\emph on
|
||
not completely
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Hint: what's the
|
||
\emph on
|
||
conceptual
|
||
\emph default
|
||
difference beween an automatic and a manual re-connect? Yes, you can try
|
||
to
|
||
\emph on
|
||
lower
|
||
\emph default
|
||
the risk in some cases by transferring risks to human analysis and human
|
||
decisions, but did you take into account the possibility of human errors?
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
|
||
\emph default
|
||
vanished, and/or they did not notice that now the actuality produced by
|
||
DRBD is even drastically worse than that of MARS (in the same situation).
|
||
It is true that DRBD provides better actuality in
|
||
\family typewriter
|
||
connected
|
||
\family default
|
||
state, but for a full picture the actuality in
|
||
\family typewriter
|
||
disconnected
|
||
\family default
|
||
state should not be neglected
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Hint: a potential hurdle may be the fact that the current format of
|
||
\family typewriter
|
||
/proc/drbd
|
||
\family default
|
||
does neither display the timestamp of the first
|
||
\emph on
|
||
relevant
|
||
\emph default
|
||
network drop nor the total amount of lag-behind user data (which is
|
||
\emph on
|
||
not
|
||
\emph default
|
||
the same as the number of dirty bits in the bitmap), while
|
||
\family typewriter
|
||
marsadm view
|
||
\family default
|
||
can display it.
|
||
So it is difficult to judge the risks.
|
||
Possibly a chance is inspection of DRBD messages in the syslog, but quantificat
|
||
ion could remain hard.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
So they didn't notice that their argumentation on the importance of actuality
|
||
may be fundamentally wrong.
|
||
A possible way to overcome that may be re-reading section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "subsec:Behaviour-of-MARS"
|
||
|
||
\end_inset
|
||
|
||
and comparing its outcome with the corresponding outcome of DRBD in the
|
||
same situation.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
People are stuck in contradictive requirements because the current version
|
||
of MARS does not yet support synchronous or pseudo-synchronous operation
|
||
modes.
|
||
This should be resolved some day.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
A common misunderstanding is about the actuality guarantees provided by
|
||
filesystems.
|
||
The buffer cache / page cache uses by default a
|
||
\series bold
|
||
writeback strategy
|
||
\series default
|
||
for performance reasons.
|
||
Even modern journalling filesystems will (by default) provide only consistency
|
||
guarantees, but no strong actuality guarantee.
|
||
In case of power loss, some transactions may be even
|
||
\emph on
|
||
rolled back
|
||
\emph default
|
||
in order to restore consistency.
|
||
According to POSIX
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
The above argumentation also applies to Windows filesystems in analogous
|
||
way.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
and other standards, the only
|
||
\emph on
|
||
reliable
|
||
\emph default
|
||
way to achieve actuality is usage of system calls like
|
||
\family typewriter
|
||
sync()
|
||
\family default
|
||
,
|
||
\family typewriter
|
||
fsync()
|
||
\family default
|
||
,
|
||
\family typewriter
|
||
fdatasync()
|
||
\family default
|
||
, flags like
|
||
\family typewriter
|
||
O_DIRECT
|
||
\family default
|
||
, or similar.
|
||
For performance reasons, the
|
||
\emph on
|
||
vast majority of applications
|
||
\emph default
|
||
don't use them at all, or use them only sparingly!
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
It makes no sense to require strong actuality guarantees from any block
|
||
layer replication (whether DRBD or future versions of MARS) while higher
|
||
layers such as filesystems or even applications are already sacrificing
|
||
them!
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
In summary, the
|
||
\series bold
|
||
anytime consistency
|
||
\series default
|
||
provided by MARS is an argument you should consider, even if you need an
|
||
extra hard disk for transaction logfiles.
|
||
\end_layout
|
||
|
||
\begin_layout Chapter
|
||
Requirements of Long-Distance Replication
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
Avoiding Inappropriate Clustermanager Types for Medium and Long-Distance
|
||
Replication
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "sec:Inappropriate-Clustermanger"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This section addresses some wide-spread misconceptions.
|
||
Its main target audience is developers, but sysadmins will profit from
|
||
|
||
\series bold
|
||
detailed explanations of problems and pitfalls
|
||
\series default
|
||
.
|
||
When the problems described in this section are solved somewhen in future,
|
||
this section will be shortened and some relevant parts moved to the appendix.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Doing
|
||
\series bold
|
||
High Availability (HA)
|
||
\series default
|
||
wrong at
|
||
\emph on
|
||
concept level
|
||
\emph default
|
||
may easily get you into trouble, and may cost you several millions of €
|
||
or $ in larger installations, or even knock you out of business when disasters
|
||
are badly dealt with at higher levels such as clustermanagers.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
General Cluster Models
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The most commonly known cluster model is called
|
||
\series bold
|
||
shared-disk
|
||
\series default
|
||
, and typically controlled by clustermanagers like
|
||
\family typewriter
|
||
PaceMaker
|
||
\family default
|
||
:
|
||
\begin_inset Separator latexpar
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/shared-disk-model.fig
|
||
width 50col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
The most important property of shared-disk is that there exists only a single
|
||
disk instance.
|
||
Nowadays, this disk often has some
|
||
\emph on
|
||
internal
|
||
\emph default
|
||
redundancy such as RAID.
|
||
At
|
||
\emph on
|
||
system
|
||
\emph default
|
||
architecure layer / network level, there exists no redundant disk at all.
|
||
Only the application cluster is built redundant.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
It should be immediately clear that shared-disk clusters are only suitable
|
||
for short-distance operations in the same datacenter.
|
||
Although running one of the data access lines over short distances between
|
||
very near-by datacenters (e.g.
|
||
1 km) would be theoretically possible, there would be no sufficient protection
|
||
against failure of a whole datacenter.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Both DRBD and MARS belong to a different architectural model called
|
||
\series bold
|
||
shared-nothing
|
||
\series default
|
||
:
|
||
\begin_inset Separator latexpar
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/shared-nothing-model.fig
|
||
width 50col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
The characteristic feature of a shared-nothing model is (additional)
|
||
\series bold
|
||
redundancy at network level
|
||
\series default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Shared-nothing
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
clusters
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Notice that the term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
cluster computing
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
usually refers to short-distance only.
|
||
Long-distance coupling should be called
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
grid computing
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
in preference.
|
||
As known from the scientific literature, grid computing requires different
|
||
concepts and methods in general.
|
||
Only for the sake of simplicity, we use
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
cluster
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
and
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
grid
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
interchangeably.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
could theoretically be built for
|
||
\emph on
|
||
any
|
||
\emph default
|
||
distances, from short to medium to long distances.
|
||
However, concrete technologies of disk coupling such as synchronous operation
|
||
may pose practical limits on the distances (see chapter
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "chap:Use-Cases-for"
|
||
|
||
\end_inset
|
||
|
||
).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In general, clustermanagers must fit to the model.
|
||
Some clustermanager can be configured to fit to multiple models.
|
||
If so, this must be done properly, or you may get into serious trouble.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Some people don't know, or they don't believe, that different architectural
|
||
models like shared-disk or shared-nothing will
|
||
\emph on
|
||
require
|
||
\emph default
|
||
an
|
||
\emph on
|
||
appropriate
|
||
\emph default
|
||
type of clustermanager and/or a different configuration.
|
||
Failing to do so, by selection of an inappropriate clustermanager type
|
||
and/or an inappropriate configuration may be hazardous.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
Selection of the right model alone is not sufficient.
|
||
Some, if not many, clustermanagers have not been designed for long distances.
|
||
As explained in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "subsec:Special-Requirements-for"
|
||
|
||
\end_inset
|
||
|
||
, long distances have further
|
||
\series bold
|
||
hard requirements
|
||
\series default
|
||
.
|
||
Disregarding them may be also hazardous!
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Handover / Failover Reasons and Scenarios
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
From a sysadmin perspective, there exist a number of different
|
||
\series bold
|
||
reasons
|
||
\series default
|
||
why the application workload must be switched from the currently active
|
||
side A to the currently passive side B:
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Some
|
||
\series bold
|
||
defect
|
||
\series default
|
||
has occurred at cluster side A or at some corresponding part of the network.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Some
|
||
\series bold
|
||
maintenance
|
||
\series default
|
||
has to be done at side A which would cause a longer downtime (e.g.
|
||
security kernel update or replacement of core network equipment or maintainance
|
||
of UPS or of the BBU cache etc - hardware isn't 24/7/365 in practice, although
|
||
some vendors
|
||
\emph on
|
||
claim
|
||
\emph default
|
||
it - it is either not really true, or it becomes
|
||
\emph on
|
||
extremely
|
||
\emph default
|
||
expensive).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Both reasons are valid and must be automatically handled in larger installations.
|
||
In order to deal with all of these reasons, the following basic mechanisms
|
||
can be used in either model:
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
|
||
\series bold
|
||
Failover
|
||
\series default
|
||
(triggered either manually or automatically)
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
|
||
\series bold
|
||
Handover
|
||
\series default
|
||
(triggered manually
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Automatic triggering could be feasible for prophylactic treatments.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
)
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
It is important to not confuse handover with failover at concept level.
|
||
Not only the reasons / preconditions are very different, but also the
|
||
\emph on
|
||
requirements
|
||
\emph default
|
||
.
|
||
Example: precondition for handover is that
|
||
\emph on
|
||
both
|
||
\emph default
|
||
cluster sides are healthy, while precondition for failover is that
|
||
\emph on
|
||
some relevant(!)
|
||
\emph default
|
||
failure has been
|
||
\emph on
|
||
detected
|
||
\emph default
|
||
somewhere (whether this is
|
||
\emph on
|
||
really
|
||
\emph default
|
||
true is another matter).
|
||
Typically, failover must be able to run in masses, while planned handover
|
||
often has lower scaling requirements.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Not all existing clustermanagers are dealing with all of these cases (or
|
||
their variants) equally well, and some are not even dealing with some of
|
||
these cases / variants
|
||
\emph on
|
||
at all
|
||
\emph default
|
||
.
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Some clustermanagers cannot easily express the concept of
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
automatic triggering
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
versus
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
manual triggering
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
of an action.
|
||
There exists simply no cluster-global switch which selects either
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
manual mode
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
or
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
automatic mode
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
(except when you start to hack the code and/or write new plugins; then
|
||
you might notice that there is almost no architectural layering / sufficient
|
||
separation between mechanism and strategy).
|
||
Being forced to permanently use an automatic mode for several hundreds
|
||
or even thousands of clusters is not only boring, but bears a considerable
|
||
risk when automatics do a wrong decision at hundreds of instances in parallel.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Granularity and Layering Hierarchy for Long Distances
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Granularity-and-Layering"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Many existing clustermanager solutions are dealing with a single cluster
|
||
instance, as the term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
|
||
\emph on
|
||
cluster
|
||
\emph default
|
||
manager
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
suggests.
|
||
However, when running several hundreds or thousands of cluster instances,
|
||
you likely will not want to manage each of them individually.
|
||
In addition, failover should
|
||
\emph on
|
||
not only
|
||
\emph default
|
||
be
|
||
\emph on
|
||
triggered
|
||
\emph default
|
||
(not to be confused with
|
||
\emph on
|
||
executed
|
||
\emph default
|
||
) individually at cluster level, but likely
|
||
\emph on
|
||
also
|
||
\emph default
|
||
at a higher granularity such as a room, or a whole datacenter.
|
||
Otherwise, some chaos is likely to happen.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Here is what you probably will
|
||
\series bold
|
||
need
|
||
\series default
|
||
, possibly in difference to what you may find on the market (whether OpenSource
|
||
or not).
|
||
For simplicity, the following diagram shows only two levels of granularity,
|
||
but can be easily extended to multiple layers of granularity, or to some
|
||
concept of various
|
||
\emph on
|
||
subsets of clusters
|
||
\emph default
|
||
:
|
||
\begin_inset Separator latexpar
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/clustermanager-hierarchy.fig
|
||
width 70col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
Notice that many existing clustermanager solutions are not addressing the
|
||
datacenter granularity at all.
|
||
Typically, they use concepts like
|
||
\series bold
|
||
quorums
|
||
\series default
|
||
for determining failures
|
||
\emph on
|
||
at cluster level
|
||
\emph default
|
||
solely, and then immediately executing failover of the cluster, sometimes
|
||
without clean architectural distinction between trigger and execution (similar
|
||
to the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
separation of concerns
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
between
|
||
\series bold
|
||
mechanism
|
||
\series default
|
||
and
|
||
\series bold
|
||
strategy
|
||
\series default
|
||
in Operating Systems).
|
||
Sometimes there is even no internal software layering / modularization
|
||
according to this separation of concerns at all.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
When there is no distinction between different levels of granularity, you
|
||
are hopelessly bound to a non-extensible and thus non-adaptable system
|
||
when you need to operate masses of clusters.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/MatieresCorrosives.png
|
||
lyxscale 50
|
||
scale 17
|
||
|
||
\end_inset
|
||
|
||
A lacking distinction between automatic mode and manual mode, and/or lack
|
||
of corresponding
|
||
\series bold
|
||
architectural software layers
|
||
\series default
|
||
is not only a blatant ignoration of well-established best practices of
|
||
|
||
\series bold
|
||
software engineering
|
||
\series default
|
||
, but will bind you even more firmly to an inflexible system.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
Terminology: for practical reasons, we use the general term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
clustermanager
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
also for speaking about layers dealing with higher granularity, such as
|
||
datacenter layers, and also for long-distance replication scenarios, although
|
||
some terminology from grid computing would be more appropriate in a scientific
|
||
background.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Please consider the following: when it comes to long-distance HA, the above
|
||
layering architecture is also motivated by vastly different numbers of
|
||
instances for each layer.
|
||
Ideally, the topmost automatics layer should be able to overview several
|
||
datacenters in parallel, in order to cope with (almost) global network
|
||
problems such as network partitions.
|
||
Additionally, it should also detect single cluster failures, or intermediate
|
||
problems like
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
rack failure
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
or
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
room failure
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, as well as various types of (partial / intermediate) (replication) network
|
||
failures.
|
||
Incompatible decisions at each of the different granularities would be
|
||
a no-go in practice.
|
||
Somewhere and somehow, you need one single
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
If you have
|
||
\emph on
|
||
logical pairs of datacenters
|
||
\emph default
|
||
which are firmly bound together, you could also have several topmost automatics
|
||
instances, e.g.
|
||
for each
|
||
\emph on
|
||
pair
|
||
\emph default
|
||
of datacenters.
|
||
However, that would be very
|
||
\series bold
|
||
inflexible
|
||
\series default
|
||
, because then you cannot easily mix locations or migrate your servers between
|
||
datacenters.
|
||
Using
|
||
\begin_inset Formula $k>2$
|
||
\end_inset
|
||
|
||
replicas with MARS would also become a nightmare.
|
||
In your own interest, please don't create any concepts where masses of
|
||
hardware are firmly bound to fixed constants at some software layers.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
top-most
|
||
\emph on
|
||
logical
|
||
\emph default
|
||
problem detection / ranking instance, which should be
|
||
\emph on
|
||
internally distributed
|
||
\emph default
|
||
of course, typically using some
|
||
\series bold
|
||
distributed consensus protocol
|
||
\series default
|
||
; but in difference to many published distributed consensus algorithms it
|
||
should be able to work with multiple granularities at the same time.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Methods and their Appropriateness
|
||
\end_layout
|
||
|
||
\begin_layout Subsubsection
|
||
Failover Methods
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Failover-Methods"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Failover methods are only needed in case of an incident.
|
||
They should not be used for regular handover.
|
||
\end_layout
|
||
|
||
\begin_layout Paragraph
|
||
STONITH-like Methods
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
STONITH = Shoot The Other Node In The Head
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
These methods are widely known, although they have several serious drawbacks.
|
||
Some people even believe that
|
||
\emph on
|
||
any
|
||
\emph default
|
||
clustermanager must
|
||
\emph on
|
||
always
|
||
\emph default
|
||
have some STONITH-like functionality.
|
||
This is wrong.
|
||
There
|
||
\emph on
|
||
exist
|
||
\emph default
|
||
alternatives, as shown in the next paragraph.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The most obvious drawback is that STONITH will always create a
|
||
\series bold
|
||
damage
|
||
\series default
|
||
, by definition.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Example: a typical contemporary STONITH implementation uses IPMI for automatical
|
||
ly powering off your servers, or at least pushes the (virtual) reset button.
|
||
This will
|
||
\emph on
|
||
always
|
||
\emph default
|
||
create a certain type of damage: the affected systems will definitely not
|
||
be available, at least for some time until they have (manually) rebooted.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This is a conceptual contradiction: the reason for starting failover is
|
||
that you want to restore availability as soon as possible, but in order
|
||
to do so you will first
|
||
\emph on
|
||
destroy
|
||
\emph default
|
||
the availability of a particular
|
||
\emph on
|
||
component
|
||
\emph default
|
||
.
|
||
This may be counter-productive.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Example: when your hot standby node B does not work as expected, or if it
|
||
works even
|
||
\emph on
|
||
worse
|
||
\emph default
|
||
than A before, you will loose some time until you
|
||
\emph on
|
||
can
|
||
\emph default
|
||
become operational again at the old side A.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Here is an example method for handling a failure scenario.
|
||
The old active side A is assumed to be no longer healthy anymore.
|
||
The method uses a sequential state transition chain with a STONITH-like
|
||
step:
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
Phase1 Check whether the hot standby B is currently usable.
|
||
If this is violated (which may happen during certain types of disasters),
|
||
abort the failover for any affected resources.
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
Phase2
|
||
\emph on
|
||
Try
|
||
\emph default
|
||
to shutdown the damaged side A (in the
|
||
\emph on
|
||
hope
|
||
\emph default
|
||
that there is no
|
||
\emph on
|
||
serious
|
||
\emph default
|
||
damage).
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
Phase3 In case phase2 did not work during a grace period / after a timeout,
|
||
assume that A is badly damaged and therefore STONITH it.
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
Phase4 Start the application at the hot standby B.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Notice: any cleanup actions, such as
|
||
\series bold
|
||
repair
|
||
\series default
|
||
of defective hard- or software etc, are outside the scope of failover processes.
|
||
Typically, they are executed much later when restoring redundancy.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Also notice: this method is a
|
||
\emph on
|
||
heavily
|
||
\emph default
|
||
distributed one, in the sense that sequential actions are alternated multiple
|
||
times on different hosts.
|
||
This is known to be cumbersome in distributed systems, in particular in
|
||
presence of network problems.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "Phase4-in-more"
|
||
|
||
\end_inset
|
||
|
||
Phase4 in more detail for DRBD, augmented with some pseudo code for application
|
||
control:
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
at side B:
|
||
\family typewriter
|
||
drbdadm disconnect all
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
at side B:
|
||
\family typewriter
|
||
drbdadm primary --force all
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
at side B:
|
||
\family typewriter
|
||
applicationmanager start all
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The same phase4 using MARS:
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
at side B:
|
||
\family typewriter
|
||
marsadm pause-fetch all
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
at side B:
|
||
\family typewriter
|
||
marsadm primary --force all
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
at side B:
|
||
\family typewriter
|
||
applicationmanager start all
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This sequential 4-phase method is far from optimal, for the following reasons:
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
The method tries to handle both failover and handover scenarios with one
|
||
single sequential receipe.
|
||
In case of a true failover scenario where it is
|
||
\emph on
|
||
already known for sure
|
||
\emph default
|
||
that side A is badly damaged, this method will unnecessarily waste time
|
||
for phase 2.
|
||
This could be fixed by introduction of a conceptual distinction between
|
||
handover and failover, but it would not fix the following problems.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Before phase4 is started (which will re-establish the service from a user's
|
||
perspective), a lot of time is wasted by
|
||
\emph on
|
||
both
|
||
\emph default
|
||
phases 2
|
||
\emph on
|
||
and
|
||
\emph default
|
||
3.
|
||
Even if phase 2 would be skipped, phase 3 would unnecessarily cost some
|
||
time.
|
||
In the next paragraph, an alternative method is explained which eliminates
|
||
any unnecessary waiting time at all.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
The above method is adapted to the shared-disk model.
|
||
It does not take advantage of the shared-nothing model, where further possibili
|
||
ties for better solutions exist.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
In case of long-distance network partitions and/or sysadmin / system management
|
||
subnetwork outages, you may not even be able to (remotely) start STONITH
|
||
at at.
|
||
Thus the above method misses an important failure scenario.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Some people seem to have a
|
||
\emph on
|
||
binary
|
||
\emph default
|
||
view at the healthiness of a system: in their view, a system is either
|
||
operational, or it is damaged.
|
||
This kind of view is ignoring the fact that some systems may be half-alive,
|
||
showing only
|
||
\emph on
|
||
minor
|
||
\emph default
|
||
problems, or occurring only from time to time.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
It is obvious that damaging a healthy system is a bad idea by itself.
|
||
Even
|
||
\emph on
|
||
generally
|
||
\emph default
|
||
damaging a half-alive system in order to
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
fix
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
problems is not generally a good idea, because it may increase the damage
|
||
when you don't know the
|
||
\emph on
|
||
real
|
||
\emph default
|
||
reason
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Example, occurring in masses: an incorrectly installed bootloader, or a
|
||
wrong BIOS boot priority order which unexpectedly lead to hangs or infinite
|
||
reboot cycles once the DHCP or BOOTP servers are not longer available /
|
||
reachable.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Even worse: in a distributed system
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Notice: the STONITH concept is more or less associated with short-distance
|
||
scenarios where
|
||
\series bold
|
||
crossover cables
|
||
\series default
|
||
or similare equipment are used.
|
||
The assumption is that crossover cables can't go defective, or at least
|
||
it would be an extremely unlikely scenario.
|
||
For long-distance replication, this assumption is simply not true.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
you sometimes
|
||
\emph on
|
||
cannot(!)
|
||
\emph default
|
||
know whether a system is healthy, or to what degree it is healthy.
|
||
Typical STONITH methods as used in some contemporary clustermanagers are
|
||
|
||
\series bold
|
||
assuming a worst case
|
||
\series default
|
||
, even if that worst case is currently not for real.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Therefore, avoid the following
|
||
\series bold
|
||
fundamental flaws
|
||
\series default
|
||
in failover concepts and healthiness models, which apply to implementors
|
||
/ configurators of clustermanagers:
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Don't mix up knowledge with conclusions about a (sub)system, and also don't
|
||
mix this up with the real state of that (sub)system.
|
||
In reality, you don't have any knowledge about a complex distributed system.
|
||
You only may have
|
||
\emph on
|
||
some
|
||
\emph default
|
||
knowledge about
|
||
\emph on
|
||
some
|
||
\emph default
|
||
parts of the system, but you cannot
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
see
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
a complex distributed system as a whole.
|
||
What you think is your knowledge, isn't knowledge in reality: in many cases,
|
||
it is
|
||
\emph on
|
||
conclusion
|
||
\emph default
|
||
, not knowledge.
|
||
Don't mix this up!
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Some systems are more complex than your model of it.
|
||
Don't neglect important parts (such as networks, routers, switches, cables,
|
||
plugs) which may lead you to wrong conclusions!
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Don't restrict your mind to boolean models of healthyness.
|
||
Doing so can easily create unnecessary damage by construction, and even
|
||
at concept level.
|
||
You should know from software engineering that defects in concepts or models
|
||
are much more serious than simple bugs in implementations.
|
||
Choosing the wrong model cannot be fixed as easily as a typical bug or
|
||
a typo.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Try to deduce the state of a system as
|
||
\series bold
|
||
reliably
|
||
\series default
|
||
as possible.
|
||
If you don't know something for sure, don't generally assume that it has
|
||
gone wrong.
|
||
Don't confuse missing knowledge with the conclusion that something is bad.
|
||
Boolean algebra restricts your mind to either
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
good
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
or
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
bad
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
Use at least
|
||
\series bold
|
||
tri-state algebra
|
||
\series default
|
||
which has a means for expressing
|
||
\series bold
|
||
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
unknown
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
|
||
\series default
|
||
.
|
||
Even better: attach a probability to anything you (believe to) know.
|
||
Errare humanum est: nothing is absolutely sure.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Oversimplification: don't report an
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
unknown
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
or even a
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
broken
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
state for a complex system whenever a smaller subsystem exists for which
|
||
you have some knowledge (or you can conclude something about it with reasonable
|
||
evidence).
|
||
Otherwise, your users / sysadmins may draw wrong conclusions, and assume
|
||
that the whole system is broken, while in reality only some minor part
|
||
has some minor problem.
|
||
Users could then likely make wrong decisions, which may then easily lead
|
||
to bigger damages.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Murphy's law:
|
||
\series bold
|
||
never assume that something can't go wrong!
|
||
\series default
|
||
Doing so is a blatant misconception at topmost level: the
|
||
\emph on
|
||
purpose
|
||
\emph default
|
||
of a clustermanager is creating High Availablity (HA) out of more or less
|
||
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
unreliable
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
components.
|
||
It is the damn duty of both a clustermanager and its configurator to try
|
||
to compensate
|
||
\emph on
|
||
any
|
||
\emph default
|
||
failures,
|
||
\emph on
|
||
regardless of their probability
|
||
\emph default
|
||
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Never claim that something has only low probability (and therefore it were
|
||
not relevant).
|
||
In the HA area, you simply
|
||
\series bold
|
||
cannot know
|
||
\series default
|
||
that, because you typically have
|
||
\emph on
|
||
sporadic
|
||
\emph default
|
||
incidents.
|
||
In extreme cases, the
|
||
\emph on
|
||
purpose
|
||
\emph default
|
||
of your HA solution is protection against 1 failure per 10 years.
|
||
You simply don't have the time to wait for creating an incident statistics
|
||
about that!
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
, as best as possible.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Never confuse
|
||
\series bold
|
||
probability
|
||
\series default
|
||
with
|
||
\series bold
|
||
expectancy value!
|
||
\series default
|
||
If you don't know the mathematical term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
expectancy value
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, or if you don't know what this means
|
||
\emph on
|
||
in practice
|
||
\emph default
|
||
, don't take responsibility for millions of € or $.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
When operating masses of hard- and software: never assume that a particular
|
||
failure can occur only at a low number of instances.
|
||
There are
|
||
\series bold
|
||
\emph on
|
||
unknown(!)
|
||
\emph default
|
||
systematic errors
|
||
\series default
|
||
which may pop up at the wrong time and in huge masses when you don't expect
|
||
them.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Multiple layers of fallback:
|
||
\emph on
|
||
any
|
||
\emph default
|
||
action can fail.
|
||
Be prepared to have a plan B, and even a plan C, and even better a plan
|
||
D, wherever possible.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Never increase any damage anywhere, unnecessarily! Always try to
|
||
\emph on
|
||
miminize
|
||
\emph default
|
||
any damage! It can be mathematically proven that in deterministic probabilistic
|
||
systems having finite state, increases of a damage level
|
||
\emph on
|
||
at the wrong place
|
||
\emph default
|
||
will
|
||
\emph on
|
||
introduce
|
||
\emph default
|
||
an
|
||
\emph on
|
||
additional
|
||
\emph default
|
||
|
||
\emph on
|
||
risk
|
||
\emph default
|
||
of getting into an
|
||
\series bold
|
||
endless loop
|
||
\series default
|
||
.
|
||
This is also true for nondeterministic systems, as known from formal language
|
||
theory
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Finite automatons are known to be transformable to deterministic ones, usually
|
||
by an exponential increase in the number of states.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Use the
|
||
\series bold
|
||
best effort principle
|
||
\series default
|
||
.
|
||
You should be aware of the following fact: in general, it is impossible
|
||
to create an
|
||
\emph on
|
||
absolutely reliable system
|
||
\emph default
|
||
out of unreliable components.
|
||
You can
|
||
\emph on
|
||
lower
|
||
\emph default
|
||
the risk of failures to any
|
||
\begin_inset Formula $\epsilon>0$
|
||
\end_inset
|
||
|
||
by investing a lot of resources and of money, but whatever you do:
|
||
\begin_inset Formula $\epsilon=0$
|
||
\end_inset
|
||
|
||
is impossible.
|
||
Therefore, be careful with boolean algebra.
|
||
Prefer approximation methods / optimizing methods instead.
|
||
Always do
|
||
\emph on
|
||
your
|
||
\emph default
|
||
best, instead of trying to reach a
|
||
\emph on
|
||
global
|
||
\emph default
|
||
optimum which likely does not exist at all (because the
|
||
\begin_inset Formula $\epsilon$
|
||
\end_inset
|
||
|
||
can only
|
||
\emph on
|
||
converge
|
||
\emph default
|
||
to an optimum, but will never actually reach it).
|
||
The best effort principle means the following: if you discover a method
|
||
for improving your operating state by reduction of a (potential) damage
|
||
in a reasonable time and with reasonable effort, then
|
||
\series bold
|
||
simply do it
|
||
\series default
|
||
.
|
||
Don't argue that a particular step is no 100% solution for all of your
|
||
problems.
|
||
|
||
\emph on
|
||
Any
|
||
\emph default
|
||
|
||
\emph on
|
||
improvement
|
||
\emph default
|
||
is valuable.
|
||
|
||
\series bold
|
||
Don't miss any valuable step
|
||
\series default
|
||
having reasonable costs with respect to your budget.
|
||
Missing valuable measures which have low costs are certainly a violation
|
||
of the best effort principle, because you are not doing
|
||
\emph on
|
||
your
|
||
\emph default
|
||
best.
|
||
Keep that in mind.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
If you have
|
||
\emph on
|
||
understood
|
||
\emph default
|
||
this (e.g.
|
||
deeply think at least one day about it), you will no longer advocate STONITH
|
||
methods
|
||
\emph on
|
||
in general
|
||
\emph default
|
||
, when there are alternatives.
|
||
STONITH methods are only valuable when you
|
||
\emph on
|
||
know in advance
|
||
\emph default
|
||
that the final outcome (after reboot) will most likely be better, and that
|
||
waiting for reboot will most likely
|
||
\emph on
|
||
pay off
|
||
\emph default
|
||
.
|
||
In general, this condition is
|
||
\emph on
|
||
not true
|
||
\emph default
|
||
if you have a healthy hot standby system.
|
||
This should be easy to see.
|
||
But there exist well-known clustermanager solutions / configurations blatantly
|
||
ignoring
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
For some
|
||
\emph on
|
||
special(!)
|
||
\emph default
|
||
cases of the shared-disk model, there exist some justifications for doing
|
||
STONITH
|
||
\emph on
|
||
before
|
||
\emph default
|
||
starting the application at the hot standby.
|
||
Under certain circumstances, it can happen that system A running amok could
|
||
destroy the data on your single shared disk (example: a filesystem doubly
|
||
mounted
|
||
\emph on
|
||
in parallel
|
||
\emph default
|
||
, which will certainly destroy your data, except you are using
|
||
\family typewriter
|
||
ocfs2
|
||
\family default
|
||
or suchalike).
|
||
This argument is only valid for
|
||
\emph on
|
||
passive
|
||
\emph default
|
||
disks which are
|
||
\emph on
|
||
directly
|
||
\emph default
|
||
attached to
|
||
\emph on
|
||
both
|
||
\emph default
|
||
systems A and B, such that there is no
|
||
\emph on
|
||
external
|
||
\emph default
|
||
means for fencing the disk.
|
||
In case of iSCSI running over ordinary network equipment such as routers
|
||
or switches, the argument
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
fencing the disk is otherwise not possible
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
does not apply.
|
||
You can interrupt iSCSI connection at the network gear, or you can often
|
||
do it at cluster A or at the iSCSI target.
|
||
Even commercial storage appliances speaking iSCSI can be remotely controlled
|
||
for forcefully aborting iSCSI sessions.
|
||
In modern times, the STONITH method has no longer such a justification.
|
||
The justification stems from ancient times when a disk was a purely passive
|
||
mechanical device, and its disk controller was part of the server system.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
this.
|
||
Only when the former standby system does not work as expected (this means
|
||
that
|
||
\emph on
|
||
all
|
||
\emph default
|
||
of your redundant systems are not healthy enough for your application),
|
||
|
||
\emph on
|
||
only then
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Notice that STONITH may be needed for (manual or partially automatic)
|
||
\emph on
|
||
repair
|
||
\emph default
|
||
in some cases, e.g.
|
||
when you know that a system has a kernel crash.
|
||
Don't mix up the repair phase with failover or handover phases.
|
||
Typically, they are executed at different times.
|
||
The repair phase is outside the scope of this section.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
|
||
\emph default
|
||
STONITH is unevitable as a
|
||
\emph on
|
||
last resort
|
||
\emph default
|
||
option.
|
||
\begin_inset Newline newline
|
||
\end_inset
|
||
|
||
In short: blindly using STONITH without true need during failover is a violation
|
||
of the best effort principle.
|
||
You are simply not doing your best.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
When your budget is limited, carefully select those improvements which make
|
||
your system
|
||
\series bold
|
||
as reliable as possible
|
||
\series default
|
||
, given your fixed budget.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Create statistics on the duration of your actions.
|
||
Based on this, try to get a
|
||
\emph on
|
||
balanced
|
||
\emph default
|
||
optimum between time and costs.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Whatever actions you can
|
||
\series bold
|
||
start in parallel
|
||
\series default
|
||
for saving time, do it.
|
||
Otherwise you are disregarding the best effort principle, and your solution
|
||
will be sub-optimal.
|
||
You will require deep knowledge of parallel systems, as well as experience
|
||
with dealing with problems like (distributed) races.
|
||
Notice that
|
||
\emph on
|
||
any
|
||
\emph default
|
||
distributed system is
|
||
\emph on
|
||
inherently parallel
|
||
\emph default
|
||
.
|
||
Don't believe that sequential methods can deliver an optimum solution in
|
||
such a difficult area.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
If you don't have the
|
||
\series bold
|
||
necessary skills
|
||
\series default
|
||
for (a) recognizing already existing parallelism, (b) dealing with parallelism
|
||
at concept level, (c) programming and/or configuring parallelism race-free
|
||
and deadlock-free (or if you even don't know what a race condition is and
|
||
where it may occur in practice), then don't take responsibility for millions
|
||
of € or $.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Avoid hard timeouts wherever possible.
|
||
Use
|
||
\series bold
|
||
adaptive timeouts
|
||
\series default
|
||
instead.
|
||
Reason: depending on hardware or workload, the same action A may take a
|
||
very short time on cluster 1, but take a very long time on cluster 2.
|
||
If you need to guard action A from hanging (which is almost always the
|
||
case because of Murphy's law), don't configure any fixed timeout for it.
|
||
When having several hundreds of clusters, you would need to use the
|
||
\emph on
|
||
worst case value
|
||
\emph default
|
||
, which is the longest time occurring somewhere at the very slow clusters
|
||
/ slow parts of the network.
|
||
This wastes a lot of time in case one of the fast clusters is hanging.
|
||
Adaptive timeouts work differently: they use a kind of
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
progress bar
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
to monitor the
|
||
\emph on
|
||
progress
|
||
\emph default
|
||
of an action.
|
||
They will abort only if there is
|
||
\emph on
|
||
no progress
|
||
\emph default
|
||
for a certain amount of time.
|
||
Hint: among others,
|
||
\family typewriter
|
||
marsadm view-*-rest
|
||
\family default
|
||
commands or macros are your friend.
|
||
\end_layout
|
||
|
||
\begin_layout Paragraph
|
||
ITON = Ignore The Other Node
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This means
|
||
\series bold
|
||
fencing from application traffic
|
||
\series default
|
||
, and can be used as an alternative to STONITH when done properly.
|
||
\begin_inset Separator latexpar
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/fencing-hierarchy.fig
|
||
width 60col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
Fencing from application traffic is best suited for the shared-nothing model,
|
||
but can also be adapted to the shared-disk model with some quirks.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The idea is simple: always route your application network traffic to the
|
||
current (logically) active side, whether it is currently A or B.
|
||
Just don't route any application requests to the current (logically) passive
|
||
side at all.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
For failover (and
|
||
\emph on
|
||
only
|
||
\emph default
|
||
for that), you
|
||
\emph on
|
||
should not care about
|
||
\emph default
|
||
any split brain occurring at the low-level generic block device:
|
||
\begin_inset Separator latexpar
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/split-brain-history.fig
|
||
width 50col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
Although having a split brain at the generic low-level block device, you
|
||
now define the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
logically active
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
and
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
logically passive
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
side by yourself by
|
||
\emph on
|
||
logically ignoring
|
||
\emph default
|
||
the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
wrong
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
side as defined by yourself:
|
||
\begin_inset Separator latexpar
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\align center
|
||
\begin_inset Graphics
|
||
filename images/split-brain-resolved.fig
|
||
width 50col%
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
This is possible because the generic block devices provided by DRBD or MARS
|
||
are completely
|
||
\series bold
|
||
agnostic
|
||
\series default
|
||
of the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
meaning
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
of either version A or B.
|
||
Higher levels such as clustermanagers (or humans like sysadmins) can assign
|
||
them a meaning like
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
relevant
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
or
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
not relevant
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, or
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
logically active
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
or
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
logically passive
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
As a result of fencing from application traffic, the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
logically passive
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
side will
|
||
\emph on
|
||
logically
|
||
\emph default
|
||
cease any actions such as updating user data, even if it is
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
physically active
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
during split-brain (when two primaries exist in DRBD or MARS sense
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
Hint: some clustermanagers and/or some people seem to define the term
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
split-brain
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
differently from DRBD or MARS.
|
||
In the context of generic block devices, split brain means that the
|
||
\emph on
|
||
history
|
||
\emph default
|
||
of both versions has been split to a Y-like
|
||
\series bold
|
||
fork
|
||
\series default
|
||
(for whatever reason), such that re-joining them
|
||
\emph on
|
||
incrementally
|
||
\emph default
|
||
by ordinary write operations is no longer guaranteed to be possible.
|
||
As a slightly simplified definition, you might alternatively use the definition
|
||
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
two incompatible primaries are existing in parallel
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
, which means almost the same in practice.
|
||
Details of formal semantics are not the scope of this treatment.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
If you already have some load balancing, or BGP, or another
|
||
\emph on
|
||
mechanism
|
||
\emph default
|
||
for dynamic routing, you already have an important part for the ITON method.
|
||
Additionally, ensure by an appropriate
|
||
\emph on
|
||
strategy
|
||
\emph default
|
||
that your balancer status / BGP announcement etc does always coincide with
|
||
the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
logically active
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
side (recall that even during split-brain
|
||
\emph on
|
||
you
|
||
\emph default
|
||
must define
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
logically active
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
|
||
\series bold
|
||
uniquely
|
||
\series default
|
||
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
A possible strategy is to use a Lamport clock for route changes: the change
|
||
with the most recent Lamport timestamp will always win over previous changes.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
by yourself).
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Example:
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
Phase1 Check whether the hot standby B is currently usable.
|
||
If this is violated (which may happen during certain types of disasters),
|
||
abort the failover for any affected resources.
|
||
\end_layout
|
||
|
||
\begin_layout Description
|
||
Phase2 Do the following
|
||
\emph on
|
||
in parallel
|
||
\begin_inset Foot
|
||
status open
|
||
|
||
\begin_layout Plain Layout
|
||
For database applications where no transactions should get lost, you should
|
||
slightly modify the order of operations: first fence the old side A, then
|
||
start the application at standby side B.
|
||
However, be warned that even this cannot guarantee that no transaction
|
||
is lost.
|
||
When the network between A and B is interrupted
|
||
\emph on
|
||
before
|
||
\emph default
|
||
the incident happens, DRBD will automatically disconnect, and MARS will
|
||
show a lagbehind.
|
||
In order to fully eliminate this possibility, you can either use DRBD and
|
||
configure it to hang forever during network outages (such that users will
|
||
be unable to commit any transactions at all), or you can use the shared-disk
|
||
model instead.
|
||
But in the latter case, you are introducing a SPOF at the single shared
|
||
disk.
|
||
The former case is logically almost equivalent to shared-disk, but avoiding
|
||
some parts of the physical SPOF.
|
||
In a truly distributed system, the famous CAP theorem is limiting your
|
||
possibilities.
|
||
Therefore, no general solution exists fulfilling all requirements at the
|
||
same time.
|
||
\end_layout
|
||
|
||
\end_inset
|
||
|
||
:
|
||
\begin_inset Separator latexpar
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_deeper
|
||
\begin_layout Itemize
|
||
Start all affected applications at the hot standby B.
|
||
This can be done with the same DRBD or MARS procedure as described
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vpageref
|
||
reference "Phase4-in-more"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Fence A by fixedly routing all affected application traffic to B.
|
||
\end_layout
|
||
|
||
\end_deeper
|
||
\begin_layout Standard
|
||
That's all which has to be done for a shared-nothing model.
|
||
Of course, this will likely produce a split-brain (even when using DRBD
|
||
in place of MARS), but that will not matter from a user's perspective,
|
||
because the users will no longer
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
see
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
the
|
||
\begin_inset Quotes eld
|
||
\end_inset
|
||
|
||
logically passive
|
||
\begin_inset Quotes erd
|
||
\end_inset
|
||
|
||
side A through their network.
|
||
Only during the relatively small time period where application traffic
|
||
was going to the old side A while not replicated to B due to the incident,
|
||
a very small number of updates
|
||
\emph on
|
||
could
|
||
\emph default
|
||
have gone lost.
|
||
In fields like webhosting, this is taken into account.
|
||
Users will usually not complain when some (smaller amount of) data is lost
|
||
due to split-brain.
|
||
They will complain when the service is unavailable.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This method is the fastest for restoring availability, because it doesn't
|
||
try to execute any (remote) action at side A.
|
||
Only from a sysadmin's perspective, there remain some cleanup tasks to
|
||
be done during the following repair phase, such as split-brain resolution,
|
||
which are outside the scope of this treatment.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
By running the application fencing step
|
||
\emph on
|
||
sequentially
|
||
\emph default
|
||
(including wait for its partial successfulness such that the old side A
|
||
can no longer be reached by any users) in front of the failover step, you
|
||
may minimize the amount of lost data, but at the cost of total duration.
|
||
Your service will take longer to be available again, while the amount of
|
||
lost data is typically somewhat smaller.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
\begin_inset Graphics
|
||
filename images/lightbulb_brightlit_benj_.png
|
||
lyxscale 12
|
||
scale 7
|
||
|
||
\end_inset
|
||
|
||
A few people might clamour when some data is lost.
|
||
In long-distance replication scenarios with high update traffic, there
|
||
is
|
||
\emph on
|
||
simply no way at all
|
||
\emph default
|
||
for guaranteeing that no data can be lost ever.
|
||
According to the laws of Einstein and the laws of Distributed Systems like
|
||
the famous CAP theorem, this isn't the fault of DRBD+proxy or MARS, but
|
||
simply the
|
||
\emph on
|
||
consequence
|
||
\emph default
|
||
of having long distances.
|
||
If you want to protect against data loss as best as possible, then don't
|
||
use
|
||
\begin_inset Formula $k=2$
|
||
\end_inset
|
||
|
||
replicas.
|
||
Use
|
||
\begin_inset Formula $k\geq4$
|
||
\end_inset
|
||
|
||
, and spread them over different distances, such as mixed small + medium
|
||
+ long distances.
|
||
Future versions of MARS will support adaptive pseudo-synchronous modes,
|
||
which will allow individual adaptation to network latencies / distances.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The ITON method can be adapted to shared-disk by additionally fencing the
|
||
common disk from the (presumably) failed cluster node A.
|
||
\end_layout
|
||
|
||
\begin_layout Subsubsection
|
||
Handover Methods
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Planned handover is conceptually simpler, because both sides must be (almost)
|
||
healthy as a
|
||
\emph on
|
||
precondition
|
||
\emph default
|
||
.
|
||
There are simply no pre-existing failures to deal with.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Here is an example using DRBD, some application commands denoted as pseudo
|
||
code:
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
at side A:
|
||
\family typewriter
|
||
applicationmanager stop all
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
at side A:
|
||
\family typewriter
|
||
drbdadm secondary all
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
at side B:
|
||
\family typewriter
|
||
drbdadm primary all
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
at side B:
|
||
\family typewriter
|
||
applicationmanager start all
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
MARS already has a conceptual distinction between handover and failover.
|
||
With MARS, it becomes even simpler, because a generic handover procedure
|
||
is already built in:
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
at side A:
|
||
\family typewriter
|
||
applicationmanager stop all
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
at side B:
|
||
\family typewriter
|
||
marsadm primary all
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
at side B:
|
||
\family typewriter
|
||
applicationmanager start all
|
||
\end_layout
|
||
|
||
\begin_layout Subsubsection
|
||
Hybrid Methods
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In general, a planned handover may fail at any stage.
|
||
Notice that such a failure is also a failure, but (partially) caused by
|
||
the planned handover.
|
||
You have the following alternatives for automatically dealing with such
|
||
cases:
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
In case of a failure, switch back to the old side A.
|
||
\end_layout
|
||
|
||
\begin_layout Enumerate
|
||
Instead, forcefully switch to the new side A, similar to the methods described
|
||
in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand ref
|
||
reference "subsec:Failover-Methods"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Similar options exist for a failed failover (at least in theory), but chances
|
||
are lower for actually recovering if you have only
|
||
\begin_inset Formula $k=2$
|
||
\end_inset
|
||
|
||
replicas in total.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Whatever you decide to do in what case in whatever priority order, whether
|
||
you decide it in advance or during the course of a failing action: it simply
|
||
means that according to the best effort principle, you should
|
||
\series bold
|
||
never leave your system in a broken state
|
||
\series default
|
||
when there exists a chance to recover availability with any method.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Therefore, you should
|
||
\emph on
|
||
implement
|
||
\emph default
|
||
neither handover nor failover in their pure forms.
|
||
Always implement hybrid forms following the best effort principle.
|
||
\end_layout
|
||
|
||
\begin_layout Subsection
|
||
Special Requirements for Long Distances
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "subsec:Special-Requirements-for"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
Most contemporary clustermanagers have been constructed for short distance
|
||
shared-nothing clusters, or even for
|
||
\emph on
|
||
local
|
||
\emph default
|
||
shared-nothing clusters (c.f.
|
||
DRBD over crossover cables), or even for shared-disk clusters (
|
||
\emph on
|
||
originally
|
||
\emph default
|
||
, when their
|
||
\emph on
|
||
concepts
|
||
\emph default
|
||
were developed).
|
||
Blindly using them for long-distance replication without modification /
|
||
adaptation bears some additional risks.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Notice that long-distance replication always
|
||
\emph on
|
||
requires
|
||
\emph default
|
||
a
|
||
\series bold
|
||
shared-nothing
|
||
\series default
|
||
model.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
As a consequence,
|
||
\series bold
|
||
split brain
|
||
\series default
|
||
can appear
|
||
\emph on
|
||
regularly
|
||
\emph default
|
||
during failover.
|
||
There is no way for preventing it! This is an
|
||
\emph on
|
||
inherent property
|
||
\emph default
|
||
of distributed systems, not limited to MARS (e.g.
|
||
also ocurring with DRBD if you try to use it over long distances).
|
||
Therefore, you
|
||
\emph on
|
||
must
|
||
\emph default
|
||
deal with occurences of split-brain as a
|
||
\emph on
|
||
requirement
|
||
\emph default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
The probability of
|
||
\series bold
|
||
network partitions
|
||
\series default
|
||
is much higher: although you should have been required by Murphy's law
|
||
to deal with network partitions already in short-distance scenarios, it
|
||
now becomes
|
||
\emph on
|
||
mandatory
|
||
\emph default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
Be prepared that in case of certain types of (more or less global) internet
|
||
partitions, you may not be able to trigger STONITH actions
|
||
\emph on
|
||
at all
|
||
\emph default
|
||
.
|
||
Therefore,
|
||
\series bold
|
||
fencing of application traffic
|
||
\series default
|
||
is
|
||
\emph on
|
||
mandatory
|
||
\emph default
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Chapter
|
||
\start_of_appendix
|
||
Mathematical Model of Architectural Reliability
|
||
\begin_inset CommandInset label
|
||
LatexCommand label
|
||
name "chap:Mathematical-Model-of"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
The assumptions used in the model are explained in detail in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sub:Detailed-explanation"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
Here is a quick recap of the main parameters:
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
is the number of basic storage units.
|
||
It is also used for the number of application units, assumed to be the
|
||
same.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
is the replication degree, or number of replicas.
|
||
In general, you will have to deploy
|
||
\begin_inset Formula $N=k*n$
|
||
\end_inset
|
||
|
||
storage servers for getting
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
basic storage units.
|
||
This applies to any of the competing architectures.
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
\begin_inset Formula $s$
|
||
\end_inset
|
||
|
||
is the architecture-dependent spread exponent: it tells whether a storage
|
||
incident will spread to the application units.
|
||
Examples:
|
||
\begin_inset Formula $s=0$
|
||
\end_inset
|
||
|
||
means that there is no spread between storage unit failures and application
|
||
unit failures, other than a local 1:1 one.
|
||
|
||
\begin_inset Formula $s=1$
|
||
\end_inset
|
||
|
||
means that an uncompensated storage node incident will cause
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
application incidents.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
\begin_inset Formula $p$
|
||
\end_inset
|
||
|
||
is the probability of a storage server incident.
|
||
In the examples at section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sec:Reliability-Arguments-from"
|
||
|
||
\end_inset
|
||
|
||
, a fixed
|
||
\begin_inset Formula $p=0.0001$
|
||
\end_inset
|
||
|
||
was used for easy understanding, but the following formulae should also
|
||
hold for any other
|
||
\begin_inset Formula $p\in(0,1)$
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Itemize
|
||
\begin_inset Formula $T$
|
||
\end_inset
|
||
|
||
is the observational period, introduced for convenience of understanding.
|
||
The following can also be computed independently from any
|
||
\begin_inset Formula $T$
|
||
\end_inset
|
||
|
||
, as long as the probability
|
||
\begin_inset Formula $p$
|
||
\end_inset
|
||
|
||
does not change over time, which is assumed.
|
||
Because
|
||
\begin_inset Formula $T$
|
||
\end_inset
|
||
|
||
is only here for convenience of understanding, we set it to
|
||
\begin_inset Formula $T=1/p$
|
||
\end_inset
|
||
|
||
.
|
||
In the examples from section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sub:Detailed-explanation"
|
||
|
||
\end_inset
|
||
|
||
, a fixed
|
||
\begin_inset Formula $T=10,000$
|
||
\end_inset
|
||
|
||
hours was used.
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
Formula for DRBD / MARS
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
We need not discrimiate between a storage failure probability S and an applicati
|
||
on failure probability A because applications are run locally at the storage
|
||
servers 1:1.
|
||
The probability for failure of a single shard consisting of
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
nodes is
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\begin_inset Formula
|
||
\[
|
||
A_{p}(k)=p^{k}
|
||
\]
|
||
|
||
\end_inset
|
||
|
||
because all
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
shard members have to be down all at the same time.
|
||
In section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sub:Detailed-explanation"
|
||
|
||
\end_inset
|
||
|
||
we assumed that there is no cross-communication between shards.
|
||
Therefore they are completely independent from each other, and the total
|
||
downtime of
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
shards during the observational period
|
||
\begin_inset Formula $T$
|
||
\end_inset
|
||
|
||
is
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\begin_inset Formula
|
||
\[
|
||
A_{p,T}(k,n)=T*n*p^{k}
|
||
\]
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
When introducing the spread exponent
|
||
\begin_inset Formula $s$
|
||
\end_inset
|
||
|
||
, the formula turns into
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\begin_inset Formula
|
||
\[
|
||
A_{s,p,T}(k,n)=T*n^{s+1}*p^{k}
|
||
\]
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
Formula for Unweighted BigCluster
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
This is based on the Bernoulli formula.
|
||
The probability that exactly
|
||
\begin_inset Formula $\bar{k}$
|
||
\end_inset
|
||
|
||
storage nodes out of
|
||
\begin_inset Formula $N=k*n$
|
||
\end_inset
|
||
|
||
total storage nodes are down is
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\begin_inset Formula
|
||
\[
|
||
\bar{S}_{p}(\bar{k},N)=\binom{N}{\bar{k}}*p^{\bar{k}}*(1-p)^{N-\bar{k}}
|
||
\]
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
Similarly, the probability for getting
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
or more storage node failures (up to
|
||
\begin_inset Formula $N$
|
||
\end_inset
|
||
|
||
) at the same time is
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\begin_inset Formula
|
||
\[
|
||
S_{p}(k,N)=\sum_{\bar{k}=k}^{N}\bar{S}_{p}(\bar{k},N)=\sum_{\bar{k}=k}^{N}\binom{N}{\bar{k}}*p^{\bar{k}}*(1-p)^{N-\bar{k}}
|
||
\]
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
By replacing
|
||
\begin_inset Formula $N$
|
||
\end_inset
|
||
|
||
with
|
||
\begin_inset Formula $k*n$
|
||
\end_inset
|
||
|
||
(for conversion of the x axis into basic storage units) and by introducing
|
||
|
||
\begin_inset Formula $T$
|
||
\end_inset
|
||
|
||
we get
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\begin_inset Formula
|
||
\[
|
||
S_{p,T}(k,n)=T*\sum_{\bar{k}=k}^{k*n}\binom{k*n}{\bar{k}}*p^{\bar{k}}*(1-p)^{k*n-\bar{k}}
|
||
\]
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
For comparability with DRBDorMARS, we have to compute the application downtime
|
||
A instead of the storage downtime S, which depends on the spread exponent
|
||
|
||
\begin_inset Formula $s$
|
||
\end_inset
|
||
|
||
as follows:
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\begin_inset Formula
|
||
\[
|
||
A_{s,p,T}(k,n)=n^{s+1}*S_{p,T}(k,n)=n^{s+1}*T*\sum_{\bar{k}=k}^{k*n}\binom{k*n}{\bar{k}}*p^{\bar{k}}*(1-p)^{k*n-\bar{k}}
|
||
\]
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
Notice that at
|
||
\begin_inset Formula $s=0$
|
||
\end_inset
|
||
|
||
we have introduced a factor of
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
, which corresponds to the hashing effect (teardown of
|
||
\begin_inset Formula $n$
|
||
\end_inset
|
||
|
||
application instances by a single uncompensated storage incident) as described
|
||
in section
|
||
\begin_inset CommandInset ref
|
||
LatexCommand vref
|
||
reference "sub:Detailed-explanation"
|
||
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Section
|
||
Formula for SizeWeighted BigCluster
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
In difference to above, we need to introduce a correction factor by the
|
||
fraction of affected objects, relative to basic storage units.
|
||
Otherwise the y axis would not stay comparable due to different units.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
For the special case of
|
||
\begin_inset Formula $k=1$
|
||
\end_inset
|
||
|
||
, there is no difference to above.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
For the special case of
|
||
\begin_inset Formula $k=2$
|
||
\end_inset
|
||
|
||
replica, the correction factor is
|
||
\begin_inset Formula $1/(N-1)$
|
||
\end_inset
|
||
|
||
, because we assume that all the replica of the affected first node are
|
||
uniformly spread to all other nodes, which is
|
||
\begin_inset Formula $N-1$
|
||
\end_inset
|
||
|
||
.
|
||
The probability for hitting the intersection of the first node with the
|
||
second node is thus
|
||
\begin_inset Formula $1/(N-1)$
|
||
\end_inset
|
||
|
||
.
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
For higher values of
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
, and with a similar argument (never put another replica of the same object
|
||
onto the same storage node) we get the correction factor as
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\begin_inset Formula
|
||
\[
|
||
C(k,N)=\prod_{l=1}^{k-1}\frac{1}{N-l}
|
||
\]
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
Hint: there are maximum
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
physical replicas on the disks.
|
||
For higher values of
|
||
\begin_inset Formula $\bar{k}\geq k$
|
||
\end_inset
|
||
|
||
, there are
|
||
\begin_inset Formula $\binom{\bar{k}}{k}$
|
||
\end_inset
|
||
|
||
combinations of object intersections (when assuming that the number of
|
||
objects on a node is very large such and no further object repetition can
|
||
occur execpt for the
|
||
\begin_inset Formula $k$
|
||
\end_inset
|
||
|
||
-fold replica placement).
|
||
Thus the generalization to
|
||
\begin_inset Formula $\bar{k}\geq k$
|
||
\end_inset
|
||
|
||
is
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\begin_inset Formula
|
||
\[
|
||
C(k,\bar{k},N)=\binom{\bar{k}}{k}\prod_{l=1}^{k-1}\frac{1}{N-l}
|
||
\]
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\noindent
|
||
By inserting this into the above fomula, we get
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\begin_inset Formula
|
||
\[
|
||
A_{s,p,T}(k,n)=n^{s+1}*T*\sum_{\bar{k}=k}^{k*n}C(k,\bar{k},k*n)*\binom{k*n}{\bar{k}}*p^{\bar{k}}*(1-p)^{k*n-\bar{k}}
|
||
\]
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\begin_layout Standard
|
||
\begin_inset CommandInset include
|
||
LatexCommand input
|
||
preview true
|
||
filename "common-back-matter.lyx"
|
||
|
||
\end_inset
|
||
|
||
|
||
\end_layout
|
||
|
||
\end_body
|
||
\end_document
|