mars/docu/architecture-guide-geo-redu...

52542 lines
975 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#LyX 2.3 created this file. For more info see http://www.lyx.org/
\lyxformat 544
\begin_document
\begin_header
\save_transient_properties true
\origin unavailable
\textclass scrreprt
\begin_preamble
\usepackage{listings}
\end_preamble
\options abstracton,most,usenames,dvipsnames
\use_default_options true
\begin_modules
customHeadersFooters
enumitem
fixltx2e
tcolorbox
\end_modules
\maintain_unincluded_children false
\language english
\language_package default
\inputencoding auto
\fontencoding global
\font_roman "default" "default"
\font_sans "default" "default"
\font_typewriter "default" "default"
\font_math "auto" "auto"
\font_default_family rmdefault
\use_non_tex_fonts false
\font_sc false
\font_osf false
\font_sf_scale 100 100
\font_tt_scale 100 100
\use_microtype false
\use_dash_ligatures false
\graphics default
\default_output_format default
\output_sync 1
\bibtex_command default
\index_command default
\paperfontsize 10
\spacing single
\use_hyperref true
\pdf_title "Architecture Guide Geo-Redundancy"
\pdf_author "Thomas Schöbel-Theuer"
\pdf_bookmarks true
\pdf_bookmarksnumbered false
\pdf_bookmarksopen true
\pdf_bookmarksopenlevel 2
\pdf_breaklinks true
\pdf_pdfborder true
\pdf_colorlinks true
\pdf_backref section
\pdf_pdfusetitle true
\papersize a4paper
\use_geometry true
\use_package amsmath 1
\use_package amssymb 1
\use_package cancel 1
\use_package esint 1
\use_package mathdots 1
\use_package mathtools 1
\use_package mhchem 1
\use_package stackrel 1
\use_package stmaryrd 1
\use_package undertilde 1
\cite_engine basic
\cite_engine_type default
\biblio_style plain
\use_bibtopic false
\use_indices false
\paperorientation portrait
\suppress_date false
\justification true
\use_refstyle 1
\use_minted 0
\index Index
\shortcut idx
\color #008000
\end_index
\leftmargin 3.7cm
\topmargin 2.7cm
\rightmargin 2.8cm
\bottommargin 2.3cm
\secnumdepth 3
\tocdepth 4
\paragraph_separation indent
\paragraph_indentation default
\is_math_indent 0
\math_numbering_side default
\quotes_style english
\dynamic_quotes 0
\papercolumns 1
\papersides 2
\paperpagestyle headings
\tracking_changes false
\output_changes true
\html_math_output 0
\html_css_as_file 0
\html_be_strict false
\end_header
\begin_body
\begin_layout Title
Architecture Guide for Geo-Redundancy
\end_layout
\begin_layout Subtitle
Multiversion Asynchronous Replicated Storage
\begin_inset Newline newline
\end_inset
\begin_inset space ~
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/earth-mars-transfer.fig
width 70col%
\end_inset
\end_layout
\begin_layout Author
Thomas Schöbel-Theuer (
\family typewriter
tst@1und1.de
\family default
)
\end_layout
\begin_layout Date
Version 0.1a-143
\end_layout
\begin_layout Lowertitleback
\noindent
Copyright (C) 2013-16 Thomas Schöbel-Theuer
\begin_inset Newline newline
\end_inset
Copyright (C) 2013-16 1&1 Internet AG (see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
http://www.1und1.de
\end_layout
\end_inset
shortly called 1&1 in the following).
\begin_inset Newline newline
\end_inset
\size footnotesize
Permission is granted to copy, distribute and/or modify this document under
the terms of the GNU Free Documentation License, Version 1.3 or any later
version published by the Free Software Foundation; with no Invariant Sections,
no Front-Cover Texts, and no Back-Cover Texts.
A copy of the license is included in the section entitled
\begin_inset Quotes eld
\end_inset
\begin_inset CommandInset ref
LatexCommand nameref
reference "chap:GNU-FDL"
\end_inset
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout New Color Box Type
\begin_inset Argument 2
status open
\begin_layout Plain Layout
cBoxA
\end_layout
\end_inset
\begin_inset Argument 3
status open
\begin_layout Plain Layout
1
\end_layout
\end_inset
\begin_inset Argument 4
status open
\begin_layout Plain Layout
\end_layout
\end_inset
\begin_inset Argument 1
status open
\begin_layout Plain Layout
auto counter, number within=chapter
\end_layout
\end_inset
enhanced,breakable,colback=green!5!white,title=Example~
\backslash
thetcbcounter : #1
\end_layout
\begin_layout New Color Box Type
\begin_inset Argument 2
status open
\begin_layout Plain Layout
cBoxB
\end_layout
\end_inset
\begin_inset Argument 3
status open
\begin_layout Plain Layout
1
\end_layout
\end_inset
\begin_inset Argument 4
status open
\begin_layout Plain Layout
\end_layout
\end_inset
\begin_inset Argument 1
status open
\begin_layout Plain Layout
auto counter, number within=chapter
\end_layout
\end_inset
enhanced,breakable,title=Details~
\backslash
thetcbcounter : #1
\end_layout
\begin_layout New Color Box Type
\begin_inset Argument 2
status open
\begin_layout Plain Layout
cBoxC
\end_layout
\end_inset
\begin_inset Argument 3
status open
\begin_layout Plain Layout
1
\end_layout
\end_inset
\begin_inset Argument 4
status open
\begin_layout Plain Layout
\end_layout
\end_inset
\begin_inset Argument 1
status open
\begin_layout Plain Layout
auto counter, number within=chapter
\end_layout
\end_inset
enhanced,breakable,colback=yellow!25!white,title=Manager~Hint~
\backslash
thetcbcounter : #1
\end_layout
\begin_layout New Color Box Type
\begin_inset Argument 2
status open
\begin_layout Plain Layout
cBoxD
\end_layout
\end_inset
\begin_inset Argument 3
status open
\begin_layout Plain Layout
1
\end_layout
\end_inset
\begin_inset Argument 4
status open
\begin_layout Plain Layout
\end_layout
\end_inset
\begin_inset Argument 1
status open
\begin_layout Plain Layout
auto counter, number within=chapter
\end_layout
\end_inset
enhanced,breakable,colback=blue!5!white,title=Hint~for~research~
\backslash
thetcbcounter : #1
\end_layout
\begin_layout Abstract
\family typewriter
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
sloppy
\end_layout
\end_inset
\family default
\begin_inset CommandInset include
LatexCommand input
preview true
filename "architecture-guide-introduction.lyx"
\end_inset
\end_layout
\begin_layout Abstract
\paragraph_spacing double
\noindent
\begin_inset space ~
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset space ~
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset Box Frameless
position "c"
hor_pos "c"
has_inner_box 1
inner_pos "c"
use_parbox 0
use_makebox 1
width "100col%"
special "none"
height "1in"
height_special "totalheight"
thickness "0.4pt"
separation "3pt"
shadowsize "4pt"
framecolor "black"
backgroundcolor "none"
status open
\begin_layout Plain Layout
\begin_inset Graphics
filename images/earth-mars-transfer.fig
width 70col%
\end_inset
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Many thanks for constructive feedback which helped to improve this document
series and related material like presentation slides:
\end_layout
\begin_layout Itemize
Philipp Reisner from Linbit
\end_layout
\begin_layout Itemize
Ewen NcNeill and Simon Lyall from the Australian / New Zealand Linux community
\end_layout
\begin_layout Itemize
Jens Clever and Jörg Mann, external freelancers working at 1&1
\end_layout
\begin_layout Itemize
Anders Henke and Christian Albert and Kai Müller and David Meder-Marouelli
from 1&1 Ionos
\end_layout
\begin_layout Itemize
Olof Sandström-Herrera from Arsys
\end_layout
\begin_layout Standard
Please report any omissions in case I forgot somebody.
\end_layout
\begin_layout Addchap
Preface
\begin_inset CommandInset label
LatexCommand label
name "chap:Preface"
\end_inset
\end_layout
\begin_layout Section*
Introduction
\end_layout
\begin_layout Standard
\begin_inset CommandInset include
LatexCommand input
preview true
filename "architecture-guide-introduction.lyx"
\end_inset
\end_layout
\begin_layout Addsec
Purpose
\end_layout
\begin_layout Standard
This document explains and discusses how to select the right storage architectur
e for typical use cases in big enterprises.
Besides general storage architectures, pitfalls of geo-redundancy and long-dist
ance replication are highlighted.
\end_layout
\begin_layout Standard
In addition to technical discussion,
\series bold
cost and risks
\series default
are treated as well, addressing some
\series bold
management needs
\series default
up to CTO level.
\end_layout
\begin_layout Standard
In contrast to several other publications, it is
\emph on
not
\emph default
an enumeration of sheer endless possibilites and components on the market.
It provides
\series bold
guidance
\series default
about the
\series bold
structures and ideas
\series default
\emph on
behind
\emph default
storage architectures and their connection to application processing.
Particular attention is on
\series bold
avoidance of pitfalls
\series default
.
\end_layout
\begin_layout Standard
It provides both
\emph on
technical
\emph default
and
\emph on
management
\emph default
guidance about selection of architectures as well as their implementation
\emph on
classes
\emph default
, and also about selection of suitable component
\emph on
classes
\emph default
.
\end_layout
\begin_layout Standard
Finally, it helps checking for use cases where MARS will be a good solution,
and where other solutions will be better suited.
It also addresses some unexpected problems when inappropriate types of
cluster managers are selected for long-distance replication.
\end_layout
\begin_layout Addsec
Scope
\end_layout
\begin_layout Standard
The following topics are covered within this document:
\end_layout
\begin_layout Itemize
Management Summary
\end_layout
\begin_layout Itemize
Architectures of Cloud Storage, and
\end_layout
\begin_deeper
\begin_layout Itemize
their application area
\end_layout
\begin_layout Itemize
their reliability / risks / pitfalls
\end_layout
\begin_layout Itemize
their cost
\end_layout
\begin_layout Itemize
scalability and performance of architectures
\end_layout
\begin_layout Itemize
recommendations for managers and architects
\end_layout
\end_deeper
\begin_layout Itemize
Selection of components
\end_layout
\begin_deeper
\begin_layout Itemize
MARS vs DRBD
\end_layout
\end_deeper
\begin_layout Itemize
Architecture and pitfalls of Cluster Managers
\end_layout
\begin_layout Addsec
Audience
\end_layout
\begin_layout Standard
This document is mainly written for system architects.
Technical decision makers / managers with technical background, up to CTO
level, should also benefit from
\series bold
risk reduction
\series default
and
\series bold
cost saving
\series default
, when making clever investment and consolidation decisions.
\end_layout
\begin_layout Standard
Researchers in the field of storage systems are also addressed in the section
about
\series bold
reliability
\series default
and the appendix, by providing mathematical models of reliability.
\end_layout
\begin_layout Addsec
How to use this document
\end_layout
\begin_layout Standard
Managers should start with chapter
\begin_inset CommandInset ref
LatexCommand nameref
reference "chap:Management-Summary"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Then read the short chapter
\begin_inset CommandInset ref
LatexCommand nameref
reference "chap:Important-Concepts"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
For details, just follow the internal links within this document.
In any case, the last chapter
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Recommendations-for-Managers"
plural "false"
caps "false"
noprefix "false"
\end_inset
is highly recommended.
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
These boxes are something you definitely should read as a manager.
It explains
\series bold
important key items
\series default
in a nutshell.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
All others should read chapter 1 and 2 sequentially, and proceed to the
other chapters when interested.
\end_layout
\begin_layout Standard
When MARS is already in use (or planned to be used), reading all of the
chapters may pay off for
\series bold
avoidance of pitfalls
\series default
.
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
Examples are marked with boxes like this.
They can be skipped if you don't have much time.
Examples will however help for understanding of complex material.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
Detail explanations are marked like this.
They are recommended for system architects for more elaborate methodology,
and for deeper understanding of fundamentals.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 4
status open
\begin_layout Plain Layout
This document is no scientific work in a strong sense.
However, it is based on scientific background.
In a few places, hints like this could be fruitful for spawning research
activity.
\end_layout
\end_inset
\end_layout
\begin_layout Addsec
Related documents
\end_layout
\begin_layout Itemize
\family typewriter
mars-user-manual.pdf
\family default
: for sysadmins who want to install and run MARS.
\end_layout
\begin_layout Itemize
\family typewriter
football-user-manual.pdf
\family default
: for sysadmins and userspace developers who want to use Football.
\end_layout
\begin_layout Itemize
\family typewriter
mars-for-kernel-developers.pdf
\family default
: some infos for kernel developers.
\end_layout
\begin_layout Addsec
Table of Contents
\end_layout
\begin_layout Standard
\begin_inset CommandInset toc
LatexCommand tableofcontents
\end_inset
\end_layout
\begin_layout Part
Geo-Redundancy for Managers and Consultants
\begin_inset Newline newline
\end_inset
\size small
Plus Background for Responsibles / Architects / Project Members / Sysadmins
/ etc
\end_layout
\begin_layout Chapter
Management Summary
\begin_inset CommandInset label
LatexCommand label
name "chap:Management-Summary"
\end_inset
\end_layout
\begin_layout Standard
This guide is about
\series bold
investments and long-term follow-up cost
\series default
in the range of
\series bold
millions
\series default
of € or $.
It tries to guide you through the jungle of storage solutions and their
features, by focussing at
\series bold
fundamental principles
\series default
and high-level structures, called
\series bold
architecture
\series default
.
\end_layout
\begin_layout Standard
For
\series bold
HA enterprise-critical data
\series default
in the range of
\series bold
petabytes
\series default
, different storage architectures are leading to very different properties
in the
\series bold
cost and risk dimensions
\series default
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Provably best HA / Cloud Storage architecture
\end_layout
\end_inset
By intuitive explanations as well as mathematical arguments, this guide
shows that
\end_layout
\begin_layout Itemize
Permanent
\series bold
minimization of the distances
\series default
between storage and the compute nodes will both
\series bold
increase reliability and reduce cost at the same time
\series default
.
\end_layout
\begin_layout Itemize
When applicable for a certain use case, the best architectural model is
shown to be
\series bold
sharding
\series default
on top of
\series bold
local storage
\series default
.
It can easily save a cost factor of about 2, while increasing
\series bold
architectural reliability
\series default
at the same time.
\end_layout
\begin_layout Itemize
When the so-called
\series bold
FlexibleSharding
\series default
variant of the sharding model is possible, and when combined with a novel
load balancing method called
\series bold
Football
\series default
, it can deliver a very similar level of
\series bold
flexibility
\series default
than network-centric BigCluster architectures are promising.
\end_layout
\begin_layout Itemize
By both intuitive and mathematical explanations, and contrary to some contempora
ry belief, you will learn
\series bold
\emph on
why(!)
\emph default
BigCluster architectures are generally worse
\series default
in practically any dimension, with only
\emph on
few exceptions
\emph default
.
\series bold
Hints
\series default
are provided at certain use cases where BigCluster cannot be explicitly
recommended, and other hints at some of the few exceptions.
\end_layout
\begin_layout Itemize
When built and dimensioned properly,
\series bold
cross-datacenter replication
\series default
and/or
\series bold
geo-redundancy
\series default
will
\emph on
not
\emph default
double TCO = Total Cost of Ownership, but can cost roughly about the same
as local redundancy in the same datacenter.
The key is a certain class of
\series bold
wide-area distribution of resources
\series default
\emph on
in place of
\emph default
local replication.
\end_layout
\begin_layout Itemize
When cross-datacenter replication and/or geo-redundancy is required, the
so-called
\series bold
ability for butterfly
\series default
leads to further HA = High Availability improvements during ordinary operations.
\end_layout
\begin_layout Itemize
Object-based
\series bold
Cloud Storage
\series default
can also be built on top of a sharding model, avoiding cost and reliability
/ risk pitfalls caused by BigCluster.
\end_layout
\begin_layout Itemize
\series bold
Distributed Systems
\series default
(aka
\emph on
loosely coupled systems
\emph default
) are
\series bold
much more complicated to program and operate
\series default
than
\emph on
tightly coupled systems
\emph default
(aka SMP or NUMA).
You will unnecessarily loose TCO = Total Cost of Ownership and TTM =
\series bold
Time To Market
\series default
by
\emph on
unappropriate selection of coupling architectures
\emph default
for a certain use case class.
This guide will explain.
\end_layout
\begin_layout Itemize
You will learn
\series bold
why OpenSource component-based storage systems
\series default
are much cheaper than commercial storage appliances (up to
\emph on
factors
\emph default
), at least when you need a few petabytes of storage.
Alone by relinquishing
\series bold
Vendor-Lock-In
\series default
and going to RAID-based Linux storage, invest will typically decrease by
factors between 3 and 10.
By going to a
\family typewriter
LocalSharding
\family default
or
\family typewriter
FlexibleSharding
\family default
model, where possible,
\emph on
another
\emph default
decrease factor of about 2 is typically possible.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
In addition, this guide explains the ideas behind the OpenSource components
Football on top of MARS.
It can be used for cost-reduced load balancing of non-(geo)redundant setups,
as well as for geo-replication + migration over short to very long distances.
Load balancing and hardware lifecycle / datacenter defragmentation works
via background data migration while your services keep running.
\end_layout
\begin_layout Chapter
Important Concepts
\begin_inset CommandInset label
LatexCommand label
name "chap:Important-Concepts"
\end_inset
\end_layout
\begin_layout Standard
Recommended reading for
\emph on
everyone
\emph default
is
\emph on
each
\emph default
of the definitions in
\emph on
each
\emph default
section, even if you think that you already know what each concept means.
\end_layout
\begin_layout Standard
In case you
\series bold
notice a difference
\series default
between your former opinion about a concept and what you are reading here,
then
\series bold
don't skip the rest
\series default
of the corresponding section.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Skipping anything in this chapter exposes you to serious risks:
\end_layout
\begin_layout Itemize
\series bold
Misunderstanding
\series default
of following important parts.
This may become
\series bold
expensive
\series default
.
This guide is about investments and
\emph on
follow-up cost
\emph default
in the range of
\series bold
millions
\series default
of €.
\end_layout
\begin_layout Itemize
\series bold
Second-order ignorance
\series default
: you probably don't know what you don't know.
This is not only risky in
\series bold
enterprise-critical
\series default
areas.
You may also put your
\series bold
carreer
\series default
in risk.
\end_layout
\begin_layout Section
What is Architecture
\begin_inset CommandInset label
LatexCommand label
name "sec:What-is-Architecture"
\end_inset
\end_layout
\begin_layout Standard
Architecture is important for achievement of
\series bold
management goals
\series default
in companies when they want to control their own long-term IT stategy.
Bad architectures can cause you miss your management goals, typically in
long term, and/or can increase several
\series bold
fundamental risks
\series default
for you and your company.
Therefore, you need
\emph on
good architecture
\emph default
, and you need some sort of
\series bold
quality assurance for architecture
\series default
.
\end_layout
\begin_layout Standard
There exist multiple definitions of architecture, and there are several
related terms like
\begin_inset Quotes eld
\end_inset
computer architecture
\begin_inset Quotes erd
\end_inset
and
\begin_inset Quotes eld
\end_inset
hardware architecture
\begin_inset Quotes erd
\end_inset
and
\begin_inset Quotes eld
\end_inset
software architecture
\begin_inset Quotes erd
\end_inset
etc etc.
Some versions are attributed to unrelated fields like
\begin_inset Quotes eld
\end_inset
landscape architecture
\begin_inset Quotes erd
\end_inset
and many others.
We need to hit your real needs as best as possible.
\end_layout
\begin_layout Standard
We start with
\begin_inset Quotes eld
\end_inset
architecture of anything
\begin_inset Foot
status open
\begin_layout Plain Layout
When narrowing down to Software Architecture, we
\emph on
may
\emph default
take a more specific version from
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Software_architecture
\end_layout
\end_inset
, but this might foster more misunderstanding under less-trained staff:
\end_layout
\begin_layout Quote
Software architecture refers to the
\series bold
high level structures
\series default
of a software system and the
\series bold
discipline
\series default
of creating such structures and systems.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Caution: when certain software architectures (even
\emph on
theoretically good
\emph default
ones) are
\series bold
\emph on
too abstract
\series default
\emph default
such they are not fully understood by other people, and/or when they are
missing important points for achieving your goals and/or for detection
of
\emph on
hidden problems
\emph default
, and/or when introducing
\series bold
unnecessary overhead
\series default
, they may work even
\series bold
counter-productive
\series default
.
\end_layout
\end_inset
\begin_inset Quotes erd
\end_inset
.
Important IT-related variants can be found in
\begin_inset Flex URL
status open
\begin_layout Plain Layout
http://www.iso-architecture.org/42010/defining-architecture.html
\end_layout
\end_inset
.
We select a short comprehensive definition, focusing on three main bullet
points:
\end_layout
\begin_layout Itemize
its
\series bold
elements
\series default
: the constituents that make up the system;
\end_layout
\begin_layout Itemize
the
\series bold
relationships
\series default
: both internal and external to the system; and
\end_layout
\begin_layout Itemize
the
\series bold
principles
\series default
of its design and evolution.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The order of these bullet points is important: (1) Make clear (with
\emph on
as less ambiguity as possible
\emph default
) about
\emph on
which elements
\emph default
you want to talk.
(2) Only after having a clear notion of your elements, you can start talking
on
\emph on
relationships
\emph default
.
(3) Once the relationships of your elements are clear, both internal and
external ones, only then you can start talking on
\emph on
architectural principles
\emph default
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Consequence: you should
\series bold
\emph on
define
\emph default
the terms
\series default
you are using for (1) elements, and (2) relationships, and (3) principles.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Avoid talking about unclear ideas.
Invest some effort into making things clear.
Otherwise, the
\emph on
quality of your architecture
\emph default
may lead to confusion or other bad effects, up to millions of € or $.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
It is practically impossible to denote and to define
\emph on
all existing
\emph default
elements / relationships / principles.
Tell only the
\series bold
relevant
\series default
ones.
Conversely, do
\series bold
not forget
\series default
important ones, at least for your IT environment and for your
\emph on
company culture
\emph default
.
\series bold
Omissions are
\series default
potentially
\series bold
dangerous
\series default
!
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Important:
\series bold
soundness
\series default
and
\series bold
freeness of contradictions
\series default
are similarly crucial than in
\begin_inset Quotes eld
\end_inset
wrong mathematical theories
\begin_inset Quotes erd
\end_inset
.
But this isn't enough.
Otherwise this section would be unnecessary.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Example: Distributed System
\end_layout
\end_inset
Typical pictures of Distributed Systems are depicted in the following way:
\end_layout
\begin_layout Enumerate
Elements are
\series bold
boxes
\series default
, typically depicting
\emph on
server classes
\emph default
or a
\emph on
class of servers
\emph default
(as deployed into a datacenter), sometimes also
\emph on
singletons
\emph default
(each by each), augmented with their (class) names.
\end_layout
\begin_layout Enumerate
Relationships are lines, typically depicting (bundles of)
\series bold
network cables
\series default
, or other (multiplexed)
\emph on
communication channels
\emph default
.
When necessary, they may also get (class) names, but typically such detail
is not always needed for understanding.
\end_layout
\begin_layout Enumerate
Principles are communicated as subsequent
\series bold
explanation text
\series default
, typically in English.
Do not skip important explanations!
\end_layout
\begin_layout Plain Layout
Here is a simple example graphics as used in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Definition-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
, depicting the architectural bullet points (1) and (2), while the explanation
text (3) can be found in the referenced section:
\end_layout
\begin_layout Plain Layout
\noindent
\align center
´
\begin_inset Graphics
filename images/sharded-object-store-or-filesystem.fig
width 70col%
\end_inset
\end_layout
\begin_layout Plain Layout
\noindent
In this example architecture of a certain class of Distributed System, the
environment of the system is depicted as
\begin_inset Quotes eld
\end_inset
Internet
\begin_inset Quotes erd
\end_inset
without a box.
The system elements consist of boxes, denoted by their inside text, and
potentially also named by footers.
The relationships are simple lines, denoting the
\emph on
regular mass communication
\emph default
.
Less relevant details like
\family typewriter
ssh
\family default
access for sysadmins is omitted.
The dashed separators are used for optional internal grouping.
When important for a certain use case, some
\series bold
numbers depicting quantities
\series default
(see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://de.wikipedia.org/wiki/Mengenger%C3%BCst
\end_layout
\end_inset
) need to be
\emph on
added
\emph default
(which is unfortunately not possible in this paper because we don't talk
about
\emph on
concrete instances
\emph default
here).
The only abstract number here is
\begin_inset Formula $n$
\end_inset
, while others in units like Petabytes or GiB/s or similar should be added
for quality assurance.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Intuitively, the reader can imagine that this communication structure features
the main property of Sharding as explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Definition-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
: there is no communication / cross traffic
\emph on
between
\emph default
different shards.
There is
\series bold
no SPOC = Single Point Of Contention
\series default
.
The shards cannot influence each other.
Thus the
\series bold
risk of catastrophic incidents
\series default
is
\emph on
reduced
\emph default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
\series bold
Unnecessary structural complexity
\series default
would be an indicator for
\emph on
low quality
\emph default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Do not
\emph on
miss the chance
\emph default
for
\series bold
comparing
\series default
multiple
\series bold
architectural alternatives
\series default
for
\series bold
quality
\series default
!
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Pitfall: only compare
\emph on
pictures
\emph default
of architectures when they are
\emph on
really comparable
\emph default
.
The boxes need to refer to
\emph on
comparable items
\emph default
.
You cannot check this in 10 seconds! You will need to invest more time
and/or
\emph on
more
\emph default
capable people.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Important for creators of diagrams: do not unnecessarily mix boxes or lines
depicted in the same graphics style, when they are
\emph on
not easily comparable
\emph default
to each other.
For example, do not
\emph on
mix up
\emph default
less distinguishable items from
\emph on
different
\emph default
architecture
\emph on
topics
\emph default
like
\emph on
Distributed Systems
\emph default
(involving relationships at network level) with ones from OO = Object Orientati
on (involving completely uncomparable relationships like inheritance lines).
Another example: mixup of hardware boxes with software boxes (except when
you can
\emph on
clearly
\emph default
express the
\emph on
types
\emph default
of boxes as well as the types of relationships but beware of the
\emph on
complexity
\emph default
of suchalike diagrams).
Suchalike mix-ups are indicators for
\emph on
low quality
\emph default
, and can easily provoke various
\series bold
misunderstandings
\series default
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset VSpace defskip
\end_inset
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Hypothetical Example of a Misunderstanding
\end_layout
\end_inset
\end_layout
\begin_layout Plain Layout
\noindent
Here is a hypothetical example of a misunderstandable variant of the first
graphics.
For illustration, the bad parts are coloured in
\color brown
dark red
\color inherit
:
\end_layout
\begin_layout Plain Layout
\noindent
\align center
´
\begin_inset Graphics
filename images/example-bad-architecture.fig
width 50col%
\end_inset
\end_layout
\begin_layout Plain Layout
\noindent
What is bad here?
\end_layout
\begin_layout Plain Layout
This is not only violating the
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Definition-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
because the bottom box is a SPOC = Single Point of Contention, since multiple
shards
\emph on
must not share
\emph default
an unnecessary
\begin_inset Foot
status open
\begin_layout Plain Layout
Notice: some sort of
\begin_inset Quotes eld
\end_inset
Shard Gateway
\begin_inset Quotes erd
\end_inset
is unavoidable
\emph on
in general
\emph default
, because it actually belongs to the networking infrastructure, which is
always needed in order to communicate with the
\begin_inset Quotes eld
\end_inset
Internet
\begin_inset Quotes erd
\end_inset
.
However, a good
\emph on
implementation
\emph default
of a Sharding Architecture needs to
\emph on
invest
\emph default
some
\series bold
brain power
\series default
how to make this actual SPOC as good as possible for the actual use case.
\end_layout
\end_inset
common SPOC [
\size footnotesize
notice that the Internet can also have SPOCs but this is outside your system,
and we cannot influence much of it
\size default
].
\end_layout
\begin_layout Plain Layout
Another problem is that the
\emph on
whole picture
\emph default
wants to explain a
\emph on
Distributed System
\emph default
.
However, the dark red box contains
\series bold
another
\series default
\emph on
very complex
\series bold
Distributed System
\series default
\emph default
which is much more
\series bold
structurally complicated
\series default
than the original sharding model, but the reader
\emph on
cannot see this
\emph default
.
There is a
\series bold
\emph on
confusion of the abstraction level
\series default
\emph default
.
\end_layout
\begin_layout Plain Layout
When characterizations like
\begin_inset Quotes eld
\end_inset
BigCluster
\begin_inset Quotes erd
\end_inset
are omitted, readers can easily become
\series bold
mis-guided
\series default
by believing that the Object Store would be a
\begin_inset Quotes eld
\end_inset
simple
\begin_inset Quotes erd
\end_inset
or
\begin_inset Quotes eld
\end_inset
more easy box
\begin_inset Quotes erd
\end_inset
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Throughout this document, the term
\begin_inset Quotes eld
\end_inset
\series bold
architecture
\series default
\begin_inset Quotes erd
\end_inset
is strictly separated from
\begin_inset Quotes eld
\end_inset
\series bold
implementation
\series default
\begin_inset Quotes erd
\end_inset
.
Any of
\begin_inset Quotes eld
\end_inset
architecture
\begin_inset Quotes erd
\end_inset
or
\begin_inset Quotes eld
\end_inset
implementation
\begin_inset Quotes erd
\end_inset
can relate to Distributed Systems and/or to both hard- and software, and
may need inclusion of further infrastructure like networking.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
A certain architecture may have
\emph on
multiple
\emph default
implementations.
Good architectures are allowing this.
\series bold
Ability for multiple implementations
\series default
may be a
\emph on
competitive advantage
\emph default
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Omission of important implementation considerations, or single-binding
to a certain
\emph on
fixed enviroment
\emph default
, or even confusion of
\begin_inset Quotes eld
\end_inset
architecture
\begin_inset Quotes erd
\end_inset
with
\begin_inset Quotes eld
\end_inset
implementation
\begin_inset Quotes erd
\end_inset
(aka classes vs instances / confusion with
\emph on
singletons
\emph default
) is an indicator for
\emph on
low quality
\emph default
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
An implementation is based on a
\emph on
set
\emph default
of
\series bold
technologies
\series default
\begin_inset Foot
status open
\begin_layout Plain Layout
Architectures are serving as aids for
\series bold
classification of solutions
\series default
.
An implementation is a solution which has
\emph on
materialized
\emph default
(in contrast to solutions which exist only on paper).
Complex solutions / implementations are typically matching only one architectur
e.
Thus the relationship between architectures and solutions / implementations
is typically
\begin_inset Formula $1:n$
\end_inset
, while the relationship between solutions / implementations and technologies
is
\begin_inset Formula $n:m$
\end_inset
in general.
In case of a very simple solution, it may
\emph on
exceptionally
\emph default
match multiple architectures, but this is not typical for classification
schemes.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Unfortunately, certain technologies are not suitable for certain architectures.
There may be
\series bold
restrictions
\series default
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Because of
\series bold
hidden restrictions
\series default
which may show up later, you should not start with implementations or technolog
ies.
Always start top-down with architectural considerations, while trying to
identify potential restrictions
\emph on
as early as possible
\emph default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The biggest
\series bold
potential for good solutions
\series default
is at architectural level.
Exchanging a single component or a technology is typically much easier
than changing a whole architecture, once it has been implemented.
Often, changing an architecture is close to impossible.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Important for
\series bold
quality and usability
\series default
of
\series bold
software developments
\series default
: a certain implementation of software, even when intended as
\emph on
part
\emph default
of a certain architecture, should be able for
\series bold
multiple instances
\series default
.
Good implementations are
\emph on
constructed
\emph default
for this.
\series bold
Ability for multiple instances
\series default
may be a
\emph on
competitive advantage
\emph default
for development teams, provided they are professional enough for doing
this
\emph on
really
\emph default
and in proved good quality.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not plan software developments as
\series bold
\emph on
singletons
\series default
\emph default
.
Even worse: do not plan suchalike at architecture level.
This would be an indicator for
\emph on
worse quality
\emph default
of a certain architecture.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
\series bold
\emph on
Long-term
\emph default
success killers
\series default
: never start an architecture with a particular implementation and/or with
a particular technology in mind.
Even worse, never start with a particular product from a particular
\emph on
vendor
\emph default
(danger of so-called
\series bold
Vendor-Lock-In
\series default
).
Insufficient reasoning about fundamental architecture and
\series bold
fundamental laws
\series default
(e.g.
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Optimum-Reliability-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
), and/or
\series bold
not seriously considering alternative architectures
\series default
, is a major source of
\series bold
costly ill-designs
\series default
.
Some failure examples may be found in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example-Failures-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Needed skills: confusion of
\begin_inset Quotes eld
\end_inset
architecture
\begin_inset Quotes erd
\end_inset
with
\begin_inset Quotes eld
\end_inset
implementation
\begin_inset Quotes erd
\end_inset
and/or
\begin_inset Quotes eld
\end_inset
technology
\begin_inset Quotes erd
\end_inset
is another major source of ill-designs, which then often cause major product
flaws and/or operational problems.
Be sure to
\emph on
really
\emph default
understand the differences.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Recommended best practice is to (1) look at the
\series bold
problem space
\series default
, then (2) consider a
\emph on
set
\emph default
of
\series bold
architectural solution classes
\series default
, and (3) look at each of the
\series bold
mappings
\series default
between problem space and solution space.
The
\emph on
complexity
\emph default
of such a mapping is a first hint.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In somewhat more detail: start with
\series bold
architectural requirements
\series default
for a particular
\series bold
application area
\series default
(typically covering
\emph on
multiple
\emph default
use cases), then look at
\series bold
multiple solution architectures
\series default
, and finally go down to a
\series bold
\emph on
set
\series default
\emph default
of potential implementations, but only
\emph on
after
\emph default
the former has been understood.
Selection of components and technologies should be the
\emph on
last
\emph default
step during the first iteration of this method.
Then do a
\series bold
quality check
\series default
at
\emph on
concept
\emph default
level.
Often, such a review will disguise some problems / limitations etc, which
should be treated by further iterations, restarting top-down again.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
During this iterative concept work, you should
\series bold
validate
\series default
your solution(s) several times, e.g.
for
\series bold
compatibility
\series default
(no conflicts caused by restrictions, etc).
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Starting at the bottom with a particular single solution in mind, and/or
presuming a certain technology, is almost a
\emph on
guarantee
\emph default
for a non-optimum solution, or even a failed project, or even a disaster
at company level when
\series bold
enterprise-critical mass data
\series default
is involved.
Always consider a
\emph on
set of
\emph default
candidate architectures, and for each of them, a
\emph on
set of
\emph default
solutions / technologies.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Such a comparison needs to be
\series bold
fair
\series default
, and
\series bold
no biases
\series default
.
For new developments, this can easily result in killers.
Notice that typical
\emph on
software projects
\emph default
have a failure rate about 70%, as can be read in text books from software
engineering.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not believe that other areas like Distributed Systems are generally
much better.
There exists a
\emph on
plethora
\emph default
of more or less
\emph on
failed
\emph default
academic projects from the
\emph on
viewpoint
\emph default
of industry, depending on important concepts and
\series bold
fundamental laws
\series default
like
\series bold
Consistency Models
\series default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
A classical killer, but widely
\emph on
unknown
\emph default
, is the failed MIT project which tried to re-implement the semaphore at
Distributed System level.
Do
\emph on
not
\emph default
believe that other distributed re-implementations of other models working
well on SMP or NUMA boxes (e.g.
filesystems or object stores) have a much better chance.
Not not ignore the
\series bold
plethora of
\series default
(more or less)
\series bold
failed projects
\series default
, whether academic or industrial.
Distributed Systems can easily turn into
\series bold
unexpected snake pits
\series default
.
Otherwise the professors for Distributed Systems would be overpaid or useless!
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Another killer when
\emph on
updating
\emph default
or
\emph on
re-factoring
\emph default
an
\series bold
enterprise-critical productive system
\series default
: Never try this via too big and too less architectural changes.
Always do this
\series bold
incrementally
\series default
and via relatively
\series bold
small architectural changes
\series default
, whose impact is
\series bold
\emph on
revertable
\series default
\emph default
(analogously to Change Management best practices, e.g.
similar to ITIL).
It would be foolish to apply ITIL-like practices only to the
\emph on
rollout phase
\emph default
after the final implementation of a certain architecture had been completed.
Some parts of ITIL & co are very helpful for analogous transfer to
\emph on
incremental
\emph default
architecture work.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Do not think in waterfall models.
Always work
\series bold
iteratively
\series default
and
\series bold
evolutionary
\series default
by
\emph on
re-considering architecture
\emph default
whenever you find problems / contradictions induced by restrictions, similar
to the
\series bold
spiral model
\series default
\begin_inset Foot
status open
\begin_layout Plain Layout
See
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Spiral_model
\end_layout
\end_inset
.
\end_layout
\end_inset
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Be
\emph on
extremely cautious
\emph default
when transferring
\emph on
software
\emph default
development methods to storage architectures, where operations involves
masses of hardware.
You need to find a balance between extreme waterfall-like and agile
\begin_inset Foot
status open
\begin_layout Plain Layout
Purely agile methods are less suited for quality ensurance of storage architectu
res, because they are tempting people to start with simple approaches before
the problem domain has been fully understood, increasing the
\series bold
risk of architectural ill-designs
\series default
.
Starting an implementation
\emph on
too early
\emph default
on basis of an ill-design can easily
\series bold
lead into a dead end
\series default
.
Agile methods are typically encouraging
\series bold
early deliverables
\series default
, which can be counter-productive.
Example: it is clearly a bad idea to plan for an early deliverable for
some petabytes of storage.
Thus architects and managers are tempted to
\emph on
start small
\emph default
, e.g.
a BigCluster architecture with only 3 storage servers.
This type of
\begin_inset Quotes eld
\end_inset
early deliverable
\begin_inset Quotes erd
\end_inset
cannot detect any
\series bold
scalability problems
\series default
early enough, see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example-Failures-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
So you are in a
\series bold
dilemma
\series default
, whether you like it or not.
Although you probably dislike it, the planning phase of big storage systems
is unfortunately more like a waterfall process, by its very nature.
Thus
\emph on
workarounds
\emph default
for the shortcomings of a pure waterfall model are needed.
German readers may also check the V-model XT, as described in
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://de.wikipedia.org/wiki/V-Modell_(Entwicklungsstandard)
\end_layout
\end_inset
.
Unfortunately, the newer XT variant of the V-model is missing in the correspond
ing English Wikipedia article (retrieved autumn 2019), misleading readers
with unfortunate opinions like the V-model being too similar to a waterfall
model.
Notice that the newer XT variant of the V-model, as well as some other
variants (e.g.
lecture notes from Professor Jochen Ludewig / University of Stuttgart),
have adopted many ideas from the agile community, such as rework in loops
and cycles, and thus should not be classifed as
\begin_inset Quotes eld
\end_inset
linear waterfall
\begin_inset Quotes erd
\end_inset
models.
In particular,
\series bold
early quality ensurance of concepts and architectures
\series default
and
\series bold
rework of architecture as early as possible
\series default
is something you definitely should borrow from the V-model and its modern
variants, even if you dislike V-models otherwise.
\end_layout
\end_inset
methods.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Serious bugs in an
\emph on
architectural
\emph default
ill-design (examples see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example-Failures-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
) are typically very hard by causing serious limitation and/or impact, and
cannot be fixed by the best implementation, or by the best technology of
the world.
Be sure to understand the fundamental difference between architecture and
its (multiple / alternative) implementations, as well as multiple technologies,
and their respective restrictions, as well as their
\series bold
reach
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Howto QA = Quality Assurance of Architecture Work
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
It is a bad idea to delegate quality checks onto
\emph on
big teams
\emph default
and/or to
\emph on
treat
\emph default
(unofficially) some people as
\emph on
responsible for quality
\emph default
, but in reality
\emph on
some
\emph default
of them having
\emph on
insufficient skills
\emph default
and/or
\emph on
insufficient experiences
\emph default
on certain classes / parts of architectural work (e.g.
\emph on
mixup
\emph default
of management experiences with technical or architectural experiences).
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In place of meetings with more than 3 participants, the following is recommende
d:
\end_layout
\begin_layout Enumerate
Identify the
\emph on
right
\emph default
people with the best possible skills and experiences, not only in the important
tasks, but more importantly at
\emph on
architectural work
\emph default
in the application area.
When necessary, include external
\emph on
technical
\emph default
and
\emph on
architectural
\emph default
consultants.
\end_layout
\begin_layout Enumerate
Instead of few
\begin_inset Quotes eld
\end_inset
quality assurance session(s)
\begin_inset Quotes erd
\end_inset
in a format where true discussion and
\series bold
deep dive
\series default
into risks etc is unlikely (e.g.
for carreer or opportunity or
\emph on
perceptional
\emph default
or
\emph on
political(!)
\emph default
reasons) or hard (e.g.
due to incompatible skill levels), notice that
\series bold
QA discussion formats
\begin_inset Foot
status open
\begin_layout Plain Layout
Discussions in
\emph on
big groups
\emph default
about
\emph on
truth of claims
\emph default
or even about
\emph on
facts
\emph default
are similar to
\emph on
public
\emph default
discussions about the truth of Einstein's law.
There is a German saying:
\end_layout
\begin_layout Plain Layout
\family sans
\size large
Shit is excellent, because
\series bold
\emph on
millions
\series default
\emph default
of flies cannot be wrong
\family default
\size default
.
\end_layout
\end_inset
\series default
are
\series bold
even harder than development
\series default
.
Thus discussions make
\emph on
only
\emph default
sense when
\emph on
at least
\end_layout
\begin_deeper
\begin_layout Enumerate
private
\series bold
preferences
\series default
or so-called
\series bold
hidden agendas
\series default
have been counter-acted
\emph on
in advance
\emph default
at HR level, and
\end_layout
\begin_layout Enumerate
multiple
\emph on
direct
\emph default
\series bold
personal discussions
\series default
between the
\emph on
right
\emph default
persons in a
\series bold
friendly culture
\series default
but on
\series bold
risks
\series default
/
\series bold
reliability
\series default
/
\series bold
scalability
\series default
/
\series bold
architectural flaws
\series default
/
\series bold
cost
\series default
/ etc are possible from at least
\emph on
company-level scope
\emph default
, and
\end_layout
\begin_layout Enumerate
there are
\emph on
enough
\emph default
but not
\emph on
too much
\emph default
and not
\emph on
too big
\emph default
(= well balanced)
\series bold
understandable documents
\series default
on sufficient
\emph on
architectural alternatives
\emph default
, which had been
\series bold
reviewed
\series default
like recommended in
\emph on
Software Engineering
\emph default
.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
A (single) slide, reducing a complex matter down to 1 page (or too less
pages) with
\emph on
unclear
\emph default
or
\emph on
non-checkable
\emph default
terms and claims
\begin_inset Formula $\Longrightarrow$
\end_inset
then you have a
\series bold
\emph on
serious risk
\series default
\emph default
you typically know as a manager.
\end_layout
\begin_layout Enumerate
for enterprise-critical work: QA of the QA, e.g.
involving
\emph on
external
\emph default
experts in architecture and important relatives like Software Engineering.
\end_layout
\end_deeper
\begin_layout Enumerate
\series bold
\emph on
Selection
\series default
\emph default
among
\emph on
multiple
\emph default
architectural and/or implementation alternatives should be (1) checked
for sufficient QA and (2)
\series bold
risk approved
\series default
, then (3)
\series bold
decided
\series default
, and (4)
\series bold
backed
\series default
by (upper) management.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
A leadership style like
\begin_inset Quotes eld
\end_inset
the
\emph on
team
\emph default
is posponsible for the (single / best / etc) solution
\begin_inset Quotes erd
\end_inset
is a
\series bold
risk
\series default
(or almost a
\emph on
guarantee
\emph default
) for
\series bold
non-optimum
\series default
or even bad
\series bold
long-term decisions
\series default
(recall that architectures have typically a lifetime of
\emph on
decades
\emph default
and are often cumbersome to fix).
\end_layout
\end_inset
\end_layout
\begin_layout Section
What is Backup
\begin_inset CommandInset label
LatexCommand label
name "sec:What-is-Backup"
\end_inset
\end_layout
\begin_layout Standard
A Backup is a
\series bold
copy of your data
\series default
at a
\series bold
different location
\series default
.
Typically, the copy is
\emph on
intended
\emph default
for later
\emph on
copyback
\emph default
.
There are two distinct operations associated with backup:
\end_layout
\begin_layout Enumerate
\series bold
Creation
\series default
of backup.
This creates a
\series bold
copy
\series default
, or a new version of a copy.
It involves some network traffic over various distances, e.g in simplest
case over a USB cable, or from the application datacenter to a backup datacente
r.
Typically, this is done at
\series bold
regular time intervals
\series default
, e.g.
daily.
\end_layout
\begin_layout Enumerate
\series bold
Restore
\series default
from backup.
This does the
\emph on
opposite
\emph default
of backup creation.
It also involves network traffic, but typically in
\series bold
reverse direction
\series default
.
The
\series bold
roles
\series default
of application datacenter and backup datacenter
\series bold
do not change
\series default
.
Restore is typically
\series bold
triggered manually
\series default
, and
\emph on
typically
\emph default
only after some incident which led to (supposed)
\series bold
data loss
\series default
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In practice (with
\emph on
few exceptions
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
Theoretically, a restore operation could be designed more or less
\emph on
idempotent
\emph default
.
So only some
\emph on
missing
\emph default
data would be restored, all else would typically remain unchanged.
However, this
\emph on
may
\emph default
violate the semantics of several applications, e.g.
violate the referential integrity of certain databases, and/or may leave
back some
\begin_inset Quotes eld
\end_inset
unknown
\begin_inset Quotes erd
\end_inset
and/or
\begin_inset Quotes eld
\end_inset
outdated
\begin_inset Quotes erd
\end_inset
data, potentially even in huge masses.
Example: in Shared Hosting Linux, some old PHP session cookies residing
in home directories might be restored, potentially even violating some
security rules.
When run regularly via cron jobs, the storage space may be flooded in the
long term with more or less
\begin_inset Quotes eld
\end_inset
useless
\begin_inset Quotes erd
\end_inset
data and/or inodes, e.g.
exceeding the
\emph on
user quota
\emph default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Notice: in
\emph on
strict
\emph default
storage semantics, the deletion of data is a
\emph on
valid
\emph default
operation, which should not be violated except when you are sure that it
will not cause any harm.
Example: in Shared Hosting Linux, some customers are creating such masses
\emph on
regularly
\emph default
via their own cron jobs.
Their
\emph on
unnecessary restore
\emph default
may become rather expensive over time.
\end_layout
\end_inset
), it is
\emph on
often
\emph default
a
\emph on
bad idea
\emph default
to restore a backup although there is
\emph on
no real
\emph default
data loss.
This would likely overwrite your newest application data with an elder
version, likely leading to
\emph on
data loss
\emph default
.
Therefore, classical restore is a
\series bold
potentially dangerous operation
\series default
!
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Summary: structural properties of backup
\end_layout
\end_inset
Backup is
\series bold
asymmetrical
\series default
.
It involves two non-exchangable roles / locations, application location
vs backup location.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Confusion of these roles, or triggering an unnecessary restore is a
\series bold
risk
\series default
for
\series bold
data integrity
\series default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Conversely, having no reasonable backup at all is an even higher risk.
Backup is a
\series bold
best practice
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Section
What is Replication
\begin_inset CommandInset label
LatexCommand label
name "sec:Replication-vs-Backup"
\end_inset
\end_layout
\begin_layout Standard
Intuitively, data backup and data replication are two different solution
classes, addressing different problems.
\end_layout
\begin_layout Standard
However, there exist descriptions where both solution classes are overlapping,
as well as their corresponding problem classes.
For example, backup as explained in
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Backup
\end_layout
\end_inset
could be seen as also encompassing some types of storage replications explained
in
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Replication_(computing)
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
For this guide, we want a clearer discrimination, for better orientation
in the solution jungle.
As a rough comparison of
\emph on
typical
\emph default
implementations, see the following
\emph on
typical
\emph default
differences:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Tabular
<lyxtabular version="3" rows="7" columns="3">
<features tabularvalignment="middle">
<column alignment="left" valignment="top">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<row>
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Backup
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Replication
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Timely pattern
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
intervals
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
continuously
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Fast handover (planned)
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no, or cumbersome
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Fast failover (unplanned)
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no, or cumbersome
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Protect for physical failures
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Protect for logical data corruption
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
typically no
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Disaster Recovery Time (
\series bold
MTTR
\series default
)
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
typically
\series bold
very slow
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\series bold
fast
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Standard
\noindent
There are some solutions implementing a
\emph on
mixture
\emph default
, by different combinations of some of these typical properties.
Here we focus on fundamental principles.
\end_layout
\begin_layout Standard
Although
\series bold
replication
\series default
as defined here
\series bold
has much better properties
\series default
from a risk viewpoint on enterprise-critical data, there remains a gap
in favour of backup: backup is typically implemented as a
\emph on
logical copy
\emph default
, which lowers risks from certain types of
\series bold
data corruption
\series default
, such as filesystem corruption, for which only risky repair workarounds
like
\family typewriter
fsck
\family default
are the last resort when you don't have a backup
\emph on
in addition
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
An integrated solution for continuous replication via logical copies would
be difficult.
There is a
\emph on
concept mismatch
\emph default
between logical copies and strict consistency requirements posed by fast
handover, while at the same time compensation of logical data corruption
would require the
\emph on
opposite
\emph default
of strict consistency.
Notice that logical copies are residing at higher layers, e.g.
filesystems or database records, while pure replication is easier done
at block layer.
See also sections
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Performance-Risk-Arguments-from-Layer"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Layering-Rules"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Notice that snapshots at block layer cannot
\emph on
reliably
\emph default
protect against long-lasting
\series bold
silent corruptions
\series default
.
Even higher-layer ZFS snapshots treated in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example:-ZFS-Replication"
plural "false"
caps "false"
noprefix "false"
\end_inset
cannot provide the same protection level as a classical per-inode multi-generat
ion backup onto a different filesystem type, thus lowering systematical
risks from software bugs in filesystem code.
In general, there always remains a
\series bold
residual risk
\series default
of data loss.
The classical solution is simple: just have two or more different counter-measu
res in parallel, and spread them over distinct datacenters.
\end_layout
\end_inset
to replication.
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
Typical Advantages
\end_layout
\end_inset
Typically, backup improves the
\emph on
recoverability
\emph default
, while replication improves the
\emph on
reliability
\emph default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Because of these typical differences, enterprise-critical data typically
deserves
\emph on
both
\emph default
solution classes at the same time.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Important requirements for replication
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
A good replication solution is
\series bold
symmetrical
\series default
.
There a two (or more) copies at different locations.
They are either active at the same time (which works reliably only rack-to-rack
over crossover cables, see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Explanation-via-CAP"
plural "false"
caps "false"
noprefix "false"
\end_inset
), and/or they need to
\series bold
switch their roles quickly
\series default
.
Switching should have two different triggers:
\series bold
planned handover
\series default
, vs
\series bold
unplanned failover
\series default
in case of an incident.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Symmetry is an important precondition for
\series bold
fast reaction
\series default
onto incidents.
For
\series bold
enterprise-critical data
\series default
, this is important for drastically
\series bold
lowering
\series default
the expectance value of
\series bold
losses by incidents
\series default
.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Confusion of solution classes replication vs backup and/or their corresponding
problem classes / properties can be harmful to enterprises and to carreers
of responsible persons.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Hint: the
\emph on
granularity
\emph default
of replication handover / failover is important for maximum flexibility.
See section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Flexibility-of-Failover"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\end_inset
\end_layout
\begin_layout Section
What is Location Transparency
\begin_inset CommandInset label
LatexCommand label
name "sec:Location-transparency"
\end_inset
\end_layout
\begin_layout Standard
Replication as defined in the previous section works only reasonable fast
enough when Location Transparency is implemented reasonably well, see also
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Where-implement-Location-Transparency"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Here is a brief explanation what it is.
\end_layout
\begin_layout Standard
Location Transparency is an extremely important and well-known
\series bold
fundamental principle
\series default
in Distributed Systems, and has attracted research for decades.
\end_layout
\begin_layout Standard
Simply stated, it means that the location of an object or of a service is
never (part of) a primary key, but any access is via a
\emph on
logical name
\emph default
not depending on the location.
Thus the location may (relatively easily) change at runtime.
\end_layout
\begin_layout Standard
There are numerous examples where this fundamental principle is obeyed.
Unfortunately, there are also many examples where it is violated.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\series bold
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Phone numbers
\end_layout
\end_inset
\series default
Phone numbers are
\emph on
not
\emph default
location transparent in general.
For stationary phones, they contain a location-dependent prefix.
In general, it is not possible to move to a different city while keeping
the old stationary phone number.
In case of mobile phones / cellphones, numbers are
\begin_inset Quotes eld
\end_inset
more location transparent
\begin_inset Quotes erd
\end_inset
, but even there they are
\emph on
not fully
\emph default
location transparent: for international calls, they contain prefixes referring
to the country, e.g +1 for US or +49 for Germany.
In practice, it is not easily possible to permanently move from Germany
to US, without giving up the old number after a while.
In addition, often the
\emph on
service provider
\emph default
and/or the network technology (D-net vs E-net etc) may be also encoded
in cellphone numbers, e.g.
somewhere as an infix.
Changing the provider may have some restrictions.
However, for
\emph on
most practical purposes
\emph default
, such as Europeans spending their holidays in US, mobile phone numbers
are
\emph on
sufficienctly
\emph default
location transparent.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In practice, location transparency is not just a boolean property.
As explained by the cellphone example, it may have various
\series bold
degrees
\series default
.
In addition, it can refer to different sub-systems at different architectural
layers.
Some layers / some components may be (more or less) location transparent,
but others not at all.
Thus it is important to mention the
\series bold
layer or the component
\series default
when talking about location transparency.
\end_layout
\begin_layout Standard
Interestingly, the Wikipedia article
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Location_transparency
\end_layout
\end_inset
is an incomplete stub when this section was written (Autumn 2019).
It seems that people are actually paying less attention to it.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Major violations of location transparency are almost always carrying some
\series bold
technical debt
\series default
, likely causing future problems and impediments.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Therefore, establishment of reasonable location transparency needs to be
seen as
\series bold
best practice
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
It may happen that somebody thinks there would not be enough time and/or
resources for implemention of certain kinds of location transparency.
Although in many cases this is not really true, there might be some corner
cases where it sometimes is true, or close to true.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\series bold
Investments into location transparency
\series default
are often
\emph on
l
\series bold
ongterm
\emph default
investments
\series default
.
Not doing it will likely
\series bold
decrease your business opportunities
\series default
and
\series bold
increase your risks
\series default
in the long term.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Location transparency is simply a certain type of
\series bold
redirection mechanism
\series default
, which
\series bold
\emph on
automatically
\emph default
follows the current location
\series default
of a service and/or its storage.
It makes you
\emph on
independent
\emph default
from various placement strategies.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Once you have established location transparency, a
\series bold
multitude of placement strategies
\series default
for your services and/or your storage locations is possible.
This opens up more
\series bold
opportunities
\series default
for
\series bold
higher efficiency
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
If anyone is arguing that location transparancy were
\emph on
not needed
\emph default
as a major feature, you should check whether such a person is really an
expert.
There needs to be a clear and valid justification for such an opinion.
\end_layout
\begin_layout Standard
Hints for implementation of location transparency are in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Where-implement-Location-Transparency"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Section
What is HA = High Availability
\begin_inset CommandInset label
LatexCommand label
name "sec:What-is-HA"
\end_inset
\end_layout
\begin_layout Standard
HA is defined by a single number, denoting the
\emph on
minimum percentage of uptime
\emph default
of a certain system from a user's perspective.
Some examples:
\end_layout
\begin_layout Itemize
99% availability: a total downtime of more than 87.6 hours per year is not
acceptable.
\end_layout
\begin_layout Itemize
99.9% availability: a total downtime of more than 8.76 hours per year is not
acceptable.
\end_layout
\begin_layout Itemize
99.99% availability: a total downtime of more than 52.56 minutes per year
is not acceptable.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not confuse uptime / downtime with service times or similar.
For example, internet services typically can require some seconds until
web pages are appearing on a smartphone.
Thus it does not make sense to define a
\begin_inset Quotes eld
\end_inset
downtime
\begin_inset Quotes erd
\end_inset
or an
\begin_inset Quotes eld
\end_inset
availability
\begin_inset Quotes erd
\end_inset
or even a
\begin_inset Quotes eld
\end_inset
reaction time
\begin_inset Quotes erd
\end_inset
in units of milliseconds.
\end_layout
\begin_layout Paragraph*
Requirements vs Solutions
\begin_inset CommandInset label
LatexCommand label
name "par:Requirements-vs-Solutions"
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
HA is a
\series bold
requirement
\series default
.
Requirements are characterizations of the
\series bold
problem space
\series default
.
In software engineering, requirements are
\emph on
strictly separated
\emph default
from any measures, how a requirement can be met (solution space).
In general, there may be
\emph on
several
\emph default
solutions for achieving a certain HA percentage.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Some of the potential solutions for the same HA percentage may be much
more
\series bold
expensive
\series default
than others, sometimes by
\emph on
factors
\emph default
.
We will see some examples later.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Expensive Over-Engineering Pitfalls
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Some people are arguing
\emph on
incorrectly
\emph default
, by claiming that
\emph on
any
\emph default
HA solution would
\emph on
need
\emph default
to be built by
\emph on
hardware redundancy.
\emph default
Some people even believe that redundancy would be needed at
\emph on
each and every single hardware component
\emph default
, otherwise it would not be HA.
This confuses requirements with solutions.
It is wrong in general, because even a certain degree of hardware redundancy
cannot guarantee a certain overall hard+software HA percentage in general,
for example when certain components such as failover software are not reliable
enough.
See section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sub:Detailed-explanation"
plural "false"
caps "false"
noprefix "false"
\end_inset
for another counter-example, where addition of more hardware redundancy
\begin_inset Formula $>k$
\end_inset
does not help.
Of course, higher degrees of HA are
\emph on
typically(!)
\emph default
built using certain types and degrees of hardware redundancy, including
variants like geo-redundancy.
In general, however, there might be other means for achieving HA, like
extremely quick automatic repair methods, self-healing
\begin_inset Foot
status open
\begin_layout Plain Layout
This is no joke.
For example, certain spacecrafts need to run for years or even for decades,
without any maintenance.
Thus it helps enormously when some of their components are self-healing,
for example certain surfaces or shields after a hit by micro meteorites.
\end_layout
\end_inset
systems, etc.
\end_layout
\end_inset
\end_layout
\begin_layout Section
What is Geo-Redundancy
\begin_inset CommandInset label
LatexCommand label
name "sec:What-is-Geo-Redundancy"
\end_inset
\end_layout
\begin_layout Standard
From the technical viewpoint of HA, geo-redundancy belongs to the
\emph on
solution space
\emph default
.
From the viewpoint of
\series bold
government authorities
\series default
, and/or from
\series bold
owners
\series default
of a company / rating agencies determining the
\series bold
business risk
\series default
and the
\series bold
stock exchange value
\series default
of a company, it is also a
\emph on
requirement
\emph default
.
\end_layout
\begin_layout Standard
Geo-redundancy means that the
\series bold
risk
\series default
of certain types of geo-localized
\series bold
physical impacts
\series default
, such as earthquakes, floods, terrorist attacks, cascading mass power blackouts
, etc, must be
\series bold
compensated
\series default
by being able to run at least the
\series bold
core business
\series default
from another geo-location within some reasonable timeframe.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Loss of Datacenters
\end_layout
\end_inset
\end_layout
\begin_layout Plain Layout
Here is the basic idea of geo-redundancy between two datacenters A and B
during
\emph on
ordinary operations
\emph default
:
\end_layout
\begin_layout Plain Layout
\align center
\begin_inset Graphics
filename images/example-geo-datacenters.fig
width 70line%
\end_inset
\end_layout
\begin_layout Plain Layout
Here are
\series bold
two possible geo-disaster scenarios
\series default
which can be compensated by well-implemented geo-redundancy:
\end_layout
\begin_layout Plain Layout
\align center
\begin_inset Graphics
filename images/example-geo-disaster-1.fig
width 70line%
\end_inset
\end_layout
\begin_layout Plain Layout
\noindent
\align center
\series bold
\size largest
OR
\emph on
not predictable
\end_layout
\begin_layout Plain Layout
\align center
\begin_inset Graphics
filename images/example-geo-disaster-2.fig
width 70line%
\end_inset
\end_layout
\begin_layout Plain Layout
On top of these example geo-disasters, which may last several days / weeks
/ months / or
\series bold
forever
\series default
,
\emph on
ordinary indicents
\emph default
may
\emph on
additionally
\emph default
occur at the surviving datacenter.
\end_layout
\begin_layout Plain Layout
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Plain Layout
There is at least one more possible scenario, which also belongs to the
realm of geo-redundancy, although some people may not count the following
as a
\begin_inset Quotes eld
\end_inset
geo
\begin_inset Quotes erd
\end_inset
disaster but instead may talk about a
\emph on
(long-lasting) network outage
\emph default
or a long-lasting
\emph on
cable incident
\emph default
or
\series bold
connection loss by earthquake
\begin_inset Foot
status open
\begin_layout Plain Layout
Example: please ask serious geologists about earthquake risks from the San
Andreas Fault (see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/San_Andreas_Fault
\end_layout
\end_inset
), or any other risiky
\emph on
places
\emph default
over the world.
\end_layout
\end_inset
\series default
or similar:
\end_layout
\begin_layout Plain Layout
\noindent
\align center
\series bold
\size largest
OR
\emph on
not predictable
\end_layout
\begin_layout Plain Layout
\align center
\begin_inset Graphics
filename images/example-geo-disaster-3.fig
width 70line%
\end_inset
\end_layout
\begin_layout Plain Layout
Serious geo disasters / loss of datacenters have happened several times
in history.
For example, the 2021 flood in the German Ahrtal (Ahr valley) has destroyed
several local datacenters, even governmental ones e.g.
from local tax authorities.
Some of these datacenters have been
\series bold
physically lost forever
\series default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Scientists are predicting that the climate change will
\series bold
increase the risk
\series default
of big floods.
The so-called
\emph on
jet streams
\emph default
are weakening, such that low-pressure zones are typically moving only slowly,
or even can stay resident at the same geo-position for
\emph on
weeks
\emph default
.
This has happened in the Ahrtal.
If a similar weather scenario would have rained down its
\emph on
hundreds
\emph default
of liters per square meter south of Stuttgart, around the Neckar valley,
just a few hundred kilometers away from the Ahr valley disaster, relevant
parts of the German industry could have been lost, including relevant parts
of international companies like Daimler-Chrysler.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
This risk increase is not limited to Europe, but a
\series bold
global risk
\series default
.
Around 2021, many US citicens have noticed that they are also affected
from increased weather risks.
President Biden reacts differently from president Trump.
Large parts of the industry, as well as stock exchange markets are also
adapting strategies.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Other geo-disasters like devastating high-magnitude earth quakes can for
example happen in the Rhine valley, which is known for a history of suchalike
events around once per century.
The last one in Basel is now ~100 years ago.
The next bigger one looks
\emph on
overdue
\emph default
in statistical average (depending on expected magnitude), and may potentially
happen anywhere in the long rift valley, even near Cologne, or in the Netherlan
ds.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
There is no doubt that Japan and many other areas in the Pacific are active
earth quake zones.
Even the US has geo disaster zones (e.g.
around Yellowstone).
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Implementation of
\emph on
real
\begin_inset Foot
status open
\begin_layout Plain Layout
Some advocates are trying to sell their so-called
\begin_inset Quotes eld
\end_inset
geo-redundant
\begin_inset Quotes erd
\end_inset
solutions or strategies although they cannot actually cope with true geo
disasters.
\end_layout
\end_inset
\emph default
geo-redundancy has
\series bold
pitfalls
\series default
and needs high-grade methods and skills.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
This guide hints you on technical
\series bold
prevention strategies
\series default
against (partly or full) datacenter losses, provided you are investing
the time to read it
\emph on
carefully
\emph default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
There are some ongoing political discussions about detail requirements for
geo-redundancy.
The mimimum distance requirement between suitable geo-locations is seen
differently by different interest groups, and even differently in different
countries.
Some backgrounds:
\end_layout
\begin_layout Itemize
\noindent
The
\series bold
enormous cost
\series default
for setup of a new datacenter.
\end_layout
\begin_layout Itemize
\noindent
Several
\emph on
commerical
\emph default
geo-redundancy solution
\emph on
components
\emph default
are
\series bold
more expensive
\series default
with increasing geo-distance, sometimes by
\emph on
factors
\emph default
, or even
\emph on
unapplicable
\emph default
at all for longer distances (e.g.
bad historical experiences with
\begin_inset Formula $>50$
\end_inset
km), while another OpenSource component like MARS does not functionally
depend on
\emph on
arbitrary
\emph default
geo distances
\emph on
by construction
\emph default
.
\end_layout
\begin_layout Standard
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
sloppy
\end_layout
\end_inset
While some NGOs = Non-Governmental Organizations are fighting for a minimum
distance of only 5 km, the German government authority BSI recommends a
minimum distance of 200 km between datacenters for
\series bold
critical infrastructures.
\series default
See
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Sicherheitsberatung/Standort-Kr
iterien_HV-RZ/Standort-Kriterien_HV-RZ.pdf?__blob=publicationFile&v=5
\end_layout
\end_inset
.
Although this is only a
\begin_inset Quotes eld
\end_inset
recommendation
\begin_inset Quotes erd
\end_inset
officially, certain sectors like
\series bold
banking
\series default
are actually forced to treat this more or less like a requirement.
\end_layout
\begin_layout Standard
For an observer, it could be interesting how
\emph on
international requirements
\emph default
will evolve, and how rating agencies will change their rules during the
course of the next decades.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Opportunities
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Notice that not only the same family of requirements can be solved
\emph on
very
\emph default
differently, but also some
\emph on
combinations of different requirements
\emph default
.
This guide explains ways for both
\series bold
cost reduction
\series default
and
\series bold
risk reduction
\series default
at the same time, by
\emph on
combining
\emph default
HA requirements with geo-redundancy requirements in a clever way, such
that the combined solution will meet both at the same time.
\end_layout
\begin_layout Plain Layout
\noindent
Example: a resulting combined solution is called
\series bold
Football on top of MARS
\series default
.
It provides additional operational value, such as load balancing via the
\series bold
ability for butterfly
\series default
, see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Flexibility-of-Failover"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\end_inset
\end_layout
\begin_layout Section
What is a Trilemma?
\begin_inset CommandInset label
LatexCommand label
name "sec:What-is-a-Trilemma"
\end_inset
\end_layout
\begin_layout Standard
While
\begin_inset Quotes eld
\end_inset
dilemma
\begin_inset Quotes erd
\end_inset
is a common term,
\begin_inset Quotes eld
\end_inset
trilemma
\begin_inset Quotes erd
\end_inset
is less known.
Managers and other decision makers may
\series bold
risk massive consequences
\series default
, such as
\series bold
going out of business
\series default
, when a
\emph on
certain
\emph default
trilemma is
\emph on
unknown
\emph default
and/or
\emph on
not considered appropriately
\emph default
.
\end_layout
\begin_layout Standard
\noindent
\align center
In short:
\series bold
\size larger
pick any two
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
Management Trilemma
\series bold
Quality-Cost-Delivery
\series default
\size footnotesize
aka
\series bold
Quick-Cheap-Good
\end_layout
\end_inset
\end_layout
\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
filename images/trilemma-quality-cost-delivery.fig
width 45col%
\end_inset
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
If you switch your management strategy too fast or too often, several
\series bold
employees
\series default
may
\series bold
leave your company each time
\series default
.
People have
\emph on
adapted
\emph default
to the old situation, and cannot switch quickly to a new strategy.
For example, some people have invested large parts of their professional
life into
\begin_inset Quotes eld
\end_inset
Good
\begin_inset Quotes erd
\end_inset
delivery but not
\begin_inset Quotes eld
\end_inset
Fast
\begin_inset Quotes erd
\end_inset
delivery, e.g.
in areas where
\begin_inset Quotes eld
\end_inset
Fast
\begin_inset Quotes erd
\end_inset
is simply
\emph on
impossible
\emph default
.
Such people will then likely go to another employer where their skills
are a better fit.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Datacenter and Geo-Redundancy Trilemma: CAP-Theorem
\end_layout
\end_inset
\end_layout
\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
filename images/cap-theorem.fig
width 60col%
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset VSpace -6mm
\end_inset
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Do not try to achieve the impossible.
You
\emph on
cannot
\emph default
pick
\emph on
all
\emph default
three.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
It
\emph on
might
\emph default
be possible to change the CAP property of a
\series bold
big operational system
\series default
(e.g.
huge local and/or geo-redundant datacenters) under certain circumstances,
but it
\emph on
can easily
\emph default
grow into
\emph on
more
\emph default
risk during a transitional phase, and will likely eat a lot of time and
resources.
\end_layout
\begin_layout Plain Layout
Details: section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Explanation-via-CAP"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
More examples:
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Trilemma
\end_layout
\end_inset
\end_layout
\begin_layout Section
What is
\emph on
Cloud Storage
\begin_inset CommandInset label
LatexCommand label
name "sec:What-is-Cloud-Storage"
\end_inset
\end_layout
\begin_layout Standard
According to a popular definition from
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Cloud_storage
\end_layout
\end_inset
(retrieved June 2018), cloud storage is
\end_layout
\begin_layout Description
(1) Made up of many
\series bold
distributed resources
\series default
, but still
\series bold
act as one
\series default
.
\end_layout
\begin_layout Description
(2) Highly
\series bold
fault tolerant
\series default
through redundancy and distribution of data.
\end_layout
\begin_layout Description
(3) Highly
\series bold
durable
\series default
through the creation of versioned copies.
\end_layout
\begin_layout Description
(4) Typically
\series bold
eventually consistent
\series default
with regard to data replicas.
\end_layout
\begin_layout Standard
A detailed analysis of consequences from this definition are in sections
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Suitability-of-Architectures"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Properties-Cloud-Storage"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Section
What is SDS = Software Defined Storage
\begin_inset CommandInset label
LatexCommand label
name "sec:What-is-Software-defined-Storage"
\end_inset
\end_layout
\begin_layout Standard
As explained in
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Software-defined_storage
\end_layout
\end_inset
, SDS is a
\series bold
marketing term
\series default
, subsuming a wide variety of offerings from several
\emph on
vendors
\emph default
.
\end_layout
\begin_layout Standard
In essence, it can be
\emph on
almost anything
\emph default
from the storage area, where hardware can be treated independently from
software, or at least some software configuration is available.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\noindent
Even a
\begin_inset Quotes eld
\end_inset
simple
\begin_inset Quotes erd
\end_inset
HDD = Hard Disk Drive device has not only some
\series bold
network interface
\series default
(typically SATA or SAS in place of Ethernet), but also contains some software
called firmware, which
\emph on
could
\emph default
(at least potentially) be exchanged independently.
Believe it or not: even such a
\begin_inset Quotes eld
\end_inset
simple hardware
\begin_inset Quotes erd
\end_inset
device is providing
\series bold
storage virtualization
\series default
, although a rather primitive one.
For example, it maps logical sector numbers (LBNs) to physical coordinates
like CHS = Cylinder / Head / Sector, or similar.
Newer 4k sector disks can emulate old 512 byte sector formats, etc.
Thus such devices would match the fuzzy Wikipedia description of SDS.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In practice, the term SDS is a
\series bold
tautology
\series default
because it can mean almost anything from the storage area, thus the term
is not really useful.
\end_layout
\begin_layout Standard
In order to talk about SDS in technical terms of architecture, here is an
\emph on
attempt
\emph default
to somehow narrow it down, and to somehow relate it to Cloud Storage:
\end_layout
\begin_layout Quote
SDS (in the sense of this guide) is a Cloud Storage system.
\end_layout
\begin_layout Standard
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Treating SDS as equivalent to Cloud Storage makes it more useful, but neglects
the opportunity for defining something useful inbetween of Cloud Storage
and
\begin_inset Quotes eld
\end_inset
anything
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Standard
Notice that a Wikipedia search
\begin_inset Quotes eld
\end_inset
storage as a service
\begin_inset Quotes erd
\end_inset
(which could be abbreviated StaaS) is delivering a redirection to
\begin_inset Quotes eld
\end_inset
Cloud Storage
\begin_inset Quotes erd
\end_inset
.
Another missed opportunity for getting some useful structure into the
\series bold
wild-growing jungle
\series default
, and for clearly explaining differences, and for a fruitful discussion
of pro and cons.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Remark
\end_layout
\end_inset
This is an indicator that the storage area is not really mature.
There are more short-sighted hypes than fundamental concepts.
This architecture guide is an attempt to guide
\begin_inset Foot
status open
\begin_layout Plain Layout
German saying, semantically translated to English:
\begin_inset Quotes eld
\end_inset
You cannot see the forest because there are too many trees in front of it.
\begin_inset Quotes erd
\end_inset
\end_layout
\end_inset
you through the hype jungle in a structured way.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Indirect cost of hypes
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Beware of hyped buzzwords like
\begin_inset Quotes eld
\end_inset
storage as a service
\begin_inset Quotes erd
\end_inset
.
It narrows your attention to network-centric architectures, and distracts
your attention from major cost saving opportunities like
\family typewriter
LocalSharding
\family default
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Variants-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\end_inset
\end_layout
\begin_layout Section
What is Scalability
\begin_inset CommandInset label
LatexCommand label
name "sec:What-is-Scalability"
\end_inset
\end_layout
\begin_layout Standard
If you know the zones from the following picture, and if you know what the
red
\family sans
\series bold
\color red
X
\family default
\series default
\color inherit
means, you may skip this section:
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/principle-scalability.fig
width 100col%
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
The term
\begin_inset Quotes eld
\end_inset
scalability
\begin_inset Quotes erd
\end_inset
can be
\series bold
\emph on
dangerous
\emph default
for your business
\series default
.
Why?
\end_layout
\begin_layout Standard
Many technical people know that
\emph on
tightly coupled
\emph default
computers like SMP = Symmetric Multi Processing systems or NUMA = Non-Uniform
Memory Access systems have some
\series bold
scalability limit
\series default
, also called
\emph on
bottleneck
\emph default
(or
\emph on
RAM bottleneck
\emph default
or
\emph on
von-Neumann bottleneck
\emph default
or similar).
\end_layout
\begin_layout Standard
Unfortunately, this can induce a
\series bold
common mis-belief
\series default
: just design a
\emph on
loosely coupled
\emph default
cluster or grid, and then any scalability limit would be gone; such a system
would have
\begin_inset Quotes eld
\end_inset
no scalability limits
\begin_inset Quotes erd
\end_inset
.
Such a belief is
\series bold
fundamentally broken
\series default
and may induce
\emph on
massive
\emph default
problems and/or cost.
\end_layout
\begin_layout Standard
The fundamental terms
\begin_inset Quotes eld
\end_inset
tightly coupled
\begin_inset Quotes erd
\end_inset
and
\begin_inset Quotes eld
\end_inset
loosely coupled
\begin_inset Quotes erd
\end_inset
are
\series bold
important
\series default
to know, also for
\emph on
responsibles
\emph default
like managers.
They can be found in
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Multiprocessing#Processor_coupling
\end_layout
\end_inset
.
In 2021, this article mentions an interesting experience, with some emphasis
added by me:
\end_layout
\begin_layout Quote
Tightly coupled systems
\series bold
perform better
\series default
and are physically smaller than loosely coupled systems, but have
\emph on
historically(!)
\emph default
required greater initial investments and
\emph on
may
\emph default
depreciate rapidly [...]
\end_layout
\begin_layout Standard
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Nowadays, the
\emph on
historic part
\emph default
of this experience does not apply anymore.
Today's standard servers are
\emph on
tightly coupled
\emph default
within their enclosure, for ordinary prices.
Servers with more than 100 standard SMP CPUs are not extremely expensive,
anymore, while even small loosely coupled
\series bold
clusters
\series default
are typically
\series bold
much more expensive
\series default
.
In a datacenter, rackspace and network cost are not neglectible, as well
as further contributors to TCO = Total Cost of Ownership.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Horizontal vs Vertical Scaling, alias scale-out vs scale-up
\end_layout
\end_inset
Some presentations are arguing with questionable claims about
\begin_inset Quotes eld
\end_inset
horizontal scalability
\begin_inset Quotes erd
\end_inset
\begin_inset Formula $\cong$
\end_inset
\begin_inset Quotes erd
\end_inset
scale-out
\begin_inset Quotes erd
\end_inset
vs
\begin_inset Quotes eld
\end_inset
vertical scalability
\begin_inset Quotes erd
\end_inset
\begin_inset Formula $\cong$
\end_inset
\begin_inset Quotes erd
\end_inset
scale-up
\begin_inset Quotes erd
\end_inset
.
These compound notions belong to a
\emph on
taxonomy
\emph default
.
Their origin is from
\emph on
scaling methods
\emph default
, which is not the same as
\begin_inset Quotes eld
\end_inset
scalability
\begin_inset Quotes erd
\end_inset
(notice the grammatical suffixes -ing vs -ability).
A rough explanation can be found in
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Scalability#Horizontal_(scale_out)_and_vertical_scal
ing_(scale_up)
\end_layout
\end_inset
(retrieved in Feb 2022).
Essentially, these notions refer to
\begin_inset Quotes eld
\end_inset
loosely coupled
\begin_inset Quotes erd
\end_inset
vs
\begin_inset Quotes eld
\end_inset
tightly coupled
\begin_inset Quotes erd
\end_inset
, and thus they belong to the
\series bold
solution space
\series default
,
\emph on
not
\emph default
to requirements.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
As explained in paragraph
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Requirements-vs-Solutions"
plural "false"
caps "false"
noprefix "false"
\end_inset
and several sections like
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Requirements-for-Geo-Redundancy"
plural "false"
caps "false"
noprefix "false"
\end_inset
, mix-up of requirements with solutions can have
\series bold
adverse effects
\series default
.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In practice, scale-up by addition of
\begin_inset Quotes eld
\end_inset
more disks
\begin_inset Quotes erd
\end_inset
or
\begin_inset Quotes eld
\end_inset
more RAM
\begin_inset Quotes erd
\end_inset
or
\begin_inset Quotes eld
\end_inset
more CPUs
\begin_inset Quotes erd
\end_inset
depends on already deployed hardware.
HDD or SSD addition is possible during operations, provided you have bought
some enclosures with hot-swappable free slots.
RAM or CPU power requires more preparations.
While Intel- oder AMD-based standard servers are inducing an effort similar
to a repair, there exists
\emph on
specialized hardware
\emph default
like IBM mainframes which are
\emph on
constructed
\emph default
for scale-up during 24/7/365 operations.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not mix up different hardware classes like mainframes vs contemporary
servers.
\end_layout
\begin_layout Plain Layout
Following are more adverse effects, by incorrect usage of the term
\begin_inset Quotes eld
\end_inset
scalability
\begin_inset Quotes erd
\end_inset
:
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Some people believe that the term
\begin_inset Quotes eld
\end_inset
scalability
\begin_inset Quotes erd
\end_inset
would be
\emph on
defined as
\emph default
\begin_inset Quotes eld
\end_inset
there are no limits
\begin_inset Quotes erd
\end_inset
.
Such a belief is not only fundamentally wrong, it is
\series bold
dangerous
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Failures and Indirect Cost by ill-usage of
\begin_inset Quotes eld
\end_inset
scalability
\begin_inset Quotes erd
\end_inset
\end_layout
\end_inset
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
\series bold
Scalability isn't for free.
\series default
Mixup of terms and their meaning can easily endanger your company.
We will see an example later.
So: what does
\begin_inset Quotes eld
\end_inset
scalability
\begin_inset Quotes erd
\end_inset
or their fuzzy marketing descendants like
\begin_inset Quotes eld
\end_inset
planet scalability
\begin_inset Quotes erd
\end_inset
\emph on
really
\emph default
mean?
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
or
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
: There is no valid general definition of
\begin_inset Quotes eld
\end_inset
scalabilty
\begin_inset Quotes erd
\end_inset
for all science areas.
Even when narrowed down to
\emph on
computer science
\emph default
, there is no generally accepted unique definition.
Even in certain sub-areas like
\emph on
storage systems
\emph default
or certain
\emph on
application classes
\emph default
, the term
\begin_inset Quotes eld
\end_inset
scalability
\begin_inset Quotes erd
\end_inset
can easily
\series bold
mislead
\series default
you in your role as a
\series bold
\emph on
responsible manager
\emph default
.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
For some people like sales persons or so-called
\begin_inset Quotes eld
\end_inset
evangelists
\begin_inset Quotes erd
\end_inset
or advocates / influencers, this creates an
\emph on
opportunity
\emph default
for their
\emph on
marketing
\emph default
(cf footnote
\begin_inset CommandInset ref
LatexCommand vref
reference "fn:faked-scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
Internal projects can be argued
\begin_inset Quotes eld
\end_inset
more convincing
\begin_inset Quotes erd
\end_inset
or
\begin_inset Quotes eld
\end_inset
more promising
\begin_inset Quotes erd
\end_inset
.
External products or solutions can be sold more easily.
This may
\series bold
tear out money from your pocket
\series default
.
It may imply further bad effects, e.g.
by
\series bold
creation of impressions
\series default
(or un-proved or un-provable claims) about
\end_layout
\begin_layout Itemize
\series bold
arbitrary(!) scalability
\series default
of certain architectures / solution classes / products / etc, and/or
\end_layout
\begin_layout Itemize
\series bold
marketing buzzwords
\series default
, e.g.
like
\begin_inset Quotes eld
\end_inset
planet scale
\begin_inset Quotes erd
\end_inset
(e.g.
avoiding the -ing or -ability so you cannot check or disprove it), and/or
\end_layout
\begin_layout Itemize
several further
\series bold
mis-guidings
\series default
, some of them looking like
\emph on
deliberately not corrected
\emph default
even when counter-arguments had been published (see examples in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Scalability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
According to many physical laws like
\series bold
limited speed of light
\series default
or
\series bold
limited density of matter
\series default
,
\begin_inset Quotes eld
\end_inset
infinite resources
\begin_inset Quotes erd
\end_inset
do not exist.
Thus:
\end_layout
\begin_layout Plain Layout
\align center
\series bold
\size larger
\begin_inset Quotes eld
\end_inset
Infinite
\begin_inset Quotes erd
\end_inset
or
\begin_inset Quotes eld
\end_inset
Arbitrary
\begin_inset Quotes erd
\end_inset
Scalability does not exist.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
A
\emph on
draft
\emph default
of an
\emph on
attempt
\emph default
for a
\emph on
hopefully
\emph default
better defintion of
\begin_inset Quotes eld
\end_inset
*scalable
\begin_inset Quotes erd
\end_inset
can be found in appendix
\begin_inset CommandInset ref
LatexCommand vref
reference "chap:Definition-of-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
According to Einstein and many other fundamental laws,
\emph on
every
\emph default
system on this globe has
\series bold
\emph on
always
\emph default
some scalability limit
\series default
, somewhere.
Even the internet has some limit.
Scalability is
\emph on
always
\emph default
a
\series bold
non-linear
\series default
behaviour.
In general, it cannot be computed or fully predicted due to
\emph on
unknown influence factors
\emph default
.
As a manager, you are trained to deal with
\begin_inset Quotes eld
\end_inset
unknowns
\begin_inset Quotes erd
\end_inset
.
As an architect, you need to deal with this also.
\end_layout
\begin_layout Quote
\noindent
\series bold
\size larger
In order to find a
\emph on
practical
\emph default
limit, you must
\emph on
reach
\emph default
it.
\end_layout
\begin_layout Standard
Consequence: there exists no general
\begin_inset Quotes eld
\end_inset
unlimited scalability
\begin_inset Quotes erd
\end_inset
, only some
\begin_inset Quotes eld
\end_inset
currently scalable
\begin_inset Quotes erd
\end_inset
systems which have
\series bold
not yet
\begin_inset Foot
status open
\begin_layout Plain Layout
There are
\emph on
some
\emph default
cases where the scalability of a certain system is
\emph on
estimated(!)
\emph default
as so high, such that it will likely never be reached.
Notice: this is not a
\emph on
proof
\emph default
, but just a
\emph on
prediction
\emph default
.
\end_layout
\end_inset
reached their scalability limit
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Example: Fork Bombs and OOM Killer
\end_layout
\end_inset
When
\begin_inset Quotes eld
\end_inset
unlimited scalability
\begin_inset Quotes erd
\end_inset
would be possible, then it
\emph on
must
\emph default
be possible to literally
\begin_inset Quotes eld
\end_inset
run a fork bomb
\begin_inset Foot
status open
\begin_layout Plain Layout
Besides classical attacks via fork bombs, there may exist
\emph on
valid use cases
\emph default
for your customers or websurfers for running an
\emph on
application
\emph default
which behaves
\emph on
similar to
\emph default
a fork bomb.
Details are out of scope here.
By ruling out such-alike
\begin_inset Quotes eld
\end_inset
applications
\begin_inset Quotes erd
\end_inset
e.g.
via customer contract, you can avoid much of the adverse effects.
\end_layout
\end_inset
\begin_inset Quotes erd
\end_inset
on your system, whether it runs at a NUMA box providing
\begin_inset Quotes eld
\end_inset
vertical scalability
\begin_inset Quotes erd
\end_inset
, or on a BigCluster providing
\begin_inset Quotes eld
\end_inset
horizontal scalability
\begin_inset Quotes erd
\end_inset
, or any combination, or whatever else.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
\series bold
Infiniteness
\series default
or
\series bold
\begin_inset Quotes eld
\end_inset
unlimited
\begin_inset Quotes erd
\end_inset
\series default
is a
\emph on
very bold claim
\emph default
.
Do not promise suchalike.
Otherwise, you may need to explain why something does not work as expected,
e.g.
\end_layout
\begin_layout Itemize
when your
\series bold
Linux kernel
\series default
reacts with customer-visible major slowdown, or with OOM Killer = Out Of
Memory Killer), and/or
\end_layout
\begin_layout Itemize
when your
\series bold
BigCluster
\series default
is very quickly running out of capacity, and/or
\end_layout
\begin_layout Itemize
when your
\series bold
network
\series default
is running out of capacity, and/or
\end_layout
\begin_layout Itemize
when your
\series bold
budget
\series default
is running out of capacity, and/or
\end_layout
\begin_layout Itemize
when your customers can
\series bold
sue
\series default
you for unmet contracts, and/or
\end_layout
\begin_layout Itemize
other
\series bold
relevant topics
\series default
.
\end_layout
\begin_layout Plain Layout
Consequence: you might be forced to admit that
\begin_inset Quotes eld
\end_inset
arbitrary scalability
\begin_inset Quotes erd
\end_inset
or
\begin_inset Quotes eld
\end_inset
planet scalability
\begin_inset Quotes erd
\end_inset
(or whatever you name it)
\emph on
cannot exist
\emph default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Another type of boldness: do not argue at
\emph on
architectural layer
\emph default
that one of the above
\emph on
examples
\emph default
(e.g.
an OOM killer) would
\begin_inset Quotes eld
\end_inset
show
\begin_inset Quotes erd
\end_inset
that
\begin_inset Quotes eld
\end_inset
horizontal scaling
\begin_inset Quotes erd
\end_inset
would be better than
\begin_inset Quotes eld
\end_inset
vertical scaling
\begin_inset Quotes erd
\end_inset
, or would even
\begin_inset Quotes eld
\end_inset
prove
\begin_inset Quotes erd
\end_inset
the opposite, etc.
It is simply
\emph on
unfair
\emph default
to compare
\emph on
incomparable
\begin_inset Foot
status open
\begin_layout Plain Layout
Even some more or less
\begin_inset Quotes eld
\end_inset
comparable
\begin_inset Quotes erd
\end_inset
solutions like
\family typewriter
nginx
\family default
vs
\family typewriter
Apache
\family default
, or mismatches like solutions vs solution
\emph on
classes
\emph default
like Docker-based
\family typewriter
AppEngine
\family default
s vs
\emph on
whatever
\emph default
, should not be used for an
\emph on
architectural
\emph default
argumentation.
\end_layout
\end_inset
solutions
\emph default
with each other, e.g.
by presenting
\emph on
examples
\emph default
of some operational incidents, or similar.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Always take
\series bold
important given context
\series default
(like the use case, your workload, classes of customers, etc)
\series bold
and their distribution / amounts of resources
\series default
into account, as well as
\series bold
all relevant parameters
\begin_inset Foot
status open
\begin_layout Plain Layout
Be sure to check what is
\emph on
really
\emph default
given, and what is predicted by whom, and what is
\emph on
really
\emph default
required, and what is just a
\begin_inset Quotes eld
\end_inset
wish
\begin_inset Quotes erd
\end_inset
.
\end_layout
\end_inset
\series default
like TCO etc.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Do
\series bold
\emph on
not
\emph default
assume linearity
\series default
in complex systems, consisting of a
\emph on
multitude
\emph default
of layers, and/or hundreds of software components needing
\emph on
runtime interactions
\emph default
, typically based on
\series bold
message passing
\series default
(e.g.
over
\emph on
virtual
\emph default
networks).
OOM or communication overload /
\series bold
queueing
\series default
/ deadlocks /
\series bold
self-amplification
\series default
/ message floods / etc aren't the only pitfalls.
Real systems have
\emph on
many
\emph default
places where non-linear or
\series bold
disruptive behaviour
\series default
can appear under (sometimes
\emph on
unpredictable
\emph default
)
\series bold
side conditions
\series default
like (customer-induced) bugs, attacks, etc, which may
\series bold
spread out
\series default
(e.g.
error propagation,
\series bold
incident propagation
\series default
, etc).
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
Example: Scalability of MARS
\end_layout
\end_inset
The current
\emph on
official
\emph default
scalability limits of MARS are documented in the appendix of mars-user-manual.pd
f.
They are following a
\emph on
conservative communication strategy
\emph default
, not intended for marketing.
Since MARS is a software component, other factors like hardware properties
cannot be documented in
\family typewriter
mars-user-guide.pdf
\family default
, but only
\emph on
estimated from experiences
\emph default
.
Some
\emph on
practical experiences
\emph default
for
\emph on
certain hardware
\emph default
are documented in the ChangeLog.
These
\emph on
practical
\emph default
limits are
\emph on
much better
\emph default
, but they are depending on the released kernel and MARS version, and on
many other factors.
It is all else but easy to provide
\emph on
generic information
\emph default
on scalability of a certain more or less generic product.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
More
\emph on
examples
\emph default
, some of them showing
\series bold
fatal and/or disruptive incidents
\series default
, and even requiring (major)
\series bold
architectural changes
\series default
in their specific example use case, can be found in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example-Failures-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
Once again, here is the general picture of the important
\series bold
Zones of Scalability
\series default
as explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Scalability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/principle-scalability.fig
width 100col%
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Importance of Scalability Limits
\end_layout
\end_inset
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not believe any people who are not aware of the
\emph on
fact
\emph default
that
\begin_inset Quotes eld
\end_inset
scalability
\begin_inset Quotes erd
\end_inset
of a certain offering can only be
\emph on
claimed
\emph default
when the
\series bold
Scalability Limit
\series default
is reported and
\emph on
checkable
\emph default
for a
\series bold
certain workload
\series default
and/or for a
\series bold
workload class
\series default
and/or for an
\series bold
application class
\series default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
\emph on
Ensure
\emph default
that you are
\emph on
really
\emph default
talking about a
\emph on
relevant
\emph default
workload and/or (application) class, fitting your needs.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
When you
\series bold
reach a scalability limit
\series default
for a relevant product operated on system
\begin_inset Formula $S_{i}$
\end_inset
, and when its
\emph on
upgrade
\emph default
to the future workload
\begin_inset Formula $W_{j}$
\end_inset
(definitions see appendix
\begin_inset CommandInset ref
LatexCommand vref
reference "chap:Definition-of-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
) would become
\emph on
too expensive
\emph default
, your company is
\emph on
risking
\emph default
the
\series bold
danger
\series default
of
\series bold
loss of a product line
\series default
, or potentially even of going
\series bold
out of business
\series default
when your competition can bypass you dramatically.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
More details can be found in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Scalability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Section
What is an Object Store
\begin_inset CommandInset label
LatexCommand label
name "sec:What-is-Object-Store"
\end_inset
\end_layout
\begin_layout Standard
The following picture explains the typical
\series bold
Abstract Functionality
\series default
differences between contemporary object store
\emph on
implementations
\emph default
and contemporary filesystem
\emph on
implementations
\emph default
.
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/functionality-object-store-vs-filesystems.fig
width 70col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Caution: there exists a large bandwidth of object store implementations,
as well as filesystem implementations (e.g.
academic prototypes vs industrial).
The sizes of the above
\emph on
typical
\emph default
areas
\emph on
cannot
\emph default
tell you too much about
\emph on
implementation efforts
\emph default
as well as
\emph on
maintenance efforts
\emph default
and other efforts, because other factors like
\series bold
strongly coupled vs loosely coupled
\series default
implementations /
\emph on
ease
\emph default
of caching e.g.
in
\series bold
Cache Coherence
\series default
problems / other architectural differences /
\series bold
Consistency Models
\series default
/ experiences of protagonists /
\series bold
maturity
\series default
at multiple layers and on subsystems / size of the
\series bold
developer community
\series default
e.g.
in small vs big projects like the Linux kernel / common code e.g.
among multiple Linux filesystem implementions / etc are
\emph on
typically
\emph default
much more important.
While well-known classical filesystems are mature technology on tightly
coupled local
\begin_inset Foot
status open
\begin_layout Plain Layout
For
\begin_inset Quotes eld
\end_inset
Distributed Systems
\begin_inset Quotes erd
\end_inset
, we don't count low-level
\begin_inset Quotes eld
\end_inset
local
\begin_inset Quotes erd
\end_inset
connections like
\series bold
short-distance
\series default
SAS cables between disk enclosures and servers.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Attention: some evangelists are wrongly claiming that local storage would
be
\begin_inset Quotes eld
\end_inset
too small
\begin_inset Quotes erd
\end_inset
and/or
\begin_inset Quotes eld
\end_inset
unmanagle
\begin_inset Quotes erd
\end_inset
/ etc for too many use cases.
Notice: contemporary
\begin_inset Quotes eld
\end_inset
local
\begin_inset Quotes erd
\end_inset
hardware RAID networks can easily scale up to 1 petabytes or more, and
can provide competitive IOPS rates.
\end_layout
\end_inset
servers, object stores often try to be
\begin_inset Quotes eld
\end_inset
sexy
\begin_inset Quotes erd
\end_inset
and
\begin_inset Quotes eld
\end_inset
more intelligent
\begin_inset Quotes erd
\end_inset
by
\begin_inset Quotes eld
\end_inset
less functionality at the surface
\begin_inset Quotes erd
\end_inset
on loosely coupled BigCluster designs.
In reality, the latter are typically
\series bold
more cumbersome
\series default
due to
\series bold
less controllable
\series default
concepts like
\begin_inset Quotes eld
\end_inset
eventually consistent
\begin_inset Quotes erd
\end_inset
or
\series bold
high complexity
\series default
, e.g.
in Distributed Systems / at
\emph on
hidden
\emph default
parts / less visible subsystems, leading to various problems in practice.
\end_layout
\begin_layout Standard
More details can be found in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Negative-Example:-object"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
\emph on
Real
\emph default
Functionality and TCO behind typical Object Stores
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
\emph on
Typical
\emph default
object store implementations are approximately nothing else but
\series bold
special cases
\series default
of classical fileystems, when looking at the
\emph on
abstract
\emph default
functionality.
In reality, their
\begin_inset Quotes eld
\end_inset
additional
\begin_inset Quotes erd
\end_inset
functionality is close to neglectible.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In many cases, roughly comparable classical filesystems are
\series bold
more mature
\series default
, and/or
\series bold
more reliable
\series default
, and/or
\series bold
cheaper
\series default
in terms of TCO = Total Cost of Ownership.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Don't let
\series bold
fool
\series default
you from contrary generalized claims.
\series bold
Always check
\series default
such claims in detail, and by
\emph on
real experts
\emph default
who really know and can not only explain the differences both in terms
of
\series bold
Abstract Functionality
\series default
, but also by knowledge and by enough
\emph on
first-hand
\emph default
experience on
\emph on
implementation details
\emph default
.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
When unsure, read the details from section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Negative-Example:-object"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\end_inset
\end_layout
\begin_layout Section
What is Sharding
\begin_inset CommandInset label
LatexCommand label
name "sec:What-is-Sharding"
\end_inset
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
Sharding is an architecture addressing
\series bold
Distributed System
\series default
s with a certain
\series bold
horizontal scaling
\series default
strategy (aka known as
\emph on
scale-out
\emph default
), which has interesting properties for enterprise-critical workloads (provided
that the method is
\emph on
applicable
\emph default
for your use case):
\end_layout
\begin_layout Itemize
\series bold
Minimization
\series default
of
\series bold
distastrous incidents
\series default
and good
\series bold
reliability
\series default
by avoidance of
\emph on
spreading risks
\emph default
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Optimum-Reliability-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Itemize
Well-suited for
\series bold
geo-redundancy
\series default
.
\end_layout
\begin_layout Itemize
High
\series bold
scalability
\series default
.
\end_layout
\begin_layout Itemize
\series bold
Minimization
\series default
of
\series bold
TCO
\series default
= Total Cost of Ownership.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\emph on
Example
\emph default
picture, intended for use cases like DropBox & co, typically for use by
\emph on
masses
\emph default
of end users for copies and/or backup of their private filesystem data:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/sharded-object-store-or-filesystem.fig
width 80col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
An example for another use case (Shared Hosting Linux), productive since
years with an SLA of 99.98% end-to-end for currently ~6 millions of customers
on ~6 petabytes of filesystem data on ~10 billions of files (inodes) can
be found in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Positive-Example:-ShaHoLin"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
A definiton of Sharding (for the context of
\emph on
datacenters
\emph default
e.g.
in cloud / hosting / storage systems & co) can be found in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Distributed-vs-Local:"
plural "false"
caps "false"
noprefix "false"
\end_inset
and its subsection
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Definition-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Section
What is active-active
\begin_inset CommandInset label
LatexCommand label
name "sec:What-is-active-active"
\end_inset
\end_layout
\begin_layout Standard
A search in
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://www.wikipedia.org
\end_layout
\end_inset
for the term
\begin_inset Quotes eld
\end_inset
active-active
\begin_inset Quotes erd
\end_inset
results in no article (status April 2022).
\end_layout
\begin_layout Standard
To the knowledge of the author, there is no commonly accepted scientific
definition.
There are some related definitions in Distributed Systems, like
\series bold
cache coherency
\series default
or
\series bold
DSM
\series default
=
\series bold
D
\series default
istributed
\series bold
S
\series default
hared
\series bold
M
\series default
emory etc, but obviously these terms are less accessible for a broad audience.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Why
\begin_inset Quotes eld
\end_inset
active-active
\begin_inset Quotes erd
\end_inset
can be misleading
\end_layout
\end_inset
This architecture guide is explicitly focussing on
\emph on
\series bold
geo-redundancy
\series default
\emph default
.
By introduction of
\begin_inset Quotes eld
\end_inset
active-active
\begin_inset Quotes erd
\end_inset
into geo-redundancy, the
\emph on
assumption
\begin_inset Foot
status open
\begin_layout Plain Layout
An external source of a (traditional / folklore) definition, cited with
extra emphasis added by me:
\end_layout
\begin_layout Labeling
\labelwidthstring 00.00.0000
Active-Active Redundancy is a high availability mode of operation whereby
two or more instances of the
\series bold
same network device
\series default
or
\series bold
appliance
\series default
serve clients
\series bold
simultaneously and interchangeably
\series default
.
A client can connect to any one instance and have requests served in the
same way.
If any of the
\series bold
identical instances
\series default
go offline, any of the other instances can
\series bold
seamlessly serve requests
\series default
while client sessions persist.
\end_layout
\begin_layout Plain Layout
Commentary: in addition to
\emph on
fundamental
\emph default
contradictions to the CAP theorem, this
\emph on
forces
\emph default
a certain solution class while excluding other solution classes.
When the term
\begin_inset Quotes eld
\end_inset
client
\begin_inset Quotes erd
\end_inset
is borrowed from the client-server computing paradigm, a solution
\emph on
within the same datacenter
\emph default
is typically possible, but outside the scope of this guide on
\emph on
geo
\emph default
-redundancy.
When the term
\begin_inset Quotes eld
\end_inset
client
\begin_inset Quotes erd
\end_inset
is used exchangably with
\begin_inset Quotes eld
\end_inset
customer using a geo-redundant service
\begin_inset Quotes erd
\end_inset
, following is an attempted explanation, touching only
\emph on
some
\emph default
key items out of many others:
\end_layout
\begin_layout Itemize
Multiple instances of the
\emph on
same
\emph default
network device [
\begin_inset Formula $\ldots$
\end_inset
] identical instances: forces DSM or its equivalent not only at lower layers
like the storage layer and the computation layer, but also at networking,
e.g.
the application layer, or at least the session layer.
This is by far harder than required by geo-redundancy.
Essentially, the
\emph on
network itself
\emph default
must be HA.
How to achieve this on the
\emph on
entire internet
\emph default
as such?
\begin_inset Newline newline
\end_inset
In contrast, geo-redundancy just requires
\emph on
core business continuity
\emph default
within a
\emph on
reasonable
\emph default
period of time.
Business continuity in the context of internet / hosting companies will
typically
\emph on
not
\emph default
require that customer sessions (e.g.
originating from mobiles like smartphones) and/or http requests (maybe
while in progress?) must
\emph on
seamlessly
\emph default
survive a
\emph on
geo
\emph default
disaster.
\end_layout
\begin_layout Itemize
Multiple instances of the
\emph on
same(!)
\emph default
\series bold
\emph on
appliance
\series default
\emph default
: this may prefer
\emph on
commercial
\emph default
turn-key solutions (so-called
\emph on
Vendor-Lock-In
\emph default
) in front of (self-managed) OpenSource components supporting an open /
extensible / migratable manner.
\end_layout
\begin_layout Itemize
\begin_inset Formula $\ldots$
\end_inset
can [sic!] connect to
\emph on
any one
\emph default
instance
\begin_inset Formula $\ldots$
\end_inset
: how to achieve this without knowledge on the instance name, excluding
Location Transparency (section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Location-transparency"
plural "false"
caps "false"
noprefix "false"
\end_inset
)?
\end_layout
\begin_layout Itemize
Serve [requests] simultaneously: implies
\emph on
full parallelism
\emph default
between different requests, even when needing to work on the
\emph on
virtually same data
\emph default
.
In Computer Science, this broad research area is called differently, like:
\series bold
Formal Semantics
\series default
of
\emph on
parallelism
\emph default
applied to Memory and/or Storage (and/or networking etc).
Thus the results from these scientific disciplines (and more) will hold
in addition, thereby exceeding the scope of this guide
\emph on
by far
\emph default
.
\end_layout
\end_inset
\emph default
could be something like: during
\emph on
ordinary
\emph default
operations,
\emph on
all
\emph default
hardware from
\emph on
all
\emph default
geo-redundant datacenters should be
\begin_inset Quotes eld
\end_inset
fully employed
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Plain Layout
Here is an attempt to explain why the term
\begin_inset Quotes eld
\end_inset
active-active
\begin_inset Quotes erd
\end_inset
is more or less a
\emph on
contradiction
\emph default
to the term
\begin_inset Quotes eld
\end_inset
geo-redundancy
\begin_inset Quotes erd
\end_inset
.
Please recap from section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Geo-Redundancy"
plural "false"
caps "false"
noprefix "false"
\end_inset
, while adding a
\emph on
slight
\emph default
enhancement:
\end_layout
\begin_layout Plain Layout
\align center
\begin_inset Graphics
filename images/example-geo-disaster-1.fig
width 40line%
\end_inset
\end_layout
\begin_layout Plain Layout
\noindent
\align center
\series bold
\size larger
OR
\emph on
not predictable
\end_layout
\begin_layout Plain Layout
\align center
\begin_inset Graphics
filename images/example-geo-disaster-2.fig
width 40line%
\end_inset
\end_layout
\begin_layout Plain Layout
\noindent
\align center
\series bold
\size larger
AND / OR
\emph on
not predictable
\end_layout
\begin_layout Plain Layout
\align center
\begin_inset Graphics
filename images/example-geo-disaster-3.fig
width 40line%
\end_inset
\end_layout
\begin_layout Plain Layout
According to the CAP theorem (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Explanation-via-CAP"
plural "false"
caps "false"
noprefix "false"
\end_inset
ff), at least the last scenario cannot be
\emph on
productively
\emph default
handled by a so-called
\begin_inset Quotes eld
\end_inset
active-active
\begin_inset Quotes erd
\end_inset
setup between the two datacenters, without
\series bold
causing split brain
\series default
.
\end_layout
\begin_layout Plain Layout
If you want to survive any of the unpredictable datacenter losses via
\begin_inset Quotes eld
\end_inset
active-active
\begin_inset Quotes erd
\end_inset
, then good luck.
Maybe there exists some kind of
\begin_inset Quotes eld
\end_inset
solution
\begin_inset Quotes erd
\end_inset
, but you likely will be on your own upon further on-top incidents, at least
when bound to the so-called
\begin_inset Quotes eld
\end_inset
active-active
\begin_inset Quotes erd
\end_inset
operation mode even
\emph on
during
\emph default
suchalike long-lasting geo-disasters.
Some kind of
\begin_inset Quotes eld
\end_inset
downgrading
\begin_inset Quotes erd
\end_inset
of your active-active setup during a long-lasting geo disaster
\emph on
might
\emph default
be possible in some way, but again: good luck.
Hopefully you have
\emph on
tested in advance
\emph default
all relevant combinations of possible scenarios.
\end_layout
\begin_layout Plain Layout
More details: sections
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Performance-Risks-Replication-Layer"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Global-Strict-Consistency"
plural "false"
caps "false"
noprefix "false"
\end_inset
ff.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
3-fold Datacenter Replication and 2-fold Geo-Disaster
\end_layout
\end_inset
As a hypothetical example of a geo-redundant datacenter setup, according
to some imagined
\emph on
future
\emph default
EU standards, you will have 3 datacenters A and B and C.
Maybe that A will reside in the Netherlands, B might be in Spain, and C
might be in Poland:
\end_layout
\begin_layout Plain Layout
\align center
\begin_inset Graphics
filename images/example-3x2-geo-disaster.fig
width 70line%
\end_inset
\end_layout
\begin_layout Plain Layout
You are free to populate these 3 datacenters via any affordable strategy,
like:
\end_layout
\begin_layout Description
(a) active-active-active, or
\end_layout
\begin_layout Description
(b) active-passive-passive.
\end_layout
\begin_layout Plain Layout
Please imagine a future geo disaster: a future impact (e.g.
storm flood) will hit your company, such that
\emph on
two
\emph default
datacenters (e.g.
unpredictable A and B) will get lost, and only one of the datacenters will
survive (e.g.
C).
\end_layout
\begin_layout Plain Layout
Answers are left to the reader.
You are free
\begin_inset Foot
status open
\begin_layout Plain Layout
Outside the scope of this guide: active-passive+ can survive
\begin_inset Formula $(k-1)$
\end_inset
unpredictable datacenter losses out of
\begin_inset Formula $k$
\end_inset
datacenters, while active-active+ can only survive
\begin_inset Formula $k/2-\epsilon$
\end_inset
unpredictable losses, where
\begin_inset Formula $\epsilon\approx1$
\end_inset
in practice.
\end_layout
\end_inset
to select solutions, as far as you will bear the consequences.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Personal Advice for Responsibles
\size scriptsize
(from the author of this guide)
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
\series bold
KISS
\series default
=
\series bold
K
\series default
eep
\series bold
I
\series default
t
\series bold
S
\series default
imple and
\series bold
S
\series default
tupid
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Simply do
\series bold
not require
\series default
\emph on
both
\emph default
geo-redundancy
\emph on
and
\emph default
active-active,
\emph on
both at the same time
\emph default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Please do yourself a favour by requiring
\emph on
either
\emph default
geo-redundancy,
\emph on
or
\emph default
by requiring the so-called active-active operation,
\emph on
or
\emph default
(maybe) none
\begin_inset Foot
status open
\begin_layout Plain Layout
Beware in bigger organizations and/or broad product spectrum: different
teams (e.g.
when working on different matter) may decide their local solutions differently.
While it is typical for humans to prefer their own (envisioned) solutions,
it
\emph on
may
\emph default
be an
\emph on
advantage
\emph default
for an enterprise (e.g.
when different use cases can justify it, backed by real experiences),
\emph on
or
\emph default
it may become a
\emph on
disadvantage
\emph default
: any cleanup of suchalike individual decisions may become a (future) challenge.
\end_layout
\end_inset
of them.
This will simplify your life.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Theoretically, you may require the so-called active-active operation mode
for
\emph on
each
\emph default
of two data\SpecialChar softhyphen
cen\SpecialChar softhyphen
ters, as a purely
\emph on
local
\emph default
option, while
\emph on
not
\emph default
spreading
\emph on
across
\emph default
the datacenters.
Theoretically this
\emph on
might
\emph default
be possible for
\begin_inset Formula $k=2$
\end_inset
(or even
\begin_inset Formula $k>2$
\end_inset
) total storage replica
\begin_inset Foot
status open
\begin_layout Plain Layout
Probably,
\begin_inset Formula $k=4$
\end_inset
total storage replica in a
\begin_inset Formula $2\times2$
\end_inset
scheme
\emph on
might
\emph default
reduce the
\emph on
overall
\emph default
complexity under certain circumstances, but suchalike will drastically
increase TCO = Total Cost of Ownership.
\end_layout
\end_inset
, but I cannot recommend it.
Preparations against only the 3 depicted scenarios are probably not enough,
since there will be much more opportunities for
\series bold
complicated failures
\series default
(e.g.
upon
\emph on
partial
\emph default
failures in some datacenters, maybe so-called
\begin_inset Quotes eld
\end_inset
rolling disasters
\begin_inset Quotes erd
\end_inset
, etc), and for unintended human error, etc.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Hint: please take a deeper look at the so-called
\series bold
Butterfly operation mode
\series default
(section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Flexibility-of-Failover"
plural "false"
caps "false"
noprefix "false"
\end_inset
ff), which is a
\emph on
generalization
\emph default
of an
\series bold
active-passive
\series default
scheme.
By definition, it can adapt to geo disasters in a flexible way.
\end_layout
\end_inset
\end_layout
\begin_layout Chapter
Architectural Principles and Properties for Geo-Redundancy and its Storages
\begin_inset CommandInset label
LatexCommand label
name "chap:Architectural-Principles-and-Properties"
\end_inset
\end_layout
\begin_layout Standard
Datacenter architects have no easy job.
Building up some petabytes of data in the wrong way can easily endanger
a company, as will be shown later.
There are some architectural laws to know and some rules to follow.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
As a responsible manager, you will make architectural decisions, even if
you are
\emph on
not aware
\emph default
of them.
Bad decisions, even if you are not aware of its consequences, can endanger
major products, and increase cost by
\emph on
factors
\emph default
.
Once you have commited to a certain architecture, it will be
\emph on
extremely cumbersome
\emph default
to modify it later.
Thus you need to get an architecture right from start.
Typically, you will have
\series bold
only one shot
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
We need to take a look at the most general possibilities how
\emph on
storage
\emph default
can be architecturally designed (independently from considerations on geo-redun
dancy):
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/storage-classification.fig
width 80col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
An important question is: do we always need to access bigger masses of (typicall
y unstructured) data over a network?
\end_layout
\begin_layout Standard
There is a common belief that both reliability and scalability could be
only achieved this way.
In the past, local storage has often been viewed as
\begin_inset Quotes eld
\end_inset
too simple
\begin_inset Quotes erd
\end_inset
to provide enterprise grade reliability, and scalability, and maintainability.
In the past, this was sometimes true.
\end_layout
\begin_layout Standard
However, this picture has changed with the advent of a new
\series bold
load balancing
\series default
method called
\series bold
LV Football
\series default
, see
\family typewriter
football-user-manual.pdf
\family default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
When Football is combined with a
\family typewriter
FlexibleSharding
\family default
architecture (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:FlexibleSharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
), practically the same flexibility as promised by
\family typewriter
BigCluster
\family default
is possible.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Before looking into storage architectures, we need to consider extremely
important top requirements first.
\end_layout
\begin_layout Section
Fundamental Requirements for Geo-Redundancy
\begin_inset CommandInset label
LatexCommand label
name "sec:Requirements-for-Geo-Redundancy"
\end_inset
\end_layout
\begin_layout Subsection
\emph on
Global
\emph default
Strict Consistency and Latencies vs Geo-Redundancy
\begin_inset CommandInset label
LatexCommand label
name "subsec:Global-Strict-Consistency"
\end_inset
\end_layout
\begin_layout Standard
Experienced sysadmins typically know that Strict Consistency
\emph on
between
\emph default
datacenters (e.g.
\emph on
synchronous
\emph default
replication, or their harder variants like active-active, see
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-active-active"
plural "false"
caps "false"
noprefix "false"
\end_inset
, or dual-primary, etc) can induce operational problems when run over long
distances, as is
\emph on
necessary
\emph default
for
\emph on
real
\emph default
geo-redundancy (cf section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Geo-Redundancy"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
Since this subsection is on
\emph on
requirements from architecture
\emph default
(as opposed to discussions about solutions and implementations), important
questions are:
\end_layout
\begin_layout Description
1)
\begin_inset space ~
\end_inset
Why can geo-redundancy cause latency problems?
\end_layout
\begin_layout Description
2)
\begin_inset space ~
\end_inset
How can latency problems due to geo-redundancy be resolved at architectural
level?
\end_layout
\begin_layout Standard
For 1) there is a simple answer:
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Why can geo-redundancy cause latency problems?
\end_layout
\end_inset
With
\series bold
increasing distance
\series default
, latencies are following
\series bold
Einstein's law
\series default
of the
\series bold
speed of light
\series default
.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Distance matters.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not forget to determine distances from market demands, and check
\series bold
governmental requirements
\series default
/
\series bold
laws
\series default
.
Otherwise you are risking consequences like going out of business, or even
consequences by court.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\end_layout
\begin_layout Standard
\noindent
For 2) there are multiple answers:
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
How can latency problems due to geo-redundancy be resolved at architectural
level?
\end_layout
\end_inset
Check which architectures may fit your needs for planning and evaluation:
\end_layout
\begin_layout Description
2a
\begin_inset space ~
\end_inset
Low
\begin_inset space ~
\end_inset
performance
\begin_inset space ~
\end_inset
demands: by deciding that Einstein's law and its impact on network latencies
can be accepted.
You are limiting the product space, and probably are increasing some business
risks.
When demands are low enough, almost any architecture, and almost any correspond
ing solution, will become acceptable.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Exception: anything which
\emph on
contradicts
\emph default
to geo-redundancy
\emph on
by construction
\emph default
.
This rules out almost all so-called active-active architectures and solutions,
see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-active-active"
plural "false"
caps "false"
noprefix "false"
\end_inset
, even for extremely low performance demands.
\end_layout
\begin_layout Description
2b
\begin_inset space ~
\end_inset
Mid
\begin_inset space ~
\end_inset
performance
\begin_inset space ~
\end_inset
demands
\begin_inset space ~
\end_inset
or
\begin_inset space ~
\end_inset
unknown
\begin_inset space ~
\end_inset
demands: please read this guide carefully and think about all pros and cons
in all relevant dimensions of all affordable solutions.
\end_layout
\begin_layout Description
2c
\begin_inset space ~
\end_inset
High
\begin_inset space ~
\end_inset
performance
\begin_inset space ~
\end_inset
demands: concentrate on active-passive architectures (contrary to section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-active-active"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
Inside of active-passive architectures, select only
\series bold
asynchronous solutions
\series default
, because any comparable synchronous solution will necessarily be worse
(due to Einstein's law).
Further topics and answers can be found in this guide.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Good solutions based on good architecture can deliver almost the same (or
sometimes even better) performance for geo-redundant setups, than local
datacenters have delivered in the past, for comparable TCO = Total Cost
of Ownership.
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
\emph on
Global
\emph default
Eventual Consistency vs Geo-Redundancy
\begin_inset CommandInset label
LatexCommand label
name "subsec:Global-Eventual-Consistency"
\end_inset
\end_layout
\begin_layout Standard
Some BigCluster advocates are seemingly trying to use their favourite implementa
tion for geo-distribution, and to adjust requirements accordingly.
There is a
\emph on
fundamental
\emph default
misunderstanding about geo-redundancy.
\end_layout
\begin_layout Standard
It does not suffice to distribute for example a Ceph or Swift cluster over
two geo-locations A and B.
Recall the definition of geo-redundancy from section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Geo-Redundancy"
plural "false"
caps "false"
noprefix "false"
\end_inset
: it
\emph on
must
\emph default
be possible to run (at least) the core business from
\series bold
either A or B
\series default
, while the respective other location B or A is
\emph on
not available
\emph default
for several days or weeks, or even when the other location is
\series bold
lost forever
\series default
and needs to be re-constructed
\series bold
physically from scratch
\series default
.
\end_layout
\begin_layout Standard
This also applies to
\series bold
partial unavailability
\series default
of a few servers, or of a few racks, or of a few rooms, or of some of the
three power phases, or to corresponding
\emph on
partial
\emph default
permanent losses.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Nobody can know
\emph on
in advance
\emph default
whether (parts of) datacenter B will be
\emph on
lost
\emph default
during an unpredictable geo disaster, or whether it will be A.
\end_layout
\begin_layout Standard
Consequence for requirements:
\emph on
any
\emph default
replication system claiming to support geo-redundancy
\emph on
must
\emph default
have a
\series bold
recovery operation
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Recovery operations in Linux-based OpenSource components for short and long
replication distances
\end_layout
\end_inset
In DRBD or MARS, the recovery operation is called (fast)
\series bold
full-sync
\series default
.
It can be started with commands like
\family typewriter
drbdadm invalidate
\family default
or
\family typewriter
marsadm invalidate
\family default
, or with replica creation operations like
\family typewriter
{drbd,mars}adm join-resource
\family default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Notice: when you have a few petabytes of data, the recovery operation needs
to transfer a non-trivial amount of data over a cross-datacenter bottleneck
(cf section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
), and will take a considerable time, typically weeks, up to months.
During all of this, operation must continue.
\end_layout
\begin_layout Standard
Consequence: during recovery, the data must be
\series bold
alterable
\series default
.
In other words, the recovery must work
\emph on
while
\emph default
the data is being modified by your running applications.
Data must remain
\series bold
logically consistent
\series default
during all of this.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\series bold
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Logical consisteny before/during/after recovery
\series default
\end_layout
\end_inset
\series default
You
\emph on
may
\emph default
need two different types of consistency models at the same time, but on
different layers:
\end_layout
\begin_layout Description
Customer-visible
\begin_inset space ~
\end_inset
consistency
\begin_inset space ~
\end_inset
model: typically either Strict Consistency, or Eventual Consistency.
Any of them is needed
\emph on
as well as
\emph default
during ordinary operations
\emph on
as well as
\emph default
during geo disasters
\emph on
as well as
\emph default
during geo recovery
\emph on
as well as
\emph default
afterwards.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Never require only Eventually Consistent although your
\emph on
application class
\emph default
/
\emph on
use case
\emph default
actually requires Strict Consistency.
Suchalike would mean that you would accept
\emph on
inconsistent data
\emph default
from the viewpoint of your customers, by construction.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Requiring Strict Consistency as the customer-visible model is always at
the
\series bold
safe side
\series default
.
Why? Because any strictly consistent solution will also fulfill Eventually
Consistent, but vice versa is
\emph on
not
\emph default
possible in general, due to the
\series bold
laws of Mathematics
\series default
.
Never try to circumvent the firm laws of maths!
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
\series bold
Overkill avoidance
\series default
by requirement engineering: do
\emph on
not
\emph default
require Strict Consistency
\emph on
inbetween
\emph default
any distinct customers X and Y although these customers have
\emph on
nothing to do with each other, obviously!
\end_layout
\begin_layout Description
Geo-level
\begin_inset space ~
\end_inset
consistency
\begin_inset space ~
\end_inset
model: typically only Eventually Consistent is possible over long geo-distances,
due to physics (aka Einstein's law / speed of light).
This holds for
\emph on
all
\emph default
operational phases, including ordinary operations, during (geo) disasters,
and of course for
\emph on
\emph default
recovery phases after geo disasters, etc.
It also holds for
\emph on
partial
\emph default
failures, e.g.
at room level, failed power phases, etc.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Consequence: the safe side is to require Strict Consistency at the per-customer
visible layer, while at the same time requiring Eventually Consistent for
the geo layer
\emph on
inbetween
\emph default
your datacenters.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not mix up the two layers: geo-layer vs cutomer-visible layer.
You need to look at
\emph on
both
\emph default
of them.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Recovery after geo disasters is
\emph on
top important
\emph default
for anything claiming to provide geo-redundancy.
Thus any
\emph on
potential
\emph default
violations of
\emph on
future
\emph default
customer-visible consistency problems
\emph on
must
\emph default
be also repaired by the geo-level recovery operation.
Otherwise your seemingly
\begin_inset Quotes eld
\end_inset
repaired
\begin_inset Quotes erd
\end_inset
datacenter is
\series bold
not really repaired
\series default
, but would
\emph on
not
\emph default
be fully usable for future operations, and the geo recovery would have
\emph on
failed
\emph default
in reality.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Do not miss the following
\series bold
requirement
\series default
: the geo-level recovery operation
\emph on
must
\emph default
also deal with the customer-visible consistency model, working on
\emph on
both
\emph default
consistency layers at the same time, and
\emph on
in parallel
\emph default
to each other.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
All of this must be enterprise-grade, meeting appropriate SLAs.
You
\emph on
cannot assume
\emph default
that a certain storage implementation will reliably be able to cope with
geo-failure scenarios, when it isn't
\series bold
explicitly constructed and
\emph on
tested
\emph default
for geo-redundancy
\series default
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Critically: in addition to the storage, enough
\series bold
application servers
\series default
(or
\emph on
combined
\emph default
server hardware containing their local storage) must be present at
\emph on
both
\emph default
locations A and B, and they need to know where their corresponding data
is (remote storage and/or local storage).
When the active side is lost by a spontaneous geo-disaster like an earthquake,
all the application servers, their services, networking functionality,
etc, must be successfully restarted at the other location within a reasonable
timeframe.
It must be guaranteed that all relevant servers and services are running
on the right corresponding data, with the right IP addresses, etc.
\end_layout
\begin_layout Standard
All of this needs
\series bold
prepared processes
\series default
in advance, for
\end_layout
\begin_layout Enumerate
coping with planned handover and unplanned failover scenarios to KTLO =
Keep The Lights On, and
\end_layout
\begin_layout Enumerate
Later recovery within a reasonable timeframe.
\end_layout
\begin_layout Standard
These are
\series bold
hard requirements
\series default
.
Recommended soft requirements like Ability for Butterfly are described
in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Flexibility-of-Failover"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Never use any replication system inside of VMs! Suchalike attempts are
\emph on
fundamentally broken
\emph default
.
See section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Inappropriate-Replication-Layering"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Important Advice on Geo-Redundancy: Time and Cost
\end_layout
\end_inset
When
\series bold
geo-redundancy
\series default
is required for a certain application class, it
\series bold
must be constructed in
\series default
from the very beginning.
\end_layout
\begin_layout Plain Layout
If you believe that geo-redundancy would be an
\emph on
optional feature
\emph default
which could be added later at any time, you will
\series bold
lose a lot of time and money
\series default
.
\end_layout
\begin_layout Plain Layout
Consequence: any (storage / operations / product) strategy in an enterprise
\emph on
must
\emph default
start with the question whether geo-redundancy is required, or not.
\end_layout
\begin_layout Plain Layout
Any error in the requirement will become
\series bold
extremely expensive
\series default
with respect to a close-to-optimal solution, typically factor 2 or more
for TCO.
When selecting an inappropriate storage+application
\emph on
fundamental architecture
\emph default
like BigCluster, it may grow much higher.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Never start with a particular solution in mind.
Determine and update
\series bold
\emph on
serious requirements.
\end_layout
\end_inset
\end_layout
\begin_layout Section
Architectural Properties of Cloud Storage
\emph on
\begin_inset CommandInset label
LatexCommand label
name "sec:Properties-Cloud-Storage"
\end_inset
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Pitfall: Use Cases for Cloud Storage
\end_layout
\end_inset
The following
\series bold
compatibility table
\series default
is important for understanding the
\emph on
consequences
\emph default
from combinations of requirements with solutions:
\end_layout
\begin_layout Plain Layout
\align center
\begin_inset Tabular
<lyxtabular version="3" rows="3" columns="3">
<features tabularvalignment="bottom">
<column alignment="center" valignment="top" width="0pt">
<column alignment="center" valignment="top" width="0pt">
<column alignment="center" valignment="top" width="0pt">
<row>
<cell multirow="3" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Box Frameless
position "t"
hor_pos "c"
has_inner_box 1
inner_pos "t"
use_parbox 0
use_makebox 0
width "20text%"
special "none"
height "1in"
height_special "totalheight"
thickness "0.4pt"
separation "3pt"
shadowsize "4pt"
framecolor "black"
backgroundcolor "none"
status open
\begin_layout Plain Layout
\noindent
\align center
Solution
\emph on
is
\series bold
\emph default
\begin_inset Newline newline
\end_inset
Eventually Consistent
\begin_inset Newline newline
\end_inset
\begin_inset space ~
\end_inset
\end_layout
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Box Frameless
position "t"
hor_pos "c"
has_inner_box 1
inner_pos "t"
use_parbox 0
use_makebox 0
width "20text%"
special "none"
height "1in"
height_special "totalheight"
thickness "0.4pt"
separation "3pt"
shadowsize "4pt"
framecolor "black"
backgroundcolor "none"
status open
\begin_layout Plain Layout
\noindent
\align center
Solution
\emph on
is
\series bold
\emph default
\begin_inset Newline newline
\end_inset
Strictly Consistent
\begin_inset Newline newline
\end_inset
\begin_inset space ~
\end_inset
\end_layout
\end_inset
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Box Frameless
position "t"
hor_pos "c"
has_inner_box 1
inner_pos "t"
use_parbox 0
use_makebox 0
width "20text%"
special "none"
height "1in"
height_special "totalheight"
thickness "0.4pt"
separation "3pt"
shadowsize "4pt"
framecolor "black"
backgroundcolor "none"
status open
\begin_layout Plain Layout
\noindent
\align center
Task
\begin_inset Newline newline
\end_inset
\emph on
permits
\series bold
\emph default
\begin_inset Newline newline
\end_inset
Eventually Consistent
\begin_inset Newline newline
\end_inset
\begin_inset space ~
\end_inset
\end_layout
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Box Frameless
position "t"
hor_pos "c"
has_inner_box 1
inner_pos "t"
use_parbox 0
use_makebox 0
width "20text%"
special "none"
height "1in"
height_special "totalheight"
thickness "0.4pt"
separation "3pt"
shadowsize "4pt"
framecolor "black"
backgroundcolor "none"
status open
\begin_layout Plain Layout
\noindent
\align center
\begin_inset space ~
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset space ~
\end_inset
\begin_inset Newline newline
\end_inset
OK
\series bold
\begin_inset Newline newline
\end_inset
\begin_inset space ~
\end_inset
\end_layout
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Box Frameless
position "t"
hor_pos "c"
has_inner_box 1
inner_pos "t"
use_parbox 0
use_makebox 0
width "20text%"
special "none"
height "1in"
height_special "totalheight"
thickness "0.4pt"
separation "3pt"
shadowsize "4pt"
framecolor "black"
backgroundcolor "none"
status open
\begin_layout Plain Layout
\noindent
\align center
\begin_inset space ~
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset space ~
\end_inset
\begin_inset Newline newline
\end_inset
OK
\series bold
\begin_inset Newline newline
\end_inset
\begin_inset space ~
\end_inset
\end_layout
\end_inset
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Box Frameless
position "t"
hor_pos "c"
has_inner_box 1
inner_pos "t"
use_parbox 0
use_makebox 0
width "20text%"
special "none"
height "1in"
height_special "totalheight"
thickness "0.4pt"
separation "3pt"
shadowsize "4pt"
framecolor "black"
backgroundcolor "none"
status open
\begin_layout Plain Layout
\noindent
\align center
Task
\begin_inset Newline newline
\end_inset
\emph on
requires
\series bold
\emph default
\begin_inset Newline newline
\end_inset
Strictly Consistent
\begin_inset Newline newline
\end_inset
\begin_inset space ~
\end_inset
\end_layout
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Box Frameless
position "t"
hor_pos "c"
has_inner_box 1
inner_pos "t"
use_parbox 0
use_makebox 0
width "20text%"
special "none"
height "1in"
height_special "totalheight"
thickness "0.4pt"
separation "3pt"
shadowsize "4pt"
framecolor "black"
backgroundcolor "none"
status open
\begin_layout Plain Layout
\noindent
\align center
\begin_inset space ~
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 25
\end_inset
\end_layout
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Box Frameless
position "t"
hor_pos "c"
has_inner_box 1
inner_pos "t"
use_parbox 0
use_makebox 0
width "20text%"
special "none"
height "1in"
height_special "totalheight"
thickness "0.4pt"
separation "3pt"
shadowsize "4pt"
framecolor "black"
backgroundcolor "none"
status open
\begin_layout Plain Layout
\noindent
\align center
\begin_inset space ~
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset space ~
\end_inset
\begin_inset Newline newline
\end_inset
OK
\series bold
\begin_inset Newline newline
\end_inset
\begin_inset space ~
\end_inset
\end_layout
\end_inset
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Important: from customer view, Eventually Consistent has
\series bold
\emph on
worse
\series default
functional properties
\emph default
than Strictly Consistent.
Do not believe any propaganda claiming the opposite.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Do
\series bold
not
\series default
place the following business tasks on top of Eventually Consistent cloud
storages:
\end_layout
\begin_layout Itemize
\series bold
Email
\series default
or
\series bold
important documents.
\end_layout
\begin_layout Itemize
\series bold
Financial
\series default
data,
\series bold
Invoices
\series default
,
\series bold
Payment
\series default
records, etc.
\end_layout
\begin_layout Itemize
\series bold
Contracts
\series default
/
\series bold
legal
\series default
/ etc.
\end_layout
\begin_layout Itemize
\series bold
Security
\series default
critical /
\series bold
secret
\series default
data.
\end_layout
\begin_layout Itemize
\series bold
Enterprise-critical
\series default
data.
\end_layout
\begin_layout Itemize
Any other
\series bold
\emph on
risky
\series default
\emph default
data.
\end_layout
\begin_layout Plain Layout
Read the following explanation
\emph on
why
\emph default
Cloud Storage
\emph on
allows
\emph default
Eventually Consistent solutions
\series bold
\emph on
by definition
\series default
\emph default
and thus conforms to the
\emph on
left
\emph default
column and its skull symbol:
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Brief recall from section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Cloud-Storage"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
According to
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Cloud_storage
\end_layout
\end_inset
and several other definitions in the literature, cloud storage is
\end_layout
\begin_layout Description
(1) Made up of many
\series bold
distributed resources
\series default
, but still
\series bold
act as one
\series default
.
\end_layout
\begin_layout Description
(2) Highly
\series bold
fault tolerant
\series default
through redundancy and distribution of data.
\end_layout
\begin_layout Description
(3) Highly
\series bold
durable
\series default
through the creation of versioned copies.
\end_layout
\begin_layout Description
(4) Typically
\series bold
eventually consistent
\series default
with regard to
\emph on
data replicas
\emph default
.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Several cloud storage solutions are eventually consistent with respect
to
\series bold
data access
\emph on
during
\emph default
\size large
ordinary operations
\series default
\size default
\begin_inset Formula $\Longrightarrow$
\end_inset
although claimed by advocates, such solutions are actually
\emph on
missing(!)
\emph default
the above definition
\begin_inset Formula $\Longrightarrow$
\end_inset
they are
\series bold
actually not
\series default
cloud storage.
Due to
\emph on
implementation policies
\emph default
and/or due to the famous
\emph on
CAP theorem
\emph default
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Explanation-via-CAP"
plural "false"
caps "false"
noprefix "false"
\end_inset
) they may be even
\emph on
missing
\emph default
the
\emph on
standard
\emph default
storage semantics as explained in academic textbooks / literature on
\emph on
Formal Semantics
\emph default
(outside the scope of this guide
\begin_inset Formula $\Longrightarrow$
\end_inset
ask the
\emph on
real
\emph default
experts).
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Regularly missing the standard storage semantics
\begin_inset Formula $\Longrightarrow$
\end_inset
such a solution is not even a
\begin_inset Quotes eld
\end_inset
storage
\begin_inset Quotes erd
\end_inset
at all.
For example, it may deliver
\emph on
mathematically wrong data
\emph default
at any
\emph on
unpredictable
\emph default
time
\emph on
without notice
\emph default
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
What the Definition of Cloud Storage means
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Confusing of cloud storage with other types of storage may cause massive
problems.
When unsure, please read the above definition
\emph on
carefully
\emph default
.
\end_layout
\begin_layout Plain Layout
\noindent
Requirement (4) is clearly stating that replicas
\emph on
need not
\emph default
have
\emph on
realtime
\emph default
consistency properties.
Unfortunately, some advocates have incorrectly propagated that replicas
would need to be updated and/or usable for failover in
\emph on
realtime
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
From (4) it becomes clear that failover in
\emph on
realtime
\emph default
to a
\emph on
strictly consistent
\emph default
replica is explicitly
\emph on
not
\emph default
requested.
Requiring suchalike
\emph on
in addition
\emph default
would lead to a
\emph on
contradiction
\emph default
with the above definition.
This extends to
\emph on
eventually consistent
\emph default
.
Even when respecting the CAP theorem (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Explanation-via-CAP"
plural "false"
caps "false"
noprefix "false"
\end_inset
) by prefering A in front of C,
\emph on
realtime
\emph default
requirements for failover to an
\emph on
old
\emph default
version / replica are
\emph on
not
\emph default
implied.
A
\emph on
realtime
\emph default
interpretation of A simply does not make sense in the presence of (3) and
(4).
In order to remain honest and fair, the timescale requirements for achieving
A must not artificially tightened stronger than those implied by (4).
\end_layout
\end_inset
, otherwise it wouldn't be
\begin_inset Quotes eld
\end_inset
cloud storage
\begin_inset Quotes erd
\end_inset
.
Via propagation of a wrong definition at concept or architecture level,
it is possible to screw up whole product lines, at least in the financial
dimension:
\series bold
realtime properties
\series default
are relatively
\series bold
expensive
\series default
to achieve, leading to
\series bold
unnecessary cost increases
\series default
up to
\emph on
orders of magnitude
\emph default
.
It is one of the
\series bold
central ideas
\series default
\begin_inset Foot
status open
\begin_layout Plain Layout
Distribution is mentioned in requirements (1) and (2).
According to the CAP theorem and its sister theorems, distribution is even
an
\series bold
antagonist
\series default
to realtime requirements.
\end_layout
\end_inset
\series bold
of cloud storage
\series default
to get
\emph on
rid
\emph default
of realtime requirements at those places where it is reasonable.
More on (unnecessary) realtime requirements and its financial consequences
see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The requirement (1)
\begin_inset Quotes eld
\end_inset
act as one
\begin_inset Quotes erd
\end_inset
implies some appropriate type of
\series bold
location transparency
\series default
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Location-transparency"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The definition says nothing about the
\series bold
granularity
\series default
/ sizes of the distributed resources.
See section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Granularity-at-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
for a more detailed discussion of opportunities arising from better informed
decisions about this.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Notice that the term
\begin_inset Quotes eld
\end_inset
network
\begin_inset Quotes erd
\end_inset
does not occur in this definition.
However, the term
\begin_inset Quotes eld
\end_inset
distributed resources
\begin_inset Quotes erd
\end_inset
is implying
\emph on
some(!)
\emph default
kind of network.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The definition does
\emph on
not
\emph default
imply some
\emph on
specific
\emph default
type of network, such as a costly
\series bold
dedicated storage network
\series default
which must be capable of transporting masses of IO operations in
\series bold
realtime
\series default
.
In general, we are free to use other types of networks, such as cheaper
\series bold
replication networks
\series default
, which need not be dimensioned for realtime IO traffic, but are sufficient
for
\series bold
background data migration
\series default
, and even over long distances, where
\emph on
any
\emph default
network has some bottlenecks.
Requirement (4) is even
\emph on
suggesting
\emph default
that costly realtime requirements are not needed everywhere.
See also section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Often, there are
\series bold
restrictions from technology
\series default
.
Not every architecture as discussed in this guide can be easily implemented
via a certain technology.
Example: when a so-called
\series bold
Vendor Lock-In
\series default
is binding you to to a certain brand of commercial storage boxes, certain
opportunities will be missed.
By going to self-built and self-administered RAID storage, typically an
invest factor between 3 and 10 can be saved (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Cost-Arguments-from-Technology"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
On top of this, about another factor of 2 is possible, about
\emph on
halving your total hardware invest
\emph default
, by use of Linux-based local storage + Football in place of network-based
commercial storage, provided it is possible for your use case.
See sections
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Proprietary-vs-OpenSource"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Local-vs-Centralized"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The definition says nothing concrete about the
\series bold
time scale
\series default
of operations, except (4) which is
\emph on
explicitly permitting
\emph default
a relatively coarse timescale for replicas.
We are
\emph on
explicitly encouraged
\emph default
to implement certain operations, such as
\series bold
background data migration
\series default
, in a rather long timescale (from a human point of view).
This bears an opportunity for
\series bold
major cost reduction
\series default
(see relaxation of realtime requirements in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
), as well as
\series bold
improving reliability
\series default
by decreasing dependencies from (hidden) SPOFs
\begin_inset Foot
status open
\begin_layout Plain Layout
Several people appear to work with the
\emph on
assumption
\emph default
that networks are available all the time.
Although minor network outages can be compensated very well, there remains
a
\series bold
residual risk
\series default
for a major outage, similar to what happened in Fukushima.
Thus such an attitude can endanger both companies and carreers.
\end_layout
\end_inset
= Single Points Of Failure.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
In-Datacenter
\emph on
storage
\emph default
\begin_inset space ~
\end_inset
network
\begin_inset space ~
\end_inset
failures
\end_layout
\end_inset
It is clear that a failure of a classical storage network will halt all
services depending on it.
Some people believe that realtime storage networks and/or replication networks
could not be avoided, in order to react on (real or assumed)
\emph on
varying
\emph default
load situations, and thus the whole system would be running much faster
due to load distribution.
This is not the full picture:
\end_layout
\begin_layout Enumerate
Football plus FlexibleSharding can achieve a similar level of elasticity.
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Long-distance
\emph on
replication
\emph default
\begin_inset space ~
\end_inset
network
\begin_inset space ~
\end_inset
failures
\end_layout
\end_inset
Football on top of MARS for background LV migration over both short and
geo-distances.
When the replication network is down, it will just pause for a while, and
MARS will automatically resume once the network is up again.
Football can be configured to also resume the higher-level migration process,
when necessary.
\end_layout
\end_inset
\end_layout
\begin_layout Enumerate
Load distribution is essentially nothing else but a variant of
\series bold
data striping
\series default
.
If you really need it for performance reasons, you can often do similarly
with certain types of local RAID, such as RAID-10 or RAID-60, and with
a variety of RAID parameters.
Notice that
\emph on
any
\emph default
kind of data striping, whether at block level or at object level, is coming
with some cost
\begin_inset Foot
status open
\begin_layout Plain Layout
For a given redundancy degree
\begin_inset Formula $k$
\end_inset
,
\series bold
reliability is reduced
\series default
by striping.
In case of RAID, this is well-known since decades.
Unfortunately, in case of BigCluster some misleading
\begin_inset Quotes eld
\end_inset
propaganda
\begin_inset Quotes erd
\end_inset
was blurring the public opinion for many years.
Notice that the BigCluster analysis in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sub:Detailed-explanation"
plural "false"
caps "false"
noprefix "false"
\end_inset
is showing up some parallels to the well-known reliability loss caused
by RAID striping, when some granularity differences (block vs object level
etc) are ignored.
\end_layout
\end_inset
.
\end_layout
\begin_layout Enumerate
LocalStorage is even faster (when using a comparable technology yielding
the same size), because IO does not involve
\emph on
any dedicated storage network
\emph default
at all.
Therefore, it is also more reliable (when using comparable technology).
See also section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Enumerate
\noindent
Reorg tasks: these may occur in any top-level architectures.
In general, not all operations can run in realtime, by construction.
For example, increasing the number of replicas in an operational Ceph cluster,
already containing a few hundreds of terabytes of data, will not only require
additional storage hardware, but will also take a rather long time, implied
by the very nature of bigger reorganisational tasks.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
When
\series bold
geo-redundancy
\series default
= some minimum distance between datacenters for
\series bold
survial of geo-disasters
\series default
like earthquakes or floods is added to (2) as an additional requirement
(see also section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Geo-Redundancy"
plural "false"
caps "false"
noprefix "false"
\end_inset
), some
\emph on
further consequences
\emph default
will arise.
For example, the German government authority BSI recommends a minimum distance
of 200 km between datacenters for
\series bold
critical infrastructures
\series default
\begin_inset Foot
status open
\begin_layout Plain Layout
See
\begin_inset Flex URL
status collapsed
\begin_layout Plain Layout
https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Sicherheitsberatung/Standort-Kr
iterien_HV-RZ/Standort-Kriterien_HV-RZ.pdf?__blob=publicationFile&v=5
\end_layout
\end_inset
\end_layout
\begin_layout Plain Layout
Some press comments on this:
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://www.it-finanzmagazin.de/bsi-rechenzentren-entfernung-bafin-84078/
\end_layout
\end_inset
\end_layout
\end_inset
.
Over suchalike distances, realtime storage networks cannot be used anymore
in general.
Thus some sort of
\begin_inset Quotes eld
\end_inset
migration
\begin_inset Quotes erd
\end_inset
of data over long distances will be needed anyway.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Synergy effects: combination of Background Migration with geo-redundancy
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Since data migration is needed
\emph on
anyway
\emph default
over long distances, there is an opportunity for
\series bold
saving cost
\emph on
and
\emph default
increasing reliabilty + flexibility
\series default
all at the same time.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Basic idea behind Football on top of a Sharding model:
\series bold
minimize the
\emph on
distances
\series default
\emph default
between your storage spindles and the corresponding data processing.
\end_layout
\begin_layout Plain Layout
When background data migration is automated properly, real-time storage
networks can become superfluous, or at least the corresponding realtime
IO traffic can be drastically reduced.
\end_layout
\begin_layout Plain Layout
\noindent
When minimization is well dimensioned, a pair of storage + application server
residing in the same geo-location can be
\series bold
collapsed into a single box
\series default
.
This is not only a
\series bold
major cost reducer
\series default
, it also
\series bold
improves reliability
\series default
because there are less components which can fail.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Missing of Opportunities
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Unfortunately, this opportunity can be easily
\emph on
missed
\emph default
if both system architects and responsible managers are just requiring only
DR = Disaster Recovery over long distances, while forgetting to require
the
\series bold
ability for butterfly
\series default
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Flexibility-of-Failover"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Pitfalls from non-optimum Requirements
\end_layout
\end_inset
Essentially, minimum requirements like mentioned can be easily interpreted
as
\begin_inset Quotes eld
\end_inset
everything has to be doubled
\begin_inset Quotes erd
\end_inset
in order to survive any geo-disaster
\begin_inset Foot
status open
\begin_layout Plain Layout
A geo-disaster like an earthquake will typically last for weeks, if not
months, until it is fully repaired.
During such a period, a single surviving datacenter must be capable of
providing
\begin_inset Quotes eld
\end_inset
good enough
\begin_inset Quotes erd
\end_inset
SLAs.
These disaster-SLAs can be lower than usual.
For example, in place of an ordinary 99.98% availability, 98% may be a sufficien
t target
\emph on
during
\emph default
such a geo-disaster.
By unnecessarily requiring much more during a very rare corner case, you
can easily explode the cost, even beyond doubling, without reasonable benefit
during ordinary operations.
\end_layout
\end_inset
.
This would double cost in comparison to certain kinds of fully locally
redundant architectures, missing the opportunity for
\series bold
\emph on
splitting
\series default
\emph default
much of the overall redundancy into two geo-locations,
\series bold
instead of doubling
\series default
virtually everything.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Some people are arguing that doubling were unavoidable, which is
\emph on
incorrect in general
\emph default
, as Football can demonstrate as a positive counter-example.
See section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Cost-Arguments-from-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Counter-productive cost arguments are sometimes heard when geo-redundancy
is discussed about, without considering newer possibilities such as Football.
As explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Flexibility-of-Failover"
plural "false"
caps "false"
noprefix "false"
\end_inset
, the
\series bold
granularity of failover
\series default
should not be required as a
\series bold
coarse failover of a full datacenter
\series default
, but explicitly be required as
\series bold
fine-grained cross-geo failover + handover at VM level
\series default
, or at a similar granularity (c.f.
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Granularity-at-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
This will force people to think about
\series bold
wide-area distribution
\series default
of resources instead of plainly doubling them (once again
\begin_inset Foot
status open
\begin_layout Plain Layout
Example: commercial storage boxes from NetApp, IBM, etc already have some
\emph on
local redundancy
\emph default
, typically doubling the amount of physical disks you are actually buying
when you buy a single storage box.
Typically, the amount of
\emph on
physical
\emph default
disks is not directly reported as a KPI, although it is major cost producer.
When introducing geo-replication, you will likely need to buy double the
number of boxes, resulting in a total of about 4x the capacity at the physical
layer.
In contrast, MARS + Football can often be built on top of local RAID-6.
As pointed out in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Cost-Arguments-from-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
, this leads to only about 2.2x the physical capacity you will need to buy.
In addition, the rackspace is much lower when using local storage, reducing
the number of servers to deploy and administer, and reducing networking
cost by omission of dedicated storage networks.
\end_layout
\end_inset
).
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Important keyword for flexible cross-geo distribution:
\series bold
ability for butterfly
\series default
, see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Flexibility-of-Failover"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
There is a
\series bold
tradeoff
\series default
between the effort for implementation of per-VM flexibility, and hardware
cost savings.
Sometimes arguments are heard that a high level of flexibility would be
too costly.
Although this might be true in some relatively small corner cases, the
picture can rapidly change when thousands of servers and/or petabytes or
storage are involved.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
Doubling the overall cost for big datacenters instead of intelligently geo-distr
ibuting resources, is likely much more cost intensive in the long term than
investing once into
\series bold
intelligent abilities
\series default
of the company like Football, which can then
\series bold
scale up
\series default
(more details see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Scalability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
As a consequence from sufficiently fine-grained handover + failover, the
above definition of cloud storage can be
\series bold
met at geo-datacenter level
\series default
, i.e.
the distributed resources according to (1) will be distributed over
\emph on
multiple geo-redundant
\emph default
locations / datacenters.
As pointed out in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Cost-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
, sometimes this may be even cheaper than building certain types of local
redundancy inside the same datacenter.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The famous CAP theorem (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Explanation-via-CAP"
plural "false"
caps "false"
noprefix "false"
\end_inset
) is one of the motivations behind requirement (4)
\begin_inset Quotes eld
\end_inset
eventually consistent
\begin_inset Quotes erd
\end_inset
.
This is not an accident.
There is a
\emph on
reason
\emph default
for it, although it is not a
\emph on
hard
\emph default
requirement.
\end_layout
\begin_layout Plain Layout
\noindent
Strict consistency is not needed for
\emph on
some
\emph default
applications running on top of cloud storage, e.g.
\emph on
less crucial
\emph default
application data.
In addition, the CAP theorem and some other theorems cited at
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/CAP_theorem
\end_layout
\end_inset
are telling us that Strict Consistency would be
\series bold
difficult and expensive
\series default
to achieve at
\emph on
global
\emph default
level in a
\emph on
big
\emph default
and/or
\emph on
long-distance
\emph default
Distributed System, and at the cost of other properties.
However, classical
\emph on
local
\emph default
Unix or Linux filesystems have already implemented Strict Consistency (aka
POSIX semantics), more of less
\series bold
for free
\series default
.
More detailed explanations are in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Explanation-via-CAP"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\end_inset
\end_layout
\begin_layout Section
Suitability of Architectures for Cloud Storage
\begin_inset CommandInset label
LatexCommand label
name "subsec:Suitability-of-Architectures"
\end_inset
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
There are some consequences from the above definition of Cloud Storage (see
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Properties-Cloud-Storage"
plural "false"
caps "false"
noprefix "false"
\end_inset
), for each of our high-level storage architectures:
\end_layout
\begin_layout Description
Distributed
\begin_inset space ~
\end_inset
Storage, in particular
\family typewriter
BigCluster
\family default
architectures (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Distributed-vs-Local:"
plural "false"
caps "false"
noprefix "false"
\end_inset
): many of them (with few exceptions) are conforming to all of these requirement
s.
Typical granularity are objects, or chunks, or other relatively small units
of data.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Distributed Storage is the growground where Cloud Storage was invented.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Many contemporary
\family typewriter
BigCluster
\family default
implementations are
\emph on
not really
\emph default
supporting
\series bold
geo-distribution
\series default
of masses of objects over long distances, in the sense of well-proven use
cases (maturity).
Small object granularity and/or strict consistency on top of unreliable
objects are worsening the effects of the CAP theorem and its sister theorems.
Thus object-based architectures are typically only suited for local (non-geo)
operations.
\begin_inset Newline newline
\end_inset
Example: at the moment (mid 2019), Amazon AWS is offering object replication
only over campus distances, which cannot meet the requirements from BSI.
\end_layout
\begin_layout Description
Centralized
\begin_inset space ~
\end_inset
Storage: does not conform to (1) and to (4) by definition
\begin_inset Foot
status open
\begin_layout Plain Layout
Notice that sharding on top of CentralStorage is no longer a CentralStorage
model by definition, but a RemoteSharding model according to section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Variants-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\end_inset
.
By introduction of synchronous or asynchronous replication, it can be made
to
\emph on
almost
\emph default
conform, except for (1) where some concept mismatches remain (probably
resolvable by going to a RemoteSharding model on top of CentralStorage,
where CentralStorage is only a
\emph on
sub-component
\emph default
).
Typical granularity is replication of whole internal storage pools, or
of filesystem instances.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In general,
\family typewriter
CentralStorage
\family default
architectures are a
\series bold
mismatch
\series default
to Cloud Storage, by their very nature.
Healing suchalike
\series bold
concept
\series default
mismatches may be close to impossible, or at least very tricky and costly.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Adding asynchronous replication to commercial storage boxes will not only
double the cost, which are anyway at a very high starting level (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Cost-Arguments-from-Technology"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
In addition, the
\series bold
handover granularity
\series default
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Flexibility-of-Failover"
plural "false"
caps "false"
noprefix "false"
\end_inset
) may not meet the optimum.
\end_layout
\begin_layout Description
LocalStorage, and some further models like
\family typewriter
RemoteSharding
\family default
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Variants-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
)
\begin_inset Newline newline
\end_inset
There is some historical belief that cloud storage cannot be reasonably
built on top of them.
When newer developments and opportunities are taken into account, this
has changed.
Here are some examples, mentioning some example components:
\end_layout
\begin_deeper
\begin_layout Description
(1) can be achieved at LV granularity with Football (see
\family typewriter
football-user-manual.pdf
\family default
), which creates a
\series bold
Big Virtual LVM Pool
\series default
.
Football is in mass production at 1&1 Ionos since August 2018.
\end_layout
\begin_layout Description
(2) can be achieved at disk granularity with local RAID, and at LV granularity
with DRBD or MARS.
Both are in mass production since several years.
\end_layout
\begin_layout Description
(3) can be achieved at LV granularity with LVM snapshots, and/or ZFS (or
other filesystem) snapshots, and/or above filesystem layer by addition
of classical backup.
\end_layout
\begin_layout Description
(4) at least
\family typewriter
Eventually Consistent
\family default
or better can be alternatively achieved by one of the components
\end_layout
\begin_deeper
\begin_layout Description
(4a)
\series bold
DRBD
\series default
, which provides
\family typewriter
Strict Consistency
\family default
during
\family typewriter
connected
\family default
state, but works only reliably with passive crossover cables over
\series bold
short distances
\series default
(see CAP theorem in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Explanation-via-CAP"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
DRBD violates any type of consistency within your
\emph on
replicas
\emph default
during (automatic) re-sync, and thus does not
\emph on
fully
\emph default
comply with the above definition of cloud storage in a
\emph on
strong
\emph default
sense.
You may argue at a coarse time granularity scale in order to
\begin_inset Quotes eld
\end_inset
fix
\begin_inset Quotes erd
\end_inset
this.
\end_layout
\begin_layout Description
(4b)
\series bold
MARS
\series default
, which works over
\series bold
long distances
\series default
and provides two different consistency guarantees at different levels,
\emph on
both at the same time
\emph default
:
\end_layout
\begin_deeper
\begin_layout Description
locally:
\family typewriter
Strict Consistency
\family default
at local LV granularity, also
\emph on
within
\emph default
each of the LV replicas.
\end_layout
\begin_layout Description
globally:
\family typewriter
Eventually Consistent
\family default
\emph on
between
\emph default
different LV replicas (global level).
\begin_inset Newline newline
\end_inset
The CAP theorem (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Explanation-via-CAP"
plural "false"
caps "false"
noprefix "false"
\end_inset
) says that
\family typewriter
Strict Consistency
\family default
is
\series bold
not possible
\series default
in general at
\emph on
unplanned failover
\emph default
during long-distance network outages (P = Partitioning Tolerance), when
A = Availability is also a requirement.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
However, in case of a
\emph on
planned handover
\emph default
, MARS is also
\family typewriter
Strictly Consistent
\family default
at a global level, but may need some extra time for catching up.
\begin_inset Newline newline
\end_inset
Notice: global
\family typewriter
Strict Consistency
\family default
is also possible at a
\emph on
coarse timescale
\emph default
, in accordance with the CAP theorem, if you decide to sacrifice A = Availabilit
y during such a network incident by simply
\emph on
not
\emph default
executing a failover action.
Just wait until the network outage is gone, and MARS will automatically
resume
\begin_inset Foot
status open
\begin_layout Plain Layout
This automatic MARS behaviour is similar to the behaviour of DRBD in such
situations, when DBRD can automatically go to
\family typewriter
disconnected
\family default
-like state, and you are later manually or automatically resuming the DRBD
connection for an incremental re-sync.
MARS does everything automatically because it has no firmly built-in assumption
s about the actual duration of any network communication.
\end_layout
\end_inset
everything ASAP, and thus you are using MARS
\emph on
only
\emph default
as a protection against
\series bold
fatal
\series default
storage failures / unplanned
\series bold
disasters
\series default
.
\begin_inset Newline newline
\end_inset
Notice: A = Availability is
\emph on
not generally
\emph default
required by the above definition of cloud storage, because from a user's
perspective it would not generally make sense in the global internet where
connection loss may anyway occur at any time.
Thus it is a valid operational strategy to
\emph on
not
\emph default
fail-over your LVs during certain minor, or even during certain types of
major network outages (e.g.
when failover would not improve much).
\begin_inset Newline newline
\end_inset
Notice: long-term
\series bold
disaster tolerance
\series default
(e.g.
perpetual loss of some storage nodes during an earthquake) is
\emph on
not
\emph default
modeled by the CAP theorem, but is more or less required by (2) and (3)
from the above definition of cloud storage.
\end_layout
\end_deeper
\end_deeper
\end_deeper
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
\family typewriter
BigCluster
\family default
architectures are creating
\emph on
virtual
\emph default
storage pools out of physically distributed storage servers.
For fairness reasons, creation of a big virtual LVM pool, must be considered
as
\emph on
another
\emph default
valid Cloud Storage
\emph on
model
\emph default
, matching the above definition of Cloud Storage.
The main architectural difference is (1)
\series bold
granularity
\series default
, as explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Granularity-at-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
, and (2) the
\series bold
stacking order of sub-components
\series default
(cf.
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Layering-Rules"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Football is creating
\series bold
location transparency
\series default
inside of the distributed virtual LVM pool.
This is an important (though not always required) basic property of
\emph on
any
\emph default
type of clusters and/or grids.
\end_layout
\begin_layout Section
Kirchhoff's Law: Suitability of Storage Networks
\begin_inset CommandInset label
LatexCommand label
name "sec:Kirchhoff-Suitability-of-Storage-Networks"
\end_inset
\end_layout
\begin_layout Standard
When storage networks are used at certain architectures, they will have
some impact which is widely under-estimated.
Disregarding this impact can lead to serious problems, up to major project
failures, and may induce
\series bold
high cost
\series default
for problem compensation.
\end_layout
\begin_layout Standard
The most important fundamental law for any type of network is
\series bold
Kirchhoff's first law
\series default
(see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Kirchhoff's_circuit_laws
\end_layout
\end_inset
), shortly called Kirchhoff's law.
It does not only hold in electrical engineering, but also in information
theory, and in computer networking
\begin_inset Foot
status open
\begin_layout Plain Layout
In strong sense, Kirchhoff's law is only valid when there is no
\emph on
internal
\emph default
traffic
\emph on
between
\emph default
network switches / routers, and between internal and external nodes.
In practice, ICMP and similar traffic is very low, typically only a few
percent or permille (as long as there are no major misconfigurations or
attacks).
Therefore, we can neglect these special cases and treat Kirchhoff's law
as
\emph on
approximately
\emph default
valid.
\end_layout
\end_inset
.
\end_layout
\begin_layout Quote
The sum of all
\emph on
actually successful
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
In the presence of low-level communication errors or packet loss, there
may be some unsuccessful traffic, which need not obey Kirchhoff's law.
However, these flows do not contribute to the overall functionality provided
by the network.
Thus they can be ignored in this high-level architectural consideration.
\end_layout
\end_inset
ingoing data flows into an abitrary sub-network, or into a single network
compenent (e.g.
a router or a switch) equals the sum of all actually outgoing traffic flows.
\end_layout
\begin_layout Standard
For simplicity, the following graphics shows only one way of an
\emph on
actual
\emph default
full-duplex data flow:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/kirchhoff.fig
width 70col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Kirchhoff's law is a very universal natural law.
It holds in many places at the same time.
It is not only valid for each and every single network switch and/or router
(independently from each other), but also for complete sub-networks, and
even approximately for the internet as a whole.
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Datacenter Coupling of 2,000 Servers each
\end_layout
\end_inset
Typical servers have uplinks with a capacity of 10 GBit/s.
For coupling of 2,000 servers in datacenter A with another 2,000 servers
in datacenter B, such that anyone can (at least potentially) talk to anyone
else, a lot of intermediate switches and backbone wires are needed.
Typically, backbones and datacenter interconnects are built with 100 GBit/s
technology.
The following graphics displays the wire capacities:
\end_layout
\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
filename images/example-datacenter-coupling.fig
width 100col%
\end_inset
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Notice: 2,000 uplinks with a capacity of 10 GBit/s each are summing up
to a total uplink capacity of 20,000 GBit/s, which is bigger than the backbone
capacity of 200 GBit/s by a factor of 100.
Now Kirchhoff's law comes into play: the
\emph on
actual
\emph default
data flow is limited to 200 GBit/s, due to the bottleneck inbetween the
two datacenters.
\end_layout
\begin_layout Plain Layout
\noindent
In other words, the
\series bold
Network Overprovisioning Factor = NOF
\series default
is 20,000 / 200 = 100 in this example
\begin_inset Foot
status open
\begin_layout Plain Layout
In general, the NOF must be computed for each pair of (sender,receiver).
Only in very symmetric corner cases, like explained here for didactic reasons,
the NOF is globally unique.
\end_layout
\end_inset
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
A network bottleneck with a ratio of 1:100 means: if only 1% of the servers
would be sending data with full speed, the whole network would be already
at its limits.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In practice, network traffic is almost never equally distributed, but varies
heavily over time.
\series bold
Timely bursts
\series default
are occurring regularly, flooding their respective uplinks.
If such bursts are occurring at only 1% of the servers, queueing theory
(see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Queueing_theory
\end_layout
\end_inset
) will come into play: somewhere in the overall system,
\series bold
spontanous queues
\series default
will start to form.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Queueing theory says that any queue will grow
\emph on
indefinitely
\emph default
whenever the arrival rate
\begin_inset Formula $\lambda$
\end_inset
is
\emph on
permanently
\emph default
higher than the departure rate
\begin_inset Formula $\mu$
\end_inset
.
During
\emph on
permanent
\emph default
overload, this is
\emph on
approximately
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
In practice, queues are limited, and congestion control algorithms will
also limit their length (see TCP window sizes etc).
\end_layout
\end_inset
the case.
As a consequence from those spontanous queues forming anywhere in the complex
system,
\series bold
network latencies
\series default
will rise up.
Example, observed in practice: depending on the number of machines causing
traffic jam by overload, iSCSI latencies may climb up from tenths of millisecon
ds, sometimes up to several seconds (measured with
\family typewriter
blktrace
\family default
and visualized with
\family typewriter
blkreplay
\family default
).
\end_layout
\begin_layout Plain Layout
This is also called
\series bold
jitter
\series default
(see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Jitter
\end_layout
\end_inset
), or PDV = Packet Delay Variation.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
iSCSI and similar TCP-based protocols are known for
\series bold
over-reacting
\series default
on jitter produced by spontanous queues, since their internal queueing
discipline is FIFO-like.
Result of too high latencies / jitter: customers will be dissatisfied with
their application behaviour.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Often a
\begin_inset Quotes eld
\end_inset
rescue
\begin_inset Quotes erd
\end_inset
comes from the
\series bold
application burst behaviour
\series default
: when bursts are relatively short with respect to pauses between bursts,
the resulting latencies may be less disturbing, because the queues will
get a chance to drain.
However, the network will
\series bold
depend on the application behaviour
\series default
, and thus will become
\series bold
flaky = unreliable
\series default
in general.
When it
\begin_inset Quotes eld
\end_inset
appears to work
\begin_inset Quotes erd
\end_inset
, you are just in
\series bold
good luck
\series default
.
Since application behaviour is non-predictable in general, your luck may
change at any time, forming a
\series bold
risk
\series default
from a management perspective.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
In general, a network dimensioned with NOF
\begin_inset Formula $\gg1$
\end_inset
is
\series bold
not capable of carrying realtime traffic
\series default
.
Consequently, storage networks must not be built in such a way.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Many network protocols from the storage area, like iSCSI, NFS, glusterfs,
etc, are heavily reacting at PDV / jitter produced by spontanous queues.
Notice that Kirchhoff's law is the real reason behind the observation that
these protocols are often working over point-to-point connections (e.g.
crossover cables) where NOF=1, but often show up some problems in complex
networks when NOF
\begin_inset Formula $\gg1$
\end_inset
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
NOF of a Storage Network
\end_layout
\end_inset
In the following example, we assume to have a
\emph on
dedicated
\emph default
storage network, interconnecting 200 clients to a high-speed storage server.
This time, Kirchhoff's law tells us that the network overprovisioning factor
is
\begin_inset Quotes eld
\end_inset
only
\begin_inset Quotes erd
\end_inset
NOF = 10:
\end_layout
\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
filename images/example-storage-network.fig
width 80col%
\end_inset
\end_layout
\begin_layout Plain Layout
Now the question is: is this type of network capable of realtime IO traffic?
Will this storage network be reliable?
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
There is a clear answer: NO.
With an overprovisioning factor of 10, it is
\emph on
possible
\emph default
to overload the storage network by forming spontanous queues, for example
when more than 10% of all servers are reading or writing big chunks of
data.
Suchalike can happen very easily, for example when masses of page cache
data are dirtified in a very short time in the Linux kernel.
See the example story in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example-Failures-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
where customers pressing the
\begin_inset Quotes eld
\end_inset
save
\begin_inset Quotes erd
\end_inset
button were inducing spontanous page cache dirtifications in masses.
Another possibility are databases with a huge number of transaction commits
running in parallel.
Although each single commit may involve a relatively low number of IOs,
\emph on
huge masses
\emph default
of them may also lead to network congestion during peaks.
Database administrators will confirm that commits are very sensitive to
PDV / jitter.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The only
\emph on
reliable
\emph default
way to achieve realtime capabilities in a storage network would be lowering
the overprovisioning factor down to NOF
\begin_inset Formula $\approx$
\end_inset
1.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Except when using crossover cables rack-to-rack, this would make the storage
network
\series bold
very expensive
\series default
, as soon as some hundreds or thousands of servers need to be coupled with
storage boxes over a routed network.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset VSpace defskip
\end_inset
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Pitfalls: VLANS and ROCEv2 and Spine-Leaf Architectures
\end_layout
\end_inset
As mentioned in this guide at several places, realtime storage networks
should always be built as
\series bold
physically separate networks
\series default
, in order to lower the NOF as much as possible.
Here are some common pitfalls which may easily turn out counter-productive:
\end_layout
\begin_layout Enumerate
VLANs and other methods for
\series bold
network virtualization
\series default
are a method for
\emph on
isolation
\emph default
of multiple sources of traffic from each other.
As a result, realtime storage network traffic can be
\emph on
non-functionally disturbed
\emph default
by other traffic, as soon as they are
\emph on
sharing
\emph default
some physical resources.
Unfortunately, (internal) customers cannot see the reasons for this, because
the
\emph on
virtual
\emph default
network structures are isolating them from each other
\emph on
functionally
\emph default
, although the disturbance is at non-functional level.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
As said: realtime storage networks should always be built as
\series bold
physically separate networks
\series default
, each by each, and over
\series bold
shortest possible physical distances
\series default
.
Their NOF should be as low as possible.
Consequently, VLANS are not necessary, since there would be an 1:1 ratio
between virtual and physical LANs.
\end_layout
\begin_layout Enumerate
At the moment (2020), a hype cycle is starting with new network protocols
like ROCEv2 claiming (or at least psychologically suggesting) that they
would be able to guarantee predictable / realtime behaviour over conventional
IP networks.
Do not believe in such hypes.
Kirchhoff's law is a
\emph on
natural law
\emph default
, which is stronger than any human-made hype.
Notice that Kirchhoff's law does not depend on details of traffic congestion
control, whether packet loss can be observed somewhere, or whether it cannot
be observed anymore because congestion control is
\emph on
hiding
\emph default
the effect.
ROCE and siblings should get their chance to demonstrate their merits in
reducing packet loss
\emph on
overhead,
\emph default
caused by classical IPv4 congestion control algorithms (by shifting congestion
control down in the OSI reference model to some lower layers).
However, these new protocols
\series bold
cannot work miracles
\series default
with respect to Kirchhoff's law.
\end_layout
\begin_layout Enumerate
Notice that the NOF is a
\emph on
relative
\emph default
measure.
It does not change, for example when you upgrade your uplinks from old-style
1 GBit/s to contemporary 10 GBit/s and also upgrade your switches and bockbones
from old-style 10 GBit/s to contemporary 100 GBit/s, while keeping all
other parameters like numbers of servers etc.
Intuitive explanation: by increasing the total uplink capacity, you are
also increasing
\emph on
competition
\emph default
.
Not only your server, but all of your
\emph on
neighbor
\emph default
servers can also grasp more network bandwidth, thus competing at the upgraded
backbone in a similar way than before.
\end_layout
\begin_layout Enumerate
\series bold
Network topologies
\series default
(see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Network_topology
\end_layout
\end_inset
) can have a tremendous impact onto the NOF.
For example, star-ring hybrid
\begin_inset Foot
status open
\begin_layout Plain Layout
Classical star topologies have a central hub, which can easily become a
bottleneck in itself.
In general, its internal
\begin_inset Quotes eld
\end_inset
collapsed backbone
\begin_inset Quotes erd
\end_inset
bandwidth must
\emph on
also
\emph default
be considered when computing the NOF.
This also applies to generalizations like spine-leaf topologies.
\end_layout
\end_inset
topologies or generalizations like spine-leaf topologies and many other
models are used for avoiding the hardware cost of a full mesh
\begin_inset Formula $O(n^{2})$
\end_inset
network, when
\begin_inset Formula $n$
\end_inset
servers are interconnected with each other such that anyone can (at least
potentially) talk to anyone else (see also section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Scalability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
The idea is to use only a
\emph on
small
\emph default
number of parallel spines, for reduction of
\begin_inset Formula $O(n^{2})$
\end_inset
hardware cost.
However, such topologies increase the risk of producing
\emph on
indirect cost
\emph default
in a different dimension: they tend to worsen the NOF by
\series bold
asymmetry
\series default
\begin_inset Foot
status open
\begin_layout Plain Layout
Example: spine-leaf wires look symmetric on paper.
However, the
\emph on
servers
\emph default
are typically attached to the network in clusters or in other types of
\emph on
local aggregates
\emph default
.
When the leafs are interconnected via a ring-like bus structure, different
segments of the bus-like wheel may be loaded differently.
This is like a bicycle wheel, where some
\emph on
external
\emph default
weights are non-centrically attached onto the spines in an
\emph on
asymmetric
\emph default
manner.
In a network, this may easily create
\series bold
hot spots
\series default
in the space dimension, which come
\emph on
on top of
\emph default
the timely load peaks mentioned above.
\end_layout
\end_inset
in the actual load distribution, such that spontanous bottlenecks may arise
\emph on
unexpectedly
\emph default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset VSpace defskip
\end_inset
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Varieties of NOF
\end_layout
\end_inset
It must be stressed that there exists no global NOF limit for all types
of applications and workloads and networks.
Here is an extreme example where a rather huge NOF can be tolerated in
practice:
\end_layout
\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
filename images/example-cellphone-network.fig
width 90col%
\end_inset
\end_layout
\begin_layout Plain Layout
Although each single cellphone has a rather low data rate (1 MBit in this
example), there are huge numbers of them spread over the world.
In this example, we have 2 billions of them, leading to a grand total uplink
capacity of 2,000,000 GBit/s.
By assuming that the internet backbone would be totally dedicated to this
type of application (no other shared
\begin_inset Foot
status open
\begin_layout Plain Layout
By assuming that the shared backbone lines are occupied by 10%, the effective
backbone bandwidth would drop to 20 GBit/s.
As a result, the NOF would rise to 100,000 in this example.
\end_layout
\end_inset
traffic occurring there), the NOF would be 2,000,000 / 200 = 10,000.
Although this is a really huge overbooking, it does typically work in practice
(as long as no DDOS attacks from bot nets are mixing up the scene).
Why aren't there similar problems in practice than with the above storage
network example?
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The key point is: smartphone users do not expect realtime behaviour.
The latencies in their local radio networks are anyway in the range of
100ms, sometimes up to seconds, e.g.
when the radio signal strength is bad.
Mobile users are adapted to such mobile network behaviour, without complaining
in huge masses.
Thus, adding a few more ms latency produced by the internet backbone and/or
by the datacenter network will not make a big difference to them.
They are humans, where an additional latency of 100ms or more can be tolerated,
since reading and understanding the web pages will take even longer.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
In contrast, a NOF of 10,000 would be completely unbearable in a loaded
production realtime storage network.
This is a striking example that realtime requirements versus non-realtime
requirements can make a
\emph on
huge
\emph default
difference, requiring totally different technical means / solutions.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Storage networks are typically carrying
\series bold
realtime traffic
\series default
.
For them, low NOF values are a must, otherwise PDV / jitter may easily
grow too high.
In some cases, like attempts to deliver
\series bold
full SSD performance
\series default
to external customers, the NOF requirements might be even lower than 1.
Notice that NFS or iSCSI workloads may have a
\series bold
high internal parallelism degree
\series default
, even for a single customer (see some captured real-life workloads from
\begin_inset Flex URL
status open
\begin_layout Plain Layout
www.blkreplay.org
\end_layout
\end_inset
).
Queueing theory suggests that a NOF of 0.7 or lower might be required in
order to make the performance impact of a non-local storage network
\begin_inset Quotes eld
\end_inset
invisible
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In addition to throughput requirements, the NOF is a major influence factor
at the
\series bold
price tag
\series default
of a storage network.
When mixing realtime traffic with ordinary bulk network traffic, you will
pay the low NOF also for the ordinary traffic part.
This may easily lead to another cost increase, or the PDV / jitter could
grow higher than expected.
Therefore,
\series bold
do not mix realtime traffic with ordinary traffic
\series default
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset VSpace defskip
\end_inset
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Workarounds against PDV / Jitter in Storage Networks
\end_layout
\end_inset
Here are some countermeasures, in descending order of effectiveness, specificall
y for storage networks:
\end_layout
\begin_layout Enumerate
Simply
\series bold
avoid realtime traffic
\series default
over IP-based networks (and other types of non-realtime capable networks).
For example, use MARS + Football on
\family typewriter
LocalStorage
\family default
.
Do not use realtime traffic requesters like NFS or iSCSI or DRBD etc.
\end_layout
\begin_layout Enumerate
Do
\series bold
not
\series default
build
\series bold
monolithic network topologies
\series default
/ structures, and
\series bold
avoid traffic sharing
\series default
.
Storage networks, when unavoidable, should always be built as a multitude
of
\series bold
dedicated
\series default
and small (local)
\series bold
islands
\series default
, according to the
\series bold
sharding principle
\series default
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Definition-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Enumerate
Check whether there is traffic whose timely behaviour can be improved.
For example, do not start nightly backups all in parallel and all at the
same time, but smear them over a larger time window.
\end_layout
\begin_layout Enumerate
Check whether upgrading of outdated network technology to a contemporary
stage may help for improving the NOF.
Also, changing the network topology in the right direction might help if
you know what you are doing.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Blindly upgrading hardware, or
\begin_inset Quotes eld
\end_inset
throwing with hardware
\begin_inset Quotes erd
\end_inset
, has never been a good strategy.
First check whether other measures mentioned earlier can do it not only
much cheaper, but also much better.
\end_layout
\begin_layout Enumerate
Do not over-estimate the potential of
\series bold
network scheduling
\series default
.
You may try better congestion control algorithms (e.g.
try ROCEv2 whether it
\emph on
really
\emph default
helps), and/or use classical traffic shaping with an appropriate
\series bold
classification of realtime traffic
\series default
.
However notice: Kirchhoff's law cannot be circumvented.
Whenever you are improving the PDV / Jitter of one traffic class somewhere
via packet scheduling, you will
\emph on
necessarily
\emph default
worsen the PDV of
\emph on
another
\emph default
traffic class at the same place.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset VSpace defskip
\end_inset
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Potential Benefits of Network Packet Scheduling
\end_layout
\end_inset
Here is a schematic explanation of the zones where scheduling (in general)
can improve things.
It also holds for the special case of network packet scheduling for a given
storage network:
\end_layout
\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
filename images/potential-of-scheduling.fig
width 100col%
\end_inset
\end_layout
\begin_layout Enumerate
As long as the congestion threshold is not exceeded, you typically won't
need a sophisticated scheduling algorithm.
Practically
\emph on
any
\emph default
type of (reasonable) scheduling will do it, even FIFO.
Simply because there is nothing relevant which
\emph on
needs
\begin_inset Foot
status open
\begin_layout Plain Layout
Notice: packet scheduling makes only sense when there currently
\emph on
is
\emph default
a spontanous queue.
When no queue has formed at a certain point in time, no elaborate scheduling
decision is necessary at all: whenever a packet arrives, just serve it
\emph on
directly
\emph default
without delay.
\end_layout
\end_inset
\emph default
to be scheduled.
By definition of the so-called
\begin_inset Quotes eld
\end_inset
nobrainer zone
\begin_inset Quotes erd
\end_inset
, congestion is so low that the effort for implementation of a sophisticated
scheduling algorithm and its supporting infrastructure will not pay off.
In general, the congestion threshold depends on the application and its
requirements.
When
\series bold
hard realtime
\series default
behaviour would be required, the threshold might be very low, e.g.
at 10% or less.
For ordinary non-realtime applications with a well-behaving distribution
function, classical queuing theory as well as practical experiences are
suggesting that the congestion threshold may be around 30%.
Sometimes even 50% or more is bearable (see the above example characterization
of requirements for mobile phone users).
\end_layout
\begin_layout Enumerate
Next comes the overload threshold.
It is characterized by a simple property: once it is exceeded, even the
\emph on
very best scheduling algorithm in this universe
\emph default
(which would be able to predict the future
\begin_inset Foot
status open
\begin_layout Plain Layout
There are academic papers on realtime scheduling with pre-assumptions on
predictability, such that solutions exist where the overload theshold is
near 100%.
I have never seen such a case in practice, at least not in the area of
storage networks.
In practice,
\emph on
details
\emph default
of application behaviour timings are not predictable.
\end_layout
\end_inset
) will
\series bold
not be able to help
\series default
anymore.
For ordinary non-critical workloads, queueing theory predicts that this
point is around 70%.
For realtime-critical workloads heavily reacting onto PDV, it may be however
\emph on
much
\emph default
lower.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The yellow zone is the only one where scheduling can be beneficial.
Improvements by better scheduling algorithms will not change the size of
the zone, but will move the
\emph on
actual incident threshold
\emph default
a little bit rightwards.
Depending on the concrete use case, the size of the yellow zone may be
big, or rather small.
In general, its size may depend on further factors like the
\emph on
inherent parallelism degree of the application workload
\emph default
, and on further factors mentioned in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Influence-Factors-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Warning: in some corner cases, the size of this zone might be even near
zero.
Do not invest huge amounts of money into scheduling unless you have determined
its size
\begin_inset Formula $\approx$
\end_inset
potential in advance.
\end_layout
\begin_layout Enumerate
The incident zone is characterized by the queuing behaviour, where queues
and their effects are dominating the scene.
Scheduling cannot help here much more, the system is simply overloaded
too much.
\end_layout
\begin_layout Enumerate
The last zone is easy to understand: you cannot
\emph on
permanently
\emph default
povide less resources to your applications
\emph on
in weighted average
\emph default
than is
\emph on
necessary
\emph default
for providing the service / its SLA.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Notice: this
\emph on
looks
\emph default
like a rather
\emph on
static
\emph default
model.
In practice, DDOS attacks and other dynamic runtime behaviour, such as
PDV peaks resulting from a high NOF, may completely change the game.
Thus the graphics has to be interpreted as a
\emph on
dynamic
\emph default
one.
The point
\begin_inset Quotes eld
\end_inset
100% loaded
\begin_inset Quotes erd
\end_inset
must not be determined according to
\emph on
unweighted average
\emph default
behaviour, but according to your
\emph on
concrete SLAs
\emph default
applied to your
\emph on
concrete load distribution
\emph default
, or approximately due to
\series bold
near-worst-case
\begin_inset Foot
status open
\begin_layout Plain Layout
Caution TCO waste: blindly taking worst-case behaviour in place of well-defined
SLAs is a major pitfall.
When peak behaviour is spanning a few orders of magnitude (e.g.
exponentinal distributions according to Zipf's law), and when there are
no well-defined SLAs such that extreme
\emph on
but short
\emph default
peaks can be ignored, TCO may be worsened by
\emph on
factors
\emph default
due to
\series bold
over-engineering
\series default
.
\end_layout
\end_inset
behaviour
\series default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
When your input application workload (as such) has some
\series bold
load peaks
\series default
which are
\emph on
higher
\emph default
than the average load by
\emph on
several orders of magnitude
\emph default
, scheduling will typically not help much more.
Typical iSCSI workloads from external customers are behaving like this
(e.g.
when many of them are starting their nightly backups around the same time,
and it is not prevented by contract plus some governance).
In simplest case, load peaks may result from reboots with
\series bold
cold caches
\series default
.
Your complex shared storage network needs to be dimensioned according to
the load peaks, in order to prevent a few
\begin_inset Quotes eld
\end_inset
pig customers
\begin_inset Quotes erd
\end_inset
or a DDOS attack from missing your SLAs, or from tearing down masses of
ordinary customers.
This means
\series bold
expensive over-engineering
\series default
by
\emph on
factors
\emph default
.
In the context of factors by
\emph on
orders of magnitude
\emph default
, the
\emph on
relative
\emph default
size of the yellow zone does not play a big role anymore.
Your money pocket needs to be deep anyway.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not trust some frequently heard arguments, claiming that you just would
need huge masses of customers, then the total load distribution would become
smooth.
This need not apply to peaks.
As observed at ShaHoLin, where several thousands of customers are concentrated
onto each LXC container, the
\emph on
average
\emph default
IOPS demand of a container is around 70, while
\series bold
load peaks
\series default
can easily go up to several thousands of IOPS.
This is about 2 orders of magnitude.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
ShaHoLin's storage doesn't suffer from Kirchhoff's law and all the other
problems discussed in this section, since it uses
\family typewriter
LocalStorage
\family default
.
Load balancing is done via MARS + Football, which has no realtime requirements
onto the network.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Typical real-life timely load distributions aren't purely Gaussian or Markov.
They are a
\emph on
mixture
\emph default
of
\emph on
many
\emph default
influences.
Among them is Zipf's law, which means
\series bold
exponential distribution
\series default
within some boundaries.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Of course, you might
\emph on
artificially limit
\emph default
your customer's IOPS in such a way that expensive over-engineering is avoided.
But then you have given up the above goal of high performance, and in some
sense you have relaxed the
\begin_inset Quotes eld
\end_inset
realtime requirements
\begin_inset Quotes erd
\end_inset
onto your storage network.
Consequently, you shouldn't buy expensive SSDs anymore in masses
\begin_inset Foot
status open
\begin_layout Plain Layout
This may be a TCO pitfall.
Some people are apparently using SSDs for
\emph on
compensation
\emph default
of a fraction of problems, which are most likely caused by their high-NOF
storage network.
\end_layout
\end_inset
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset VSpace defskip
\end_inset
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Advice on Storage Networks
\end_layout
\end_inset
For each type of application workload behaviour, the
\series bold
storage network overprovisioning factor NOF
\series default
is an important key parameter, following from
\series bold
Kirchhoff's law
\series default
.
When it grows too high for a certain use case, the
\series bold
incident rate
\series default
will grow.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Depending on storage network topology, low NOF can turn out very expensive.
Do not neglect its influence at business cases!
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not build monolithic storage networks.
Never build a storage network as a non-dedicated / shared network.
Never mix realtime storage IO traffic with ordinary traffic, because it
would induce a
\series bold
risk
\series default
that both traffic types can disturb each other.
Never use VLANs or similar network virtualization techniques for storage
networks, which can easily create such a traffic-type mix on the physical
wires.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
This guide shows you a solution: how to
\series bold
get rid of realtime-capable storage networks
\series default
\emph on
at all
\emph default
.
Load balancing via background data migration (e.g.
using Football) does neither require realtime IO behaviour, nor does it
require a dedicated storage network.
In addition, background data migration traffic is VLAN / spine-leaf friendly
and can be combined with traffic shaping.
\end_layout
\end_inset
\end_layout
\begin_layout Section
Layering Rules and their Importance
\begin_inset CommandInset label
LatexCommand label
name "subsec:Layering-Rules"
\end_inset
\end_layout
\begin_layout Standard
Complex systems are composed of several layers.
In this section, we will learn how to organize them (close to)
\series bold
optimally
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Non-optimal layering is a major cause of
\series bold
financial losses
\series default
, decreased reliability /
\series bold
increased risk
\series default
,
\series bold
worse scalability
\series default
, etc.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Well-designed systems can be recognized as roughly following Dijkstra's
famous
\series bold
layering rules,
\series default
originating from his pioneer THE project.
Wikipedia article
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/THE_multiprogramming_system
\end_layout
\end_inset
is mentioning an important principle behind Dijkstra's layers, in section
\begin_inset Quotes eld
\end_inset
Design
\begin_inset Quotes erd
\end_inset
:
\end_layout
\begin_layout Quotation
\series bold
higher layers only depend on lower layers
\end_layout
\begin_layout Standard
The original article
\begin_inset Flex URL
status open
\begin_layout Plain Layout
http://www.cs.utexas.edu/users/EWD/ewd01xx/EWD196.PDF
\end_layout
\end_inset
resp
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://dl.acm.org/citation.cfm?doid=363095.363143
\end_layout
\end_inset
contains very interesting information, and is a highly recommended reading.
The introduction and the progress report is relevant for today's managers,
optionally the
\begin_inset Quotes eld
\end_inset
design experience
\begin_inset Quotes erd
\end_inset
, and certainly the conclusions.
The section
\begin_inset Quotes eld
\end_inset
System hierarchy
\begin_inset Quotes erd
\end_inset
is relevant for today's system architects, while the rest is mostly of
historical interest for OS and kernel specialists.
Reading the relevant parts after more than 50 years is extremely well-invested
time.
Dijkstra provides solutions for
\series bold
invariant problems
\series default
which are facing us today with the same boring ignorance, even after 50
years.
The heart of his conclusions is
\series bold
timeless
\series default
.
\end_layout
\begin_layout Standard
Dijkstra's methodology has been intensively discussed
\begin_inset Foot
status open
\begin_layout Plain Layout
An important contribution is from Haberman, by clarifying that there exist
serveral types of hierarchies.
\end_layout
\end_inset
by the scientific OS community, and has been generalized in various ways
to what folklore calls
\begin_inset Quotes eld
\end_inset
Dijkstra's layering rules
\begin_inset Quotes erd
\end_inset
.
Here is a condensed summary of its essence:
\end_layout
\begin_layout Itemize
Layers should be viewed as
\series bold
abstractions
\series default
.
\end_layout
\begin_layout Itemize
Higher layers should only depend on lower layers.
\end_layout
\begin_layout Itemize
Each layer should
\series bold
add
\series default
some
\series bold
new
\series default
functionality.
\end_layout
\begin_layout Itemize
Trivial conclusion by reversing this:
\series bold
Regressions
\series default
should be avoided.
A regression is when some functionality is
\emph on
lost
\emph default
at a higher layer, although it was present at a lower layer.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
This sounds very simple.
However, on a closer look, there are numerous violations of these rules
in modern system designs.
Some examples will follow in the next subsections.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The term
\begin_inset Quotes eld
\end_inset
\series bold
functionality
\series default
\begin_inset Quotes erd
\end_inset
is very abstract, and deliberately not very specific
\begin_inset Foot
status open
\begin_layout Plain Layout
Elder schools of software engineering know that
\series bold
design processes
\series default
must
\emph on
necessarily
\emph default
start with unspecific terms, in order to start to bridge the so-called
\series bold
semantic gap
\series default
.
\end_layout
\end_inset
.
It is
\series bold
independent
\series default
from any implementations, programming languages, or programming / user
interfaces, or other matters of
\series bold
representation
\series default
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The same functionality may be accessible via
\emph on
multiple
\emph default
different
\series bold
interfaces
\series default
.
Thus a different interface does
\emph on
not imply
\emph default
that functionality is (fundamentally) different.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Nevertheless, people are often confusing functionality with interfaces.
They think that a different interface must provide a different functionality.
As explained, this is not correct in general.
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Pitfalls from Confusion of
\begin_inset Quotes eld
\end_inset
Excellent Slides
\begin_inset Quotes erd
\end_inset
with
\emph on
Reality
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Confusion of interfaces with functionality can be exploited by so-called
\emph on
marketing drones
\emph default
and other types of advertising (e.g.
aquisition of
\series bold
venture capital
\series default
), in order to
\series bold
open your money pocket
\series default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
As a responsible manager, you should always check the
\emph on
functionality
\emph default
behind a certain product and its interfaces: what is
\emph on
really
\emph default
behind the scenes?
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
For enterprise-critical
\begin_inset Quotes eld
\end_inset
marketing slides
\begin_inset Quotes erd
\end_inset
& co: checks of
\emph on
abstract
\emph default
functionality aren't enough in many cases.
Find the
\emph on
right
\emph default
experts for
\emph on
additional
\emph default
checks of the
\emph on
real
\emph default
functionality (for
\emph on
existing
\emph default
and/or
\emph on
future
\emph default
implementations / hardware / etc).
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Negative Example: object store implementations mis-used as backend for block
devices / directory or pointer structures / POSIX filesystems
\begin_inset CommandInset label
LatexCommand label
name "par:Negative-Example:-object"
\end_inset
\end_layout
\begin_layout Standard
Several object store implementations have two or more high-level layers,
each possibly decomposable into several sub-layers.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Pitfalls from Disregarding
\emph on
Nested
\emph default
Sub-Layers
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Simple slides can be produced when the top layers are small and look
\begin_inset Quotes eld
\end_inset
easy
\begin_inset Quotes erd
\end_inset
, but the
\emph on
real
\emph default
functionality is
\emph on
hidden
\emph default
in
\series bold
nested sub-layers
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
At least the high-level layers of object stores are typically following
the client-server paradigm, where servers and clients are interconnected
via some
\begin_inset Formula $O(n^{2})$
\end_inset
storage network (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Distributed-vs-Local:"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Standard
We start by looking at the
\emph on
internal
\emph default
architecture of certain OSD = Object Storage Device (see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Object_storage
\end_layout
\end_inset
) implementations.
Some publications are treating them more or less as black boxes (e.g.
as abstract interfaces).
Certain people are selling this as an advantage.
\end_layout
\begin_layout Standard
However, we will check this here.
Thus we need to take a closer look at the
\emph on
internal
\emph default
sub-architecture of certain OSD implementations:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/ceph-layering-server.fig
scale 50
\end_inset
\end_layout
\begin_layout Standard
\noindent
The crucial point is: several OSD implementations are internally using
\series bold
filesystems
\series default
for creating the object abstraction.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
OSD implementation strategies
\end_layout
\end_inset
For implementors, filesystems seem to be a tempting
\begin_inset Foot
status open
\begin_layout Plain Layout
Linux kernel implementations of filesystems need typically at least 10 years,
if not 20 years to be considered
\begin_inset Quotes eld
\end_inset
mature
\begin_inset Quotes erd
\end_inset
enough for mass production on billions of inodes.
Search the internet for remarks from Linus Torvalds.
\end_layout
\end_inset
shortcut strategy.
Implementing their own object store functionality on top of block devices,
which could easily take some years or decades until mature enough for productio
n use.
Linus Torvalds, for example, is measuring the maturity cycles of filesystem
implementations in units of
\emph on
decades
\emph default
, not in years.
Pure object stores would need to solve similar
\emph on
fundamental problems
\emph default
, like
\series bold
fragmentation problems
\series default
, which is a science in itself.
Thus existing kernel-level filesystem implementations are often just re-used
for OSDs.
They seem to be already there,
\begin_inset Quotes eld
\end_inset
for free
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Plain Layout
However, at architectural level, they are
\emph on
not
\emph default
for free.
They are violating Dijkstra's layering rules by causing
\emph on
regressions
\emph default
.
\end_layout
\begin_layout Plain Layout
At abstract functionality level: passive objects, and even some associated
\emph on
rich metadata
\emph default
, are more or less nothing else but
\series bold
restricted files
\series default
, optionally augmented with POSIX EAs = Extended Attributes
\begin_inset Foot
status open
\begin_layout Plain Layout
Posix EAs = Extended Attributes implementations as provided by classical
filesystems are providing roughly the same functionalities as
\emph on
passive
\emph default
augmented object metadata.
Even active metadata is possible, e.g.
by separate processes run by metadata indexing tools like
\family typewriter
Akonadi
\family default
or
\family typewriter
miner
\family default
or
\family typewriter
baloo
\family default
.
With such a standard addendum, classical filesystems can also be used for
providing active functionality.
\end_layout
\end_inset
.
\end_layout
\begin_layout Itemize
Object IDs can be
\series bold
trivially mapped
\series default
\begin_inset Foot
status open
\begin_layout Plain Layout
Example: random hex key
\family typewriter
0123456789ABCDEF
\family default
can be trivially mapped to a path
\family typewriter
/objectstore/0123/4567/89ABCDEF
\family default
in an easily reversible way (bijective mapping)
\end_layout
\end_inset
to filenames / pathnames.
At
\emph on
abstract functionality
\emph default
level, there is almost no difference between pathnames and object IDs,
with the exception that pathnames are
\emph on
more general
\emph default
, e.g.
by allowing deep nesting into subfolders.
\end_layout
\begin_layout Itemize
Newer versions of certain Linux-based filesystems can even automatically
generate random object keys, and even atomically (= free of race conditions
when executed concurrently).
Example: supply the option
\family typewriter
O_TMPFILE
\family default
to
\family typewriter
open()
\family default
, followed by
\family typewriter
linkat()
\family default
.
\end_layout
\begin_layout Itemize
While filesystems are translating file IDs = pathnames into
\series bold
file handles
\series default
before further operations can be carried out, object stores are typically
skipping this intermediate step from a user's viewpoint.
The user needs to supply the
\series bold
object ID
\series default
for
\emph on
any
\emph default
operation.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In the implementation, this can lead to considerable
\series bold
runtime overhead
\series default
, because ID lookup functionality similar to
\family typewriter
open()
\family default
has to be re-executed for each operation.
In contrast, valid file handles are
\emph on
directly
\emph default
referring to the relevant kernel objects in RAM, without need to search
for a filename again.
Extreme example: consider the total runtime overhead by repeatedly appending
1 byte to an object in a loop.
\end_layout
\begin_layout Itemize
Consequently, certain file operations associated with file handles are missing
in pure object stores, such as
\family typewriter
lseek()
\family default
, as well as many other operations.
\end_layout
\begin_layout Itemize
\series bold
Concurrency
\series default
functionality of a POSIX-compliant
\begin_inset Foot
status open
\begin_layout Plain Layout
POSIX requires
\series bold
strict consistency
\series default
for many operations, while weaker consistency models are often
\emph on
sufficient
\emph default
(but not required) for object stores.
\end_layout
\end_inset
filesystem is much more elaborated than actually needed by an object store.
Examples: fine-grained locking operations like
\family typewriter
flock()
\family default
are typically not needed in pure object stores.
The
\family typewriter
rename()
\family default
operation, including its side effects onto concurrency, would even
\emph on
contradict
\emph default
to the fundamental idea of immutable object IDs.
\end_layout
\begin_layout Itemize
\series bold
Shared memory
\series default
functionality.
Filesystems need to support
\family typewriter
mmap()
\family default
and relatives.
This is
\emph on
inevitable
\emph default
in modern kernels like Linux, for hardware MMU-supported
\series bold
execution of processes
\series default
, employing the COW = Copy On Write strategy.
See
\family typewriter
fork()
\family default
and
\family typewriter
execve()
\family default
syscalls, and their relatives.
In general, shared memory can be used by several processes concurrently,
and on
\series bold
sparse files
\series default
.
Filesystem implementors need to spend a considerable fraction of their
total effort on this.
Concurrency on shared memory, together with SMP plus NUMA scalability to
a contemporary degree, is what makes implementation really hard, and why
there are only relatively few people in the world mastering this art.
As a responsible manager, please compare with Dijkstra's remarks on required
\series bold
skill levels
\series default
for serious OS work.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Object stores are typically lacking shared memory functionalities completely.
Thus they are not suited as a
\emph on
core component
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
Years ago, certain advocates of object stores have claimed that filesystems
would be superseded by object stores / OSDs in future.
This is unrealistic, due to the lack of mentioned basic functionalities.
When missing functionality would be added to object stores, they would
turn into filesystems, or into so-called
\begin_inset Quotes eld
\end_inset
hybrid systems
\begin_inset Quotes erd
\end_inset
.
Consequently, there is no clue in claiming that object stores are forming
a fundamental base for operating systems.
They are essentially just a special case, optionally augmented with some
active functionality, which in turn should be attributed to a
\emph on
separate
\emph default
layer, independently from filesystems or object stores.
\end_layout
\end_inset
of a modern OS.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In comparison, creating a different interface for an
\emph on
already existing
\emph default
sub-functionality, and optionally adding some metadata harvesters and filters,
is requiring lower
\begin_inset Foot
status open
\begin_layout Plain Layout
Roughly, computer science students should be able to do that after a 1 semester
OS course.
\end_layout
\end_inset
skills and effort.
\end_layout
\begin_layout Itemize
Several less-used functionalities, like
\series bold
hardlinks
\series default
etc.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Obviously, these functionalities are
\emph on
lost
\emph default
at the object layer and/or latest at the exports interface.
Thus we have identified a Dijkstra regression.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
As explained in the detail box:
\series bold
trivial differences
\series default
in an interface, such as usage of intermediate file handles / or not, or
near-trivial
\series bold
representation
\series default
variants like pathnames vs object IDs, are no valid
\emph on
\begin_inset Foot
status open
\begin_layout Plain Layout
Arguing with trivial syscall combinations or trivial parameter passing can
be observed sometimes.
As a responsible manager, you should draw another conclusion: someone arguing
this way is likely fighting for a particular
\series bold
political interest
\series default
in an
\series bold
unfair
\series default
manner, and/or possibly demonstrating a
\series bold
poor skill level
\series default
.
\end_layout
\end_inset
\emph default
arguments for claiming differences in the
\emph on
abstract functionality
\emph default
in the sense of Dijkstra.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
\emph on
Real
\emph default
functionality behind object stores
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Conclusion:
\emph on
passive
\emph default
object stores are approximately nothing else but a
\series bold
special case
\series default
of fileystems.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Here is the picture from section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Object-Store"
plural "false"
caps "false"
noprefix "false"
\end_inset
once again:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/functionality-object-store-vs-filesystems.fig
width 70col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
Now let us look at some
\emph on
active
\emph default
functionality of some object stores, such as automatic collection of
\series bold
rich metadata
\series default
, or filtering functionality on top of them: are suchalike functionalities
\emph on
really specific
\emph default
for object stores?
\end_layout
\begin_layout Standard
There is a clear answer: NO.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Active Functionality in Linux
\emph on
on top of
\emph default
Filesystems
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
For example,
\family typewriter
Akonadi
\family default
,
\family typewriter
miner
\family default
,
\family typewriter
baloo
\family default
, and similar standard Linux tools (and several multimedia frameworks like
\family typewriter
gstreamer
\family default
) are indexing the EXIF metadata of images, or metadata of mp3 songs, videos,
etc, residing in a classical filesystem.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not draw wrong conclusions from the fact that the classical Unix Philosophy
(see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Unix_philosophy
\end_layout
\end_inset
) has a long tradition of
\series bold
decomposing
\series default
functionality into
\series bold
separate layers
\series default
, such as the distinction between passive filesystems and active metadata
indexing.
When some object advocates are merging these separate layers into one,
and/or
\series bold
presenting
\series default
some
\series bold
impressive slides
\series default
, this is
\emph on
not
\emph default
an advantage.
In contrary, there are disadvantages like
\emph on
hidden cartesian product multiplications
\emph default
occurring at (nested) architecture level, and possibly also in implementations.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
\emph on
Real
\emph default
implementation value of OSDs
\begin_inset Formula $\Longrightarrow$
\end_inset
business value
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
For responsibles: when certain advocates are claiming that functionality
mergers, such as more or less
\series bold
trivial combinations
\series default
of filesystem sub-functionality with some metadata harvesters, are constituting
some new product, be
\series bold
cautious
\series default
.
It is about
\series bold
\emph on
your
\emph default
money
\series default
, or about your company's money.
\end_layout
\begin_layout Plain Layout
While it might be a
\begin_inset Quotes eld
\end_inset
new
\begin_inset Quotes erd
\end_inset
product from the perspective of end customers, you should
\series bold
check
\series default
the
\series bold
technical effort
\series default
for
\begin_inset Quotes eld
\end_inset
implementing
\begin_inset Quotes erd
\end_inset
the
\begin_inset Quotes eld
\end_inset
new
\begin_inset Quotes erd
\end_inset
functionality.
There are cases where more than 90% functionality is already there.
When it is from OpenSource, do not pay a lot of money for some more or
less trivial adaptors.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
When more than 95% of functionality is already there
\emph on
for free
\emph default
, beware of costly blown-up architectural ill-designs, such as
\begin_inset Formula $O(n^{2})$
\end_inset
client-server BigCluster architectures.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Dijkstra's layering rules can be used as tools for analyzing this, and
for discovery of
\series bold
technical debt
\series default
by unfortunate layering, causing further cost and trouble in the long term.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
When augmented metadata functionality is present (whether actively or passively
), it should
\emph on
not
\emph default
be viewed as an integral part of object stores, but as an
\emph on
optional addendum
\emph default
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Reason:
\series bold
rich metadata is
\emph on
conceptually independent
\series default
\emph default
from both filesystems and object stores.
\end_layout
\begin_layout Standard
You may wonder what is the
\emph on
damage
\emph default
caused by Dijkstra regressions at object stores.
\end_layout
\begin_layout Standard
We now look at a certain
\emph on
mis-use
\emph default
of object stores, which has been unfortunately advocated by object store
advocates several years ago.
Some advocates appear to have learned from bad experiences with suchalike
setups (see examples in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
), no longer propagating suchalike mis-uses anymore, but to focus on more
appropriate use cases for
\emph on
native
\emph default
object stores instead.
\end_layout
\begin_layout Standard
We continue by looking at the
\emph on
client part
\emph default
of distributed block devices / distributed filesystems
\emph on
on top of
\emph default
OSDs, and/or on top of distributed object stores, or similar.
\end_layout
\begin_layout Standard
\emph on
In general
\emph default
, POSIX-like semantics are
\emph on
not necessarily
\emph default
needed for
\emph on
each and every
\emph default
use case.
See section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Properties-Cloud-Storage"
plural "false"
caps "false"
noprefix "false"
\end_inset
for some examples where POSIX or transactional databases are typically
\emph on
really needed
\emph default
.
Examples for
\emph on
unneeded
\emph default
POSIX are
\begin_inset Quotes eld
\end_inset
simpler
\begin_inset Quotes erd
\end_inset
or
\begin_inset Quotes eld
\end_inset
less critical
\begin_inset Quotes eld
\end_inset
use cases, like (
\emph on
parts of
\emph default
and/or
\emph on
native
\emph default
) Docker / Kubernetes applications etc, e.g.
for developers or similar customers.
\end_layout
\begin_layout Standard
Filesystem-like functionality typically needed by developers (and their
users) are for example
\begin_inset Quotes eld
\end_inset
directory-alike
\begin_inset Quotes erd
\end_inset
\series bold
index functionality
\series default
on
\series bold
file names
\series default
or
\series bold
object names
\series default
, or similar.
\emph on
Full
\emph default
POSIX semantics is typically only required when a certain
\series bold
parallelism degree
\series default
must be delivered, while certain types of
\series bold
race conditions
\series default
must be
\emph on
hidden
\emph default
from the end user.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
\series bold
Strict Consistency
\series default
is a
\emph on
subset
\emph default
of POSIX, and may
\series bold
remain critical
\series default
by some
\emph on
weaker
\emph default
use cases like backends for DropBox & co, or by some non-POSIX-like use
cases e.g.
like
\series bold
banking
\series default
or stock exchange
\series bold
marketplaces
\series default
etc.
Do
\emph on
not misinterpret
\emph default
the following picture where
\family typewriter
(POSIX-like)
\family default
is written in parentheses.
The parantheses do
\emph on
not imply
\emph default
that Strict Consistency can be dropped.
See section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Properties-Cloud-Storage"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
We start the next example picture with a more elaborate filesystem semantics
on top of pure object stores.
The following example would require POSIX compliance
\begin_inset Foot
status open
\begin_layout Plain Layout
1&1 Ionos has made the experience that a near POSIX-compliant filesystem
called
\family typewriter
nfs
\family default
did not work correctly, causing customer complaints, because it is
\emph on
not fully
\emph default
POSIX-compliant.
\end_layout
\end_inset
for some top-level applications like Apache webhosting with
\family typewriter
ssh
\family default
access, while some other applications would't require it necessarily:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/ceph-layering-client.fig
scale 50
\end_inset
\end_layout
\begin_layout Standard
\noindent
It should catch your eyes that both block-device and filesystem functionality
is re-appearing once again, although it had been already implemented at
OSD level.
Obviously, there are two more Dijkstra regressions.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not over-stress the fact that now we are creating
\emph on
distributed
\emph default
block-devices, or
\emph on
distributed
\emph default
filesystems in place of local ones.
This does
\emph on
not imply
\emph default
that a
\family typewriter
BigCluster
\family default
architecture is needed on top an
\begin_inset Formula $O(n^{2})$
\end_inset
storage network, or that
\series bold
random replication
\series default
inducing further problems and serious reliability problems (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Reliability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
) is needed.
There are near-trivial alternatives at architecture level, see
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Variants-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
There is another (fourth) Dijkstra regression in further sub-layers, not
depicted here.
Distributed block devices are typically storing 4k sectors or similar
\begin_inset Foot
status open
\begin_layout Plain Layout
Mapping of multiple 4k sectors onto a smaller number of bigger objects (e.g.
128k) opens up another
\series bold
tradeoff
\series default
, called
\series bold
false sharing
\series default
.
This can lead to serious performance degradation of highly random workloads.
\end_layout
\end_inset
\series bold
fixed-size
\series default
entities in the object store, although objects are capable of
\series bold
varying sizes
\series default
.
Thus objects and their
\emph on
dynamic key indirection mechanisms
\emph default
are
\begin_inset Quotes eld
\end_inset
misused
\begin_inset Quotes erd
\end_inset
for a restricted use case where array-like virtual data structures would
be sufficient.
When some petabytes of block device data are created in such a way, a
\series bold
massive overhead
\begin_inset Foot
status open
\begin_layout Plain Layout
For example, an
\family typewriter
xfs
\family default
inode has a typical size of 256 bytes.
When each 4k sector of a distributed block device is stored as 1 object
in an
\family typewriter
xfs
\family default
filesystem consuming 1 inode, there is not only noticable space overhead.
In addition, random access by large application workingsets will need at
least two seeks in total (inode + sector content).
Disregarding caching effects, this just doubles the needed worst-case IOPS.
When taking the lookup fuctionality into account, the picture will worsen
once again.
\end_layout
\end_inset
\series default
is induced.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
As explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Properties-Cloud-Storage"
plural "false"
caps "false"
noprefix "false"
\end_inset
, do not place Strictly Consistent filesystems and/or object stores on top
of Eventually Consistent object stores.
Suchalike is very
\series bold
dangerous
\series default
at
\series bold
risk
\series default
level.
Even when you would have the time (measured in
\emph on
decades
\emph default
) and the money and the top-grade developer skills to get this implemented
and tested for enterprise grade and rolled out to operations, you could
be investing into a
\emph on
Dijkstra regression
\emph default
.
Other aspects are in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Negative-Example:-directory structures over eventually consistent objects"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
Some damages caused (or at least
\emph on
supported
\emph default
) by Dijkstra regressions:
\end_layout
\begin_layout Itemize
\series bold
Risk
\series default
from ill-belief that Eventually Consistent would be sufficient for a certain
use case, and/or
\series bold
risk
\series default
from stacking Strictly Consistent (hidden) sub-systems
\emph on
on top of
\emph default
other Eventually Consistent (hidden) sub-systems.
\end_layout
\begin_layout Itemize
\series bold
Increased invest
\series default
.
Further reasons like doubled effort are explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Cost-Arguments-from-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Itemize
\series bold
Increased operational cost
\series default
, both manpower and electrical power.
Example: certain Ceph OSD implementations have been estimated as roughly
consuming 1 GHz CPU power and 1 GB RAM per spindle.
Even when newer versions are implemented somewhat more efficiently, there
remains architectural Dijkstra overhead as explained above.
\end_layout
\begin_layout Itemize
\series bold
Decreased reliability
\series default
/
\series bold
increased risk
\series default
, simply caused by
\series bold
additional complexity
\series default
introduced by Dijkstra regressions.
Further reasons are explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Reliability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Itemize
\series bold
Decreased total performance
\series default
, simply induced by regression overhead.
Some more reasons can be found in sections
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Performance-Risk-Arguments-from-Layer"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Itemize
\series bold
Limited scalability
\series default
as explained in sections
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Scalability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
is further worsened by Dijkstra regressions.
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Positive Example: ShaHoLin storage + application stack
\begin_inset CommandInset label
LatexCommand label
name "par:Positive-Example:-ShaHoLin"
\end_inset
\end_layout
\begin_layout Standard
ShaHoLin = Shared Hosting Linux at 1&1 Ionos.
It is a
\series bold
managed product
\series default
, i.e.
the sysadmins can login anywhere as
\family typewriter
root
\family default
.
Notice that this has some influence at the architecture.
In general, layers dealing with
\emph on
unmanaged products
\emph default
need to be constructed somewhat differently.
\end_layout
\begin_layout Standard
ShaHoLin's architecture does not suffer from Dijkstra regressions, since
each layer is adding new functionality, which is also available at higher
layers, or at least provides functionality.
\end_layout
\begin_layout Standard
Because of this, and by using a scalability principle called Sharding (see
sections
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Definition-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Variants-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
), architectural properties are
\series bold
close to optimal
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
ShaHoLin Layering
\series default
\begin_inset CommandInset label
LatexCommand label
name "ShaHoLin-Layering"
\end_inset
\end_layout
\end_inset
The following bottom-up description explains some granularity considerations
at each layer:
\end_layout
\begin_layout Enumerate
Hardware-based RAID-6, with an internal sub-architecture based on SAS networking
\begin_inset Foot
status open
\begin_layout Plain Layout
Certain advocates are overlooking the fact that SAS busses are a small network,
just using the SAS protocol in place of TCP/IP.
When necessary, the SAS network can be dynamically extended, e.g.
by addition of external enclosures.
\end_layout
\end_inset
.
The newest LSI-based chip generation supports 8 GB fast BBU cache, which
has RAM speed.
Depending on the number of disks, this creates one big block device per
RAID set.
Current dimensioning (2019) is between
\begin_inset Formula $\approx$
\end_inset
15 TB on 10 fast spindles in a small pizza box, and 48 large-capacity slower
spindles with a total capacity of
\begin_inset Formula $\approx$
\end_inset
300 TB, spread over 3 RAID sets.
This is somewhat conservative; with current technology higher capacity
would be possible, at the cost of lower IOPS.
\end_layout
\begin_layout Enumerate
LVM = Logical Volume Management.
This is provided by the dm = device mapper infrastructure of the Linux
kernel, and by the standard LVM2 userspace tools.
It is sub-divided into the following sub-layers:
\end_layout
\begin_deeper
\begin_layout Enumerate
PV = Physical Volumes, one per RAID set, with practically the same size
/ granularity.
\end_layout
\begin_layout Enumerate
VG = Volume Group.
All PVs
\begin_inset Formula $\cong$
\end_inset
RAID sets are merged into one local storage pool.
Typical sizes are between 15 and 300 TB, depending on hardware class.
Very old hardware may have only
\begin_inset Formula $\approx$
\end_inset
3 TB, but these machines should go EOL soon.
\end_layout
\begin_layout Enumerate
LV = Logical Volumes, one per VM
\begin_inset Formula $\cong$
\end_inset
LXC container instance.
Typical sizes are between
\begin_inset Formula $\approx$
\end_inset
300 GB and
\begin_inset Formula $\approx$
\end_inset
40 TB.
When necessary, the size can be dynamically increased during runtime.
Typical number of LVs per physical machine (also called
\series bold
hypervisor
\series default
) is between 3 and 14 (or exceptionally only 1 on very small old hardware).
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The number of LVs per hypervisor can change during operations by moving
around some LVs
\begin_inset Formula $\cong$
\end_inset
VMs
\begin_inset Formula $\cong$
\end_inset
LXC containers via Football (see
\family typewriter
football-user-manual.pdf
\family default
).
This is used for multiple purposes, such as decommissioning of old hardware,
or load balancing, or for physical reorganizations, e.g.
defragmentation of racks in some of the datacenters.
\end_layout
\end_deeper
\begin_layout Enumerate
Replication layer for achieving geo-redundancy (see sections
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Geo-Redundancy"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Requirements-for-Geo-Redundancy"
plural "false"
caps "false"
noprefix "false"
\end_inset
) using the OpenSource project MARS (see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://github.com/schoebel/mars/docu/mars-user-manual.pdf
\end_layout
\end_inset
).
MARS is the base for planned handover, and for unplanned failover.
Each LV can be switched over individually (ability for butterfly, see
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Flexibility-of-Failover"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
In addition to geo-redundancy, MARS provides the base for
\series bold
LV migration during operations
\series default
via Football (see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://github.com/schoebel/mars/docu/football-user-manual.pdf
\end_layout
\end_inset
).
The number of replicas is typically between 2 and 4, where higher replication
degrees are only used temporarily, e.g.
during a migration, or for compensation of near-defective / unreliable
hardware instances.
\end_layout
\begin_layout Enumerate
Filesystem layer, typically
\family typewriter
xfs
\family default
mounted locally
\begin_inset Foot
status open
\begin_layout Plain Layout
Only on a few old machines, which are shortly before EOL,
\family typewriter
/dev/mars/vm_name
\family default
is exported via iSCSI and imported into some near-diskless clients.
This is an old architectural model, showing worse reliability (more components
which can fail), and higher cost (more hardware, more power, more rackspace,
etc).
Due to iSCSI, IOPS are much worse than with pure
\family typewriter
LocalStorage
\family default
.
Contrary to some old belief, it is
\emph on
not
\emph default
much more flexible.
The ability for butterfly is already sufficient for rare exceptional overload
situations, or for sporadic hardware failures.
Since Football also works on the old iSCSI-based architecture, load balancing
etc does not need to be done via iSCSI.
\end_layout
\end_inset
.
This layer is extremely important for getting the granularities right:
typically, each xfs instance contains several millions of customer inodes
and/or files.
In some cases, the number can climb up to several tenths of millions.
Reason: shared webhosting has to deal with myriads of extremely small customer
files, intermixed with a lower number of bigger files, up to terabytes
in a handful of scarce corner cases.
\end_layout
\begin_layout Enumerate
LXC containers
\begin_inset Formula $\cong$
\end_inset
VMs.
Each of them has a publicly visible customer IP address, which is shared
by all of its customers (typically a few hundrets up to several tenthousands
per container).
Upon primary handover / failover, this IP is handed over to the sister
datacenter via BGP = Border Gateway Protocol.
Upon Football migrations, this IP is also retained, but just automatically
routed to a different physical network segment.
\end_layout
\begin_layout Enumerate
Application layer.
Here are only some important highlights:
\end_layout
\begin_deeper
\begin_layout Enumerate
Apache, spawning PHP via suexec.
One Apache instance per LXC container is typically sufficient for serving
thousands or tenthousands of customers.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Some surprising detail:
\family typewriter
fastcgi
\family default
is deliberately
\emph on
not
\emph default
used at the moment, because security /
\series bold
user isolation
\series default
is considered much more important than a few
\emph on
permille(!)
\emph default
of performance gain by saving a few
\family typewriter
fork()
\family default
+
\family typewriter
execve()
\family default
system calls.
While the Linux kernel is highly optimized for them, typical PHP applications
like Wordpress are poorly optimized, for example by clueless runtime inclusion
of
\begin_inset Formula $\approx$
\end_inset
120 PHP include files, cluelessly repeated for each and every PHP request.
Even when
\family typewriter
OpCache
\family default
is enabled, this costs much more than any potential savings by
\family typewriter
fastcgi
\family default
.
\end_layout
\begin_layout Enumerate
EhB = Enhanced Backup.
This is a 1&1-specific proprietary solution, supporting a grand total of
\begin_inset Formula $\approx$
\end_inset
10 billions of inodes.
It is also organized via the Sharding principle, but based on a different
granularity.
In order to parallelize daily incremental-forever backups, several measures
are taken.
Among others, customer homedirectories are grouped into 49 subdirectories
called
\emph on
hashes
\emph default
in 1&1-slang.
Both backups and restores may run in parallel, independently for each hash,
and distributed over multiple shards.
Hashes are thus forming an
\series bold
intermediate granularity
\series default
between xfs instances, and a grand total of
\begin_inset Formula $\approx$
\end_inset
9 millions of customer home directories.
\end_layout
\end_deeper
\end_inset
\end_layout
\begin_layout Subsection
Negative Example: Inappropriate Replication Layering
\begin_inset CommandInset label
LatexCommand label
name "subsec:Inappropriate-Replication-Layering"
\end_inset
\end_layout
\begin_layout Standard
Several people have independently tried to use MARS within VMs.
This may look like a reasonable idea, but has a number of disadvantages:
\end_layout
\begin_layout Enumerate
It contradicts to Dijkstras layering rules.
\end_layout
\begin_layout Enumerate
It ignores the operational recommendations for MARS.
\end_layout
\begin_layout Paragraph
VM replication and Dijkstra.
\end_layout
\begin_layout Standard
Please be aware that Dijkstra's layering is not a restriction of MARS, but
a fundamental issue for
\emph on
any
\emph default
kind of replication mechanism.
\end_layout
\begin_layout Standard
In general, creation of a
\series bold
separate replication layer at bare metal
\series default
is the strongly recommended solution by Dijkstra, e.g.
using dedicated storage boxes, or directly replicating at hypervisor hardware
when using local storage (e.g.
at ShaHoLin).
\end_layout
\begin_layout Standard
Dijkstra's layering rules are
\emph on
implying
\emph default
that an actively running VM can never replicate
\emph on
itself
\emph default
into
\emph on
another
\emph default
VM, at least not its entire
\begin_inset Foot
status open
\begin_layout Plain Layout
Being unable to replicate the
\emph on
entire
\emph default
VM state is also a violation of the blackbox principle.
\end_layout
\end_inset
internal state.
Trying to do so would lead to an
\series bold
endless nesting recursion
\begin_inset Foot
status open
\begin_layout Plain Layout
A replicator replicating itself would change the state of the VM by its
replication activity, triggering another replication, which in turn would
trigger another replication, and so on.
\end_layout
\end_inset
\series default
of runtime state.
Dijkstra's rules are clearly forbidding cyclic layering.
Therefore, replication must always be considered as a
\emph on
separate
\emph default
layer, and not intermixed with other layers.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
This isn't specific for MARS and its heavy statekeeping in
\family typewriter
/mars
\family default
.
Dijkstra's rules also apply to
\emph on
any other
\emph default
replication system.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In addition to formal layering rules, resource management can easily become
a hell when based on virtual resources instead of on physical ones.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Another outbreak of the hell will happen on highly over-provisioned VM
farms when masses of VMs are starting their (geo-redundant)
\series bold
recovery phase
\series default
after a disaster (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Requirements-for-Geo-Redundancy"
plural "false"
caps "false"
noprefix "false"
\end_inset
)
\emph on
all in parallel
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
In contrast, a single
\family typewriter
/mars
\family default
instance at storage or hypervisor layer will
\emph on
automatically
\emph default
limit the sync parallelism degree to some reasonable value.
In addition, sync is easily controllable by sysadmins.
\end_layout
\end_inset
.
This is not limited to DRBD or MARS full-sync, it is also a problem for
\emph on
any other
\emph default
replication system trying to be operated in
\emph on
overprovisioned
\emph default
VM environments.
Hypervisor-level sysadmins have no control over internals of external customer
VMs, and thus cannot temporarily suspend the massive IO and network traffic.
Limiting the IO is no good solution, since it will also sacrifice ordinary
application performance.
\series bold
Simply do replication right
\series default
, by implementing it at the
\emph on
right
\emph default
layer of the Dijkstra hierarchy.
\end_layout
\begin_layout Paragraph
Operational environment conditions for MARS.
\end_layout
\begin_layout Standard
With respect to MARS: not only for performance reasons and for resource
allocation reasons, MARS is
\emph on
explicitly
\emph default
constructed for running on
\series bold
bare metal
\series default
\emph on
solely
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
A minor exception is
\emph on
functional component testing
\emph default
inside of KVM (as opposed to end-to-end system testing, aka integration
testing, and as opposed to non-functional testing).
This can be done
\emph on
inside
\emph default
of KVM, provided that
\family typewriter
/dev/mars/mydata
\family default
is not used for further sub-virtualization (except
\emph on
lightweight
\emph default
containers like Docker & co), and only for non-critical
\emph on
test loads
\emph default
.
\end_layout
\end_inset
.
A single storage-level or hypervisor-level MARS instance can
\emph on
share
\emph default
a single
\family typewriter
/mars
\family default
filesystem instance for multiple resources, while a multitude of per-VM
\family typewriter
/mars
\family default
instances would induce a waste of storage space by
\emph on
factors
\emph default
.
See also description of hardware requirements in
\family typewriter
mars-user-manual.pdf
\family default
.
\end_layout
\begin_layout Paragraph
Sysadmin Perspective.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Why Replication inside of VMs is a
\emph on
Bad Idea
\emph default
\end_layout
\end_inset
I never heard of anyone who tried to use DRBD
\emph on
productively
\emph default
inside of VMs.
Apparently, sysadmins understand that this would be a bad idea,
\series bold
worsening performance
\series default
over-proportionally and
\series bold
\emph on
unpredictably
\series default
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
Theoretical foundation: queueing theory.
VMs are introducing
\emph on
several
\emph default
queues into workloads, which did not exist without them.
In addition, it becomes impossible to guarantee a maximum service time.
\end_layout
\end_inset
, since the passive side would have to react in
\emph on
realtime
\emph default
, and for each single IO request.
People seem to understand that
\series bold
realtime behaviour
\series default
cannot be expected from ordinary VMs.
Often they already had a bad experience, such as huge performance differences
between para-virtualized device drivers and physical hardware drivers,
both running on so-called
\begin_inset Quotes eld
\end_inset
virtual hardware
\begin_inset Foot
status open
\begin_layout Plain Layout
The term
\begin_inset Quotes eld
\end_inset
virtual hardware
\begin_inset Quotes erd
\end_inset
is a contradiction in itself.
It simply isn't hardware at all.
Hardware is something which creates an
\begin_inset Quotes eld
\end_inset
Outch
\begin_inset Quotes erd
\end_inset
when falling down onto your feet.
\end_layout
\end_inset
\begin_inset Quotes erd
\end_inset
.
Sometimes, the latter cannot run
\emph on
reliably
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
Standard problem: missed interrupts, or interrupts not delivered in-time.
\end_layout
\end_inset
under KVM/qemu, other than for non-critical or minor workstation loads.
Even then, they often work as a CPU burner.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
For unknown reasons, a few people seem to
\emph on
expect
\begin_inset Foot
status open
\begin_layout Plain Layout
From a management perspective, this looks like a
\emph on
broken expectation management.
\end_layout
\end_inset
\emph default
that MARS would be able to work miracles there.
\end_layout
\end_inset
\end_layout
\begin_layout Paragraph
User Perspective.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
End users messing around with IPs
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
I don't know of any virtualization platform where ordinary VM users can
easily configure and use BGP themselves.
Therefore, geo-redundant replication setups under VMs would
\series bold
lack location transparency
\series default
, and provide a
\series bold
crippled user experience
\series default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Leaving geo-replication and BGP handover to be managed by end users would
be a bad idea.
Apart from skills and from a management hell to be mastered by end users,
it would be a
\series bold
waste of IP addresses
\series default
.
When
\emph on
external
\emph default
VM customers would need to control BGP themselves, at least 3 public IP
addresses would be needed: each of both non-location-transparent VMs running
in parallel would require at least 1 public IP for external
\family typewriter
ssh
\family default
access etc, which is 2 in total, and a third public IP for BGP handover,
carrying the workload traffic.
Notice that public IPv4 addresses are a scarce resource.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
A good virtualization platform must provide
\series bold
full location transparency
\series default
of the VMs, without user intervention.
Only a single public IP per VM is then required, which automatically follows
the current geo-location of
\emph on
the
\emph default
single per-user
\begin_inset Foot
status open
\begin_layout Plain Layout
At the passive / secondary side, only the LV replica is updated.
No VM is started there.
Thus no additional VM is requiring CPU and RAM resources.
In contrast, 2 non-location-transparent VMs responsible for replication
would essentially
\series bold
double the necessary compute resources
\series default
.
In addition, total disk space allocation for multiple
\family typewriter
/mars
\family default
instances instead of a shared one would be much higher.
All of these would result in a
\series bold
massive cost increase
\series default
.
\end_layout
\end_inset
VM instance running at the same time.
This is already standard for local VM handover in the same datacenter.
No serious VM user would accept manual IP renumbering work, or responsibility
for routing changes, when his VM is suddenly running on a different hypervisor,
just because another customer used some more RAM, or because some hardware
went defective.
For unknown reasons, a few people are however
\emph on
expecting
\begin_inset Foot
status open
\begin_layout Plain Layout
From a management perspective, this looks like a
\emph on
broken expectation management.
\end_layout
\end_inset
\emph default
a similar effort and similar skills from their (internal or external) VM
customers as soon as geo-redundancy comes into play.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
BGP or a sister protocol is a
\emph on
must
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
The 1&1 Ionos ShaHoLin setup (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Positive-Example:-ShaHoLin"
plural "false"
caps "false"
noprefix "false"
\end_inset
) is a striking example that BGP and its control by hypervisors is possible
in large scale.
\end_layout
\end_inset
for geo-redundant VMs.
It should be automatically controlled by the storage or by the hypervisor
layer, instead of by end users.
When storage and hypervisors are anyway managed by sysadmins, users should
not notice where their VM is currently running (see
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Location-transparency"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
In addition, managed geo-control may become a sold feature.
Customers can then
\emph on
trigger
\emph default
automatic handover of the geo-location with a single click (provided that
both locations are healthy).
\end_layout
\begin_layout Paragraph
Management Perspective.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\series bold
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
OPEX Cost Savings by Managed Geo-Location Transparency
\end_layout
\end_inset
\series default
When using a geo-redundant
\family typewriter
RemoteSharding
\family default
or
\family typewriter
FlexibleSharding
\family default
model, passive-side hypervisors do not carry any workload.
Thus they may be powered off, until they are needed again.
Only the corresponding passive storage boxes need to remain powered all
the time.
\end_layout
\begin_layout Plain Layout
However, this can only work when
\emph on
managed
\emph default
geo-location transparency is implemented.
Otherwise, end users would get a
\emph on
pair of
\emph default
VMs instead of a single VM, running all the time, in order to be able to
manage geo-redundancy themselves.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Manager Briefing
\end_layout
\end_inset
Never accept a proposal to use MARS or any other replication system inside
of VMs.
\end_layout
\begin_layout Plain Layout
\series bold
Insist on fully managed geo-location transparency
\series default
from the viewpoint of VM users.
It is even
\series bold
considerably cheaper
\series default
at OPEX, since unnecessary doubling of the number of concurrently running
VM instances is avoided.
\end_layout
\begin_layout Plain Layout
Do not call any VM system
\begin_inset Quotes eld
\end_inset
geo-redundant
\begin_inset Quotes erd
\end_inset
if it misses this simple standard requirement.
It should not require any political discussions at all (since local location
transparency is standard at local VM farms for decades).
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Managed BGP makes you independent from the OS running inside of VMs.
For example, Windows guests will become geo-redundant without modification.
\end_layout
\begin_layout Subsection
Potentially Negative Example: layering directory-alike structures on top
of billions of eventually consistent objects
\begin_inset CommandInset label
LatexCommand label
name "subsec:Negative-Example:-directory structures over eventually consistent objects"
\end_inset
\end_layout
\begin_layout Standard
The following example is about a
\emph on
potentially planned
\emph default
system, which
\emph on
could
\emph default
be deducable from Dijkstra and/or contemporary belief.
We are
\emph on
not discussing
\emph default
direct violations
\begin_inset Foot
status open
\begin_layout Plain Layout
In general, (distributed) object stores
\emph on
can
\emph default
be constructed without major violations of Dijktra's rules.
However, some contemporary implementations
\emph on
may
\emph default
have some problems in this area, see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Negative-Example:-object"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Important: Distributed Systems (aka loosely coupled systems) are
\series bold
much more complicated to program and operate
\series default
than tightly coupled systems.
\end_layout
\end_inset
of Disjtra's rules (which are discussed e.g.
in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Negative-Example:-object"
plural "false"
caps "false"
noprefix "false"
\end_inset
), but here we discuss a
\emph on
potential misinterpretation
\emph default
of Dijkstra by ignoring some important
\series bold
non-functional properties
\series default
.
\end_layout
\begin_layout Standard
When directory structures are implemented straight-forward on top of object
stores for
\series bold
billions of objects
\series default
, there will be a
\series bold
risk
\series default
from fundamental problems which are
\emph on
known
\emph default
by filesystem and database implementers and their experienced architects,
provided they also know the
\series bold
Theory of Databases
\series default
and/or the
\series bold
Theory of Filesystems
\series default
as published in the traditional research field about filesystems and databases.
For example, a huge bulk of academic research activity was historically
invested into
\series bold
OODB
\series default
= Object Oriented DataBases, while contemporary implementations like Kassandra
& co are typically contradicting to (or ignoring) some of their results.
Another huge bulk can be found in traditional
\series bold
VLDB
\series default
= Very Large DataBases.
\end_layout
\begin_layout Standard
The following explanation is referring to
\emph on
very big
\emph default
\series bold
eventually consistent
\series default
object stores (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Object-Store"
plural "false"
caps "false"
noprefix "false"
\end_inset
), independently from local vs distributed types of object stores.
\end_layout
\begin_layout Standard
What is one of the important
\emph on
fundamental problems
\emph default
of directory-alike data structures (among a long list of other fundamental
problems to be solved)?
\end_layout
\begin_layout Standard
Situation: data structures are living
\series bold
inside of objects
\series default
which contain
\series bold
pointers or references to other objects
\series default
.
These may also contain some more pointers or references to further objects,
in a
\series bold
\emph on
transitive
\series default
\emph default
manner.
\end_layout
\begin_layout Standard
Typical userspace programmers will not notice a problem here.
For example, Java enthusiasts or Python enthusiasts are using references
all the time, but they are using their pointers in the
\emph on
virtual address space
\emph default
of a
\series bold
userspace process
\series default
.
These processes are typically
\emph on
much smaller
\emph default
than billions of objects.
In practice, their relatively few long-living objects are stored either
in
\end_layout
\begin_layout Description
(a) databases, or
\end_layout
\begin_layout Description
(b) in POSIX-aware local filesystems plus some object-oriented import /
export layers.
\end_layout
\begin_layout Standard
Result: these programmers will likely confirm that they (almost) never have
seen a
\emph on
fundamental
\emph default
problem caused by their
\series bold
persistence model
\series default
(a) or (b) where some of their relatively few objects are living for a
\emph on
longer
\emph default
time than their userspace processes are typically living.
\end_layout
\begin_layout Standard
\emph on
Why
\emph default
does such a relatively small object pointer structure typically work in
practice, at least when using (a) or (b) (with few exceptions)?
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Hints for Risk Reduction
\end_layout
\end_inset
(a) Traditional databases are following the well-known
\series bold
ACID
\series default
=
\series bold
A
\series default
tomicity +
\series bold
C
\series default
onsistency +
\series bold
I
\series default
solation +
\series bold
D
\series default
urability principle.
There is an easily usable
\series bold
commit
\series default
operation which ensures this by definition.
\end_layout
\begin_layout Plain Layout
(b) POSIX filesystems are lacking the
\series bold
I
\series default
=
\series bold
I
\series default
solation property, but they can typically provide sufficient
\series bold
A
\series default
=
\series bold
A
\series default
tomicity +
\series bold
C
\series default
=
\series bold
C
\series default
onsistency +
\series bold
D
\series default
=
\series bold
D
\series default
urability properties for important use cases.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Reason: some ACID databases must be able to run
\emph on
on top of
\emph default
some classical filesystems.
Thus some members of the OS = Operating System community were
\emph on
traditionally
\emph default
supporting the needs of some members of the database community (and some
old computer science members were even members of
\emph on
both
\emph default
communities in the good old times).
More details can be found in the old literature.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Now comes the 1 million € or $ question:
\end_layout
\begin_layout Description
\noindent
(c)
\series bold
\emph on
What is the problem
\series default
\emph default
if you want to
\emph on
reliably
\emph default
store pointer-like or other referential structures in millions or billions
of
\series bold
eventually consistent objects
\series default
, containing pointers or references to other eventually consistent objects?
\end_layout
\begin_layout Standard
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
We leave the answer to this question as an
\emph on
exercise
\emph default
to the reader.
Hint: large parts of a correct answer are already mentioned above.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Another fact is also important: by definition, true
\begin_inset Quotes eld
\end_inset
eventually consistent
\begin_inset Quotes erd
\end_inset
does deliberately
\emph on
not provide
\emph default
(or
\emph on
omit for performance reasons
\emph default
) POSIX-like semantics and/or POSIX operations like global
\family typewriter
sync()
\family default
or their weaker sisters
\family typewriter
fsync()
\family default
or
\family typewriter
msync()
\family default
, at least for each
\emph on
client instance
\emph default
(which is often equivalent to
\begin_inset Quotes eld
\end_inset
in the whole cluster
\begin_inset Quotes erd
\end_inset
), and at least
\emph on
by default
\emph default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Required Skills for Projects
\series default
\size footnotesize
using References on top of Eventually Consistent Object Stores
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do
\series bold
not
\series default
accept important enterprise-level projects which place
\series bold
masses
\series default
of complex pointer or reference structures on top of
\series bold
eventually consistent object
\series default
stores.
You may become confronted with some of the following
\emph on
potential problems
\emph default
:
\end_layout
\begin_layout Itemize
\series bold
Dangling Pointers
\series default
, also called
\series bold
Referential Integrity
\series default
in databases (references to non-existing or lost or even
\emph on
wrong
\emph default
objects, potentially belonging to other customers, thus breaching privacy
/ security / isolation / etc).
\end_layout
\begin_layout Itemize
\series bold
Dangling Objects
\series default
(unused / unreachable objects forgotten to free, filling up your storage
space over time).
\end_layout
\begin_layout Itemize
\series bold
False Sharing
\series default
problems.
\end_layout
\begin_layout Itemize
\series bold
Endless Loops
\series default
/
\series bold
Cycles
\series default
in long reference chains.
\end_layout
\begin_layout Itemize
When so-called
\begin_inset Quotes eld
\end_inset
active-active
\begin_inset Quotes erd
\end_inset
(see section
\begin_inset CommandInset ref
LatexCommand vref
reference "sec:What-is-active-active"
plural "false"
caps "false"
noprefix "false"
\end_inset
) parallel writes to
\emph on
logically shared data
\emph default
in distributed object stores are promised to be possible: beware of
\series bold
split brain
\series default
(see CAP theorem) and/or some necessary
\series bold
locking
\series default
(e.g.
usage of
\series bold
distributed lock managers
\series default
) leading to
\series bold
systematic deadlocks
\series default
and/or
\series bold
Transaction Aborts
\series default
similar to OODBs, see also
\series bold
Transaction Theory
\series default
in the database literature.
Notice that
\series bold
Distributed COW
\series default
= Copy on Writes may correspond to some known transactional abort behaviour
of
\series bold
MVDB
\series default
=
\series bold
MultiVersion DataBases
\series default
, even when
\emph on
not distributed
\emph default
, also described in the literature on Transaction Theory.
There are known solutions, but they may increase the total effort for reaching
certain SLAs.
\end_layout
\begin_layout Itemize
Further classical problems from the old database and filesystem literature,
like unexpected data loss requiring an
\emph on
equivalent
\emph default
of
\series bold
database recovery
\series default
and/or
\family typewriter
fsck
\family default
.
Beware of running suchalike on some billions of objects in a BigCluster
\series bold
shared pool
\series default
.
Notice that sharding architectures are requiring less effort, because there
is no
\series bold
error propagation
\series default
between relatively small pools.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Exception: under certain circumstances, you
\emph on
might
\emph default
accept such a project proposal when your staff has the
\series bold
proven skills
\series default
\emph on
and
\emph default
\series bold
experiences with
\series default
(a) writing
\emph on
and
\emph default
maintaining an
\series bold
ACID database
\series default
, or (b) writing
\emph on
and
\emph default
maintaining at least a
\series bold
\emph on
journalling(!)
\emph default
filesystem
\series default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
If so, insist on an acceptable
\series bold
project plan
\series default
with appropriate
\series bold
fallback strategies
\series default
, and on competitive TCO.
There may be further requirements from company level, such as geo-redundancy,
typically imposing further conditions and implying
\series bold
serious pitfalls
\series default
(cf section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Requirements-for-Geo-Redundancy"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\end_inset
\end_layout
\begin_layout Section
Granularity at Architecture
\begin_inset CommandInset label
LatexCommand label
name "sec:Granularity-at-Architecture"
\end_inset
\end_layout
\begin_layout Standard
There are several alternative implementation technologies for (cloud) storage
systems.
They can be classified according to the granularity of their basic transfer
units.
\end_layout
\begin_layout Subsection
Granularities for Achieving Strict Consistency
\begin_inset CommandInset label
LatexCommand label
name "subsec:Granularities-for-Strict"
\end_inset
\end_layout
\begin_layout Standard
\emph on
End users
\emph default
are
\emph on
always
\emph default
expecting
\series bold
strict consistency
\series default
\begin_inset Foot
status open
\begin_layout Plain Layout
For an overview of consisteny models, see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Consistency_model
\end_layout
\end_inset
.
While strict consistency is the most
\begin_inset Quotes eld
\end_inset
natural
\begin_inset Quotes erd
\end_inset
one as expected by humans, most other models are only of academic interest.
\end_layout
\end_inset
from a storage system.
Whenever they are
\begin_inset Quotes eld
\end_inset
saving
\begin_inset Quotes erd
\end_inset
several
\begin_inset Quotes eld
\end_inset
things
\begin_inset Quotes erd
\end_inset
to a (cloud) storage system in a particular order, they are expecting to
always retrieve the
\emph on
newest
\emph default
version of each of them, afterwards.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Anyone who claims that
\begin_inset Quotes eld
\end_inset
eventually consistent
\begin_inset Quotes erd
\end_inset
& co would be sufficient for
\emph on
end users
\emph default
: beware of
\series bold
reputation problems
\series default
, e.g.
articles in
\series bold
test magazines
\series default
/ postings in
\series bold
social media
\series default
/
\series bold
shitstorms
\series default
/ etc.
We are talking about
\emph on
expectations
\emph default
from
\emph on
end-user customers
\emph default
.
The
\series bold
customer is king
\series default
and thus decides on your success!
\end_layout
\begin_layout Standard
Consequences at technical level: here are the most important architectural
differences between object-based storages and LV-based (Logical Volume)
storages, provided that you
\emph on
want to cover comparable use cases
\emph default
:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Tabular
<lyxtabular version="3" rows="14" columns="3">
<features tabularvalignment="middle">
<column alignment="left" valignment="top">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<row>
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\series bold
Strict Consistency required
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Objects
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
LVs
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Granularity
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
small (typically KiB)
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
huge (several TiB)
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Number of instances
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
very high
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
low to medium
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\emph on
Native
\emph default
consistency model
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
weak
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
strict
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Typical access
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
random keys
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
named
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Update in place
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no / yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Resize during operation
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no / yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Object support
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
native
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
on top of
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
LV support
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
on top of
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
native
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Filesystem support
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
on top of
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
on top of
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Scalable
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
at cluster
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
both cluster and grid
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Location distances
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
per datacenter / on campus
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
long distances possible
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Centralized pool management
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
per cluster
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Football uniting clusters
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Easy sharding support
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
cumbersome
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
As explained in sections
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Negative-Example:-directory structures over eventually consistent objects"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Reliability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
, there are
\emph on
known problems
\emph default
with object storage's
\series bold
consistency model
\series default
when higher aggregates like LVs or filesystems are
\emph on
requiring
\emph default
\series bold
strict consistency
\series default
, but are built on top of objects which are only
\emph on
eventually consistent
\emph default
due to their inherent nature.
\end_layout
\begin_layout Subsection
Granularity for Achieving Eventually Consistent
\begin_inset CommandInset label
LatexCommand label
name "subsec:Granularity-for-Eventually"
\end_inset
\end_layout
\begin_layout Standard
This section is
\emph on
not
\emph default
about expectations from end users.
It is about implementation-specific
\series bold
weak consistency models
\series default
, such as
\series bold
eventually consistent
\series default
, see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Consistency_model#Eventual_consistency
\end_layout
\end_inset
, or several other weak consistency models and their variants.
\end_layout
\begin_layout Standard
The following table reflects use cases for
\begin_inset Quotes eld
\end_inset
native
\begin_inset Quotes erd
\end_inset
object storage, where eventually consistent (or similar) is sufficient,
or at least
\emph on
claimed
\emph default
to be sufficient:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Tabular
<lyxtabular version="3" rows="11" columns="3">
<features tabularvalignment="middle">
<column alignment="left" valignment="top" width="0pt">
<column alignment="center" valignment="top" width="0pt">
<column alignment="center" valignment="top" width="0pt">
<row>
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\series bold
\size small
Eventually Consistent sufficient
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Objects
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
LVs
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Granularity
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
medium (1 object = 1 file)
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
huge (several TiB)
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Number of instances
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
medium to very high
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
low to medium
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\emph on
Typical
\emph default
access
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
random keys
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
named + random
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Update in place
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
possible, less common
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Object support
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
native
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
on top of
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Scalable
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
at cluster
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
both cluster and grid
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Location distances
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
per datacenter / on campus
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
long distances possible
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Typical operation mode
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
active - active
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
active - passive
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Centralized pool management
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
per (big) cluster
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Football uniting clusters
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Easy sharding support
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
possible but expensive
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Section
Flexibility of Handover / Failover Granularities
\begin_inset CommandInset label
LatexCommand label
name "subsec:Flexibility-of-Failover"
\end_inset
\end_layout
\begin_layout Standard
This section is also relevant for
\series bold
networking departments
\series default
and their
\series bold
management
\series default
in a bigger enterprise.
\end_layout
\begin_layout Standard
There are two important properties of replication handover / failover:
\end_layout
\begin_layout Enumerate
\series bold
Timely behaviour
\series default
: how fast can it be done?
\end_layout
\begin_layout Enumerate
What is the
\series bold
granularity
\series default
: which are the items that can be switched?
\end_layout
\begin_layout Enumerate
Physical
\series bold
distance
\series default
: both geo-redundancy (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Geo-Redundancy"
plural "false"
caps "false"
noprefix "false"
\end_inset
) and cross-datacenter replication, even when the latter is only over short
distances, are requiring different
\series bold
network support
\series default
than simple handover / failover in the same rack.
\end_layout
\begin_layout Standard
All of these aspects are only reasonable to implement via Location Transparency.
Location Transparency has been introduced in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Location-transparency"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Here we look into some details how to implement it.
\end_layout
\begin_layout Subsection
Where to implement Location Transparency
\begin_inset CommandInset label
LatexCommand label
name "sec:Where-implement-Location-Transparency"
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Where location transparency makes sense or not
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In general, it is not necessary to implement location transparency
\emph on
everywhere
\emph default
, for each and every single component / subsystem.
The art of system architecture consists of knowing
\end_layout
\begin_layout Enumerate
\noindent
where it is
\emph on
needed
\emph default
,
\end_layout
\begin_layout Enumerate
\noindent
where it is
\emph on
beneficial
\emph default
for future growth / future reqirements in multiple dimensions,
\end_layout
\begin_layout Enumerate
\noindent
where it is (or will be) too expensive to pay off in the mid-term future,
using current technology, but nevertheless
\emph on
cheap provisions for its later introduction
\emph default
can be prepared, and
\end_layout
\begin_layout Enumerate
\noindent
where its lack can be easily (or even
\emph on
trivially
\emph default
) compensated by location transparency at another layer, such that a particular
component does not need to be constructed with location transparency, but
nevertheless the
\emph on
overall system
\emph default
is sufficiently location transparent, and
\end_layout
\begin_layout Enumerate
when there are multiple choices
\emph on
where
\emph default
to implement it, knowing which will be the best one for a familiy of use
cases, and finally
\end_layout
\begin_layout Enumerate
\emph on
how
\emph default
to implement it.
For example, a common misconception is to believe that storage must always
reside at a storage network.
Football (see
\family typewriter
football-user-manual.pdf
\family default
) demonstrates that sufficient
\begin_inset Foot
status open
\begin_layout Plain Layout
There could be arguments that Football's background migrations might be
too slow or might take too long for certain use cases.
Notice that
\family typewriter
BigCluster
\family default
also needs data migration during operations, e.g.
upon replacement of physical disks.
When the
\family typewriter
FlexibleSharding
\family default
model (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:FlexibleSharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
) is combined with Football, it provides practically the same timescale
and flexibility than
\family typewriter
BigCluster
\family default
.
\end_layout
\end_inset
location transparency can be achieved on top of local storage, while expensive
and performance-eating dedicated storage networks
\begin_inset Foot
status open
\begin_layout Plain Layout
Anyway, realtime storage networks cannot span long distances.
Thus they are not suitable for achieving location transparency in a geo-redunda
nt setup.
\end_layout
\end_inset
are not generally necessary for achieving location transparency.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In the definition of Cloud Storage in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Properties-Cloud-Storage"
plural "false"
caps "false"
noprefix "false"
\end_inset
, the requirement
\begin_inset Quotes eld
\end_inset
act as one
\begin_inset Quotes erd
\end_inset
is
\emph on
implying
\emph default
some appropriate type of location transparency of the resources.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Consequence: any system not sufficiently implementing location transparency
of the customer's resources (visible layer from outside) should not be
called
\begin_inset Quotes eld
\end_inset
Cloud Storage
\begin_inset Quotes erd
\end_inset
or a
\begin_inset Quotes eld
\end_inset
Cloud Product
\begin_inset Quotes erd
\end_inset
when location transparency is not sufficient from the viewpoint of customers.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
In the rest of this section, we concetrate on cross-datacenter replication
scenarios, including geo-redundancy.
\end_layout
\begin_layout Subsection
Granularity of Cross-Datacenter and Geo-Redundant Handover / Failover
\end_layout
\begin_layout Standard
Typical management buzzwords like DR = Disaster Recovery or CDP = Continuous
Data Protection are neglecting the
\emph on
granularity
\emph default
of the data units to be protected by replication, and the ability for quick
service
\begin_inset Foot
status open
\begin_layout Plain Layout
In the table,
\begin_inset Quotes eld
\end_inset
Backup
\begin_inset Quotes erd
\end_inset
means that only the data is replicated into a different datacenter.
In difference,
\begin_inset Quotes eld
\end_inset
Replication
\begin_inset Quotes erd
\end_inset
means that both the data and the necessary compute resources are available
in two datacenters.
See also sections
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Backup"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Replication-vs-Backup"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\end_inset
handover due to
\series bold
maintenance
\series default
reasons such as power supply maintenance.
The following table explains some differences when granularity aspects
like replication at physical volume (PV) aka physical disk level versus
logical volume (LV) resp filesystem level are taken into account:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Tabular
<lyxtabular version="3" rows="17" columns="3">
<features tabularvalignment="middle">
<column alignment="left" valignment="top" width="0pt">
<column alignment="left" valignment="top" width="0pt">
<column alignment="left" valignment="top" width="0pt">
<row>
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Method
\end_layout
\end_inset
</cell>
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Disadvantages
\end_layout
\end_inset
</cell>
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Advantages
\end_layout
\end_inset
</cell>
</row>
<row>
<cell multirow="3" alignment="left" valignment="middle" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Backup at FS level
\end_layout
\end_inset
</cell>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no real data consistency
\end_layout
\end_inset
</cell>
<cell multirow="3" alignment="left" valignment="middle" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
logical copy
\end_layout
\end_inset
</cell>
</row>
<row>
<cell multirow="4" alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no handover / failover
\end_layout
\end_inset
</cell>
<cell multirow="4" alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
</row>
<row>
<cell multirow="4" alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no load balancing
\end_layout
\end_inset
</cell>
<cell multirow="4" alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
</row>
<row>
<cell multirow="4" alignment="left" valignment="top" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no CDP / high MTTR
\end_layout
\end_inset
</cell>
<cell multirow="4" alignment="left" valignment="top" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
</row>
<row>
<cell multirow="3" alignment="left" valignment="middle" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Backup via FS snapshots
\end_layout
\end_inset
</cell>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
handover cumbersome
\end_layout
\end_inset
</cell>
<cell multirow="3" alignment="left" valignment="middle" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
some point-in-time consistency
\end_layout
\end_inset
</cell>
</row>
<row>
<cell multirow="4" alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no real load balancing
\end_layout
\end_inset
</cell>
<cell multirow="4" alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
</row>
<row>
<cell multirow="4" alignment="left" valignment="top" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
medium to high MTTR
\end_layout
\end_inset
</cell>
<cell multirow="3" alignment="left" valignment="middle" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
logical copy
\end_layout
\end_inset
</cell>
</row>
<row>
<cell multirow="4" alignment="left" valignment="top" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
delayed consistency
\end_layout
\end_inset
</cell>
<cell multirow="4" alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
</row>
<row>
<cell multirow="3" alignment="left" valignment="middle" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Replication at PV granularity
\end_layout
\end_inset
</cell>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
whole clusters switch
\end_layout
\end_inset
</cell>
<cell multirow="3" alignment="left" valignment="middle" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
easier to setup
\end_layout
\end_inset
</cell>
</row>
<row>
<cell multirow="4" alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no load balancing
\end_layout
\end_inset
</cell>
<cell multirow="4" alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
</row>
<row>
<cell multirow="4" alignment="left" valignment="top" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
physical copy
\end_layout
\end_inset
</cell>
<cell multirow="4" alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
</row>
<row>
<cell multirow="4" alignment="left" valignment="top" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
medium MTTR
\end_layout
\end_inset
</cell>
<cell multirow="4" alignment="left" valignment="top" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
</row>
<row>
<cell multirow="3" alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Replication at LV granularity
\end_layout
\end_inset
</cell>
<cell multirow="3" alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
physical copy
\end_layout
\end_inset
</cell>
<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
load balancing between LVs
\end_layout
\end_inset
</cell>
</row>
<row>
<cell multirow="4" alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell multirow="4" alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
easy migration / Football
\end_layout
\end_inset
</cell>
</row>
<row>
<cell multirow="4" alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell multirow="4" alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
full handover consistency
\end_layout
\end_inset
</cell>
</row>
<row>
<cell multirow="4" alignment="left" valignment="top" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell multirow="4" alignment="left" valignment="top" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
low MTTR
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Standard
\noindent
In order to implement good flexibility of handover / failover, the network
infrastructure (as well as other infrastructures) must support it.
Here are
\series bold
levels of flexibility
\series default
, in ascending order:
\end_layout
\begin_layout Enumerate
\begin_inset Argument 1
status open
\begin_layout Plain Layout
start=0
\end_layout
\end_inset
(completely inflexible) Statically assigned IP addresses at
\emph on
each
\emph default
server and at
\emph on
both
\emph default
of 2 datacenters, and in particular for
\series bold
customer traffic
\series default
.
This is typical for contemporary backup solutions.
As a consequence, any handover / failover attempt would need massive sysadmin
work, even if there were enough CPU and RAM power at the target datacenter.
Switching whole datacenters or bigger server farms would take days, if
not weeks, to manually reconfigure.
Consequence: sysadmins will heavily dislike such type of work (acceptance
problem of geo-redundancy).
\end_layout
\begin_layout Standard
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Some people think this can be easily done at DNS level.
Just update all of your publicly visible DNS records to point to the new
IP addresses.
However, DNS updates have serious drawbacks for public internet traffic.
Although there exists a field TTL = Time To Live for limiting the caching
period of DNS clients, this field is
\emph on
ignored
\emph default
by many clients / DNS caches throughout the world.
In practice it will take days, if not weeks, until the last client has
got the new IP address, even if you try to speed this up by setting a TTL
of 1 minute.
It simply does not work as expected.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Dynamic routing protocols at AS = Autonomous Systems level are your friend,
such as
\series bold
BGP = Border Gateway Protocol
\series default
.
For any
\emph on
serious
\emph default
cross-datacenter scenarios and/or geo-redundancy, it is a
\series bold
must
\series default
.
If you don't have the ability for
\series bold
dynamic routing at the appropriate granularity
\series default
, you should better not claim that you are geo-redundant.
If handover / failover takes far longer than acceptable by customer expectation
s / SLAs (typically minutes), you are
\emph on
not really
\emph default
geo-redundant from the viewpoint of your customers.
\end_layout
\begin_layout Enumerate
(inflexible) Manual or semi-automated routing at datacenter uplink level.
Here the customer traffic is always routed to the
\emph on
same
\emph default
IP visible from outside, while there is a
\emph on
separate
\emph default
static IP per server for sysadmin
\family typewriter
ssh
\family default
access.
The customer traffic routing needs to be changed
\emph on
globally
\emph default
for the
\emph on
complete
\emph default
traffic to
\emph on
any
\emph default
of two datacenters, and thus is very inflexible.
This model protects
\emph on
only
\emph default
against a full datacenter loss, but almost nothing else.
Unfortunately, this model appears very simple to implement, so both staff
and chief executive managers are sometimes preferring this
\begin_inset Quotes eld
\end_inset
simple
\begin_inset Quotes erd
\end_inset
model, although it causes headaches at operational level when really needed.
\end_layout
\begin_layout Enumerate
(medium flexibility) Dynamic routing of customer traffic at the granularity
of building blocks, or even per hypervisor / physical server.
When automated appropriately, switchover is a matter of minutes, or even
seconds.
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Requirements for networking
\end_layout
\end_inset
Starting with this level of flexibility,
\series bold
BGP
\series default
= Border Gateway Protocol or similar network protocols are a
\series bold
must
\series default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Anyway, when you have the effort of BGP implementation for this level,
consider to
\series bold
do it right from scratch
\series default
.
Also support the following better levels from the network side of the company.
\end_layout
\end_inset
\end_layout
\begin_layout Enumerate
(flexible) Dynamic routing of each VM / LV / resource, individually.
This has massive advantages: in case of overload, DDOS attacks, etc, you
can quickly load-balance into a so-called
\series bold
butterfly runtime configuration
\series default
: half of your VMs belonging to the same hypervisor is running in datacenter
A, while the other half is running in datacenter B.
In the following illustration showing one hypervisor per datacenter, green
color denotes the active (primary) side, while white means passive (secondary):
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/replication-butterfly.fig
width 100line%
\end_inset
\begin_inset Newline newline
\end_inset
During butterfly, each of your hypervisor iron has to carry only
\emph on
half
\emph default
of the ordinary workload.
For comparison, here is the normal situation where only datacenter A would
be active:
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/replication-normal.fig
width 100line%
\end_inset
\begin_inset Newline newline
\end_inset
In the above butterfly configuration, you have essentially
\series bold
doubled the available CPU and RAM power
\series default
, when compared to the ordinary situation where side B does not carry any
application workload.
This is a
\emph on
tremendous
\emph default
aid for
\series bold
survival
\series default
of certain types of incidents, such as (unhandled
\begin_inset Foot
status open
\begin_layout Plain Layout
There is no 100% DDOS protection.
Attackers are continuosly improving their methods.
Catching all types of novel patterns is not possible in general.
\end_layout
\end_inset
) DDOS attacks.
\end_layout
\begin_layout Enumerate
(most flexible) In addition to dynamic routing at VM level, the VMs
\emph on
themselves
\emph default
are
\series bold
location transparent
\series default
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Location-transparency"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
They may transparently migrate to another hypervisor, possibly residing
in another building block, or even residing in a different datacenter.
In its most general form, the number of replicas may be different for each
VM, and may change dynamically, adapting to any needs.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Recommended flexibility
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The
\series bold
ability for butterfly
\series default
is relevant at CTO level.
It is a massive
\series bold
risk reducer
\series default
, even at company and at stock exchange value level.
\end_layout
\begin_layout Plain Layout
In order to really get it implemented in its best form, CTOs should clearly
require
\end_layout
\begin_layout Plain Layout
\noindent
\align center
\series bold
Location Transparency at Application Level
\end_layout
\begin_layout Plain Layout
\noindent
It means that not only your servers, but also your
\series bold
services
\series default
can run in any of more than 1 datacenter, without notice by your customers.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
The location of your services is no longer a primary key, but a dependent
runtime attribute which may change at runtime.
Of course, your databases, your dashboards, your monitoring, and other
surrounding tools, must also be able to properly deal with location transparenc
y.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Ability for butterfly
\end_layout
\end_inset
1&1 Ionos ShaHoLin = Shared Hosting Linux has implemented the ability for
butterfly via BGP location transparency on thousands of servers, and on
several petabytes of data.
See
\begin_inset CommandInset ref
LatexCommand nameref
reference "ShaHoLin-Layering"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\end_inset
\end_layout
\begin_layout Chapter
Architectures of Cloud Storage / Software Defined Storage
\begin_inset CommandInset label
LatexCommand label
name "chap:Cloud-Storage"
\end_inset
\end_layout
\begin_layout Standard
This chapter compares several
\emph on
architectural
\emph default
alternatives with each other.
In order to not get lost in the jungle of numerous implementations and
their features, the description focuses on
\emph on
architecture
\emph default
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
) wherever possible.
Nevertheless, principal behaviour of implementations are also discussed.
\end_layout
\begin_layout Section
Performance Arguments and Risks from Architecture
\begin_inset CommandInset label
LatexCommand label
name "sec:Performance-Risk-Arguments-from-Layer"
\end_inset
\end_layout
\begin_layout Subsection
Performance Penalties and Risks from Choice of Replication Layer
\begin_inset CommandInset label
LatexCommand label
name "subsec:Performance-Risks-Replication-Layer"
\end_inset
\end_layout
\begin_layout Standard
Some people think that replication is easily done at filesystem layer.
There exist lots of cluster filesystems and other filesystem-layer solutions
which claim to be able to replicate your data, sometimes even over long
distances.
Some of them may appear under buzzwords like active-active (cf section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-active-active"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Standard
Trying to replicate several petabytes of data, or some billions of inodes,
is a much bigger challenge than many people can imagine.
\end_layout
\begin_layout Standard
Choosing the wrong
\series bold
layer
\series default
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Layering-Rules"
plural "false"
caps "false"
noprefix "false"
\end_inset
) for
\series bold
mass data replication
\series default
may get you into trouble.
Layer selection is much more important than any load distribution argument
as frequently heard from certain advocates.
Here is an architectural-level (cf section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
) explanation why replication at the block layer is more easy and less error
prone:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/Layers.pdf
width 100col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
The picture shows the main components of a standalone Unix / Linux system.
It conforms to Dijkstra's layering rules explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Layering-Rules"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
In the late 1970s / early 1980s, a so-called
\emph on
Buffer Cache
\emph default
had been introduced into the architecture of Unix.
Today's Linux has refined the concept to various internal caches such as
the
\series bold
Page Cache
\series default
(for data) and the
\series bold
Dentry Cache
\series default
(for metadata lookup).
\end_layout
\begin_layout Standard
All these caches serve one main purpose
\begin_inset Foot
status open
\begin_layout Plain Layout
Another important purpose is
\series bold
providing shared memory
\series default
for syscalls like
\family typewriter
mmap()
\family default
.
\end_layout
\end_inset
: they are speeding up your application while reducing the load onto the
storage by exploitation of fast RAM.
For the vast majority of typical
\begin_inset Foot
status open
\begin_layout Plain Layout
Of course, there exist some exceptions.
For example, capturing 100GBit network traffic in realtime and then writing
such a massive data stream to a local disk will not profit from local caching,
but is
\emph on
slowed down
\emph default
by the overhead of contemporary kernel memory architectures.
See for example Christoph Lameter's presentation at LCA2020.
In this case, data is never accessed twice, thus the
\series bold
locality of reference
\series default
is at it's worst-case extreme, where caching cannot help
\emph on
by concept
\emph default
.
At implementation level, a solution is to use Direct IO on big physical
memory chunks.
Direct IO is designed for bypassing the page cache of the kernel, and can
co-exist with other applications utilizing the caches.
Suchalike corner-case exceptions are
\emph on
strengthening
\emph default
Denning's WorkingSet theory (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
), since the observed behaviour is
\emph on
predicted
\emph default
by his theory.
\end_layout
\end_inset
workstation or server applications, which typically show good
\series bold
locality of reference
\series default
, a well-tuned cache can yield high cache hit ratios, typically 99%.
In some cases (as observed in practice) even more than 99.9%.
\end_layout
\begin_layout Standard
Now start distributing the system over long distances.
There are potential cut points A and B and C
\begin_inset Foot
status open
\begin_layout Plain Layout
In theory, there is another cut point D by implementing a generically distribute
d cache.
There exists some academic research on this, but practically usable enterprise-
grade implementations are rare and not wide-spread.
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
Cut point A is application specific, and can have advantages because it
has knowledge of the application.
For example, replication of mail queues can be controlled much more fine-graine
d than at filesystem or block layer.
\end_layout
\begin_layout Standard
Cut points B and C are
\emph on
generic
\emph default
, supporting a wide variety of applications, without altering them.
Cutting at B means replication at filesystem layer.
C means replication at block layer.
\end_layout
\begin_layout Standard
When replicating at B, you will notice that the caches are
\emph on
below
\emph default
your cut point.
Thus you will have to re-implement
\series bold
distributed caches
\series default
, and you will have to
\series bold
maintain cache coherence
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Distributed vs local caching vs spindle load distribution
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In best case, distributed caching can never be any better than local caching
(under comparable conditions).
In
\emph on
worst case
\emph default
, distributed caching can however
\series bold
drastically slow down performance
\series default
instead of improving it.
The effect is kown in DSM = Distributed Shared Memory research.
It is related to thrashing, and may be called
\series bold
distributed thrashing
\series default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
\emph on
Local
\emph default
caching in shared RAM does not suffer from additional distribution effects.
In best case, it can yield up to
\emph on
several orders of magnitude
\emph default
of performance (depending on the workingset behaviour of your application
workload).
The worst case isn't worse than necessary: well-implemented local caches
cooperating with the kernel process scheduler can limit some effects of
local RAM thrashing, in case they should appear.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In contrast to local shared RAM caching, frequently heard spindle load
distribution arguments can only re-distribute the already existing performance
of your spindles, but cannot magically
\begin_inset Quotes eld
\end_inset
create
\begin_inset Quotes erd
\end_inset
new sources of performance out of thin air.
Anyway, their potential is only about 1 order of magnitude.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Therefore, invests into local RAM for shared caching by the kernel may
easily pay off better than invests into a storage network.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Spindle load distribution can be also achieved via local RAID-10, but typically
much cheaper and more performant due to lower distances between the spindles
and the application RAM.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Compared to local RAID storage, spindle load distribution over a dedicated
realtime storage network is typically
\emph on
costing
\emph default
some performance, by introduction of additional latencies and potential
bottlenecks (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Standard
When replicating at C, the Linux caches are
\emph on
above
\emph default
your cut point.
Thus you will receive much less traffic at C, typically already reduced
by a factor of 100, or even more.
This is much more easy to cope with.
\emph on
Local
\emph default
caches and their SMP scaling properties can be implemented much more efficientl
y than distributed ones.
You will also profit from
\series bold
journalling filesystems
\series default
like
\family typewriter
ext4
\family default
or
\family typewriter
xfs
\family default
.
In contrast,
\emph on
truly distributed
\begin_inset Foot
status open
\begin_layout Plain Layout
In this context,
\begin_inset Quotes eld
\end_inset
truly
\begin_inset Quotes erd
\end_inset
means that the POSIX semantics would be always guaranteed cluster-wide,
and even in case of partial failures.
In practice, some distributed filesystems like NFS don't even obey the
POSIX standard
\emph on
locally
\emph default
on 1 standalone client.
We know of projects which have
\emph on
failed
\emph default
right because of this.
\end_layout
\end_inset
\emph default
journalling is typically not available with distributed cluster filesystems.
\end_layout
\begin_layout Standard
A
\emph on
potential
\emph default
drawback of block layer replication is that you will be
\emph on
typically
\emph default
limited to so-called active-passive replication.
So-called active-active operation (see section
\begin_inset CommandInset ref
LatexCommand vref
reference "sec:What-is-active-active"
plural "false"
caps "false"
noprefix "false"
\end_inset
) is probably
\emph on
not impossible
\emph default
at block layer (see combinations of DRBD with
\family typewriter
ocfs2
\family default
), but less common, and less safe to operate.
For example, Linbit does
\emph on
not recommend
\emph default
the so-called active-active mode as supported by DRBD (aka dual-primary
in DRBD speak), while the newer MARS feature called
\begin_inset Quotes eld
\end_inset
Prosumer Device
\begin_inset Quotes erd
\end_inset
may be able to do it
\emph on
somewhen in future
\emph default
for
\emph on
non-critical
\emph default
or
\emph on
non-performant
\emph default
purposes, but this is also
\emph on
not recommended
\emph default
for block-level coupling of so-called active-active filesystems like
\family typewriter
ocfs2
\family default
when highly loaded, or when any non-predictable
\series bold
split-brain
\series default
(see unavoidability in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Explanation-via-CAP"
plural "false"
caps "false"
noprefix "false"
\end_inset
) cannot be easily resolved at higher layers.
\end_layout
\begin_layout Standard
This limitation / disrecommendation isn't necessarily caused by the choice
of layer.
It is simply caused by the
\series bold
laws of physics
\series default
: communication is always limited by the speed of light.
A distributed filesystem is essentially nothing else but a persistent
\series bold
DSM = Distributed Shared Memory
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Pitfalls for Managers and Architects
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
When designing or comparing architectures or implementations,
\series bold
do
\emph on
not skip
\series default
\emph default
reading section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
, even if it
\emph on
looks
\emph default
to you like an
\begin_inset Quotes eld
\end_inset
academic
\begin_inset Quotes erd
\end_inset
explanation.
These explanations are about important
\series bold
reasons for project failures
\series default
, and explaining why
\series bold
Distributed Systems are
\emph on
generally harder
\emph default
than Local Systems
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Some decades of research on DSM have shown that there exist applications
/ workloads where the DSM model is
\emph on
inferior
\emph default
to the direct communication paradigm.
Even in short-distance / cluster scenarios.
Long-distance DSM is extremely cumbersome.
\end_layout
\begin_layout Standard
Therefore: you simply shouldn't try to solve
\series bold
short or long-distance communication needs
\series default
via communication over shared filesystems.
Even simple producer-consumer scenarios (one-way communication) are less
performant (e.g.
when compared to plain TCP/IP) when it comes to distributed POSIX semantics.
There is simply too much
\series bold
synchronisation overhead at metadata level
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
If you want mixed operations at different locations in parallel: split your
data set into disjoint filesystem instances (or database / VM instances,
etc).
Then you should achieve the
\series bold
ability for butterfly
\series default
, see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Flexibility-of-Failover"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
All you need is careful thought about the
\emph on
appropriate
\emph default
\emph on
granularity
\emph default
of your data sets (such as well-chosen
\emph on
sets
\emph default
of user homedirectory subtrees, or database sets logically belonging together,
etc).
An example hierarchy of granularities is described in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Positive-Example:-ShaHoLin"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Further hints can be found in sections
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Granularity-at-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Variants-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Sharding (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Definition-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
) implementations like ShaHoLin (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Positive-Example:-ShaHoLin"
plural "false"
caps "false"
noprefix "false"
\end_inset
) are essentially exploiting the scalability of SMP = Symmetric MultiProcessing,
nowadays typically going into saturation around
\begin_inset Formula $\approx100$
\end_inset
hardware CPU threads for typical workloads, which is executed by
\emph on
hardware
\emph default
inside of your server enclosure.
In contrast, DSM-like solutions are trying to distribute your application
workload over longer distances, involving relatively slow system software
instead of
\series bold
hardware acceleration
\series default
.
Therefore, SMP is preferable over DSM wherever possible.
\end_layout
\begin_layout Standard
Replication at filesystem level is often by single-file granularity.
If you have several millions or even billions of inodes, you may easily
find yourself in a snakepit.
See also
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example-Failures-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Conclusion
\end_layout
\end_inset
So-called
\series bold
active-passive operation
\series default
(cf section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-active-active"
plural "false"
caps "false"
noprefix "false"
\end_inset
) over
\emph on
long
\emph default
distances at
\series bold
block layer
\series default
is an
\series bold
\emph on
advantage
\series default
\emph default
.
It keeps your staff from trying bad / almost impossible things, like DSM
= Distributed Shared Memory over long distances.
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Performance Tradeoffs from Load Distribution
\begin_inset CommandInset label
LatexCommand label
name "subsec:Performance-Tradeoffs-from-Load-Distribution"
\end_inset
\end_layout
\begin_layout Standard
A frequent argument from BigCluster advocates is that the so-called Random
Repliction would provide better performance.
This argument isn't wrong, but it does not hit the point.
\end_layout
\begin_layout Standard
As analysed in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Similarities-and-differences"
plural "false"
caps "false"
noprefix "false"
\end_inset
, load distribution isn't a unique concept bound to BigCluster or to Random
Replication.
Load distribution has been used since decades at a variety of
\series bold
RAID striping
\series default
methods.
\end_layout
\begin_layout Standard
RAID striping levels like RAID-0 or RAID-10 or RAID-60 are known since decades,
forming a mature technology.
Also known since the 1980s is that the size of a single striped RAID set
must not grow too big, otherwise reliability will suffer too much.
Larger RAID systems are therefore
\series bold
split
\series default
into multiple
\series bold
RAID sets
\series default
.
\end_layout
\begin_layout Standard
This has some intresting parallels to the BigCluster reliability problems
analyzed in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sub:Detailed-explanation"
plural "false"
caps "false"
noprefix "false"
\end_inset
, and some workarounds, e.g.
as discussed in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Similarities-and-differences"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
Summary: both RAID striping and random replication methods are
\series bold
limited
\series default
by the fundamental law of storage systems, see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Optimum-Reliability-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
, in a similar way.
\end_layout
\begin_layout Standard
A detailed performane comparison at architcture level between random replication
of variable-sized objects and striping of block-level sectors is beyond
the scope of this architecture guide.
However, the following should be be intuitively clear from section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Layering-Rules"
plural "false"
caps "false"
noprefix "false"
\end_inset
and from Einstein's laws of the speed of light:
\end_layout
\begin_layout Quote
Fine-grained load distribution over
\series bold
short distances
\series default
and/or at
\series bold
lower layers
\series default
has a
\series bold
bigger performance potential
\series default
than over longer distances and/or at higher layers.
\end_layout
\begin_layout Standard
In other words: local SAS busses are capable of realtime IO transfers over
very short distances (enclosure-to-enclosure), while an expensive IP storage
network with NOF
\begin_inset Formula $\gg1$
\end_inset
isn't realtime (due to congestion control and/or packet loss, see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
SAS busses are
\emph on
constructed
\emph default
for dealing with requirements arising from RAID, and have been optimized
for years / decades.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Advice for performance-critical workloads
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Besides
\emph on
local
\emph default
SSDs, also consider some appropriate RAID striping at your (Local)Sharding
storage boxes for performance-critical workloads.
It is not only cheaper than BigCluster load distribution methods, but typically
also more performant (on top of comparable technology and comparable dimensioni
ng).
Tradeoffs of various parameters and measurement methods for system architects
are described at
\begin_inset Flex URL
status open
\begin_layout Plain Layout
http://blkreplay.org
\end_layout
\end_inset
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
RAID-6 is much cheaper
\begin_inset Foot
status open
\begin_layout Plain Layout
Several OSDs are also using SAS or similar local IO busses, in order to
drive a high number of spindles.
Essentially, random replication is involving
\emph on
two
\emph default
different types of networks at the same time.
This also explains why such a combination must necessarily induce some
performance loss.
\end_layout
\end_inset
than RAID-10, and can also provide some striping with respect to (random)
reads.
However, random writes are slower.
For read-intensive workloads, the striping behaviour of RAID-6 is often
sufficient.
A tool for comparsion of different RAID setup alternatives can be found
at
\begin_inset Flex URL
status open
\begin_layout Plain Layout
http://www.blkreplay.org
\end_layout
\end_inset
.
\end_layout
\begin_layout Section
Distributed vs Local: Scalability Arguments from Architecture
\begin_inset CommandInset label
LatexCommand label
name "sec:Distributed-vs-Local:"
\end_inset
\end_layout
\begin_layout Standard
Datacenters aren't usually operated for fun or for hobby.
Scalability of an
\emph on
architecture
\emph default
(cf section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
) is important, because it can seriously limit your business.
Architectural ill-designs can grow extremely cumbersome and costly.
\end_layout
\begin_layout Standard
Some enterprise system architects are starting with a particular architecture
in mind, called
\begin_inset Quotes eld
\end_inset
Big Cluster
\begin_inset Quotes erd
\end_inset
.
There is a common belief that otherwise
\series bold
scalability
\series default
could not be achieved:
\begin_inset Separator latexpar
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/Architecure_Big_Cluster.pdf
width 100col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
The crucial point is the
\series bold
storage network
\series default
:
\begin_inset Formula $n$
\end_inset
storage servers are interconnected with
\begin_inset Formula $m=O(n)$
\end_inset
frontend servers.
The argument of BigCluster advocates is that this
\emph on
would
\emph default
be
\emph on
advantagous
\emph default
for achieving desired properties like scalability, failure tolerance, etc.
We will check this argument.
\end_layout
\begin_layout Standard
According to the idea behind BigCluster,
\emph on
any
\emph default
of the
\begin_inset Formula $m$
\end_inset
frontends needs to access
\emph on
any
\emph default
of the
\begin_inset Formula $n$
\end_inset
storages in realtime.
Thus the storage network must be
\emph on
dimensioned
\emph default
for
\begin_inset Formula $O(n\cdot m)=O(n^{2})$
\end_inset
network connections,
\emph on
potentially
\emph default
running in parallel.
Even if the total network throughput is scaling only with
\begin_inset Formula $O(n)$
\end_inset
, nevertheless
\begin_inset Formula $O(n^{2})$
\end_inset
network connections have to be
\emph on
maintained
\emph default
at connection oriented protocols and at various layers of the operating
software.
The network must be
\emph on
able
\emph default
to
\emph on
switch
\emph default
the packets from
\begin_inset Formula $n$
\end_inset
sources to
\begin_inset Formula $m$
\end_inset
destinations (and their opposite way back) in
\series bold
realtime
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
The
\begin_inset Formula $O(n^{2})$
\end_inset
\series bold
cross-bar functionality
\series default
in
\series bold
realtime
\series default
makes the storage network
\series bold
complicated
\series default
and
\series bold
expensive
\series default
, while decreasing grand-total reliability and thus
\series bold
increasing risk
\series default
(see also section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
Factors increasing the risk and cost of storage networks:
\end_layout
\begin_layout Itemize
In order to limit
\series bold
error propagation
\series default
from other networks, the storage network is often built as a
\emph on
physically separate
\emph default
=
\emph on
dedicated
\emph default
network.
\end_layout
\begin_layout Itemize
Because dedicated storage networks are heavily reacting to high latencies
and packet loss (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
), they often need to be dimensioned for the
\series bold
worst case
\series default
(load peaks,
\series bold
packet storms
\series default
, etc), needing one of the best = typically most expensive components for
reducing latency and increasing throughput.
Dimensioning to the worst case instead of an average case plus some safety
margins is an expensive
\series bold
overdimensioning
\series default
/
\series bold
over-engineering
\series default
which has their own
\series bold
pitfalls
\series default
.
\end_layout
\begin_layout Itemize
When
\series bold
multipathing
\series default
is required for improving fault tolerance of the storage network itself,
(parts of) these efforts may easily
\emph on
double
\emph default
, and induce further pitfalls.
\end_layout
\begin_layout Itemize
When
\series bold
geo-redundancy
\series default
is required (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Geo-Redundancy"
plural "false"
caps "false"
noprefix "false"
\end_inset
), and when it is
\emph on
possible
\emph default
with a certain BigCluster implementation
\emph on
at all
\emph default
, the total effort may easily double another time because in cases of disasters
like terrorist attacks the backup datacenter must be prepared for taking
over for multiple days or weeks.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
A wide-spread pitfall is
\emph on
incorrect belief
\emph default
about section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Geo-Redundancy"
plural "false"
caps "false"
noprefix "false"
\end_inset
and its
\series bold
fundamental requirements
\series default
.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
There are certain products
\emph on
marketed
\emph default
as
\begin_inset Quotes eld
\end_inset
geo-redundancy is
\emph on
possible
\emph default
\begin_inset Quotes erd
\end_inset
while actually not matching the
\emph on
requirements
\emph default
for true geo-redundancy.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In general, storage networks won't work over long distances (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
Even it would be possible for a certain use case,
\series bold
asymmetry problems
\series default
would be introduced into an architecture which is
\emph on
conceptually symmetric
\emph default
by its very nature.
Thus, and generally in
\begin_inset Formula $n:m$
\end_inset
relationships, failover granularities are tending to
\series bold
stick to coarse
\series default
.
Finer granularites as discussed in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Flexibility-of-Failover"
plural "false"
caps "false"
noprefix "false"
\end_inset
are much more difficult to achieve, if possible at all.
\end_layout
\begin_layout Standard
Fortunately, there is an alternative called
\begin_inset Quotes eld
\end_inset
\series bold
Sharding Architecture
\series default
\begin_inset Quotes erd
\end_inset
or
\begin_inset Quotes eld
\end_inset
\series bold
Shared-nothing Architecture
\series default
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Paragraph
Definition of Sharding
\begin_inset CommandInset label
LatexCommand label
name "par:Definition-of-Sharding"
\end_inset
\end_layout
\begin_layout Standard
Notice that the term
\begin_inset Quotes eld
\end_inset
Sharding
\begin_inset Quotes erd
\end_inset
originates from database architecture
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Shard_(database_architecture)
\end_layout
\end_inset
where it has a slightly different meaning than used here.
Our usage of the term
\begin_inset Quotes eld
\end_inset
sharding
\begin_inset Quotes erd
\end_inset
reflects slightly different situations in some webhosting companies
\begin_inset Foot
status open
\begin_layout Plain Layout
According to
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Shared-nothing_architecture
\end_layout
\end_inset
, Google also uses the term
\begin_inset Quotes eld
\end_inset
sharding
\begin_inset Quotes erd
\end_inset
for a particular
\begin_inset Quotes eld
\end_inset
shared-nothing architecture
\begin_inset Quotes erd
\end_inset
.
Although our above definition of
\begin_inset Quotes eld
\end_inset
sharding
\begin_inset Quotes erd
\end_inset
does not fully comply with its original meaning, a similar usage by Google
probably means that our usage of the term is not completely uncommon.
\end_layout
\end_inset
, and can be certainly transferred to more application areas.
\end_layout
\begin_layout Standard
Our more specific use of the term
\begin_inset Quotes eld
\end_inset
sharding
\begin_inset Quotes erd
\end_inset
has the following properties,
\emph on
all at the same time:
\end_layout
\begin_layout Enumerate
User / customer data is
\series bold
partitioned
\series default
.
This is similar to database sharding.
However, the original database term also allows
\emph on
some
\emph default
data to remain unpartitioned.
In webhosting, suchalike may exists also, but typically only for
\emph on
system data,
\emph default
like OS images, including large parts of their configuration data.
Suchalike system data is typically
\emph on
replicated
\emph default
to thousands of nodes from a central
\begin_inset Quotes eld
\end_inset
golden image
\begin_inset Quotes erd
\end_inset
in an
\emph on
offline
\emph default
fashion, e.g.
via regular
\family typewriter
rsync
\family default
cron jobs, etc.
Typically, it comprises only of few gigabytes per instance and is mostly
read-only with a slow change rate, while total customer data is typically
in the range of some petabytes with a higher total change rate.
For smaller
\begin_inset Formula $n$
\end_inset
in the range of a few hundreds of shards, sysadmins are typically prefering
more convenient tools like
\family typewriter
Chef
\family default
or
\family typewriter
puppet
\family default
& co.
\end_layout
\begin_layout Enumerate
The system has (almost
\begin_inset Foot
status open
\begin_layout Plain Layout
In general, there are some more
\emph on
natural
\emph default
single points of contention, such as the physical space of a datacenter,
which might be destroyed by an explosion, for example.
\end_layout
\end_inset
)
\series bold
no single point of contention
\series default
, and thus the partitions are
\series bold
completely independent
\series default
from each other, like in
\series bold
shared-nothing
\series default
architectures
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Shared-nothing_architecture
\end_layout
\end_inset
.
However, the original term
\begin_inset Quotes eld
\end_inset
shared-nothing
\begin_inset Quotes erd
\end_inset
has also been used for describing
\emph on
replicas
\emph default
, e.g.
DRBD mirrors.
In our context of
\begin_inset Quotes eld
\end_inset
sharding
\begin_inset Quotes erd
\end_inset
, the shared-nothing principle
\emph on
only
\emph default
refers to the
\begin_inset Quotes eld
\end_inset
\series bold
no single point of contention
\series default
\begin_inset Quotes erd
\end_inset
principle at
\emph on
partitioning
\emph default
level, which means it
\emph on
only
\emph default
refers to to the
\emph on
partitioning
\emph default
of the user data, but
\emph on
not
\emph default
to their replicas.
\end_layout
\begin_layout Enumerate
Shared-nothing replicas (e.g.
in the sense of some DRBD descriptions) may be also present (and in fact
they are at 1&1 Shared Hosting Linux), but these
\series bold
replicas
\series default
are considered
\series bold
orthogonal to sharding
\series default
.
Customer data replicas form an
\emph on
independent
\emph default
dimension called
\begin_inset Quotes eld
\end_inset
replication layer
\begin_inset Quotes erd
\end_inset
.
The replication layer also obeys the shared-nothing principle in original
sense, but it is
\emph on
not
\emph default
meant by our term
\begin_inset Quotes eld
\end_inset
sharding
\begin_inset Quotes erd
\end_inset
in order to avoid confusion
\begin_inset Foot
status open
\begin_layout Plain Layout
Notice that typically
\family typewriter
BigCluster
\family default
architectures are also abstracting away their replicas when talking about
their architecture.
\end_layout
\end_inset
between these two independent dimensions.
\end_layout
\begin_layout Standard
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Conceptual separation of replication from sharding has some advantages.
For example, control over the replication degree
\begin_inset Formula $k$
\end_inset
can be more fine-grained than at physical shard level.
For example, both DRBD and MARS are supporting this, by allowing a different
number of replicas for each logical resource.
\end_layout
\begin_layout Standard
The sharding architecture does not need a dedicated storage network in general,
at least when built and dimensioned properly.
Instead, it
\emph on
should have
\emph default
(but not always needs) a so-called
\series bold
replication network
\series default
which can, when present, be dimensioned much smaller because it does neither
need realtime operations nor scalabiliy to
\begin_inset Formula $O(k\cdot n)$
\end_inset
or
\begin_inset Formula $O(n^{2})$
\end_inset
:
\begin_inset Separator latexpar
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/Architecure_Sharding.pdf
width 100col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
Sharding architectures are extremely well suited when
\emph on
both
\emph default
the input traffic
\emph on
and
\emph default
the data is
\series bold
already partitioned
\series default
.
For example, when several thousands or even millions of customers are operating
on
\emph on
disjoint
\emph default
data sets, like in web hosting where each webspace is residing in its own
home directory, or when each of millions of mySQL database instances has
to be isolated from its neighbour.
Masses of customers are also appearing at cloud storage applications like
Cloud Filesystems (e.g.
Dropbox or similar).
\end_layout
\begin_layout Standard
Even in cases when any customer may potentially access any of the data items
residing in the whole storage pool (e.g.
like in a search engine), sharding can be often applied.
The trick is to create some relatively simple content-based dynamic switching
or redirect mechanism in the input network traffic, similar to HTTP load
balancers or redirectors.
\end_layout
\begin_layout Standard
Only when partitioning of input traffic plus data is not possible in a reasonabl
e way, big cluster architectures as implemented for example in Ceph or Swift
(and partly even possible with MARS when restricted to the block layer)
\emph on
may
\emph default
have a use case.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
When sharding is possible, it is the preferred model due to reliability
and cost and performance reasons.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Another good explanation can be found at
\begin_inset Flex URL
status open
\begin_layout Plain Layout
http://www.benstopford.com/2009/11/24/understanding-the-shared-nothing-architectur
e/
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
Here is a
\series bold
simple example architecture
\series default
for internet-based object stores / filesystems (differences explained in
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Object-Store"
plural "false"
caps "false"
noprefix "false"
\end_inset
), here intended for similar use cases like DropBox & co (which are typically
used by masses of end users for copies and/or backup of their private filesyste
m data):
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/sharded-object-store-or-filesystem.fig
width 100col%
\end_inset
\end_layout
\begin_layout Subsection
Variants of Sharding
\begin_inset CommandInset label
LatexCommand label
name "subsec:Variants-of-Sharding"
\end_inset
\end_layout
\begin_layout Description
LocalSharding The simplest possible sharding architecture is simply putting
both the storage and the compute CPU power onto the same iron.
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Dimensioning of 1&1 Shared Hosting Linux (ShaHoLin)
\end_layout
\end_inset
We have dimensioned several variants of this.
\end_layout
\begin_layout Enumerate
We are using 1U pizza boxes with local hardware RAID controllers with fast
hardware BBU cache and ~ 10 local disks for the majority of LXC container
instances where the
\begin_inset Quotes eld
\end_inset
small-sized
\begin_inset Quotes erd
\end_inset
customers (up to ~100 GB webspace per customer) are residing.
Since most customers have very small home directories with extremely many
but small files, this is a very cost-efficient model.
\end_layout
\begin_layout Enumerate
Less than 1 permille of all customers have > 250 GB (up to 2TB) per home
directory.
For these few customers we are using another dimensioning variant of the
same architecture: 4U servers with 48 high-capacity spindles on 3 RAID
sets, delivering a total PV capacity of ~300 TB, which are then cut down
to ~10 LXC containers of ~30 TB each.
\end_layout
\begin_layout Enumerate
(currently in planning stage) An intermediate dimensioning between both
extremes could save some more cost, and hopefully improve reliability even
more, due to better pre-distribution of customer behaviour.
The so-called midclass could be dimensioned as 90 TB per 2U pizza box,
roughly on 12 spindles.
It would carry the customers between ~50 and ~250 GB webspace each.
\end_layout
\end_inset
\begin_inset Newline newline
\end_inset
In order to operate this model at a bigger scale, you should consider the
\begin_inset Quotes eld
\end_inset
container football
\begin_inset Quotes erd
\end_inset
method as described in section
\begin_inset CommandInset ref
LatexCommand ref
reference "subsec:Principle-of-Background"
\end_inset
and in
\family typewriter
football-user-manual.pdf
\family default
.
\end_layout
\begin_layout Description
RemoteSharding This variant needs a (possibly dedicated) storage network,
which is however only
\begin_inset Formula $O(n)$
\end_inset
in total.
Each storage server exports a block device over iSCSI (or over another
transport like MARS' prosumer device) to at most
\begin_inset Formula $O(k)$
\end_inset
dedicated compute nodes where
\begin_inset Formula $k$
\end_inset
is some
\series bold
constant
\series default
.
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Hint 1
\end_layout
\end_inset
It is advisable to build this type of storage network with
\series bold
local switches
\series default
and no routers inbetween, in order to avoid
\begin_inset Formula $O(n^{2})$
\end_inset
-style network architectures and traffic.
This reduces error propagation upon network failures.
Keep the storage and the compute nodes locally close to each other, e.g.
in the same datacenter room, or even in the same rack.
\end_layout
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Hint 2
\end_layout
\end_inset
Additionally, you can provide some (low-dimensioned) backbone for
\series bold
exceptional(!)
\series default
cross-traffic between the local storage switches.
Don't plan to use any realtime cross-traffic
\emph on
regularly
\emph default
, but only for clear cases of emergency! See also explanation of NOF in
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In this model, a shard typically consists of one storage node plus
\begin_inset Formula $k+1$
\end_inset
or
\begin_inset Formula $k+2$
\end_inset
compute servers, introducing some additional failure redundancy
\emph on
within
\emph default
such a shard, while retaining the
\begin_inset Quotes eld
\end_inset
no single point of contention
\begin_inset Quotes erd
\end_inset
property
\emph on
between
\emph default
the shards (according to section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Definition-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Description
FlexibleSharding This is a dynamic combination of LocalSharding and RemoteShardi
ng, dynamically re-configurable, as explained below.
\end_layout
\begin_layout Description
BigClusterSharding The sharding model can also be placed
\series bold
on top of
\series default
a BigCluster model, or possibly
\begin_inset Quotes eld
\end_inset
internally
\begin_inset Quotes erd
\end_inset
in such a model, leading to a similar effect.
Whether this makes sense needs some discussion.
It can be used to reduce the
\emph on
logical
\emph default
BigCluster size from
\begin_inset Formula $O(n)$
\end_inset
to some
\begin_inset Formula $O(k)$
\end_inset
, such that it is no longer a
\begin_inset Quotes eld
\end_inset
big cluster
\begin_inset Quotes erd
\end_inset
but a
\begin_inset Quotes eld
\end_inset
small cluster
\begin_inset Quotes erd
\end_inset
, and thus reducing the serious problems described in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Reliability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
to some degree.
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Some use cases for BigClusterSharding
\end_layout
\end_inset
This could make sense in the following use cases:
\end_layout
\begin_layout Itemize
When you
\series bold
already have
\series default
invested into a big cluster, e.g.
Ceph or Swift, which does not really scale and/or does not really deliver
the expected reliability.
Some possible reasons for this are explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Reliability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
and subsection
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Itemize
When you really need a
\emph on
single
\emph default
LV which is necessarily
\series bold
bigger
\series default
than can be reasonably built on top of local LVM.
This means, you are likely claiming that you really need
\series bold
strict consistency
\series default
as provided by a block device on more than 1 PB with current technology
(2018).
Examples are very
\series bold
big enterprise databases
\series default
like classical SAP (c.f.
section
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:Local-vs-Centralized"
\end_inset
), or if you really need
\series bold
POSIX-compliance
\series default
on a single big filesystem instance.
Be conscious when you think this is the only solution to your problem.
Double-check or triple-check whether there is
\emph on
really
\emph default
no other solution than creating such a huge block device and/or such a
huge filesystem instance.
Such huge SPOFs are tending to create similar problems
\begin_inset Foot
status open
\begin_layout Plain Layout
Running
\family typewriter
fsck
\family default
or its Windows equivalents on huge filesystems is certainly no fun.
\end_layout
\end_inset
as described in section
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:Reliability-Arguments-from"
\end_inset
for similar reasons.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
When building a
\series bold
new
\series default
storage system, be sure to check the following use cases.
You should seriously consider a LocalSharding / RemoteSharding / FlexibleShardi
ng model in favor of BigClusterSharding when ...
\end_layout
\begin_layout Itemize
...
when more than 1 LV instance would be placed onto your
\begin_inset Quotes eld
\end_inset
small cluster
\begin_inset Quotes erd
\end_inset
shards.
Then a
\series bold
{Local,Remote,Flexible}Sharding
\series default
model could be likely used instead.
Then the total overhead (
\series bold
total cost of ownership
\series default
) introduced by a BigCluster
\emph on
model
\emph default
but actually stripped down to a
\begin_inset Quotes eld
\end_inset
SmallCluster
\begin_inset Quotes erd
\end_inset
\emph on
implementation / configuration
\emph default
should be examined separately.
Does it really pay off?
\end_layout
\begin_layout Itemize
...
when there are
\series bold
legal requirements
\series default
that you can tell at any time where your data is.
Typically, this is all else but easy on a BigCluster model, even when stripped
down to SmallCluster size.
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
FlexibleSharding
\begin_inset CommandInset label
LatexCommand label
name "subsec:FlexibleSharding"
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Notice that MARS' new prosumer device feature (formerly called
\emph on
remote device
\emph default
, like a kind of replacement for iSCSI) can not only be used for a
\family typewriter
RemoteSharding
\family default
model, but
\emph on
could
\emph default
also be used for implementing some sort of
\begin_inset Quotes eld
\end_inset
big cluster
\begin_inset Quotes erd
\end_inset
model at block layer.
However, consider the warnings for certain use cases from section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
If you deserve a very similar level of flexibility as promised by
\family typewriter
BigCluster
\family default
, read on.
\end_layout
\begin_layout Standard
Models re-introducing some kind of
\begin_inset Formula $O(n^{2})$
\end_inset
\begin_inset Quotes eld
\end_inset
big dedicated storage network
\begin_inset Quotes erd
\end_inset
, considering the
\emph on
potential
\emph default
connections, and
\begin_inset Formula $O(n)$
\end_inset
considering the
\emph on
actual
\emph default
realtime connections during runtime, are
\series bold
not
\series default
the preferred model for MARS operations in large scale.
Following is a compromize, which tries to minimize the NOF explained in
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The basic idea is that each server
\emph on
can
\emph default
(as far as necessary) operate
\emph on
both
\emph default
in server
\emph on
and
\emph default
in client role, both at the same time, and individually for each resource.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
Following is a
\series bold
super-model
\series default
which combines both the
\begin_inset Quotes eld
\end_inset
big cluster
\begin_inset Quotes erd
\end_inset
and sharding models at block layer in a very flexible way, without fully
depending on
\begin_inset Formula $O(n)$
\end_inset
realtime network connections.
The result is a similar flexibility than promised by BigCluster.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
The following example shows only two servers from a pool consisting of hundreds
or thousands of servers:
\begin_inset Separator latexpar
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/MARS_Cluster_on_Demand.pdf
width 100col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
The main difference to
\family typewriter
BigCluster
\family default
is to use iSCSI or the MARS prosumer device
\emph on
only where necessary
\emph default
.
Preferably, local storage is divided into multiple Logical Volumes (LVs)
via LVM, which should be
\emph on
directly
\emph default
used
\emph on
locally
\emph default
by Virtual Machines (VMs), whenever possible.
At abstract architectual level, detail technologies KVM/qemu vs filesystem-base
d local LXC containers make no real difference
\begin_inset Foot
status open
\begin_layout Plain Layout
A way for abstracting many details between KVM and LXC is for example provided
by
\family typewriter
libvirt
\family default
.
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
In the above example, the left machine has relatively less CPU power or
RAM than storage capacity.
Therefore, not
\emph on
all
\emph default
LVs could be instantiated locally at the same time without causing operational
problems, but
\emph on
some
\emph default
of them can be run locally.
The example solution is to
\emph on
exceptionally(!)
\emph default
export LV3 to the right server, which has some otherwise unused CPU and
RAM capacity.
\end_layout
\begin_layout Standard
Notice that local operations of VMs doesn't produce any storage network
traffic at all.
Therefore, this is the preferred runtime configuration.
\end_layout
\begin_layout Standard
Only in cases of resource imbalance, such as (transient) CPU or RAM peaks
(e.g.
caused by DDOS attacks), and only when the
\series bold
ability for butterfly
\series default
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Flexibility-of-Failover"
plural "false"
caps "false"
noprefix "false"
\end_inset
) is not available
\begin_inset Foot
status open
\begin_layout Plain Layout
This may happen when a disaster has already destroyed one of your datacenters,
and thus you are forced to run in the surviving datacenter.
\end_layout
\end_inset
or is not sufficienct, only then the following
\series bold
fallback strategy
\series default
is used:
\emph on
Some
\emph default
VMs or containers may then be run somewhere else over the network.
In a well-balanced and well-dimensioned system, this will be the
\series bold
vast minority
\series default
, and should be only used for dealing with timely load peaks, unforeseeable
customer behaviour, etc.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\series bold
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
Running (geo-)redundant VMs directly on the same servers as their storage
devices is a major cost reducer.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
You simply don't need to buy and operate
\begin_inset Formula $2\cdot(n+m)$
\end_inset
servers, but only about
\begin_inset Formula $2\cdot(\max(n,m)+m\cdot\epsilon)$
\end_inset
servers, where
\begin_inset Formula $\epsilon$
\end_inset
corresponds to some relative small extra resources needed by MARS.
\end_layout
\begin_layout Standard
In addition,
\series bold
shared memory
\series default
can be exploited more efficiently.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In addition to this and to reduced networking cost, there are further cost
savings at power consumption, air conditioning, Height Units (HUs), number
of HDDs, operating cost, etc as explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Cost-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Subsection
Principle of Background Migration
\begin_inset CommandInset label
LatexCommand label
name "subsec:Principle-of-Background"
\end_inset
\end_layout
\begin_layout Standard
The sharding model needs a different approach to load balancing of storage
space than the big cluster model.
There are serveral possibilities at different layers, each addressing different
\series bold
granularities
\series default
, starting from finest to coarsest:
\end_layout
\begin_layout Itemize
Moving per-customer data, typically at filesystem or database level via
\family typewriter
rsync
\family default
or
\family typewriter
mysqldump
\family default
or similar.
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Fine-grained migration of customer home directories
\end_layout
\end_inset
At 1&1 Shared Hosting Linux, we have about 9 millions of customer home directori
es.
We also have a script
\family typewriter
movespace.pl
\family default
using incremental
\family typewriter
tar
\family default
or
\family typewriter
rsync
\family default
for their moves.
Now, if we would try to move around
\emph on
all
\emph default
of them this way, it could easily take years or even decades for millions
of extremely small home directories, due to overhead like DNS updates etc.
However, there exist a small handful of large customer home directories
in the terabyte range.
For these, and only for these, it is a clever idea to use
\family typewriter
movespace.pl
\family default
because thereby the size of a LV can be regulated more fine grained than
at LV level.
\end_layout
\end_inset
\end_layout
\begin_layout Itemize
Dynamically growing the sizes of LVs during operations.
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Medium-grained extension of LVs
\end_layout
\end_inset
Football's
\family typewriter
expand
\family default
operation roughly does the following:
\family typewriter
lvresize
\family default
followed by
\family typewriter
marsadm resize
\family default
followed by
\family typewriter
xfs_growfs
\family default
or some equivalent filesystem-specific operation.
\end_layout
\end_inset
\end_layout
\begin_layout Itemize
Moving whole LVs via MARS + Football, as shown in the following example:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/MARS_Background_Migration.pdf
width 100col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
The idea of Football's
\family typewriter
migrate
\family default
operation is to dynamically create
\emph on
additional
\emph default
LV replicas for the sake of
\series bold
background migration
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
using MARS as replication engine
\end_layout
\end_inset
\end_layout
\begin_layout Itemize
If not yet done, you should transparently introduce MARS
\begin_inset Foot
status open
\begin_layout Plain Layout
When necessary, create the first MARS replica with
\family typewriter
marsadm create-resource
\family default
on your already-existing LV data, which will be retained unmodified, and
restart your application again.
\end_layout
\end_inset
into your LVM-based stack.
If you don't want more than
\begin_inset Formula $k=1$
\end_inset
replicas in general, you can use the so-called
\begin_inset Quotes eld
\end_inset
standalone mode
\begin_inset Quotes erd
\end_inset
of MARS.
\end_layout
\begin_layout Itemize
Optionally: once you have MARS in place, you may use iSCSI or the MARS prosumer
device or another means for exporting
\family typewriter
/dev/mars/lv3
\family default
to another hypervisor.
This might be the same hypervisor you want to migrate the data to, or it
could be another machine.
This is not generally needed, but it help for achieving a similar elasticity
than promised by
\family typewriter
BigCluster
\family default
.
\end_layout
\begin_layout Itemize
Now, for the sake of migration, you just create an additional replica at
your target server via
\family typewriter
marsadm join-resource
\family default
.
Optionally, this may be the same server where the remote VM is already
running at the moment.
Wait until the additional mirror has been fully
\series bold
synced
\series default
in background, while your application is continuously running and while
the content of the LV is modified
\emph on
in parallel
\emph default
by your ordinary applications running inside the VM.
\end_layout
\begin_layout Itemize
Then you do a
\series bold
primary handover
\series default
to your mirror (or to
\emph on
any
\emph default
of multiple mirrors).
This is usually a matter of seconds.
Newer versions of the prosumer device will allow this without shutdown
of your VM.
With standard
\begin_inset Foot
status open
\begin_layout Plain Layout
There are some iSCSI features like ALUA which
\emph on
should
\emph default
be able to handover an active session to another storage box without interrupti
on.
However, the corresponding Linux documentation looks very sparse, and the
maturity status for Linux initiators / targets is unclear at the moment.
\end_layout
\end_inset
iSCSI, you will typically have to shortly shutdown the VM and to restart
it a few seconds later.
\end_layout
\begin_layout Itemize
Once the application is running again at the old location or at another
location, you may delete the old replica via
\family typewriter
marsadm leave-resource
\family default
and
\family typewriter
lvremove
\family default
.
\end_layout
\begin_layout Itemize
Finally, you may re-use the freed-up space for something else (e.g.
\family typewriter
lvresize
\family default
of
\emph on
another
\emph default
LV followed by
\family typewriter
marsadm resize
\family default
followed by
\family typewriter
xfs_growfs
\family default
or similar).
Or, you may later migrate
\emph on
another
\emph default
(smaller) LV to this server, in order to re-use of the free space, or similar.
\end_layout
\begin_layout Itemize
For the sake of
\series bold
hardware lifecycle
\series default
, you may run a slightly different strategy: evacuate the original source
server completely via Football, and eventually decommission it.
\end_layout
\begin_layout Itemize
In case you already have a redundant LV copy somewhere else, you may run
a similar procedure, but starting with
\begin_inset Formula $k=2$
\end_inset
replicas, and temporarily increasing the number of replicas to either
\begin_inset Formula $k'=3$
\end_inset
when moving each replica step-by-step, or you may even directly go up to
\begin_inset Formula $k'=4$
\end_inset
in one step, thereby moving
\emph on
pairs
\emph default
at once.
Example: the latter variant is the default in the ShaHoLin configuration
variant of Football, internally called Tetris.
\begin_inset Newline newline
\end_inset
Technical details: see
\family typewriter
football.sh
\family default
in the
\family typewriter
football/
\family default
directory of MARS, which is a checkout of the Football sub-project, and
\family typewriter
football-user-manual.pdf
\family default
.
\end_layout
\begin_layout Itemize
When already starting with
\begin_inset Formula $k\geq3$
\end_inset
LV replicas in the starting position, you may have the luxury of using
a lesser variant.
For example, we have some mission-critical servers at 1&1 Ionos which are
running
\begin_inset Formula $k=4$
\end_inset
replicas all the time on relatively small but important LVs for extremely
increased safety.
Only in such a case, you may have the freedom to temporarily decrease from
\begin_inset Formula $k=4$
\end_inset
to
\begin_inset Formula $k'=3$
\end_inset
and then going up to
\begin_inset Formula $k''=4$
\end_inset
again, before starting primary handover.
This has the advantage of requiring less temporary storage space for
\emph on
swapping
\emph default
some LV replicas.
\end_layout
\end_inset
\end_layout
\begin_layout Section
Reliability Arguments from Architecture
\begin_inset CommandInset label
LatexCommand label
name "sec:Reliability-Arguments-from"
\end_inset
\end_layout
\begin_layout Standard
A contemporary common belief is that big clusters and their
\series bold
random replication
\series default
methods would provide better reliability than anything else.
There are some practical observations at 1&1 and its daughter companies
which cannot confirm this.
\end_layout
\begin_layout Standard
Similar experiences are part of a USENIX paper about copysets, see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://www.usenix.org/system/files/conference/atc13/atc13-cidon.pdf
\end_layout
\end_inset
.
Their proposed solution is different from the solution proposed here, but
interestingly their
\emph on
problem analysis
\emph default
part contains not only similar observations, but also comes to similar
conclusions about random replication.
Citation from the abstract:
\end_layout
\begin_layout Quote
However, random replication is
\series bold
almost guaranteed
\series default
to lose data in the common scenario of simultaneous node failures due to
cluster-wide power outages.
\size footnotesize
[emphasis added by me]
\end_layout
\begin_layout Standard
Stimulated by practical experiences from truly less disastrous scenarios
than mass power outage, theoretical explanations were sought.
Surprisingly, they clearly show by mathematical arguments that
\family typewriter
LocalSharding
\family default
is superior to
\family typewriter
BigCluster
\family default
under practically important preconditions.
\end_layout
\begin_layout Standard
We start with an intutitive explanation.
A detailed mathematical description of the model can be found in appendix
\begin_inset CommandInset ref
LatexCommand vref
reference "chap:Mathematical-Model-of"
\end_inset
.
\end_layout
\begin_layout Subsection
Storage Server Node Failures
\end_layout
\begin_layout Subsubsection
Simple Intuitive Explanation in a Nutshell
\begin_inset CommandInset label
LatexCommand label
name "subsec:Simple-intuitive-explanation"
\end_inset
\end_layout
\begin_layout Standard
Block-level replication systems like DRBD are constructed for LV or disk
failover in local redundancy scenarios.
Or, when using MARS, even for geo-redundant failover scenarios.
They are traditionally dealing with
\series bold
pairs
\series default
of servers, or with triples, etc.
In order to get a storage incident with them,
\emph on
both
\emph default
sides of a DRBD or MARS small-cluster (also called
\series bold
shard
\series default
in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Definition-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
) must have an incident
\emph on
at the same time
\emph default
.
\end_layout
\begin_layout Standard
In contrast, the
\series bold
random replication
\series default
concept of big clusters is spreading huge masses of objects over a huge
number of nodes
\begin_inset Formula $O(n)$
\end_inset
, with some redundancy degree
\begin_inset Formula $k$
\end_inset
denoting the number of object replicas.
As a consequence,
\emph on
any
\emph default
\begin_inset Formula $k$
\end_inset
node failures out of
\begin_inset Formula $O(n)$
\end_inset
will make
\emph on
some
\emph default
objects inaccessible, and thus produce an incident.
For example, when
\begin_inset Formula $k=2$
\end_inset
and
\begin_inset Formula $n$
\end_inset
is equal for both models, then
\emph on
any
\emph default
combination to two node failures occurring at the same time will lead to
an incident:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/Incident_Probabilities.pdf
width 100col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
Intuitively, it is easy to see that hitting both members of the
\emph on
same
\emph default
sharding pair at the same time is less likely than hitting
\emph on
any
\emph default
two nodes of a big cluster.
Therefore,
\series bold
sharding provides better reliability
\series default
, when built on top of comparable technology.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
In addition: even when
\begin_inset Formula $1$
\end_inset
shard out of
\begin_inset Formula $n$
\end_inset
shards has an incident, the other
\begin_inset Formula $n-1$
\end_inset
shards will continue to run.
In contrast, when a
\family typewriter
BigCluster
\family default
has an incident,
\emph on
all
\emph default
application instances are affected, due to
\emph on
uniform
\emph default
object distribution.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
Another advantage of sharded pairs is
\series bold
smaller incident size
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
If you are curious about some more details and more concrete behaviour,
read on.
\end_layout
\begin_layout Subsubsection
Detailed Explanation of
\family typewriter
BigCluster
\family default
Reliability
\begin_inset CommandInset label
LatexCommand label
name "sub:Detailed-explanation"
\end_inset
\end_layout
\begin_layout Standard
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The following analysis shows up some parallels to the well-known reliability
loss caused by RAID striping.
The main difference is granularity: variable-sized objects are used in
place of fixed-size blocks.
Therefore, this section is in reality about a
\series bold
fundamental property of data distribution / striping
\series default
.
\end_layout
\begin_layout Standard
It is only formulated in terms of
\family typewriter
BigCluster
\family default
and random replication for didactic reasons, because in the context of
this architecture guide we need to compare with
\family typewriter
LocalSharding
\family default
.
\end_layout
\begin_layout Standard
For the sake of simplicity, the following more detailed model is based on
the following assumptions:
\end_layout
\begin_layout Itemize
We are looking at
\series bold
storage node
\series default
failures only.
As observed from practice, this is the most important failure granularity
for causing incidents.
\end_layout
\begin_layout Itemize
Disk failures are regarded as already solved (e.g.
by local RAID-6 or by the well-known compensation mechanisms of big clusters).
Only in case they don't work, they are mapped to node failures, and are
already included in the probability of storage node failures.
\end_layout
\begin_layout Itemize
We only look at
\series bold
data replication
\series default
with a redundancy degree of a relatively small
\begin_inset Formula $k$
\end_inset
.
CRC methods are not modeled across storage nodes, but may be present
\emph on
internally
\emph default
at some storage nodes, e.g.
RAID-5 or RAID-6 or similar methods, or may be present internally in some
hardware devices, like SSDs or HDDs.
Notice that
\emph on
distributed
\emph default
CRC methods generally involve very high overhead, and won't work in realtime
across long distances (geo-redundancy).
\end_layout
\begin_layout Itemize
We restrict ourselves to temporary /
\series bold
transient
\series default
failures, without regarding permanent data loss.
Otherwise, the following differences between local-storage sharding architectur
es and big clusters would become even worse.
When loosing some physical storage nodes forever in a big cluster, it is
typically all else but easy to determine which data of which application
instances / customers have been affected, and which will need a restore
from backup.
\end_layout
\begin_layout Itemize
Storage network failures (parts, or as a whole) are ignored.
Otherwise a fair comparison between the architectures would become difficult.
If they were taken into account, the advantages of
\family typewriter
LocalSharding
\family default
would become even bigger.
\end_layout
\begin_layout Itemize
We assume that the storage network (when present) forms no bottleneck.
Network implementations like TCP/IP versus Infiniband or similar are thus
ignored.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
This is roughly equivalent to NOF = 1 as explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Although technically possible, this can grow extremely expensive.
\end_layout
\begin_layout Itemize
Software failures / bugs are also ignored
\begin_inset Foot
status open
\begin_layout Plain Layout
When assuming that the probability of bugs is increased by increased architectur
al complexity, a
\family typewriter
LocalSharding
\family default
model would likely win here also.
However, such an assumption is difficult to justify, and might be wrong,
depending on many (unknown) factors.
\end_layout
\end_inset
.
We are only comparing
\emph on
architectures
\emph default
here, not their various implementations (see
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Itemize
The x axis shows the number of basic storage units
\begin_inset Formula $n=x$
\end_inset
from an
\emph on
application
\emph default
perspective, meaning
\begin_inset Quotes eld
\end_inset
usable storage
\begin_inset Quotes erd
\end_inset
or
\begin_inset Quotes eld
\end_inset
net amount of storage
\begin_inset Quotes erd
\end_inset
.
For simplicitiy of the model, one basic application storage unit equals
to the total disk space provided by one physical storage node in the special
case of
\begin_inset Formula $k=1$
\end_inset
replicas.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Stated simply, this means that there is exactly 1 LV = 1 PV per each applicatio
n unit present at the x axis.
So we have a total of exactly
\begin_inset Formula $x$
\end_inset
LVs.
Of course, you might create a more elaborate model by introduction of some
constant
\begin_inset Formula $l\geq1$
\end_inset
for a grand total of
\begin_inset Formula $l\cdot x$
\end_inset
LVs on top of
\begin_inset Formula $x=n$
\end_inset
PVs, but we don't want to complexify our model unnecessarily.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Attention! when increasing the number of replicas
\begin_inset Formula $k$
\end_inset
, the total number of storage nodes needs to be
\series bold
increased accordingly
\series default
.
Typically, you will need to deploy
\begin_inset Formula $k\cdot n$
\end_inset
physical storage nodes in order to get
\begin_inset Formula $n$
\end_inset
net storage units from a user's perspective.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Attention!
\begin_inset space ~
\end_inset
\begin_inset Formula $k$
\end_inset
has a strong influence at the
\series bold
price tag
\series default
of any of the competing architectures.
You cannot assume an
\begin_inset Quotes eld
\end_inset
infinite amount of money
\begin_inset Quotes erd
\end_inset
.
Therefore, only relatively small
\begin_inset Formula $k$
\end_inset
are bearable for business cases.
\end_layout
\begin_layout Itemize
As already stated, we assume that the number of application instances is
linearly scaling with
\begin_inset Formula $n$
\end_inset
.
For simplicity, we assume that the number of applications running on the
whole pool is
\emph on
exactly
\emph default
\begin_inset Formula $n$
\end_inset
.
Of course, you might also introduce some
\emph on
coupling constant
\emph default
here, but don't complexify the model unnecessarily.
\end_layout
\begin_layout Itemize
We assume that the storage nodes are (almost completely) filled with data
(sectors with RAID, and/or objects with
\family typewriter
BigCluster
\family default
).
Otherwise, the game would be pointless on empty clusters / shards.
\end_layout
\begin_layout Itemize
We assume that the number of sectors / objects per storage node is
\begin_inset Quotes eld
\end_inset
very large
\begin_inset Quotes erd
\end_inset
.
Some examples: a logical volume of 4 TB has 1,000,000,000 sectors or object,
each 4 KB in size.
A physical storage node providing 40 TB of storage will then provide 10
billions of sectors / objects.
\end_layout
\begin_layout Itemize
For the
\family typewriter
BigCluster
\family default
architecture, we assume that all objects are always distributed to
\begin_inset Formula $O(n)$
\end_inset
nodes.
We will later discuss some variants where it is distributed to
\emph on
less
\emph default
nodes.
This assumption is only for explaining the
\series bold
principal behaviour of data distribution / striping
\series default
, and also for one of its variants called
\series bold
random replication
\series default
.
For simplicity of the model, we assume a distribution via a
\emph on
uniform
\emph default
hash function.
In general, the principal behaviour would also work for many other distribution
functions, such as RAID striping, or even certain non-uniform hash functions
over
\begin_inset Formula $O(n)$
\end_inset
nodes.
As discussed later, totally different hash functions (e.g.
distributing only to a constant number of nodes) would no longer model
a
\family typewriter
BigCluster
\family default
architecture in our sense.
\begin_inset Newline newline
\end_inset
In the below example, we assume a uniform object distribution to
\emph on
exactly
\emph default
\begin_inset Formula $n$
\end_inset
nodes.
Notice that any other
\begin_inset Formula $n'=O(n)$
\end_inset
with
\begin_inset Formula $n'<n$
\end_inset
will produce similar results for
\begin_inset Formula $n'\rightarrow\infty$
\end_inset
, but may be better in detail for smaller
\begin_inset Formula $n$
\end_inset
'.
\end_layout
\begin_layout Itemize
When random distribution / random replication methods are used at BigCluster
object stores, we assume that for any pair (or
\begin_inset Formula $k$
\end_inset
-tuple) of storage nodes, the total number of objects is so high that there
always
\emph on
exists
\emph default
some objects which are present at
\emph on
all
\emph default
of the nodes of any pair /
\begin_inset Formula $k$
\end_inset
-tuple for any reasonable (small)
\begin_inset Formula $k$
\end_inset
.
This means, we assume not only uniformity in random replication, but also
that the total number of objects is practically
\begin_inset Quotes eld
\end_inset
infinite
\begin_inset Quotes erd
\end_inset
compared to relatively small practical values of
\begin_inset Formula $k$
\end_inset
.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
For mathematically interested readers: be careful when trying to argue
with the probability to hit some object intersection for some given
\begin_inset Formula $k$
\end_inset
-tuple of storage nodes while
\begin_inset Formula $n$
\end_inset
is a growing parameter.
Even when such a
\emph on
single
\emph default
probability is declining with growing both
\begin_inset Formula $k$
\end_inset
and
\begin_inset Formula $n$
\end_inset
, and even when the
\emph on
single
\emph default
probability for the existence of an intersection somewhen gets lower than
\begin_inset Formula $1$
\end_inset
, this has an impact onto the
\emph on
total
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
Mathematical probabilties are always about a huge number of
\emph on
repetitions
\emph default
of a certain experiment.
Even when a single
\begin_inset Quotes eld
\end_inset
failure experiment
\begin_inset Quotes erd
\end_inset
does
\emph on
not always
\emph default
lead to an incident from a customer's perspective, it can contribute to
the overall incident probability, when there is a
\emph on
chance
\emph default
, even when the chance is very low.
\end_layout
\end_inset
incident probability of the
\emph on
whole
\emph default
BigCluster.
In
\emph on
general
\emph default
, the
\emph on
number
\emph default
of such tuples is growing with
\begin_inset Formula $O(\binom{k\cdot n}{k})=O((k\cdot n)!)$
\end_inset
, which is even worse than an exponential growth.
So, don't forget to sum up
\emph on
all
\emph default
probabilities even if a single one appears to be
\begin_inset Quotes eld
\end_inset
neglectible
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Itemize
For the LocalSharding architecture, called DRBDorMARS in the following graphics,
we assume that only local storage is used.
For higher replication degrees
\begin_inset Formula $k=2,\ldots$
\end_inset
, the only occurring communication is
\emph on
among
\emph default
the pairs / triples / and so on (shards), but no communication to other
shards is necessary (cf
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Definition-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Standard
The following assumptions are not part of the model, but are simplifying
the below
\emph on
example
\emph default
graphics.
You may choose other parameter values than the following ones, without
changing the principal behaviour of the model, but then the
\emph on
example
\emph default
would become less intuitive for
\series bold
humans
\series default
.
\end_layout
\begin_layout Itemize
For simplicity of the
\emph on
example
\emph default
, we assume that any single storage server node used in either architecture,
including all of its local disks, has a reliability of 99.99% (four nines).
This means, the probability of a storage node failure is uniformly assumed
as
\begin_inset Formula $p=0.0001$
\end_inset
.
\end_layout
\begin_layout Itemize
This means, during an observation period of
\begin_inset Formula $T=10,000$
\end_inset
operation hours, we will have a total downtime of 1 hour per server in
statistical average.
For simplicity, we assume that the failure probability of a single server
does neither depend on previous
\begin_inset Foot
status open
\begin_layout Plain Layout
Mathematically, we are using some Poisson process model here.
Of course, it would be possible to use more sophisticated models, but this
might turn out as a
\emph on
major
\emph default
research undertakement.
\end_layout
\end_inset
failures nor on the operating conditions of any other server.
It is known that this is not true in general, but otherwise our model would
become extremely complex.
\end_layout
\begin_layout Itemize
More intuitively, our observation period of
\begin_inset Formula $T=10,000$
\end_inset
operation hours corresponds to about 13 months, or slightly more than a
year.
\end_layout
\begin_layout Itemize
Consequence: when operating a pool of 10,000 storage servers, then in statistica
l
\emph on
average
\emph default
there will be
\emph on
almost always
\emph default
one node which is failed at the moment.
The overall behaviour is like a
\begin_inset Quotes eld
\end_inset
permanent incident
\begin_inset Quotes erd
\end_inset
which has to be solved by the competing storage architectures.
\end_layout
\begin_layout Itemize
Hint: the term
\begin_inset Quotes eld
\end_inset
statistical average
\begin_inset Quotes erd
\end_inset
is somewhat vague here, in order to not confuse readers
\begin_inset Foot
status open
\begin_layout Plain Layout
The problem is that sometimes more servers than average can be down, and
sometimes less.
Average values should not be used in the mathematical model, but exact
ones.
However, humans can often better imagine when provided with
\begin_inset Quotes eld
\end_inset
average behaviour
\begin_inset Quotes erd
\end_inset
, so we use it here just for ease of understanding.
\end_layout
\end_inset
.
A more elaborate statistical model can be found in appendix
\begin_inset CommandInset ref
LatexCommand vref
reference "chap:Mathematical-Model-of"
\end_inset
.
\end_layout
\begin_layout Standard
Let us start the comparison with a simple corner case: plain old servers
with no further redundancy, other than their local RAIDs.
This naturally corresponds to
\begin_inset Formula $k=1$
\end_inset
replicas when using the DRBDorMARS architecture.
\end_layout
\begin_layout Standard
Now we apply the corner case of
\begin_inset Formula $k=1$
\end_inset
replicas to both competing architectures, in order to shed some spotlight
at the fundamental properties of the architectures.
\end_layout
\begin_layout Standard
Under the precondition of
\begin_inset Formula $k=1$
\end_inset
replicas, a failure at
\emph on
any one
\emph default
of the
\begin_inset Formula $n$
\end_inset
servers has two possible ways to influence the downtime from an application's
perspective:
\end_layout
\begin_layout Enumerate
LocalSharding (DRBDorMARS): downtime of 1 storage node only influences 1
application unit depending on 1 basic storage unit.
This is the case with the DRBDorMARS model, because there is no communication
between shards, and we assumed that 1 storage server unit also carries
exactly 1 application unit.
\end_layout
\begin_layout Enumerate
BigCluster: here the downtime of 1 storage node will
\series bold
tear down more
\series default
than 1 application unit, because any of the application units have spread
their storage to more than 1 storage node via uniform hashing (see assumptions
above).
\end_layout
\begin_layout Standard
For ease of understanding, let us zoom into the special case
\begin_inset Formula $n=2$
\end_inset
and
\begin_inset Formula $k=1$
\end_inset
for a moment.
These are the smallest numbers where you already can see the effect.
In the following table, we denote 4 possible status combinations out of
2 servers A and B, where the cells are showing the number of application
units influenced:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
hfill
\end_layout
\end_inset
\begin_inset Tabular
<lyxtabular version="3" rows="3" columns="3">
<features tabularvalignment="middle">
<column alignment="right" valignment="top" width="0pt">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<row>
<cell alignment="right" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
LocalSharding
\size tiny
(DRBDorMARS)
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
A up
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
A down
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="right" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
B up
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
0
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
1
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="right" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
B down
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
1
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
2
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
hfill
\end_layout
\end_inset
\begin_inset Tabular
<lyxtabular version="3" rows="3" columns="3">
<features tabularvalignment="middle">
<column alignment="right" valignment="top" width="0pt">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<row>
<cell alignment="right" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
BigCluster
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
A up
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
A down
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="right" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
B up
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
0
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
2
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="right" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
B down
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
2
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
2
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
hfill
\end_layout
\end_inset
\begin_inset space ~
\end_inset
\end_layout
\begin_layout Standard
\noindent
What is the heart of the difference? While a single node failure at LocalShardin
g (DRBDorMARS) will tear down only the local application, the teardown produced
at BigCluster will spread to
\emph on
all
\emph default
of the
\begin_inset Formula $n=2$
\end_inset
application units, because of the uniform hashing and because we have only
\begin_inset Formula $k=1$
\end_inset
replica.
\end_layout
\begin_layout Standard
Would it help to increase both
\begin_inset Formula $n$
\end_inset
and
\begin_inset Formula $k$
\end_inset
to larger values?
\end_layout
\begin_layout Standard
Let us first stay at
\begin_inset Formula $k=1$
\end_inset
, looking at the behaviour when
\begin_inset Formula $n\rightarrow\infty$
\end_inset
.
The generalization to bigger redundancy degrees
\begin_inset Formula $k$
\end_inset
will follow later.
\end_layout
\begin_layout Standard
In the following graphics, the thick red line shows the behaviour for
\begin_inset Formula $k=1$
\end_inset
PlainServers (which is the same as
\begin_inset Formula $k=1$
\end_inset
DRBDorMARS) with increasing number of storage units
\begin_inset Formula $n,$
\end_inset
ranging from 1 to 10,000 storage units = number of servers for
\begin_inset Formula $k=1$
\end_inset
.
Higher values of
\begin_inset Formula $k\in[1,4]$
\end_inset
are also displayed in different colors, but we will discuss them later.
All lines corresponding to the same
\begin_inset Formula $k$
\end_inset
are drawn in the same color.
Notice that both the x and y axis are logscale:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/SERVICE_Comparison_of_Reversible_StorageNode_Failures.pdf
lyxscale 200
width 100col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
First, we look at the red lines, corresponding to
\begin_inset Formula $k=1$
\end_inset
.
The behaviour of the thick red line should be rather clear in double logscale:
with increasing number of servers at the x axis, the total downtime y is
also increasing.
This forms a straight line in double logscale, where the slope is 1 (proportion
al to
\begin_inset Formula $n$
\end_inset
), and the distances between the start of the other colored lines are multiples
of
\begin_inset Formula $1/p$
\end_inset
for the given incident probability
\begin_inset Formula $p$
\end_inset
.
\end_layout
\begin_layout Standard
Next, we are looking at the thin solid red line for
\family typewriter
BigCluster
\family default
\begin_inset Formula $k=1$
\end_inset
.
Why is it converging against the dotted grey line around
\begin_inset Formula $n=10000$
\end_inset
?
\end_layout
\begin_layout Standard
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
At
\begin_inset Formula $n\geq10000$
\end_inset
servers, there is a
\begin_inset Quotes eld
\end_inset
permanent incident
\begin_inset Quotes erd
\end_inset
.
In statistical average, there is approximately
\emph on
always
\emph default
some server down.
Due to
\begin_inset Formula $k=1$
\end_inset
replica, the whole cluster will then be down from a user's perspective.
The thin dotted grey line denotes the total number of operation hours to
be executed for each
\begin_inset Formula $n$
\end_inset
, so this is the limes line we are converging against for big enough
\begin_inset Formula $n$
\end_inset
.
\end_layout
\begin_layout Standard
This does not look nice from a user's perspective.
Can we heal the problem by deploying more replicas
\begin_inset Formula $k$
\end_inset
?
\end_layout
\begin_layout Standard
Let us look at the green solid lines, correponding to
\begin_inset Formula $k=2$
\end_inset
replicas.
Why is the thin green BigCluster line also converging against the same
dotted limes? And why is this happening around the same point, around
\begin_inset Formula $n\approx10000$
\end_inset
?
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
When you want to operate
\begin_inset Formula $n=10000$
\end_inset
application instances with a replication degree of
\begin_inset Formula $k=2$
\end_inset
replicas, then you will need to deploy
\begin_inset Formula $k\cdot n=20000$
\end_inset
storage servers.
When you have 20000 storage servers, in statistical average about
\begin_inset Formula $2$
\end_inset
of them will be down at the same time.
When
\begin_inset Formula $k=2$
\end_inset
servers are down at the same time, again the whole cluster will be down
from a user's perspective.
Thus the green line is also converging against the grey dotted limes line,
roughly also around
\begin_inset Formula $n\approx10000$
\end_inset
.
\end_layout
\begin_layout Standard
Why is the green thicker DRBDorMARS line much better?
\end_layout
\begin_layout Standard
In double logscale plot, it forms a
\emph on
parallel
\emph default
line to the corresponding red line.
The distance is conforming to
\begin_inset Formula $1/p$
\end_inset
.
This means that the incident probability for hitting
\emph on
both
\emph default
members of the
\emph on
same
\emph default
shard is
\emph on
improved
\emph default
by a factor of 10,000.
\end_layout
\begin_layout Standard
Finally, we look at all the other solid lines in any color.
All the thin solid
\family typewriter
BigCluster
\family default
lines are converging against the same limes line, regardless of replication
degree
\begin_inset Formula $k$
\end_inset
, and around the same
\begin_inset Formula $n\approx10000$
\end_inset
.
Why is this the case?
\end_layout
\begin_layout Standard
Because our BigCluster model as defined above will distribute
\emph on
all
\emph default
objects to
\emph on
all
\emph default
servers uniformly, there will almost always
\emph on
exist
\emph default
some objects for which no replica is available at almost any given point
in time.
This means, you will almost always have a
\series bold
permanent incident
\series default
involving the same number of nodes as your replication degree
\begin_inset Formula $k$
\end_inset
, and in turn
\emph on
some
\emph default
of your objects will not be accessible at all.
This means, at around
\begin_inset Formula $x=10,000$
\end_inset
application units you will loose almost any advantage from increasing the
number of replicas.
Adding more replicas will no longer help at
\begin_inset Formula $x\geq10,000$
\end_inset
application units.
\end_layout
\begin_layout Standard
Notice that the
\emph on
solid
\emph default
lines are showing the probability of
\emph on
some
\emph default
incident, disregarding the
\series bold
size of the incident
\series default
.
\end_layout
\begin_layout Standard
What's about the
\emph on
dashed
\emph default
lines showing much better behaviour for BigCluster?
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Under some further preconditions, it would be possible to argue with the
\emph on
size
\emph default
of incidents.
However, now a big fat warning.
\end_layout
\begin_layout Standard
\noindent
\begin_inset VSpace defskip
\end_inset
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Size-weighted incident probabilities
\end_layout
\end_inset
When you are
\series bold
responsible
\series default
for operations of
\series bold
thousands of servers
\series default
, you should be very conscious about preconditions for size-weighted downtime
probabilities (dashed lines).
Otherwise you could risk both the health of your business, and your career.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset VSpace defskip
\end_inset
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Some preconditions for size-weighted incident probabilities
\end_layout
\end_inset
In short:
\end_layout
\begin_layout Itemize
When your application, e.g.
a smartphone app, consists of accessing only 1 object at all during a reasonabl
y long timeframe (say once per day), you can safely
\series bold
assume that there is no interdependency
\series default
between all of your objects.
In addition, you have to assume (and you should check) that your cluster
operating software as a whole does not introduce any further
\series bold
hidden / internal
\begin_inset Foot
status open
\begin_layout Plain Layout
Several distributed filesystems are separating their metadata from application
data.
Advocates are selling this as an advantage.
However, in terms of
\series bold
reliability
\series default
this is clearly a
\series bold
disadvantage
\series default
.
It increases the
\emph on
breakdown surface
\emph default
.
Some distributed filesystems are even
\emph on
centralizing
\emph default
their metadata, sometimes via an ordinary database system, creating a SPOF
= Single Point Of Failure.
In case of inconsistencies between data and metadata, e.g.
resulting from an incident or from a software bug, you will need the equivalent
of a
\series bold
distributed
\family typewriter
fsck
\family default
\series default
.
Suchalike can easily turn into
\series bold
data loss
\series default
and other nightmares, such as node failures during the consistency check,
for example when your hardware is flaky and produces intermitting errors.
\end_layout
\end_inset
interdependencies
\series default
.
Only in this case, and only then, you can take the dashed lines arguing
with the number of inaccessible objects instead of with the number of distorted
application units.
\end_layout
\begin_layout Itemize
Whenever your application uses
\series bold
bigger structured logical objects
\series default
, such as filesystems or block devices (cf section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Negative-Example:-object"
plural "false"
caps "false"
noprefix "false"
\end_inset
), and/or whole VMs / containers requiring
\series bold
strict consistency
\series default
, then you will get
\series bold
interdependent objects
\series default
at your big cluster storage layer.
\begin_inset Newline newline
\end_inset
Practical example: experienced sysadmins will confirm that even a data loss
rate of only 1/1,000,000 of blocks in a classical Linux filesystem like
\family typewriter
xfs
\family default
or
\family typewriter
ext4
\family default
will likely imply the need of an offline filesystem check (
\family typewriter
fsck
\family default
), which is a major incident for the affected filesystem instance.
\begin_inset Newline newline
\end_inset
Theoretical explanation: servers are running for a very long time, and filesyste
ms are typically also mounted for a long time.
Notice that the probability of hitting any vital filesystem data roughly
equals the probability of hitting any other data.
Sooner or later, any defective sector in the metadata structures or in
freespace management etc will stop your whole filesystem, and in turn will
stop your application instance(s) running on top of it.
\begin_inset Newline newline
\end_inset
Similar arguments hold for transient failures: most classical filesystems
are not constructed for compensation of hanging IO, typically leading to
\series bold
system hangs
\series default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Blindly taking the dashed lines will expose you to a high
\series bold
risk
\series default
of error.
Practical experience shows that there are often
\series bold
hidden dependencies
\series default
in many applications, often also at application level.
You cannot necessarily see them when inspecting their data structures!
You will only notice some of them by analyzing their
\series bold
runtime behaviour
\series default
, e.g.
with tools like
\family typewriter
strace
\family default
.
Notice that in general the runtime behaviour of an arbitrary program is
\series bold
undecidable
\series default
.
Be cautious when drawing assumptions out of thin air!
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Conversely, the assumption that
\emph on
any
\emph default
unaccessible object will halt your application, might be too strong for
\emph on
some
\emph default
use cases.
Therefore, some practical behaviour may be inbetween the solid thin lines
and the dashed lines of some given color.
Be extremely careful when constructing such an intermediate case.
Remember that the plot is in logscale, where constant factors will not
make a huge difference.
The above example of a loss rate of 1/1,000,000 of sectors in a classical
filesystem should not be extended to lower values like 1/1,000,000,000
without knowing exactly how the filesystem works, and how it will react
\emph on
in detail
\emph default
.
The grey zone between the extreme cases thin solid vs dashed is a
\series bold
dangerous zone
\series default
!
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
As a manager, if you want to stay at the
\series bold
safe side
\series default
, simply obey the fundamental law as explained in the next section:
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Optimum Reliability from Architecture
\begin_inset CommandInset label
LatexCommand label
name "subsec:Optimum-Reliability-from"
\end_inset
\end_layout
\begin_layout Standard
Another potential argument from influencers could be: don't distribute the
BigCluster objects to exactly
\begin_inset Formula $n$
\end_inset
nodes, but to less nodes.
Would the result be better than DRBDorMARS-like LocalSharding?
\end_layout
\begin_layout Standard
Actually, several BigCluster implementations are doing similar measures,
in order to
\emph on
workaround
\emph default
problems analyzed in the previous subsections of
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Reliability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
There are various terms for suchalike measures, like placement groups (Ceph),
copysets (Facebook), buckets (generic object stores), etc.
\end_layout
\begin_layout Standard
When distributing to
\begin_inset Formula $O(k')$
\end_inset
nodes with some constant
\begin_inset Formula $k'$
\end_inset
, we have no longer a BigCluster architecture, but a mixed BigClusterSharding
form in our terminology.
\end_layout
\begin_layout Standard
As can be generalized from the above tables, the reliability of
\series bold
any
\series default
BigCluster on
\begin_inset Formula $k'>k$
\end_inset
nodes is
\series bold
always worse
\series default
than the reliability of LocalSharding on exactly
\begin_inset Formula $k$
\end_inset
nodes, where
\begin_inset Formula $k$
\end_inset
is also the redundancy degree.
In general:
\end_layout
\begin_layout Quote
\series bold
\size large
The LocalSharding model is the
\emph on
optimum model
\emph default
for reliability of operation, compared to any other model truly distributing
its data
\emph on
and
\emph default
operations over truly more nodes, like RemoteSharding or BigClusterSharding
or BigCluster does.
\end_layout
\begin_layout Standard
There exists no better model because shards consisting of exactly
\begin_inset Formula $k$
\end_inset
nodes where
\begin_inset Formula $k$
\end_inset
is the redundancy degree, are already the
\emph on
smallest possible shards
\emph default
under the assumptions of section
\begin_inset CommandInset ref
LatexCommand ref
reference "sub:Detailed-explanation"
\end_inset
.
Any other model truly involving
\begin_inset Formula $k'>k$
\end_inset
nodes for distribution of objects at any shard is
\emph on
always worse
\emph default
in the dimension of reliability.
Thus the above sentence follows by induction.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
The above sentence is formulating a
\series bold
fundamental law of storage systems
\series default
.
An intuitive formulation for humans:
\end_layout
\begin_layout Quote
\series bold
\size large
Spread your per-application data to as less nodes as possible.
\end_layout
\begin_layout Plain Layout
This includes unnecessary spreading between dedicated client and server
machines, in place of local storage.
Thus
\family typewriter
LocalSharding
\family default
is the best architectural model.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
This is intuitive: the more nodes are involved for storing the
\emph on
same
\emph default
data belonging to the
\emph on
same
\emph default
application instance (i.e.
belonging to the same LV), the higher the
\series bold
risk
\series default
that
\emph on
any
\emph default
of them can fail.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Consequence: the
\series bold
\emph on
concept
\emph default
of random replication
\series default
\begin_inset Foot
status open
\begin_layout Plain Layout
A very picky argument might be: random distribution could be viewed as
\emph on
orthogonal
\emph default
to random replication, by separating the concept
\begin_inset Quotes eld
\end_inset
distribution
\begin_inset Quotes erd
\end_inset
from the concept
\begin_inset Quotes eld
\end_inset
replication
\begin_inset Quotes erd
\end_inset
.
Then the above sentence should be re-formulated, using
\begin_inset Quotes eld
\end_inset
random distribution
\begin_inset Quotes erd
\end_inset
instead.
However notice than
\emph on
random
\emph default
replication + distribution on exactly
\begin_inset Formula $n\cdot k$
\end_inset
nodes would degenerate, since it no longer is really
\begin_inset Quotes eld
\end_inset
random
\begin_inset Quotes erd
\end_inset
, but only has the freedom degree of a
\begin_inset Quotes eld
\end_inset
permutation
\begin_inset Quotes erd
\end_inset
.
\end_layout
\end_inset
tries to do the
\emph on
opposite
\emph default
of this, by its very nature.
Thus:
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
The
\emph on
concept
\emph default
of
\series bold
random replication does not work as expected
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
This does not imply that random replication does not generally work at
all.
Section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
mentions a few use cases where it appears to work in practice.
However, after
\series bold
investing a lot
\series default
of effort / energy / money into a very complicated architecture and several
implementations, the outcome is
\series bold
worse = non-optimal
\series default
in the dimension of reliability.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
There exist some
\emph on
workarounds
\emph default
as discussed in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Similarities-and-differences"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
These can only patch the most urgent architectural problems, such that
operation remains
\emph on
bearable
\emph default
in practice.
They cannot
\emph on
fix
\emph default
the
\series bold
Dijkstra regression overhead
\series default
explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Negative-Example:-object"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
The above plot explains why even workarounds are
\series bold
far from optimal
\series default
for a given fixed
\begin_inset Foot
status open
\begin_layout Plain Layout
As explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Cost-Arguments-from-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
, several
\family typewriter
BigCluster
\family default
best practices are typically requiring
\begin_inset Formula $k=3$
\end_inset
replicas.
Some advocates have taken this as granted.
For a
\series bold
fair comparison
\series default
with Sharding, they will need to compare with
\begin_inset Formula $k=3$
\end_inset
LV replicas.
\end_layout
\end_inset
redundancy degree
\begin_inset Formula $k$
\end_inset
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Real storage management cost from workarounds
\end_layout
\end_inset
There was an original promise from BigCluster advocates years ago: near-zero
technical management effort for storage.
Just add an arbitrary number of new storage nodes to the storage network,
at more or less arbitrary physical locations, and done.
The rest should have been configured and load-balanced automatically.
\end_layout
\begin_layout Plain Layout
This promise has been fulfilled only
\emph on
partly
\emph default
.
In addition to increased CAPEX cost from at least
\begin_inset Formula $k=3$
\end_inset
replica, workarounds like
\series bold
placement groups
\series default
(and internal modelling of physical placement hierarchies like segments
/ rooms / racks and physical network topologies) are
\series bold
inducing OPEX cost
\series default
: some human effort for dimensioning of hardware and placement / planning
will remain, at least in bigger installations.
\end_layout
\begin_layout Plain Layout
So an original BigCluster USP = Unique Selling Point does not really work
as originally expected, due to fundamental laws of storage systems.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Summary from a management viewpoint
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Under comparable conditions for big installations, random replication is
requiring
\series bold
more invest
\series default
than Sharding (e.g.
more client/server hardware and an
\begin_inset Formula $O(n^{2})$
\end_inset
realtime storage network), in order to get a
\series bold
\emph on
worse result
\series default
\emph default
in the
\series bold
risk dimension
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Error Propagation to Client Mountpoints
\begin_inset CommandInset label
LatexCommand label
name "subsec:Error-Propagation-to"
\end_inset
\end_layout
\begin_layout Standard
This section deals with a
\emph on
pathological
\emph default
setup.
Best practice is to avoid such pathologies.
\end_layout
\begin_layout Standard
The following is only applicable when
\series bold
filesystems
\series default
or whole
\series bold
object pools
\series default
(buckets) are exported over a storage network, in order to be
\series bold
mounted
\series default
\emph on
in parallel
\emph default
at
\begin_inset Formula $O(n)$
\end_inset
mountpoints
\emph on
each
\emph default
.
\end_layout
\begin_layout Standard
In other words: somebody is trying to make
\emph on
all
\emph default
server data available at
\emph on
all
\emph default
clients.
In spirit, this is also some BigCluster-like
\series bold
way of thinking
\series default
.
It just relates to the filesystem layer, c.f.
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Performance-Risk-Arguments-from-Layer"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
In such a scenario, any problem / incident
\emph on
inside
\emph default
of your
\emph on
storage pool
\emph default
and/or their the storage-side filesystem instances (e.g.
exported via NFS & co) will be spread to
\begin_inset Formula $O(n)$
\end_inset
clients, leading to an increase of the incident size by a factor of
\begin_inset Formula $O(n)$
\end_inset
when measured in
\series bold
number of affected mountpoints
\series default
.
Notice that this may be different from the number of clients.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Notice the
\series bold
slopes
\series default
in the following plot.
Some are correponding to
\begin_inset Formula $n^{2},$
\end_inset
and thus are even worse than in the previous plot:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/MOUNTPOINTS_Comparison_of_Reversible_StorageNode_Failures.pdf
lyxscale 200
width 100col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
As a result, we now have a total of
\begin_inset Formula $O(n^{2})$
\end_inset
mountpoints = our new basic application units
\begin_inset Foot
status open
\begin_layout Plain Layout
If you like, please create another mathematical model in terms of number
of clients, instead of the number of mountpoints.
Though the plot curves will be different, and certainly will explain an
interesting behaviour, the management conclusions will not change too much.
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
The problem is worse than explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
, or in
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example-Failures-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
where a disaster already occurred at
\begin_inset Formula $n=6$
\end_inset
.
Suchalike
\begin_inset Formula $O(n^{2})$
\end_inset
architectures are simply
\series bold
hazardous
\series default
.
Thus a clear warning: don't try to build systems in such a way.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not believe that the problem can be fixed by
\begin_inset Formula $O(k)$
\end_inset
spreading mounts in place of
\begin_inset Formula $O(n)$
\end_inset
, when
\begin_inset Formula $k$
\end_inset
is assumed as a
\begin_inset Quotes eld
\end_inset
small
\begin_inset Quotes erd
\end_inset
constant
\begin_inset Formula $>1$
\end_inset
.
It
\emph on
may
\emph default
be possible to
\emph on
reduce
\emph default
the
\emph on
size
\emph default
of the problem space.
But it
\emph on
cannot remove
\emph default
the fundamental law of storage systems.
Even
\begin_inset Formula $k=2$
\end_inset
is typically an unnecessary factor for the
\series bold
incident expectance value
\series default
.
Do not
\emph on
plan
\emph default
unnecessary spreading, whether some more or less
\begin_inset Quotes eld
\end_inset
constant
\begin_inset Quotes erd
\end_inset
\begin_inset Formula $k$
\end_inset
or some
\begin_inset Quotes eld
\end_inset
unbounded
\begin_inset Quotes erd
\end_inset
\begin_inset Formula $n$
\end_inset
.
Just do not play with the fire when HA is important for enterprise-critical
use cases.
\end_layout
\begin_layout Standard
Notice: DRBD or MARS are traditionally used for running the application
on the same box as the storage.
Thus they are not vulnerable to these kinds of
\series bold
failure propagation over network
\series default
.
Even with
\emph on
traditional
\emph default
iSCSI exports over DRBD or MARS, you won't have suchalike problems, because
the traditional iSCSI model disallows multiple mounts of the same filesystem
\emph on
in parallel
\emph default
.
Your only chance to
\emph on
increase
\emph default
the error propagation over
\emph on
multiple
\emph default
clients are
\begin_inset Formula $O(k)$
\end_inset
or
\begin_inset Formula $O(n)$
\end_inset
NFS or
\family typewriter
glusterfs
\family default
& co exports to
\begin_inset Formula $O(k)$
\end_inset
or
\begin_inset Formula $O(n)$
\end_inset
clients
\emph on
each
\emph default
, leading to a total number of
\begin_inset Formula $O(k\cdot n)$
\end_inset
or
\begin_inset Formula $O(n^{2})$
\end_inset
mountpoints, or similar setups.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Clear advice
\end_layout
\end_inset
Do not plan
\begin_inset Formula $O(k\cdot n)$
\end_inset
or
\begin_inset Formula $O(n^{2})$
\end_inset
mountpoints in total.
It is a bad idea.
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Similarities and Differences to Copysets
\begin_inset CommandInset label
LatexCommand label
name "subsec:Similarities-and-differences"
\end_inset
\end_layout
\begin_layout Standard
This section is mostly of academic interest.
You can skip it when looking for practical advice.
\end_layout
\begin_layout Standard
The USENIX paper about copysets (see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://www.usenix.org/system/files/conference/atc13/atc13-cidon.pdf
\end_layout
\end_inset
) relates to our analysis of
\family typewriter
BigCluster
\family default
vs
\family typewriter
Sharding
\family default
in the following way:
\end_layout
\begin_layout Paragraph
Similarities
\end_layout
\begin_layout Standard
Both are concluding: the concept of Random Replication of the storage data
to large number of machines will reduce reliability.
When chosing too big sets of storage machines, then the storage system
as a whole will become practically unusable.
This is common sense between the USENIX paper and the analysis from section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sub:Detailed-explanation"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Paragraph
Differences
\end_layout
\begin_layout Standard
The USENIX paper and many other Cloud Storage approaches are
\emph on
presuming
\emph default
that there exists a storage network, allowing real-time distribution of
replicas over this kind of network.
\end_layout
\begin_layout Standard
In contrast, the Sharding Approach to Cloud Storage tries to
\emph on
avoid
\emph default
real-time storage networks
\emph on
as much as possible
\emph default
.
Notice that RemoteSharding and further variants (including future improvements)
do
\emph on
not
\emph default
preclude it, but are trying to
\emph on
avoid
\emph default
real-time storage network traffic, due to reasons explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Instead, the load-balancing problem is addressed via
\series bold
background data migration
\series default
.
\end_layout
\begin_layout Standard
This changes the
\emph on
timely granularity
\emph default
of data access: while BigCluster is transferring
\emph on
each
\emph default
IO request over the storage network in
\emph on
realtime
\emph default
, nothing is transferred over an external network at LocalSharding, provided
that no migration is necessary.
Typically, migrations are a
\series bold
rare exception
\series default
.
Normally, the data is already
\series bold
close to the consumer
\series default
.
Only in rare situations when migration is needed, local IO transfers are
\emph on
shifted over
\emph default
to external migration processes.
The outcome of a successful migration is that local IO is then sufficient
again.
\end_layout
\begin_layout Standard
In essence, Football is an
\series bold
optimizer for data proximity
\series default
: always try to keep the data as close
\begin_inset Foot
status open
\begin_layout Plain Layout
When the many local SAS busses are also viewed as a network, and when these
are logically united with the replication network to a bigger
\emph on
logical
\emph default
network which is
\emph on
heterogenous
\emph default
at physical level: Football does nothing else but trying to
\series bold
offload
\series default
all IO requests to the local SAS networks, instead of overloading the wide-area
IP network.
In essence, this is a specialized traffic scheduling strategy for a two-level
network.
\end_layout
\end_inset
to the consumers as possible.
\end_layout
\begin_layout Standard
In detail, there are some more differences to the USENIX paper.
Some examples:
\end_layout
\begin_layout Itemize
Terminology: the scatter width
\begin_inset Formula $S$
\end_inset
is defined (see page 39 of the paper) as: each node's data is split
\emph on
uniformly
\emph default
across a group of
\begin_inset Formula $S$
\end_inset
\emph on
other
\emph default
nodes.
In difference, we neither assume uniformity, nor do we require the data
to be distributed to
\emph on
other
\emph default
nodes.
By using the term
\begin_inset Quotes eld
\end_inset
other
\begin_inset Quotes erd
\end_inset
, the USENIX paper (as well as many other BigCluster approaches) are probably
presuming something like a distinction between
\begin_inset Quotes eld
\end_inset
client
\begin_inset Quotes erd
\end_inset
and
\begin_inset Quotes eld
\end_inset
server
\begin_inset Quotes erd
\end_inset
machines: while data processing is done on a
\begin_inset Quotes eld
\end_inset
client machine
\begin_inset Quotes erd
\end_inset
, data storage is on a
\begin_inset Quotes eld
\end_inset
server machine
\begin_inset Quotes erd
\end_inset
.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In contrast, MARS uses the client-server paradigm at a different granularity:
each machine can act in client role and/or in server role
\emph on
at the same time
\emph default
, and
\emph on
individually
\emph default
for each LV.
Thus it is possible to use local storage.
\end_layout
\begin_layout Itemize
We don't disallow conventional network-centric client-server machines in
variants like
\family typewriter
RemoteSharding
\family default
or
\family typewriter
FlexibleSharding
\family default
and so on, but we gave some arguments why we are trying to
\emph on
avoid
\emph default
this.
\end_layout
\begin_layout Itemize
It seems that some definitions in the USENIX paper may implicitly relate
to
\begin_inset Quotes eld
\end_inset
each chunk
\begin_inset Quotes erd
\end_inset
.
In contrast, the Sharding Approach typically relates to LVs = Logical Volumes.
Probably, LVs could be viewed as a special case of
\begin_inset Quotes eld
\end_inset
chunk
\begin_inset Quotes erd
\end_inset
, e.g.
by minimizing the number of chunks in a system.
However notice: there exists definitions of
\begin_inset Quotes eld
\end_inset
chunk
\begin_inset Quotes erd
\end_inset
where it is the basic transfer unit.
An LV has the fundamental property that small-granularity
\series bold
updates in place
\series default
(at any offset inside the LV) can be executed.
\end_layout
\begin_layout Itemize
Notice: we do not preclude further fine-grained distribution of LV data
at lower levels, such at LVM level and/or below, but this is something
which should be
\emph on
avoided
\emph default
if not absolutely necessary (see
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Optimum-Reliability-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
Preferred method in typical practical use cases: some storage servers may
have some spare RAID slots to be populated later, by resizing the PVs =
Physical Volumes before resizing LVs.
Another alternative is dynamic runtime extension of SAS busses, by addition
of external enclosures.
\end_layout
\begin_layout Itemize
Notice that a typical local RAID system
\emph on
is also
\emph default
a Distributed System, according to some reasonable definition.
Typical RAID implementations just involve SAS cables instead of Ethernet
cables or Infiniband cables.
Notice that this also applies to many
\begin_inset Quotes eld
\end_inset
Commodity Hardware
\begin_inset Quotes erd
\end_inset
approaches, like Ceph storage nodes driving dozens of local HDDs connected
over SAS or SATA.
The main difference is just that instead of a hardware RAID controller,
a hardware HBA = Host Bus Adapter is used instead.
Instead of Ethernet switches, SAS multiplexers in backplanes are used.
Anyway, this forms a locally distributed sub-system.
\end_layout
\begin_layout Itemize
The USENIX paper needs to treat the following parameters as more or less
fixed (or only slowly changable)
\series bold
constants
\series default
, given by the system designer: the replication degree
\begin_inset Formula $R$
\end_inset
, and the scatter width
\begin_inset Formula $S$
\end_inset
.
In contrast, the replication degree
\begin_inset Formula $k$
\end_inset
of our Sharding Approach is not necessarily firmly given by the system,
but can be
\series bold
dynamically changed
\series default
at runtime at per-LV granularity.
For example, during background migration via MARS the command
\family typewriter
marsadm join-resource
\family default
is used for dynamic creating additional per-LV replicas.
However notice: this freedom is limited by the total number of deployed
hardware nodes.
If you want
\begin_inset Formula $k=3$
\end_inset
replicas at the
\emph on
whole
\emph default
pool, then you will need to (dynamically) deploy at least about
\begin_inset Formula $k*x$
\end_inset
nodes in general.
\end_layout
\begin_layout Itemize
The USENIX paper defines its copysets on a per-chunk basis.
Similarly to before, we might transfer this definition to a Sharding Approach
by relating it to a per-LV basis.
As a side effect, a copyset can then trivially become identical to
\begin_inset Formula $S$
\end_inset
when the definition is
\begin_inset Formula $S$
\end_inset
is also changed to a per-LV basis, analogously.
In the Sharding Approach, a distinction is not absolutely necessary, while
the USENIX paper has to invest some effort into clarifying the relationship
between
\begin_inset Formula $S$
\end_inset
and copysets as defined on a BigCluster model.
\end_layout
\begin_layout Itemize
Neglecting the mentioned differences, we see our typical use case (LocalSharding
) roughly equivalent to
\begin_inset Formula $S=R$
\end_inset
in the terminology of the USENIX paper, or to
\begin_inset Formula $S=k$
\end_inset
(our number of replicas) in our terminology.
\end_layout
\begin_layout Itemize
This means: LocalSharding tries to
\emph on
minimize
\emph default
the
\emph on
size
\emph default
of
\begin_inset Formula $S$
\end_inset
for any given per-LV
\begin_inset Formula $k$
\end_inset
, which will lead to the best possible reliability (under the conditions
described in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sub:Detailed-explanation"
plural "false"
caps "false"
noprefix "false"
\end_inset
) as has been shown in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Optimum-Reliability-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Another parallel comes to mind: classical RAID striping has introduced
the concept of
\series bold
RAID sets
\series default
since decades.
Similarly to random replication, RAID striping is motivated by
\emph on
load distribution
\emph default
.
Similarly to our previous discussion, this induces some
\series bold
cost
\series default
.
This is not only about RAID-0 vs RAID-10 by introduction of some more replicas
\begin_inset Foot
status open
\begin_layout Plain Layout
Random replication is be more like RAID-01: first
\emph on
all
\emph default
the physical disks are striped, then replicas are created
\emph on
on top
\emph default
of it.
Reversing this order would be more similar to RAID-10, and could lead to
an improvement of random replication.
However, this would contradict to a basic idea of BigCluster, that you
can add
\emph on
any
\emph default
number of storage nodes at any time.
Instead of adding an
\emph on
odd
\emph default
number of OSDs, each potentially of different size, now an
\emph on
even
\emph default
number needs to be added for
\begin_inset Formula $k=2$
\end_inset
replicas, or equal-sized triples for
\begin_inset Formula $k=3,$
\end_inset
etc.
\end_layout
\end_inset
.
It is a general problem caused by too high stripe spreading.
When a single striped RAID set would grow too big, reliability would suffer
too much.
Thus multiple smaller RAID sets are traditionally used in place of a single
big one
\begin_inset Foot
status open
\begin_layout Plain Layout
Practical example from experience: for RAID-60, a typical RAID-6 sub-set
should not exceed 12 to 15 spindles.
\end_layout
\end_inset
.
This is somewhat similar to copysets, when taking the spread factor
\begin_inset Formula $S$
\end_inset
as analog to the RAID set size, by using objects in place of sector stripes,
and a few other differences like using some well-known
\emph on
stripe distribution function
\emph default
in place of random replication.
Compare with section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Optimum-Reliability-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
: RAID sets are just another example workaround for consequences from the
fundamental law of storage systems.
\end_layout
\begin_layout Subsection
Explanations from DSM and WorkingSet Theory
\begin_inset CommandInset label
LatexCommand label
name "subsec:Explanations-from-DSM"
\end_inset
\end_layout
\begin_layout Standard
When looking for practical advice, just read the below example use cases,
and skip the rest, which is mostly of academic interest.
\end_layout
\begin_layout Standard
This section tries to explain the BigCluster incidents observed at some
1&1 Ionos doughter from a different perspective.
In the OS literature and community, DSM = Distributed Shared Memory and
Denning's workingset theory from the 1960s are typically attributed to
a different research area.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Example use cases for
\family typewriter
BigCluster
\family default
\series default
\begin_inset CommandInset label
LatexCommand label
name "Example-use-cases-Bigcluster"
\end_inset
\end_layout
\end_inset
Personal discussions with some prominent promoters of Ceph found some informal
agreements about some use cases where BigCluster appears to be well suited:
\end_layout
\begin_layout Itemize
Large collections of audio / video files.
These are never modified in place, but written once, and then
\series bold
\emph on
streamed
\series default
\emph default
.
Thus it is possible to use relatively large object sizes, or even 1 video
file = 1 object.
Then streaming involves only a low number of objects at the same time,
down to a per-application parallelism degree of typically only 1.
\end_layout
\begin_layout Itemize
Measurement data like in CERN physics experiments, where often some
\emph on
streaming model
\emph default
is predominant.
\end_layout
\begin_layout Itemize
Backups and long-term archives, when also accomplished via
\emph on
streaming
\emph default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Example problems for
\family typewriter
BigCluster
\family default
\series default
\begin_inset CommandInset label
LatexCommand label
name "Example-problems-Bigcluster"
\end_inset
\end_layout
\end_inset
In contrast to this, here are some other use cases where BigCluster did
not meet expectations of some people at 1&1 Ionos:
\end_layout
\begin_layout Itemize
Virtual block devices involving
\series bold
strict consistency
\series default
on top of a very high number of small
\begin_inset Quotes eld
\end_inset
unreliable
\begin_inset Quotes erd
\end_inset
/ eventually consistent objects.
\end_layout
\begin_layout Itemize
CephFS with
\series bold
highly parallel random updates
\series default
to a huge number of files / inodes, also involving strict consistency in
some places (e.g.
concurrent metadata updates belonging to the same directory).
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Here is a
\emph on
first attempt
\emph default
to explain these behavioural observations from a more generalized viewpoint.
The author is open for discussion, and will modify this part upon better
understanding.
\end_layout
\begin_layout Standard
For the following, you will need profound
\begin_inset Foot
status open
\begin_layout Plain Layout
In addition to standard Operating System text books like Silberschatz or
Tanenbaum, you may need to consult some of the original work of further
authors mentioned above.
\end_layout
\end_inset
knowledge in Operating System Principles (aka Theory of Operating Systems).
\end_layout
\begin_layout Standard
Ceph & co are apparently shining at use cases where the
\emph on
object paradigm
\emph default
is naturally well-suited for the
\emph on
application behaviour
\emph default
.
\end_layout
\begin_layout Standard
Application behaviour has been studied in the 1970s.
Theorists know that in general it is
\emph on
unpredictable
\emph default
due to Turing Completeness, but practical obervations are revealing some
frequent
\emph on
behavioural pattern
\emph default
s.
Otherwise, caching would not be beneficial in practice.
\end_layout
\begin_layout Standard
While Denning had studied and modelled application behaviour for typical
drum storage devices of his era, later DSM people stumbled over similar
problems: the
\emph on
frequency of access to needed data
\emph default
can grow much higher than the channel / transport capacities can
\begin_inset Foot
status open
\begin_layout Plain Layout
In general, this is unavoidable.
In a
\series bold
storage pyramid
\series default
, the CPU is always able to access RAM pages with a much higher frequency
than any (R)DMA transport can supply.
\end_layout
\end_inset
provide.
Denning and Saltzer coined a term for this:
\series bold
thrashing
\series default
.
\end_layout
\begin_layout Standard
Thrashing means that more time is spent by
\emph on
fetching
\emph default
data than by
\emph on
working
\emph default
with it, because the transports are
\emph on
overloaded
\emph default
.
As Denning observed, thrashing essentially means that the system becomes
\emph on
unusable by customers
\emph default
.
Thrashing is a highly non-linear
\series bold
self-amplifying effect
\series default
, similar to traffic jams at highways: one it has started, it will worsen
itself.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 4
status open
\begin_layout Plain Layout
Although some historic descriptions of thrashing are mentioning contemporary
hardware devices like drum storage, the
\emph on
concept
\emph default
is very universal.
Thrashing can be transferred and
\series bold
generalized
\series default
to modern instances of
\series bold
storage pyramids
\series default
, and/or also to remote access over
\series bold
network bottlenecks
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Saltzer found a workaround for his contemporary batch operating systems:
limit the parallelism degree of concurrently running batch jobs.
In his Multics project, this was also transferred to interactive systems,
by limiting the swap-in parallelism degree of his contemporary segment
swapping methods.
Although this may sound counter-intuitive for modern readers: by introduction
of a certain type of
\series bold
artificial limitation
\series default
at or around the non-linear regression point, the
\series bold
user experience was
\emph on
improved
\series default
\emph default
.
\end_layout
\begin_layout Standard
Now comes a conclusion: when thrashing occurs in a modern BigCluster model
for whatever reason, the self-amplification will be likely worse than in
a LocalSharding model, due to the following reasons:
\end_layout
\begin_layout Itemize
\series bold
Overload propagation
\series default
: when some parts of the
\begin_inset Formula $O(n^{2})$
\end_inset
storage network are overloaded, other parts may also become affected in
turn, due to sharing of network resources, such as cross-traffic lines.
Once queueing has started somewhere, it is likely to worsen, and likely
to induce further queueing at other parts of the shared network.
The more other parts are affected transitively, the more parts will get
overloaded.
So the overload, once it has started somewhere, has a higher probabilty
for
\emph on
spreading out
\emph default
even to parts which were not overloaded before (self-amplification at BigCluste
r level).
\end_layout
\begin_layout Itemize
Random replication of objects adds
\emph on
artificial randomness
\emph default
to the
\series bold
\emph on
locality of reference
\series default
\emph default
, as described by Denning.
\end_layout
\begin_layout Itemize
Original DSM was trying to provide a strict or near-strict consistency model
for application programmers.
Later research then tried some weaker consistency models, without getting
a final breakthrough for general use cases.
BigCluster is similarly organized to DSM, but on slow
\emph on
remote storage
\emph default
instead of logically shared remote RAM over fast RDMA.
Thus we can expect similar problems as observed by the DSM community, like
\series bold
single points of contention
\series default
, etc.
These might become even worse once they have appeared.
\end_layout
\begin_layout Standard
In a nutshell:
\series bold
system stability
\series default
under overload conditions, once they have started somewhere, is highly
non-linear, and tends to spread
\begin_inset Foot
status open
\begin_layout Plain Layout
In the past, advocates of BigCluster have placed the argument that BigCluster
can
\emph on
equallay distribute
\emph default
the total application load onto
\begin_inset Formula $O(n)$
\end_inset
storage servers, so a single overloaded client will get better performance
than in a sharding model.
This argument contains the
\emph on
implicit assumption
\emph default
that load distribution is behaving
\series bold
linearly
\series default
, or close to that.
However, Denning and Saltzer found that system reaction due to overload
by workingset behaviour is
\emph on
extremely
\emph default
non-linear, and may
\emph on
completely
\emph default
tear down systems even when only
\emph on
slightly
\emph default
overloaded.
Although there may exist some areas where the assumption of linearity is
correct and may lead to improvements by better load distribution,
\begin_inset Quotes eld
\end_inset
unpredictable
\begin_inset Quotes erd
\end_inset
behaviour due to self-amplification of overload at BigCluster level may
result in the
\emph on
opposite
\emph default
.
Denning has provided a mathematical model for this, which could probably
be transferred to modern application behaviour.
\end_layout
\end_inset
, and to self-amplify.
\end_layout
\begin_layout Standard
In contrast, sharding models are not spreading any overload to other shards
by definition.
So the total availability from the viewpoint of the
\emph on
total
\emph default
set of customers is less vulnerable to impacts.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Risk characterization in a nutshell
\end_layout
\end_inset
While BigCluster increases the risk of spread-out of overload and other
stability problems similarly to a
\series bold
domino effect
\series default
, Sharding is restricting those risks by
\series bold
fencing
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In the above use cases where BigCluster is shining, overload is unlikely,
since the
\emph on
parallelism of object access
\emph default
is limited.
This is somewhat similar to Saltzer's historic workaround for trashing.
\emph on
Streaming
\emph default
at application behaviour level will translate into streaming at the network
layer.
Classical TCP networks dealing with a relatively low number of high-throuhput
streaming connections are just
\emph on
constructed
\emph default
for dealing with packet loss, such as caused by overload, e.g.
by their
\series bold
congestion control
\series default
\begin_inset Foot
status open
\begin_layout Plain Layout
Recommended reading: the papers from Sally Floyd.
\end_layout
\end_inset
algorithms.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In contrast, an extremely high number of parallel short connections would
be similar to a
\begin_inset Quotes eld
\end_inset
SYN flood attack
\begin_inset Quotes erd
\end_inset
, or similar to a classical UDP packet storm.
It would allow for a much higher parallelism degree, but will be more vulnerabl
e to packet loss / packet storm effects / etc, and more vulnerable to self-ampli
fication.
These application behaviour types are avoided in the above use case examples
for BigCluster.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In addition, storing video files as immutable BLOBs will limit the
\series bold
randomness
\series default
of
\emph on
locality of references
\emph default
, while splitting into millions of very small objects may easily lead to
an explosion of randomness by some orders of magnitude.
\end_layout
\begin_layout Section
Scalability Arguments from Architecture
\begin_inset CommandInset label
LatexCommand label
name "sec:Scalability-Arguments-from"
\end_inset
\end_layout
\begin_layout Standard
In this section, the term
\begin_inset Quotes eld
\end_inset
scalability
\begin_inset Quotes erd
\end_inset
is used in its contemporary traditional form,
\emph on
not yet
\emph default
conforming the new definition as proposed in appendix
\begin_inset CommandInset ref
LatexCommand vref
reference "chap:Definition-of-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Otherwise, the
\series bold
contemporary confusion about scalability
\series default
could be even intensified.
We start with some background about this confusion.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
Some Sources of Confusion
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
People are less prone to confusion if they (a) know Amdahl's law (see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Amdahl%27s_law
\end_layout
\end_inset
), and (b) know and obey the
\emph on
preconditions
\emph default
when this law is
\emph on
applicable
\emph default
, and when it
\emph on
must not
\emph default
be applied.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
People are typically more prone to confusion if they (a) are only citing
Gustafson's law (see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Gustafson%27s_law
\end_layout
\end_inset
) without also considering Amdahl's law and further laws / theories including
their
\series bold
preconditions
\series default
and
\series bold
applicability
\series default
, and/or (b) are
\series bold
misinterpreting
\series default
the slope parameter at Gustafson, and/or (c) do not account for the
\series bold
context
\series default
/
\series bold
use cases
\series default
for which Gustafson's law was formulated.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Importance of Scalability
\end_layout
\end_inset
Scalability is important for
\series bold
mass data
\series default
/ mass production.
It determines the
\series bold
technical limits
\series default
of
\series bold
scaling effects
\series default
.
Bad scalability can seriously
\series bold
limit the business
\series default
, and its resolvement can produce
\series bold
high cost
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Unfortunately, in some circles, seriously wrong habits have established.
I know of examples causing unnecessary problems and cost in the range of
\series bold
millions of €
\series default
.
\end_layout
\begin_layout Standard
Some people are talking about scalability by (1) looking at a relatively
small
\emph on
example
\emph default
cluster
\emph on
implementation
\emph default
of their respective (pre-)chosen
\emph on
architecture
\emph default
having
\begin_inset Formula $n$
\end_inset
machines or
\begin_inset Formula $n$
\end_inset
network components or running
\begin_inset Formula $n$
\end_inset
application instances, and then (2) extrapolating its behaviour to bigger
\begin_inset Formula $n$
\end_inset
.
They think if it runs with small
\begin_inset Formula $n$
\end_inset
, it will also run for bigger
\begin_inset Formula $n$
\end_inset
.
\end_layout
\begin_layout Standard
This way of thinking and acting is completely broken, and can endanger both
companies and careers.
\end_layout
\begin_layout Standard
This is not only because of confusion of
\begin_inset Quotes eld
\end_inset
architecture
\begin_inset Quotes erd
\end_inset
with
\begin_inset Quotes eld
\end_inset
implementation
\begin_inset Quotes erd
\end_inset
, cf section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
It is also
\series bold
fundamentally broken
\series default
because it assumes some
\begin_inset Quotes eld
\end_inset
linearity
\begin_inset Quotes erd
\end_inset
in a field which is
\series bold
non-linear
\emph on
by definition
\series default
\emph default
.
\end_layout
\begin_layout Standard
If scalability would be linear, the term would not be useful at all, because
there would be
\emph on
no limit
\emph default
.
However, limits exist in practice, and the term
\begin_inset Quotes eld
\end_inset
scalability
\begin_inset Quotes erd
\end_inset
is a
\series bold
\emph on
means
\emph default
for describing the behaviour at or around the limit
\series default
.
\end_layout
\begin_layout Standard
Another
\emph on
incorrect
\emph default
way of ill-defining / ill-using the term
\begin_inset Quotes eld
\end_inset
scalability
\begin_inset Quotes erd
\end_inset
is looking at some relatively big
\series bold
\emph on
example
\series default
\emph default
cluster, which is working in practice for some
\series bold
particular use case
\series default
, and then concluding that it will also work in
\series bold
another use case
\series default
.
Arguing with an
\emph on
example
\emph default
of a working system is wrong by construction.
In general, examples can only be used for
\series bold
\emph on
disproving
\series default
\emph default
something, but never as a positive proof of concept
\begin_inset Foot
status open
\begin_layout Plain Layout
Unfortunately, the term PoC = Proof Of Concept is used wrongly in large
parts of the industry.
It should be termed PoI = Proof of Implementation, or VoI = Validation
of Implementation or CoI = Check of Implementation instead.
A concept can have multiple implementations, but only
\emph on
one
\emph default
of them has been actually checked in most cases.
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
Examples for suchalike examples: section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
mentions some use cases where
\family typewriter
BigCluster
\family default
architecture implementations via Ceph are shining.
These example use cases are showing some commonalities, like relatively
low performance demands at the storage, and relatively low IO parallelism
degree
\begin_inset Foot
status open
\begin_layout Plain Layout
Example: many people are not aware that Apache is acting like a
\series bold
fork bomb
\series default
.
When thousands of Apache processes are running in parallel, a parallelism
of several thousands of IO requests
\emph on
may
\emph default
occasionally occur during
\emph on
peaks
\emph default
, although caches will serve them
\emph on
most
\emph default
of the time.
Certain storage systems are reacting with
\series bold
instability
\series default
, sometimes even when
\begin_inset Quotes eld
\end_inset
hammered
\begin_inset Quotes erd
\end_inset
only once with a very short but massive overload peak.
\end_layout
\end_inset
, and streaming-like
\series bold
access patterns
\series default
.
However, it also mentions some
\emph on
other
\emph default
use cases, where it did
\emph on
not
\emph default
work as expected.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Humans are
\series bold
selective
\series default
in their perception.
Evolution has created this, for our protection against overload in the
information flood.
Unfortunately, looking only at some positive use case examples, while either
not knowing or ignoring other counter-examples, can be dangerous.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
\emph on
Every
\emph default
storage system on this globe has
\series bold
\emph on
always
\emph default
some scalability limit
\series default
, somewhere.
Even the internet has some limit.
Scalability is
\emph on
always
\emph default
a
\series bold
non-linear
\series default
behaviour.
In order to find the practical limit, you must
\emph on
reach
\emph default
it.
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/principle-scalability.fig
width 80col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
Typically, the overall scalability behaviour can be divided into several
\emph on
zones
\emph default
.
Only in the scaling zone, some near-linear behaviour can be expected.
Next comes the saturation zone, where the effects of inherent system limits
are already retarding progress.
After exeeding the scalability limit, typically no further progress is
happening.
Upon overload,
\emph on
many(!)
\emph default
systems are even reacting with a
\emph on
regression
\emph default
.
\end_layout
\begin_layout Standard
Any serious study should consciously deal with
\emph on
all
\emph default
of these zones, possibly
\emph on
only
\emph default
except the regression
\begin_inset Foot
status open
\begin_layout Plain Layout
Entering the regression zone might possibly lead to destruction of certain
systems, or to other damages.
Then it is acceptable to not enter it.
It would
\emph on
honorable
\emph default
to mention any
\series bold
risks
\series default
associated with suchlike overload behaviour.
\end_layout
\end_inset
zone when measuring real-life systems which need to conform to some SLAs.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
There exists
\series bold
no excuse for omission
\series default
\begin_inset Foot
status open
\begin_layout Plain Layout
\begin_inset CommandInset label
LatexCommand label
name "fn:faked-scalability"
\end_inset
Several years ago, I attended a presentation at an OpenSource conference.
It was about scalability of a Java programming environment for some SMP
machines.
The presenter showed some nice graphics, however showing
\emph on
only
\emph default
the
\emph on
scaling zone
\emph default
of the measured performance of his demo application.
He publicly claimed
\begin_inset Quotes eld
\end_inset
arbitrary scalability
\begin_inset Quotes erd
\end_inset
.
After the talk, I tried to meet him downstairs in the arena of the theater,
in order to ask him under 4 eyes whether he had seen some limit somewhere.
His
\emph on
very short
\emph default
answer was
\begin_inset Quotes eld
\end_inset
there is no limit
\begin_inset Quotes erd
\end_inset
, completely
\emph on
unwilling
\emph default
to talk to me at all, and
\emph on
very quickly
\emph default
leaving the theater upstairs from the arena (by skipping intermediate stairs
like in sports, and taking 2 or 3 stairs with his legs at each of his very
long steps).
\end_layout
\begin_layout Plain Layout
Before that, no attendee had publicly asked a similar question, and the
very short public discussion was only about a high number of use cases
where the new tool would be highly beneficial.
\end_layout
\begin_layout Plain Layout
Theoretically, he
\emph on
could
\emph default
have
\emph on
meant
\emph default
the scalability of his
\emph on
tool
\emph default
, independently from running on any hardware.
However, he did
\emph on
not
\emph default
talk about this, and his presented measurement data was about
\emph on
end-to-end performance
\emph default
of his
\emph on
demo application
\emph default
, measured on a certain hardware.
Thus his
\series bold
claim was
\series default
definitely
\series bold
wrong
\series default
.
\end_layout
\begin_layout Plain Layout
Unfortunately, a similar
\begin_inset Quotes eld
\end_inset
methodology
\begin_inset Quotes erd
\end_inset
seems to have been copied by more and more presenters.
The problem is not only wrong claims.
The problem is that managers and other decision makers can lose a lot of
money when believing
\series bold
false claims
\series default
or even
\series bold
fake results
\series default
.
\end_layout
\end_inset
of the limit.
When the limit is
\series bold
unknown
\series default
, then you simply
\series bold
must not claim a certain scalability
\series default
!
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Example use cases are principally insufficient for proving general scalability
behaviour, as well as for comparing the scalability of architectures and/or
of certain implementations.
Examples can only be used for
\emph on
disproving
\emph default
scalability.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Caution: when a particular
\emph on
implementation
\emph default
does not work as expected, this does not generally prove that the corresponding
concept / architecture does not work at all.
There may be
\series bold
bugs
\series default
and other
\series bold
sources of error
\series default
in the particular implementation, which just need to be
\series bold
\emph on
fixed
\series default
\emph default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Required skill level for architects
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The
\series bold
suitability of architectures for certain use cases
\series default
needs to be checked separately.
This is an expert task, requiring high levels of skills and experience.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Cross-checking by several experts may lead into systematical ill-designs
by
\series bold
information bubbles
\series default
.
Well-foundation of arguments, well-founded measurements on basis of solid
methodology, etc, are much more important than number of votes!
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Example Failures of Scalability
\begin_inset CommandInset label
LatexCommand label
name "subsec:Example-Failures-Scalability"
\end_inset
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Recommended reading
\end_layout
\end_inset
The following example is a
\series bold
must read
\series default
not only for
\series bold
responsibles
\series default
, but also for system architects, and also for sysadmins.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
The numbers and some details are from my memory, thus it need not be 100%
accurate in all places.
\end_layout
\begin_layout Standard
It is about an operation environment for a
\emph on
new
\emph default
product, which was a proprietary web page editor running under a very complicat
ed variant of a LAMP
\begin_inset Foot
status open
\begin_layout Plain Layout
LAMP = Linux Apache Mysql PHP
\end_layout
\end_inset
stack.
\end_layout
\begin_layout Standard
The setup started with a
\family typewriter
BigCluster
\family default
\emph on
architecture
\emph default
, but actually sized as a
\family typewriter
\begin_inset Quotes eld
\end_inset
SmallCluster
\begin_inset Quotes erd
\end_inset
\family default
implementation.
\end_layout
\begin_layout Paragraph
Setup 1 (NFS)
\end_layout
\begin_layout Standard
The first setup consisted of
\begin_inset Formula $n=6$
\end_inset
storage servers, each replicated to another datacenter via DRBD.
Each server was exporting its filesystems via NFS to about the same number
of client servers, where Apache/PHP was supposed to serve the HTTP requests
from the customers, which were entering the client cluster via a HTTP load
balancer.
The load balancer was supposed to spread the HTTP load to the client servers
in a
\series bold
round-robin
\series default
fashion.
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
At this point, eager readers may notice some similarity with the error propagati
on problem treated in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Error-Propagation-to"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Notice that this is about
\emph on
scalability
\emph default
instead, but you should compare with that, to find some similarities.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
After the complicated system was built up and was working well enough, the
new product was launched via a marketing campaign with free trial accounts,
limited to some time.
\end_layout
\begin_layout Standard
So the number of customers was ramping up from 0 to about 20,000 within
a few weeks.
When about 20,000 customers were running on the client machines, system
hangs were noticed, from a customer's perspective.
When too many customers were pressing the
\begin_inset Quotes eld
\end_inset
save
\begin_inset Quotes erd
\end_inset
button in parallel on reasonably large web page projects, a big number
of small files, including a huge bunch of small image files, was generated
over a short period of time.
A few customers were pressing the
\begin_inset Quotes eld
\end_inset
save
\begin_inset Quotes erd
\end_inset
button several times a minute, each time re-creating all of these files
again and again from the proprietary web page generator.
Result: the whole system appeared to hang.
\end_layout
\begin_layout Standard
However, all of the servers, including the storage servers, were almost
\emph on
idle
\emph default
with respect to CPU consumption.
RAM sizes were also no problem.
\end_layout
\begin_layout Standard
After investigating the problem for a while, it was noticed that the
\series bold
\emph on
network
\series default
\emph default
was the bottleneck, but not in terms of throughput.
The internal sockets were forming some
\series bold
queues
\series default
which were
\emph on
delaying
\emph default
the NFS requests in some
\series bold
ping-pong
\series default
like fashion, almost resulting in a
\begin_inset Quotes eld
\end_inset
deadlock
\begin_inset Quotes erd
\end_inset
from a customer's perspective (a better term would be
\series bold
distributed livelock
\series default
or
\series bold
distributed thrashing
\series default
, c.f.
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Paragraph
Setup 2 (
\family typewriter
ocfs2
\family default
)
\end_layout
\begin_layout Standard
Due to some external investigations and recommendations, the system was
converted from NFS to
\family typewriter
ocfs2
\family default
.
Now DRBD was operated in so-called active-active (see explanation in section
\begin_inset CommandInset ref
LatexCommand vref
reference "sec:What-is-active-active"
plural "false"
caps "false"
noprefix "false"
\end_inset
) resp.
dual-primary mode.
Only one system software component was replaced with another one, without
altering the
\family typewriter
BigCluster
\family default
architecture, and without changing the number of servers, which remained
a stripped-down
\family typewriter
SmallCluster
\family default
implementation.
\end_layout
\begin_layout Standard
Result: the problem with the
\begin_inset Quotes eld
\end_inset
hangs
\begin_inset Quotes erd
\end_inset
disappeared.
\end_layout
\begin_layout Standard
However, after the number of customers had exceeded the
\series bold
next scalability limit
\series default
of about 30,000 customers, the
\begin_inset Quotes eld
\end_inset
hang
\begin_inset Quotes erd
\end_inset
problem appeared once again, in a similar way.
The system showed systematical incidents again.
\end_layout
\begin_layout Paragraph
Setup 3 (
\family typewriter
glusterfs
\family default
as a substitute for NFS /
\family typewriter
ocfs2
\family default
)
\end_layout
\begin_layout Standard
After investigating the network queueing behaviour and the lock contention
problems of
\family typewriter
ocfs2
\family default
, the next solution was
\family typewriter
glusterfs
\family default
.
\end_layout
\begin_layout Standard
However, when the number of customers exceeded the
\series bold
\emph on
next
\emph default
scalability limit
\series default
, which was about 50,000 customers, some of them hammering the cluster with
their
\begin_inset Quotes eld
\end_inset
save
\begin_inset Quotes erd
\end_inset
button, the
\begin_inset Quotes eld
\end_inset
hangs
\begin_inset Quotes erd
\end_inset
appeared again.
\end_layout
\begin_layout Paragraph
Setup 4 (
\family typewriter
glusterfs
\family default
replication as a substitute for DRBD)
\end_layout
\begin_layout Standard
After analyzing the problem once again, it was discovered by accident that
\family typewriter
drbdadm disconnect
\family default
\emph on
appeared
\emph default
to
\begin_inset Quotes eld
\end_inset
solve
\begin_inset Quotes erd
\end_inset
the problem.
\end_layout
\begin_layout Standard
Therefore DRBD was replaced with
\family typewriter
glusterfs
\family default
replication.
There exists a
\family typewriter
glusterfs
\family default
feature allowing replication of files at filesystem level.
\end_layout
\begin_layout Standard
This attempt was
\emph on
immediately
\emph default
resulting in an
\series bold
almost fatal disaster
\series default
, and thus was stopped immediately: the cluster completely broke down.
Almost nothing was working anymore.
\end_layout
\begin_layout Standard
The problem was even worse: switching off the
\family typewriter
glusterfs
\family default
replication and rollback to DRBD did not work.
The system remained
\series bold
unusable
\series default
.
\end_layout
\begin_layout Standard
As a temporary workaround,
\family typewriter
drbdadm disconnect
\family default
was improving the situation enough for some humbling operation.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Retrospective explanation: some of the reasons can be found in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Behaviour-of-DRBD"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\family typewriter
glusterfs
\family default
replication does not scale at all because it stores its replication information
at
\series bold
per-inode granularity
\series default
in EAs (extended attributes).
This must
\emph on
necessarily
\emph default
be worse than DRBD, because there were some hundreds of millions of them
in total as reported by
\family typewriter
df -i
\family default
(see the cut point discussion in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Performance-Risk-Arguments-from-Layer"
plural "false"
caps "false"
noprefix "false"
\end_inset
, and section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Granularity-at-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
Overnight in some cron jobs, these EAs had to be deleted in reasonably
sized batches in order to become more or less
\begin_inset Quotes eld
\end_inset
operable
\begin_inset Quotes erd
\end_inset
again.
\end_layout
\end_inset
\end_layout
\begin_layout Paragraph
Setup5 (Sharding on top of DRBD)
\end_layout
\begin_layout Standard
After the almost fatal incident had been resolved to a less critical one,
the responsibility for setup was taken over by another person.
After the
\begin_inset Formula $O(n^{2})$
\end_inset
behaviour from section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Distributed-vs-Local:"
plural "false"
caps "false"
noprefix "false"
\end_inset
had been understood, and after it was clear that sharding is only
\begin_inset Formula $O(k)$
\end_inset
from a customer's perspective, it was the final solution.
Now the problem was resolved at
\series bold
\emph on
architectural level
\series default
\emph default
, no longer by just replacing some components with some others (c.f.
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Standard
The system was converted to a variant of a
\family typewriter
RemoteSharding
\family default
model (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Variants-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
), and some
\family typewriter
migrate
\family default
scripts were introduced for load balancing of customer homedirectories
and databases between shards.
\end_layout
\begin_layout Standard
As a side effect, the load balancer became a new role: instead of spreading
\emph on
all
\emph default
of the HTTP requests to
\emph on
all
\emph default
of the client servers in a round-robin fashion, it now acted as a redirection
mechanism at
\emph on
shard granularity
\emph default
, e.g.
when one of the client servers was handed over to another one for maintenance.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Retrospective explanation: DRBD was definitely
\emph on
not
\emph default
the real reason for the critical incident.
The replication traffic per shard is so low in average that until today,
no replacement by MARS was absolutely necessary
\begin_inset Foot
status open
\begin_layout Plain Layout
Many sysadmins are running a conservative strategy: never touch a running
system...
\end_layout
\end_inset
, although the distance is over 50 km.
If you wonder why such low write traffic demands can cause such a big incident:
look at the
\series bold
cache reduction
\series default
graphics in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Performance-Risk-Arguments-from-Layer"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Today, the
\begin_inset Quotes eld
\end_inset
save
\begin_inset Quotes erd
\end_inset
buttons of the customers are just triggering some
\emph on
extra
\emph default
\series bold
writebacks
\series default
from the Page Cache of the kernel into the block layer, after some
\emph on
delay
\emph default
.
These writebacks are not performance critical in reality, because the Page
Cache is running them
\series bold
\emph on
asynchronously in background
\series default
\emph default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In contrast, distributed filesystems like
\family typewriter
NFS
\family default
or
\family typewriter
ocfs2
\family default
or
\family typewriter
glusterfs
\family default
are not working asynchronously in many places, but will often schedule
their requests
\emph on
synchronously
\emph default
into ordinary network queues, which form a
\series bold
sequential bottleneck
\series default
, competing with other high-frequent filesystem operations.
In addition, the
\begin_inset Quotes eld
\end_inset
save
\begin_inset Quotes erd
\end_inset
button triggers masses of metadata / inode updates in a short time, often
residing in the same directory.
Such a directory may thus form a
\begin_inset Quotes eld
\end_inset
global
\begin_inset Quotes erd
\end_inset
bottleneck.
When suchalike competing
\series bold
metadata updates
\series default
are distributed via a round-robin load balancer, the problem can easily
become critical by the
\series bold
cache coherence problem
\series default
.
While local filesystems can smoothen such application behaviour via the
Dentry Cache plus Inode Cache, which also show some asynchronous writeback
behaviour, network filesystems are often unable to deal with this performantly.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Although DRBD has a similar sequential bottleneck at the low-frequency
block layer by its write-through strategy into its replica, this does not
really matter: all other writebacks from the Page Cache are
\emph on
also
\emph default
started asynchronously, and triggered low-frequently, and are occurring
after some
\emph on
delay
\emph default
(which in turn will smoothen the
\series bold
spikes
\series default
caused by
\series bold
mass dirtification
\series default
of many small files and inodes in a short time as caused by the
\begin_inset Quotes eld
\end_inset
save
\begin_inset Quotes erd
\end_inset
button), and thus are not really performance critical for this particular
use case.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
This is a striking example why careful
\series bold
selection of granularity level
\series default
(filesystem vs block layer, see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Performance-Risk-Arguments-from-Layer"
plural "false"
caps "false"
noprefix "false"
\end_inset
) is essential.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
This is also a striking example why asynchronous operations can form a
huge advantage in certain use cases.
\end_layout
\begin_layout Standard
The sharding setup is working until today, scaling up to the current number
of customers, which is more than an order of magnitude, in the range of
about a million of customers.
Of course, the number of shards had to be increased, but this is just what
sharding is about.
\end_layout
\begin_layout Subsection
Properties of Storage Scalability
\begin_inset CommandInset label
LatexCommand label
name "subsec:Properties-Scalability"
\end_inset
\end_layout
\begin_layout Subsubsection
Influence Factors at Scalability
\begin_inset CommandInset label
LatexCommand label
name "subsec:Influence-Factors-Scalability"
\end_inset
\end_layout
\begin_layout Standard
In general, scalability of storage systems may depend on the following factors
(list may be incomplete):
\end_layout
\begin_layout Enumerate
The
\series bold
application class
\series default
, in particular its principal
\series bold
workingset behaviour
\series default
(in both dimensions: timely and locality).
More explanations about workingsets can be found in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
and at
\begin_inset Flex URL
status open
\begin_layout Plain Layout
http://blkreplay.org
\end_layout
\end_inset
.
\end_layout
\begin_layout Enumerate
The
\series bold
size
\series default
\begin_inset Formula $x$
\end_inset
of the application data and/or the
\series bold
number of application instances
\series default
(possibly also denoted by
\begin_inset Formula $x$
\end_inset
), and the amount of storage needed for it (could be also termed
\begin_inset Formula $x$
\end_inset
).
Besides the data itself, the corresponding
\series bold
metadata
\series default
(inodes, indexes, etc) can form an important factor, or can even
\emph on
dominate
\emph default
the whole story.
Typically, critical datacenter application data is tremendously differently
sized from workstation data.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Caution! Some people think erronously that scalability would be
\emph on
linearly
\emph default
depending on
\begin_inset Formula $x$
\end_inset
.
However, as is known at least since the 1960s (read some ancient papers
from Saltzer and/or from Denning), scalability is
\series bold
never linear
\series default
, but sometimes even
\series bold
\emph on
disruptive
\series default
\emph default
, in particular when RAM size is the bottleneck.
IO queues and/or networking queues are often also reacting to overload
in a disruptive fashion.
This means: after exceeding the
\series bold
scalability limit
\series default
of a particular system for its particular class of applications, the system
will very likely
\series bold
break down
\series default
from a customer's perspective, sometimes almost completely, and sometimes
even
\series bold
\emph on
fatally
\series default
\emph default
.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
On the other hand, some other systems are reacting with
\series bold
graceful degradation
\series default
.
Whether a particular systems reacts to a particular type of (over)load,
either with graceful degradation, or with fatal disruption, or with some
intermediate behaviour, is some sort of
\begin_inset Quotes eld
\end_inset
quality property
\begin_inset Quotes erd
\end_inset
of the system and/or of the application.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
EVERY SYSTEM, even sharded systems, and even the internet as a whole, has
\emph on
always
\emph default
some scalability limit
\emph on
somewhere
\emph default
.
There exists
\series bold
no
\begin_inset Quotes eld
\end_inset
inifinitely scaling
\begin_inset Quotes erd
\end_inset
system
\series default
on earth!
\end_layout
\begin_layout Enumerate
The
\series bold
\emph on
distribution
\series default
\emph default
of the application behaviour in both
\series bold
timely
\series default
and
\series bold
locality
\series default
dimensions.
Depending on the application class, this is often an
\emph on
exponential
\emph default
distribution according to Zipf's law.
By erronously
\emph on
assuming
\emph default
an equal distribution (or a Gaussian distribution) instead of actually
measuring the distribution in both dimensions, you can easily induce zillions
of costly problems for big
\begin_inset Formula $x$
\end_inset
, or even fatal failure of the whole system / project.
\end_layout
\begin_layout Enumerate
The
\series bold
transformation
\series default
of the application workingset behaviour at architectural level, sometimes
caused by certain components resp their specific implementation or parameteriza
tion.
Examples are intermediate virtualization layers, e.g.
vmware
\family typewriter
*.vmdk
\family default
or KVM
\family typewriter
*.qcow2
\family default
container formats which can completely change the game, not only in extreme
cases.
Another example is
\series bold
random distribution
\series default
to (or
\series bold
random replication
\series default
inside of) object stores, which can turn some uncomplicated sequential
workloads into highly problematic
\emph on
random IO
\emph default
workloads.
See also section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Similarities-and-differences"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Don't overlook such potential pitfalls!
\end_layout
\begin_layout Enumerate
The storage
\series bold
architecture
\series default
to be chosen, such as
\family typewriter
CentralStorage
\family default
vs
\family typewriter
BigCluster
\family default
vs
\family typewriter
*Sharding
\family default
.
Choice of the wrong architecture can be fatal for big
\begin_inset Formula $n$
\end_inset
and/or for certain timely / spatial application behaviour.
Changing an architecture during operations on some petabytes of data and/or
some billions of inodes can be almost impossible, and/or can consume a
lot of time and money.
\end_layout
\begin_layout Enumerate
The
\series bold
number
\series default
of storage
\series bold
nodes
\series default
\begin_inset Formula $n$
\end_inset
.
In some architectures, addition of more nodes can make the system
\emph on
worse
\emph default
instead of better, c.f.
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Reliability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Enumerate
In case of architectures relying on a storage network: choice of
\series bold
layer
\series default
for cut point, e.g.
filesystem layer vs block layer, see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Performance-Risk-Arguments-from-Layer"
plural "false"
caps "false"
noprefix "false"
\end_inset
, and/or introduction of an additional intermediate object storage layer
(which can result in major degradation from an architectural view).
Due to fundamental differences in distributed vs local
\series bold
cache coherence
\series default
, suchalike can have a
\emph on
tremendous
\emph default
effect on scalability.
\end_layout
\begin_layout Enumerate
The chosen
\series bold
implementation
\series default
of the architecture.
Be sure to understand the difference between an
\emph on
architecture
\emph default
and an
\emph on
implementation
\emph default
of that architecture (section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Enumerate
The size and types / properties of various
\series bold
caches
\series default
at various layers.
You need to know the general properties of
\series bold
inclusive
\series default
vs
\series bold
exclusive
\series default
cache architecture.
You absolutely need to know what
\series bold
thrashing
\series default
is, and under which conditions it can occur.
\begin_inset Newline newline
\end_inset
It is advantagous for system architects to know
\begin_inset Foot
status open
\begin_layout Plain Layout
Reading a few Wikipedia articles does not count as
\begin_inset Quotes eld
\end_inset
knowledge
\begin_inset Quotes erd
\end_inset
.
You need to be able to
\emph on
apply
\emph default
your knowdedge to enterprise level systems (as opposed to workstation-sized
systems),
\emph on
sustainable
\emph default
and
\emph on
reproducible
\emph default
.
Therefore you need to have
\emph on
actually worked
\emph default
in the matter and gained some extraordinary experiences, on top of deep
understanding of the matter.
\end_layout
\end_inset
pre-loading strategies, as well as replacement strategies.
It is advantageous to know what
\family typewriter
LRU
\family default
or
\family typewriter
MFU
\family default
means, what their induced
\emph on
overhead
\emph default
is, and how they
\emph on
really
\emph default
work on
\emph on
actual
\emph default
data, not just on some artificial lab data.
You also should know what an
\series bold
anomaly
\series default
is, and how it can be produced not only by
\family typewriter
FIFO
\family default
strategies, but also by certain types of ill-designed multi-layer caching.
Beware: there are places where
\family typewriter
FIFO
\family default
-like behaviour is almost impossible to avoid, such as networks.
All of these is outside the scope of this MARS manual.
You should
\emph on
measure
\emph default
, when possible, the
\series bold
overhead
\series default
of cache implementations.
I know of
\emph on
examples
\emph default
where caching is c
\emph on
ounter-productive
\emph default
.
For example, certain types and implementations of SSD caches are over-hyped.
Removing a certain cache will then
\emph on
improve
\emph default
the situation.
Notice: caches are conceptually based on some type of
\series bold
associative memory
\series default
, which is either very fast but costly when directly implemented in hardware,
or it can suffer from tremendous performance penalties when implemented
inappropriately in software.
\end_layout
\begin_layout Enumerate
\series bold
Hardware dimensioning
\series default
of the implementation: choice of storage hardware, for each storage node.
This includes SSDs vs HDDs, their attachment (e.g.
SAS multiplexing bottlenecks), RAID level, and controller limitations,
etc.
\end_layout
\begin_layout Enumerate
Only for architectures relying on a dedicated realtime storage network:
network
\series bold
throughput
\series default
and network
\series bold
latencies
\series default
, and network
\series bold
bottlenecks
\series default
, including the
\series bold
queueing
\series default
behaviour /
\series bold
congestion control
\series default
/
\series bold
packet loss
\series default
behaviour upon overload.
The latter is often neglected, leading to unexpected behaviour at load
peaks, and/or leading to costly over-engineering (examples see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example-Failures-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
, and theoretical explanation in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Enumerate
\series bold
\emph on
Hidden
\emph default
bottlenecks
\series default
of various types.
A complete enumeration is almost impossible, because there are too many
\begin_inset Quotes eld
\end_inset
opportunities
\begin_inset Quotes erd
\end_inset
.
To reduce the latter, my general advice is to try to build bigger systems
as
\emph on
simple
\emph default
as possible.
This is why you should involve some
\emph on
real
\emph default
experts in storage systems, at least on critical enterprise data.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
\emph on
Any
\emph default
of these factors can be dangerous when not carefully thought about and
treated, depending on your use case.
\end_layout
\begin_layout Subsection
Case Study: Example Scalability Scenario
\begin_inset CommandInset label
LatexCommand label
name "subsec:Example-Scalability-Scenario"
\end_inset
\end_layout
\begin_layout Standard
To get an impression what
\begin_inset Quotes eld
\end_inset
enterprise critical data
\begin_inset Quotes erd
\end_inset
can mean in a concrete example, here are some characteristic numbers from
1&1 Ionos ShaHoLin (Shared Hosting Linux) around spring 2018.
\end_layout
\begin_layout Standard
When the whole system would have to be re-constructed from scratch at a
green field, the following number from the current implemenation would
be
\emph on
requirered input parameters
\emph default
for
\emph on
any
\emph default
potential solution architecture, such as
\family typewriter
CentralStorage
\family default
vs
\family typewriter
BigCluster
\family default
vs
\family typewriter
Sharding
\family default
:
\end_layout
\begin_layout Itemize
Webhosting very close to 24/7/365.
\end_layout
\begin_layout Itemize
Overall customer-visible HA target of 99.98%, including WAN outages.
Technically, a much better system-only HA target would be possible, but
there are also some
\emph on
external
\emph default
incident sources like frequent updates of userspace software and a varity
of application software libraries, frequent security updates, etc.
Although managed by ITIL processes, these sources are outside of the scope
of this
\emph on
system architecture
\emph default
guide.
\end_layout
\begin_layout Itemize
About 9 millions of customer home directories.
\end_layout
\begin_layout Itemize
About 10 billions of inodes, with daily incremental backup.
\end_layout
\begin_layout Itemize
More than 4 petabytes of
\emph on
net
\emph default
data (total
\family typewriter
df
\family default
filling level) in spring 2018, with a growth rate of 21% per year.
\end_layout
\begin_layout Itemize
All of this permanently replicated into a second datacenter.
\end_layout
\begin_layout Itemize
In catastrophic failure scenarios,
\emph on
all
\emph default
resources must be switchable within a short time.
\end_layout
\begin_layout Standard
In order to not bail out too many competing solutions via preconditions,
the following is treated as a nice-to-have feature (only for the sake of
the following sandbox game, while in reality the sysadmins would vote for
a
\emph on
hard requirement
\emph default
instead):
\end_layout
\begin_layout Itemize
Ability for butterfly, cf section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Flexibility-of-Failover"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
For simplicity of our architectural sandbox game, we assume that all of
this is in one campus.
In reality, about 30% is residing at another continent.
Introducing this as an additional input parameter would not fundamentally
change the game.
Many other factors, like dependencies from existing infrastructure, are
also neglected.
\end_layout
\begin_layout Subsubsection
Theoretical Solution:
\family typewriter
CentralStorage
\end_layout
\begin_layout Standard
Let us assume somebody would try to operate this on classical
\family typewriter
CentralStorage
\family default
, and let us assume that migration of this amount of data including billions
of inodes would be no technical problem.
What would be the outcome?
\end_layout
\begin_layout Standard
With current technology, finding a single
\family typewriter
CentralStorage
\family default
appliance would be all else but easy.
Dimensioning would be needed for the
\emph on
lifetime
\emph default
of such a solution, which is at least 5 years.
In five years, the data would grow by a factor of about
\begin_inset Formula $1.21^{5}=2.6$
\end_inset
, which is then about
\begin_inset Formula $10.5$
\end_inset
petabytes.
This is only the
\emph on
net
\emph default
capacity; at hardware layer much more is needed for spare space and for
local redundancy.
The single
\family typewriter
CentralStorage
\family default
instance will need to scale up to at least this number, in each datacenter
(under the simplified game assumptions).
\end_layout
\begin_layout Standard
The current number of client LXC containers is about
\begin_inset Formula $2600$
\end_inset
, independently from location.
You will have to support growth in number of them.
For maintenance, these need to be switchable to a different geo-datacenter
at any time (e.g.
risk mitigation of power supply maintenance in a datacenter), at least
at hypervisor granularity.
As explained in sections
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Flexibility-of-Failover"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Location-transparency"
plural "false"
caps "false"
noprefix "false"
\end_inset
, handover
\emph on
should be
\emph default
at per-VM granularity, otherwise you would cause a regression in operability.
The number of bare metal servers running the total workload can vary with
hardware architecture / hardware lifecycle, and with growth, such as already
demonstrated during the course of internal
\begin_inset Quotes eld
\end_inset
Efficiency projects
\begin_inset Quotes erd
\end_inset
.
You will need to dimension a dedicated storage network for all of this,
such that the NOF as explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
will not grow too high.
\end_layout
\begin_layout Standard
If you find a solution which can do this with current
\family typewriter
CentralStorage
\family default
technology for the next 5 years, then you will have to ensure that restore
from backup
\begin_inset Foot
status open
\begin_layout Plain Layout
Local snapshots, whether LVM or via some COW filesystem, do not count as
backups (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Replication-vs-Backup"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
You need a
\emph on
logical
\emph default
copy, not a
\emph on
physical
\emph default
one, in case your production filesystem instance gets fatally damaged,
such that
\family typewriter
fsck
\family default
won't help anymore.
\end_layout
\end_inset
can be done in less than 1 day in case of a fatal disaster, see also treatment
of
\family typewriter
CentralStorage
\family default
reliability in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Reliability-Differences-CentralStorage"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Notice that the current self-built backup solution for a total of 15 billions
of inodes is based on a sharding model; converting this to some more or
less centralized solution would turn out as another challenge.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Attention! Buying 10 or 50 or 100 CentralStorage instances does not count
as a
\family typewriter
CentralStorage
\family default
architecture.
By definition, suchalike would be
\family typewriter
RemoteSharding
\family default
instead.
Notice that the current 1&1 solution is already a mixture of
\family typewriter
LocalSharding
\family default
and
\family typewriter
RemoteSharding
\family default
, so you would win
\emph on
nothing
\emph default
at architectural level.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In case you actually would want to build a RemoteSharding model on top
of commercial storage, you need to consider
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Cost-Arguments-from-Technology"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
In your business case, you would need to justify the price difference between
the current component-based hardware solution (horizontally extensible
by
\emph on
scale-out
\emph default
) and
\family typewriter
CentralStorage
\family default
/
\family typewriter
RemoteSharding
\family default
, which is about a factor of 10 per terabyte according to the table in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Cost-Arguments-from-Technology"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Even if you manage to find a vendor who is willing to subsidize to a factor
of only 3, this is not all you need.
You need to add the cost for the dedicated storage network.
On top of this, you need to account for the
\emph on
migration cost
\emph default
after the lifetime of 5 years has passed, where the full data set needs
to be migrated to a successor storage system.
\end_layout
\begin_layout Standard
Notice that classical argumentations with
\series bold
\emph on
manpower
\series default
\emph default
will not work.
The current operating team is about 10 persons, with no dedicated storage
admin.
This relatively small team is not only operating a total of more than 6,000
shared boxes in all datacenters, but also some tenthousands of managed
dedicated servers, running essentially the same software stack, with practicall
y fully automated mass deployment.
Most of their tasks are related to central software installation, which
is then automatically distributed, and to operation / monitoring / troubleshoot
ing of masses of client servers.
Storage administration tasks in isolation are costing only a
\emph on
fraction
\emph default
of this.
Typical claims that
\family typewriter
CentralStorage
\family default
would require less manpower will not work here.
Almost everything which is needed for
\emph on
mass automation
\emph default
is already automated.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Neglecting the tenthousands of managed dedicated servers would be a catastrophi
c ill-design.
Their hardware is already given, by existing customer contracts, some of
them decades old.
Although it may be possible to modify
\emph on
some
\emph default
of these contracts, you simply cannot fundamentally change
\emph on
all
\emph default
the hardware of these customers including their
\emph on
dedicated
\emph default
local disks, which was / is their
\emph on
main selling point
\emph default
.
You cannot simply convert them to a shared
\family typewriter
CentralStorage
\family default
, even if it would be technically possible, and if it would deliver similar
IOPS rates than tenthousands of local spindles (and if you could reach
the bundled performance of local SSDs from newer contracts), and even if
you would introduce some interesting
\series bold
storage classes
\series default
for all of this.
A dedicated server on top of a shared storage is no longer a dedicated
one.
You would have to migrate these customers to another product, with all
of its consequences.
Alone for these machines,
\emph on
most
\begin_inset Foot
status open
\begin_layout Plain Layout
Only a few out of >1000 self-built or customized Debian packages are dealing
with MARS and/or with the clustermanager
\family typewriter
cm3
\family default
.
\end_layout
\end_inset
\emph default
of the current automation of
\family typewriter
LocalStorage
\family default
is needed
\emph on
anyway
\emph default
, although they are not geo-redundant at current stage.
\end_layout
\begin_layout Standard
Conclusion:
\family typewriter
CentralStorage
\family default
is simply
\emph on
unrealistic
\emph default
.
\end_layout
\begin_layout Subsubsection
Theoretical Solution:
\family typewriter
BigCluster
\end_layout
\begin_layout Standard
The main problem of
\family typewriter
BigCluster
\family default
is
\series bold
reliability
\series default
, as explained intuitively in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Reliability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
, and graphically in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sub:Detailed-explanation"
plural "false"
caps "false"
noprefix "false"
\end_inset
, and mathematically in appendix
\begin_inset CommandInset ref
LatexCommand vref
reference "chap:Mathematical-Model-of"
\end_inset
, and as observed in several installations not working as expected.
It would be a bad idea to ignore the explanations from section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
Let us assume that all of these massive technical problems were solved,
somehow.
Then the business case would have to deal with the following:
\end_layout
\begin_layout Standard
The total number of servers would need to be roughly
\emph on
doubled
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
One of the problems of the current Ceph
\emph on
implementation
\emph default
is its massive consumption of CPU power and RAM.
Even if this would be improved in future, the
\emph on
architectural
\emph default
drawbacks will remain.
\end_layout
\end_inset
.
Not only their CAPEX, but also the corresponding OPEX (electrical power,
rackspace, manpower) would increase.
Alone their current electrical power cost, including cooling, is more than
the current sysadmin manpower cost.
Datacenter operations would also increase.
On top, a properly dimensioned dedicated storage network and its administration
cost would also be needed.
\end_layout
\begin_layout Standard
With respect to the tenthousands of managed dedicated servers and their
customer contracts, a similar argument as above holds.
You simply cannot convert them to
\family typewriter
BigCluster
\family default
.
\end_layout
\begin_layout Standard
Conclusion:
\family typewriter
BigCluster
\family default
is also
\emph on
unrealistic
\emph default
.
There is nothing to win, but a lot to loose.
\end_layout
\begin_layout Subsubsection
Current Solution:
\family typewriter
LocalSharding
\family default
, sometimes
\family typewriter
RemoteSharding
\end_layout
\begin_layout Standard
Short story: the architecture as well its current implementation works since
decades, and is both cheap and robust since geo-redundancy had been added
around 2010.
\end_layout
\begin_layout Standard
With the advent of Football (see
\family typewriter
football-user-manual.pdf
\family default
), the
\family typewriter
LocalSharding
\family default
architecture is raising up on par with the most important management abilities
of
\family typewriter
CentralStorage
\family default
and
\family typewriter
BigCluster
\family default
/ Software Defined Storage.
\end_layout
\begin_layout Standard
Pre-configured
\family typewriter
RemoteSharding
\family default
on top of dedicated Linux-based storage boxes is currently being reduced
in favour of the cheaper and more reliable
\family typewriter
LocalSharding
\family default
combined with Football.
The dedicated storage boxes are almost EOL due to their age, and should
vanish some day.
\end_layout
\begin_layout Standard
There is another story about tenthousands of managed dedicated servers:
without the traditional ShaHoLin sharding architecture and all of its automatio
n, including the newest addition called Football, the product
\begin_inset Quotes eld
\end_inset
managed dedicated servers
\begin_inset Quotes erd
\end_inset
would not be possible in this scale.
By definition, the dedicated server product
\emph on
is
\emph default
a sharding implementation.
Thanks to football, further business opportunities like migration onto
virtualized shared hardware (with optional
\series bold
resource partitioning
\series default
) are possible.
\end_layout
\begin_layout Standard
Summay: the sharded
\begin_inset Quotes eld
\end_inset
shared
\begin_inset Quotes erd
\end_inset
product enables another
\begin_inset Quotes eld
\end_inset
dedicated
\begin_inset Quotes erd
\end_inset
product, which is sharded by definition, and it actually is known to scale
up by at least another order of magnitude (in terms of number of servers).
\end_layout
\begin_layout Subsection
Scalability of Filesystem Layer vs Block Layer
\begin_inset CommandInset label
LatexCommand label
name "subsec:Filesystem-Layer-vs"
\end_inset
\end_layout
\begin_layout Standard
Following factors are known to be responsible for better architectural (cf
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
) scalability of the block layer vs the filesystem layer, with a few exceptions
(list may be incomplete):
\end_layout
\begin_layout Enumerate
\series bold
Granularity
\series default
of access:
\series bold
metadata
\series default
is often smaller than the content data it refers to, but access to data
is typically not possible without accessing corresponding metadata
\emph on
first
\emph default
.
When
\emph on
masses
\emph default
of metadata are present (e.g.
some billions of inodes like in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example-Scalability-Scenario"
plural "false"
caps "false"
noprefix "false"
\end_inset
), and/or when metadata is accessed
\series bold
more frequently
\series default
than the corresponding data (e.g.
in stateless designs like Apache), it is likely to become the bottleneck.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Neglecting metadata and their access patterns is a major source of ill-designs.
I know of projects which have failed (in their original setup) because
of this.
Repair may involve some non-trivial architectural changes.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
By default, the block layer itself has almost
\begin_inset Foot
status open
\begin_layout Plain Layout
There may be tiny metadata, such as describing the size of the whole block
device.
\end_layout
\end_inset
no metadata at all.
Therefore it has an
\emph on
inherent advantage
\emph default
over the filesystem layer in such use cases.
\end_layout
\begin_layout Enumerate
\series bold
Caching
\series default
: shared memory caches in kernelspace (e.g.
Linux page cache + dentry cache) vs distributed caches over loosely coupling
via networks.
See the picture in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Performance-Risk-Arguments-from-Layer"
plural "false"
caps "false"
noprefix "false"
\end_inset
: caches are one of the
\emph on
most important
\emph default
performance boosters, which are more or less
\emph on
required
\emph default
for today's performance expectations / requirements (at least for non-trivial
load patterns).
While local caches are nowadays typically scaling reasonably well to more
than 100 CPUs on modern SMP / NUMA systems, distributed caches have a long-stan
ding academic history of expectation failures, or even project failures.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
There exist
\emph on
examples
\emph default
where shared distributed caches did not work at all.
Frequently, this has to do with strict consistency requirements, and with
runtime access patterns.
I know of
\emph on
several
\emph default
projects which have failed.
Another project than mentioned in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example-Failures-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
has failed because of violations of POSIX filesystem semantics.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Conversely, the absence of POSIX-like requirements for filesystems and/or
relaxed consistency like
\begin_inset Quotes eld
\end_inset
eventually consistent
\begin_inset Quotes erd
\end_inset
does
\emph on
not prove
\emph default
that distributed caches will work better than local ones.
Please recall the picture from section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Performance-Risk-Arguments-from-Layer"
plural "false"
caps "false"
noprefix "false"
\end_inset
and compare with the
\series bold
thrashing problems
\series default
described in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Thrashing will typically
\emph on
not
\emph default
be improved when running over loosely coupled RAM-to-RAM-to-Disk transfers,
in place of traditional local caching.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In addition, having multiple
\series bold
nested caches
\series default
over several instances and their layers are typically not only in conflict
with Dijkstra's rules.
They can easily increase the
\series bold
total complexity
\series default
of Distributed Systems and their
\series bold
implementation / testing overhead
\series default
.
Loosely coupled systems are typically prone to
\series bold
more complex failure scenarios
\series default
(cf section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Reliability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Short summary: distributed
\emph on
exclusive
\emph default
caches are typically worsening the total reliability, while distributed
\emph on
inclusive
\emph default
caches are often complexifying the recovery of node failures.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Attention: do not believe that node failures cannot create more or less
\begin_inset Quotes eld
\end_inset
artificial
\begin_inset Quotes erd
\end_inset
data inconsistencies.
The famous
\series bold
CAP theorem
\series default
is also valid for distributed caches!
\end_layout
\begin_layout Enumerate
Only in non-trivial distributed systems: the
\series bold
cache coherence problem
\series default
, both on metadata and on data, and/or on
\emph on
interactions
\emph default
between them.
Depending on load patterns, this can lead to tremendous performance degradation
, see example in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example-Failures-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
To repeat once again: loosely coupled systems and their caches are typically
prone to more complex failure scenarios (cf section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Reliability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
), making
\series bold
failure recovery
\series default
more complex.
\end_layout
\begin_layout Enumerate
Dimensioning of the
\series bold
network
\series default
: throughput, latencies, queueing behaviour.
See NOF in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
There exist a few known exceptions (list may be incomplete, please report
further examples if you know some):
\end_layout
\begin_layout Itemize
Databases: these are typically operating on specific container formats,
where no frequent
\emph on
external
\emph default
metadata access is necessary, and where no sharing of the
\emph on
container as such
\emph default
is necessary.
Typically, there is no big performance difference between storing them
in block devices vs local filesystems (although it could be viewed as a
minor Dijkstra regression).
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Exception from the exception: MyISAM is an old design from the 1980s, originall
y based on DBASE data structures under MSDOS.
Don't try to access them over NFS or similar.
Or, better, try to avoid them at all if possible.
\end_layout
\begin_layout Itemize
VM images: these are logical BLOBS, so there is typically no big difference
whether they are in an intermediate
\emph on
true
\emph default
filesystem layer, or not.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Filesystems on top of object stores (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Granularity-at-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
) are no true intermediate filesystems.
They are typically violating Dijkstra's important layering rules (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Layering-Rules"
plural "false"
caps "false"
noprefix "false"
\end_inset
) at
\emph on
several
\emph default
places.
A similar argument holds for block devices on top of object stores.
Another layering violation may result from VM container formats like
\family typewriter
*.vmdk
\family default
or
\family typewriter
*.qcow2
\family default
, which cannot always be avoided.
Be warned that such container formats
\emph on
themselves
\emph default
can act as game changers with respect to performance, parallelism degree,
reliability, etc.
This does not mean that you have to avoid them generally.
Layering violations just create an additional
\emph on
risk
\emph default
, which need not always materialize, and need not always be fatal.
However, be sure to
\series bold
check their influence
\series default
, and don't forget to measure their
\emph on
workingset
\emph default
and their
\emph on
caching behaviour
\emph default
(which can go both into positive and into negative direction), in order
to really
\emph on
know what you are doing.
\end_layout
\begin_layout Standard
There exist a few cases where a distributed filesystem, sometimes even actually
with
\begin_inset Formula $O(n^{2})$
\end_inset
behaviour according to section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Error-Propagation-to"
plural "false"
caps "false"
noprefix "false"
\end_inset
,
\emph on
must
\emph default
be used, because there exists a
\emph on
hard requirement
\emph default
for it.
Some examples (list is certainly incomplete):
\end_layout
\begin_layout Itemize
HPC =
\series bold
High Performance Computing
\series default
on modern supercomputers, consisting of a high number of
\begin_inset Formula $n$
\end_inset
compute nodes, are often requiring access to a shared persistent data pool,
where each of the
\begin_inset Formula $n$
\end_inset
nodes must be sometimes able to access the same persistent data, sometimes
both for reading and writing.
Therefore, several supercomputers are using cluster filesystems like Lustre.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Care must be taken that high-frequency / fine granularity communication
over the distributed filesystem and its dedicated storage network does
not take place, but instead occurs over the ordinary low-latency communication
fabrics each modern supercomputer is relying on.
True
\begin_inset Formula $O(n^{2})$
\end_inset
storage access behaviour should be avoided as far as possible (given by
the problem to be solved).
When absolutely necessary, location transparency (as possible with cluster
filesystems like Lustre) as well as its DSM = Distributed Shared Memory
model must be given up, and an
\series bold
explicit communication model
\series default
must be used instead, which allows explicit control over replicas and their
communication paths (e.g.
propagation in a binary tree fashion), although it results in much more
work for the programmers.
Only low frequency / coarse granularity transfers of
\emph on
bulk data
\emph default
with
\emph on
high locality
\emph default
should run over distributed filesystems, preferably in
\emph on
streaming
\emph default
mode (c.f.
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
The total frequency of metadata access should be low, because metadata
consistency may form a bottleneck when updated too frequently.
The programmers of the distributed application software need to take care
for this.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Notice that certain supercomputer workloads may be crying for a RemoteSharding
or FlexibleSharding storage architecture in place of a BigCluster architecture.
However, this is very application specific.
\end_layout
\begin_layout Itemize
Student pools at universities, or location-independent workplaces at companies.
This is just the usecase where NFS was originally constructed for.
Typically,
\series bold
workstation workloads
\series default
are neither performance critical, nor prone to actual
\begin_inset Formula $O(n^{2})$
\end_inset
behaviour (although the network infrastructure would
\emph on
allow
\emph default
for it), because each user has her own home directory which is typically
\emph on
not shared
\emph default
with others, and she cannot split herself and sit in front of multiple
workstations at the same time.
Thus the
\emph on
local per-workstation
\emph default
NFS caching strategies have a good chance to hide much of the network latencies
, and thus the actual total network workload is typically only
\begin_inset Formula $O(n).$
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
This can lead to a dangerous misinterpretation: because it apparently works
even for a few thousands of workstations, people conclude
\emph on
wrongly
\emph default
that the network filesystem
\begin_inset Quotes eld
\end_inset
must be scalable
\begin_inset Quotes erd
\end_inset
.
Some people are then applying their experience to completely different
usecases, where much higher metadata traffic by several orders of magnitudes
is occurring (such as in webhosting), or even where true
\begin_inset Formula $O(n^{2})$
\end_inset
runtime behaviour is occuring (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example-Failures-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Standard
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
In general: when something works for usecase A, this
\series bold
does
\emph on
not
\emph default
prove
\series default
that it will also work for another usecase B.
See explanations from start of section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Scalability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Section
Point-in-time Replication via ZFS Snapshots
\begin_inset CommandInset label
LatexCommand label
name "subsec:Example:-ZFS-Replication"
\end_inset
\end_layout
\begin_layout Standard
Some ZFS advocates believe that ZFS snapshots, which were originally designed
for backup-like use cases, are also appropriate solutions for achieving
geo-redundancy (cf section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Geo-Redundancy"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
The basic idea is to run incremental ZFS snapshots in an endless loop,
e.g.
via some simple scripts, and expediting to another host where the snapshots
are then applied to another ZFS instance.
When there is less data to be expedited, loop cycle times should go down
to a few seconds.
When much data is written at the primary site, loop cycle times will rise
up.
According to some advocates, this should be no problem.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Important: ZFS is
\series bold
not
\series default
an entirely free OpenSource component.
According to
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/ZFS
\end_layout
\end_inset
it is a
\emph on
mixture
\emph default
of OpenSource with
\series bold
proprietary
\series default
sub-components.
Oracle is its current project owner, and is
\emph on
known
\emph default
in the OpenSource scene for first
\emph on
marketing
\emph default
something as
\begin_inset Quotes eld
\end_inset
free
\begin_inset Quotes erd
\end_inset
, but some years later
\emph on
may
\emph default
suddenly decide some
\series bold
fees
\series default
for some
\emph on
sub
\emph default
-functionality, forcing you to pay if this strategy was
\emph on
succesful
\emph default
in creating some sort of
\series bold
Vendor Lock-In
\series default
to some of the
\emph on
sub
\emph default
-components over the years.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Unfortunately, the mentioned English Wikipedia article does not clearly
specify this.
When possible, read the corresponding German article in
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://de.wikipedia.org/wiki/ZFS_(Dateisystem)
\end_layout
\end_inset
.
In 2021, there is a footnote text
\begin_inset Quotes eld
\end_inset
Fabian A.
Scherschel: Linus Torvalds erteilt ZFS im Linux-Kernel erneute Absage.
In: Heise online.
10.
Januar 2020.
Abgerufen am 22.
Mai 2020.
\begin_inset Quotes erd
\end_inset
pointing at
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://heise.de/-4633302
\end_layout
\end_inset
which tells you that Linus Torvalds has
\series bold
\emph on
refreshed
\series default
\emph default
in 2020 his previous
\series bold
\emph on
decision
\series default
\emph default
that the out-of-tree ZFS Linux kernel module will
\series bold
not
\series default
be included into the
\series bold
upstream
\series default
Linux kernel.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
\emph on
Long-Term
\emph default
ZFS Strategy
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
When possible, and for
\emph on
new
\emph default
projects:
\series bold
do
\emph on
not
\emph default
rely on
\series default
the external
\series bold
ZFS
\series default
non-upstream
\series bold
Linux kernel module
\series default
for
\series bold
enterprise-critical
\series default
use cases.
History has shown that such non-upstream projects
\emph on
may
\emph default
somewhen slip into some non-maintained state.
For a manager, this would more or less lead to some EOL = End Of Life state,
or increase your own maintenance effort.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
The following table tries to explain why geo-redundancy is not as simple
to achieve under Linux as some people seem to believe, at least without
addition of
\emph on
highly sophisticated
\begin_inset Foot
status open
\begin_layout Plain Layout
Notice: so-called
\begin_inset Quotes eld
\end_inset
Orchestration Layers
\begin_inset Quotes erd
\end_inset
\series bold
cannot
\series default
achieve the same level of geo-redundancy as DRBD and MARS can do.
Even when so-called Orchestrations would be built geo-redundant in itself
in some way, they would form some kind of SPOF = Single Point Of Failure.
Notice that they would need
\emph on
their own
\emph default
geo-redundancy, otherwise they would violate Dijkstra's layering rules
(see
\begin_inset CommandInset ref
LatexCommand ref
reference "subsec:Layering-Rules"
plural "false"
caps "false"
noprefix "false"
\end_inset
)
\end_layout
\end_inset
\emph default
additional means
\begin_inset Foot
status open
\begin_layout Plain Layout
ZFS advocates often argue with many features which aren't present at other
filesystem types.
The above table shows some dimensions not dealing with properties of local
filesystems, but with
\emph on
problems / tasks
\emph default
arising in long-distance distributed systems involving masses of enterprise-cri
tical storage.
\end_layout
\end_inset
.
The table compares the built-in functionality at component level.
While DRBD and MARS are rated as they are supported by their creators,
ZFS gets some (more or less
\begin_inset Quotes eld
\end_inset
unfair
\begin_inset Quotes erd
\end_inset
)
\emph on
advantage
\emph default
by adding some local sysadmin-alike scripts which are then
\series bold
responsible
\series default
for geo-redundancy, together with the external ZFS Linux kernel module.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
From a management viewpoint, ZFS-based replication may easily lead to dependenci
es from necessary co-work of the following responsibles:
\end_layout
\begin_layout Enumerate
Linux kernel upstream.
\end_layout
\begin_layout Enumerate
External ZFS kernel module.
\end_layout
\begin_layout Enumerate
(Local) sysadmins and/or developers which are responsible for the geo-redundancy
functionality (both development + operations), which is
\series bold
not
\series default
provided by the previous participants.
\end_layout
\begin_layout Plain Layout
In contrast, here is the future
\emph on
envisioned
\emph default
responsibility for MARS geo-redundancy:
\end_layout
\begin_layout Enumerate
Linux kernel upstream, where Linus Torvalds is the boss and the MARS developers
are members of his community, producing and maintaining
\series bold
generic
\series default
sub-components usable
\emph on
everywhere
\emph default
on the world.
\end_layout
\begin_layout Enumerate
Local sysadmins, responsible for
\series bold
operations
\series default
of specific Linux-based
\series bold
instances
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Tabular
<lyxtabular version="3" rows="17" columns="4">
<features tabularvalignment="middle">
<column alignment="left" valignment="top">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<row>
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
(non-)OpenSource Component
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
DRBD
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
MARS
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
ZFS (+scripts)
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Synchronity (in average)
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
delay
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
delay * 1.5
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Generic solution
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
FS-specific
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Granularity
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
LVs
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
LVs
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
subvolumes
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Built-in snapshots
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Long distances
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Replication parallelism (per gran.)
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $1$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\geq2$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $1$
\end_inset
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Built-in primary/secondary roles
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no (+hard)
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Built-in handover (planned)
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
mostly
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no (+hard)
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Built-in failover (unplanned)
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no (+hard)
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Built-in delta-overflow handling
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
unnecessary
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no, missing
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Unnoticed data loss due to delta overflow
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
possible
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\emph on
Higher
\emph default
space for
\emph on
long-lasting
\emph default
fullsync
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes,
\begin_inset Formula $\lessapprox*2$
\end_inset
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Built-in split-brain awareness
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no (+hard)
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Execute split-brain resolution
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no (+costly)
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
S-B resolution transfer granularity
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
sector
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
sector
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
unknown
\begin_inset Foot
status collapsed
\begin_layout Plain Layout
In worst case, a
\emph on
full
\emph default
snapshot may be needed for a complete ZFS
\emph on
full
\emph default
sync.
In worst case, this might roughly double the total required storage space,
which may be needed
\emph on
temporarily
\emph default
during a long-lasting
\emph on
full
\emph default
sync.
In contrast, DRBD and MARS can
\series bold
incrementally
\series default
run a (fast) fullsync in parallel to running IO, without need for temporary
snapshot space.
\end_layout
\end_inset
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Protect against illegal data modification
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
no
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Quotes eld
\end_inset
Illegal data modification
\begin_inset Quotes erd
\end_inset
means that ZFS by itself does not protect against amok-running applications
and/or tools modifiying the secondary (backup) side in parallel to the
replication process (at least not by default).
Workarounds might be possible, but are not easy to create and to test for
enterprise-critical applications.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In simple words: the
\series bold
ability for butterfly
\series default
is non-trivial to achieve.
It can easily turn into a nightmare, if you would try to establish it on
top of larger
\family typewriter
zfs
\family default
installations.
Although termed
\begin_inset Quotes eld
\end_inset
replication
\begin_inset Quotes erd
\end_inset
, it is more similar to
\begin_inset Quotes eld
\end_inset
backup
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Some contemporary
\family typewriter
zfs
\family default
replication setups at sisters of 1&1 Ionos are lacking the butterfly ability,
likely due to these difficulties.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Why are ZFS-based roles / handover / failover / butterfly / split-brain
awareness + resolution operations
\emph on
harder
\emph default
than you might expect?
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Look at line
\begin_inset Quotes eld
\end_inset
Granularity
\begin_inset Quotes erd
\end_inset
: when
\emph on
multiple
\emph default
subvolumes are hosted by the
\emph on
same
\emph default
zpool instance, but are
\emph on
required
\emph default
to do some DRBD-alike or MARS-alike operations
\emph on
independently from each other
\emph default
, and
\series bold
in parallel to running / unfinished replication
\series default
tasks, this may easily become a
\series bold
challenge
\series default
.
Hopefully the subvolumes are
\series bold
not nested
\series default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
A few workarounds may be possible by a general 1:1 correspondence between
zpools and (sub)volumes.
However, this could increase the sysadmin workload.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Even more
\series bold
hairy
\series default
: when there exist multiple zpools at one side, and/or different zpools
at different geo-redundant sides, and/or different assignments of subvolumes
to zpools, then you might need a prayer, in particular when the
\series bold
CAP theorem
\series default
comes also into play and/or when the other side is
\series bold
not reachable during a geo-incident
\series default
, and/or when
\series bold
multiple impacts
\series default
are occuring in parallel at the same time (so-called
\series bold
rolling disasters
\series default
\begin_inset Foot
status open
\begin_layout Plain Layout
MARS is regularly tested for many cascading impacts, to react
\emph on
as best as possible
\emph default
(best-effort principle).
\end_layout
\end_inset
).
Possibly, all of this can be resolved, but don't under-estimate the
\series bold
total implementation and test effort
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Notice that
\family typewriter
zfs
\family default
\series bold
snapshots
\series default
(without adding fs-layer replication on top of it) can be
\series bold
easily combined
\series default
with block-layer DRBD or MARS replication.
Reason:
\family typewriter
zfs
\family default
snapshots are
\emph on
necessarily
\emph default
residing at
\emph on
filesystem
\emph default
layer, while DRBD / MARS replicas are located at the lower
\emph on
block
\emph default
layer (see the picture in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Performance-Risk-Arguments-from-Layer"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
Due to original Unix architecture,
\series bold
cartesian products of layers
\series default
are possible in many cases.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Unfortunately, some ZFS advocates have been told that
\emph on
layer merging
\emph default
between block layer and FS layer
\series bold
would
\series default
be an
\begin_inset Quotes eld
\end_inset
advantage
\begin_inset Quotes erd
\end_inset
.
However, this contradicts with
\series bold
Parnas' modularization rules
\series default
when combined with Dijkstra's layering rules.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Combination of ZFS with DRBD or MARS
\end_layout
\end_inset
Idea: create your zpools
\emph on
on top
\emph default
of DRBD or MARS resources = virtual devices, and use
\family typewriter
zpool import
\family default
/
\family typewriter
export
\family default
\emph on
individually
\emph default
at handover / failover of each LV instance.
A relatively easy way for implemention is the
\family typewriter
systemd
\family default
interface of MARS (see the according section in
\family typewriter
mars-user-manual.pdf
\family default
).
You just need to write
\emph on
once
\emph default
a small unit template file, containing a few
\family typewriter
zpool
\family default
commands.
This small template will then be automatically instantiated for each resource
by the
\family typewriter
marsadm
\family default
macro processor, as often as needed.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
As a side effect of
\family typewriter
zpool import
\family default
and its sisters, a whole
\emph on
bunch
\emph default
of subvolumes can be activated with 1 shot.
This means: your handover / failover
\series bold
granularity
\series default
may be configured
\series bold
more coarse
\series default
than your more fine-grained hierarchy of ZFS snapshots.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Another side effect: butterfly and other geo-redundancy operations are
becoming easy, just by a 1:1 correspondence between DRBD / MARS resources
and zpools.
Then your ZFS snapshots are
\series bold
orthogonal
\series default
to the geo-redundancy.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
There is a
\series bold
\emph on
fundamental
\emph default
architectural difference
\series default
between zpools and classical RAID / LVM stacked architectures.
Some zfs advocates are propagating zpools as a replacement for both RAID
and LVM.
However, there is a massive difference in architecture, as illustrated
in the following example (10 logical resources over 48 physical spindles),
achieving practically the
\series bold
\emph on
same
\series default
zfs snapshot functionality
\emph default
from a user's perspective, but in a different way:
\end_layout
\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
filename images/raid-lvm-architecture.fig
height 6cm
\end_inset
\begin_inset Graphics
filename images/zpool-architecture.fig
height 6cm
\end_inset
\end_layout
\begin_layout Plain Layout
\noindent
When RAID functionality is executed by zfs, it will be located at the
\emph on
top
\emph default
of the hierarchy.
On one hand, this easily allows for different RAID levels for each of the
10 different logical resources.
On the other hand, this
\emph on
exposes
\emph default
the
\series bold
physical spindle configuration
\series default
to the topmost filesystem layer (48 spindles in this example).
There is no easy way for replication of these
\emph on
physical properties
\emph default
in a larger / heterogenous distributed system, e.g.
when some hardware components are replaced over a longer period of time
(hardware lifecycle, or LV Football as explained in
\family typewriter
football-user-guide.pdf
\family default
).
Essentially, only replication of
\emph on
logical
\emph default
structures like snapshots remains as the only reasonable option, with its
drawbacks as explained above.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
There is another argument: ZFS tries to
\emph on
hide
\emph default
its internal structures and interfaces from the sysadmins, forming a more
or less
\series bold
monolithic
\begin_inset Foot
status open
\begin_layout Plain Layout
Some sysadmins acting as
\family typewriter
zfs
\family default
advocates are reclaiming this as an advantage.
Apparently, they need to learn and understand only a
\emph on
single
\emph default
tool for managing
\begin_inset Quotes eld
\end_inset
everything
\begin_inset Quotes erd
\end_inset
.
However, this may turn into a short-sighted argument when it comes to
\emph on
true
\emph default
flexibility as offered by a component-based system, where multiple types
of hardware / software RAID, multiple types of LVM functionality, and much
more can be almost orthogonally combined in a very flexible way.
\end_layout
\end_inset
architecture
\series default
as seen from outside.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
This violates the classical
\emph on
layering rules
\emph default
from Dijkstra (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Layering-Rules"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
In contrast, classical LVM-based configurations (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Positive-Example:-ShaHoLin"
plural "false"
caps "false"
noprefix "false"
\end_inset
or the example setup in
\family typewriter
mars-user-manual.pdf
\family default
) are
\series bold
component oriented
\series default
, according to the
\series bold
Unix Philosophy
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Section
Local vs Centralized Storage
\begin_inset CommandInset label
LatexCommand label
name "sec:Local-vs-Centralized"
\end_inset
\end_layout
\begin_layout Standard
There is some historical belief that only centralized storage systems, as
typically sold by commercial storage vendors, could achieve a high degree
of reliability, while local storage were inferior by far.
In the following, we will see that this is only true for an
\series bold
\emph on
unfair
\series default
\emph default
comparison involving different classes of storage systems.
\end_layout
\begin_layout Subsection
Internal Redundancy Degree
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
Centralized commerical storage systems are typically built up from highly
redundant
\emph on
internal
\emph default
components:
\end_layout
\begin_layout Enumerate
Redundant power supplies with UPS.
\end_layout
\begin_layout Enumerate
Redundancy at the storage HDDs / SSDs.
\end_layout
\begin_layout Enumerate
Redandancy at internal transport busses.
\end_layout
\begin_layout Enumerate
Redundant RAM / SSD caches.
\end_layout
\begin_layout Enumerate
Redundant network interfaces.
\end_layout
\begin_layout Enumerate
Redundant compute heads.
\end_layout
\begin_layout Enumerate
Redundancy at control heads / management interfaces.
\end_layout
\begin_layout Plain Layout
What about local hardware RAID controllers? Some people think that these
relatively cheap units were massively inferior at practically each of these
points.
Please take a
\emph on
really deep
\emph default
look at what classical RAID chip manufacturers like LSI / Avago / Broadcom
and their competitors are offering as configuration variants of their top
notch models.
The following enumeration is in the same order as above (item by item):
\end_layout
\begin_layout Enumerate
Redundant hardware RAID cards with BBU caches, each with local goldcaps
surviving power outages, their BBU caches cross-coupled via high-speed
interconnects.
\end_layout
\begin_layout Enumerate
HDD / SSD redundancy: almost any RAID level you can think of.
\end_layout
\begin_layout Enumerate
Redundant SAS cross-cabling: any head can access any device.
\end_layout
\begin_layout Enumerate
BBU caches are redundant and cross-coupled, similarly to RDMA.
When SSD caches are added to both cards, you also get redundancy there.
\end_layout
\begin_layout Enumerate
When using cross-coupled redundant cards, you automatically get redundant
host bus interfaces (HBAs).
\end_layout
\begin_layout Enumerate
The same story: you also get two independent RAID controller instances which
can do RAID computations independently from each other.
Some implementations do this even in hardware (ASICs).
\end_layout
\begin_layout Enumerate
Dito: both cards may be plugged into two different servers, thereby creating
redundancy at control level.
As a side effect, you may also get a similar functionality than DRBD.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Redunduncy degree of RAID vs commercial appliances
\end_layout
\end_inset
When dimensioned appropriately, real architectual and functional differences
at block layer are smaller than certain people are claiming.
For many block layer use cases, redundancy is
\series bold
roughly comparable
\series default
.
\end_layout
\begin_layout Plain Layout
If you compare typical prices for both competing systems, you will notice
a
\emph on
huge
\emph default
difference in favour of RAID.
See also section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Cost-Arguments-from-Technology"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Capacity Differences
\end_layout
\begin_layout Standard
There is another hard-to-die myth: commercial storage would provide higher
capacity.
Please read the data sheets.
It is
\emph on
possible
\emph default
(but not generally recommended) to put several hundreds of spindles into
several external HDD enclosures, and then connect them to a redundant cross-cou
pled pair of RAID controllers via several types of SAS busses.
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Maximum possible RAID capacity
\end_layout
\end_inset
By filling a rack this way, RAID can easily reach similar, if not higher
capacities than commercial storage boxes, for a
\emph on
fraction
\emph default
of the price.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
However, this is not the recommended way for
\emph on
general
\emph default
use cases, but could be an option for low demands like archiving.
The big advantage of RAID-based local storage is
\series bold
massive scale-out by sharding,
\series default
as explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Distributed-vs-Local:"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Subsection
Caching Differences
\end_layout
\begin_layout Standard
A frequent argument is that centralized storage systems had bigger caches
than local RAID systems.
While this argument is often true, it neglects an important point:
\end_layout
\begin_layout Standard
Local RAID systems often
\emph on
don't need
\emph default
bigger caches, because they are typically located at the
\emph on
bottom
\emph default
of a cache hierarchy, playing only a
\emph on
particular
\emph default
role in that hierarchy.
There exist
\emph on
further
\emph default
caches which are
\series bold
erronously not considered
\series default
by such an argument!
\end_layout
\begin_layout Standard
Example, see also section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Performance-Risk-Arguments-from-Layer"
plural "false"
caps "false"
noprefix "false"
\end_inset
for more details: At 1&1 Shared Hosting Linux (ShaHoLin), a typical LXC
container containing several thousands to tenthousands of customer home
directories, creates a long-term
\emph on
average(!)
\emph default
IOPS load at block layer of about 70 IOPS.
No, this isn't a typo.
It is not 70,000 IOPS.
It is only 70 IOPS.
\end_layout
\begin_layout Standard
Reason: the standard Linux kernel has two main caches, the Page Cache for
file content, and the Dentry Cache (plus Inode slave cache) for metadata.
Both caches are residing in
\series bold
RAM
\series default
, which is the
\emph on
fastest
\emph default
type of cache you can get.
Some more details are in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Performance-Risk-Arguments-from-Layer"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
Nowadays, typical servers have several hundreds of gigabytes of RAM, sometimes
even up to terabytes, resulting in an incredible caching behaviour which
can be measured
\begin_inset Foot
status open
\begin_layout Plain Layout
Caution: this requires
\emph on
extremely solid
\emph default
expert knowledge and experience.
It can be easily done wrongly.
When managers are believing
\series bold
fake results
\series default
, whether produced by accident from people stuck to
\series bold
second-order ignorance
\series default
, or whether produced for some
\series bold
political reasons
\series default
: This can be
\series bold
dangerous for companies
\series default
.
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
Many people appear to neglect these caches, sometimes not knowing of their
existence, and erronously assuming that 1 application r
\family typewriter
ead()
\family default
or
\family typewriter
write()
\family default
operation will also lead to 1 IOPS at block layer.
As a consequence, they are demanding 50,000 IOPS or 100,000 or even 1,000,000
IOPS.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
IOPS over-engineering
\end_layout
\end_inset
IOPS over-engineering by some orders of magnitudes can cause
\emph on
considerable
\emph default
unnecessary expenses.
Be sure to carefully
\series bold
check real demands
\series default
!
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
Some (but not all) commercial storage systems can deliver similar IOPS rates,
because they have
\series bold
internal RAM caches
\series default
in the same order of magnitude.
Notice that persistent RAM is the
\series bold
most expensive
\series default
type of scalable storage you can buy.
\end_layout
\begin_layout Plain Layout
People who are demanding such systems are typically falling into some of
the following classes (list is probably incomplete):
\end_layout
\begin_layout Itemize
some people know this, but price does not matter - the more caches, the
better.
Wasted money for doubled caches does not count for them, or is even viewed
as an advantage to them (personally).
Original citation of an anonymous person:
\begin_inset Quotes eld
\end_inset
only the best and the most expensive storage is good enough for us
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Itemize
using NFS, which has extremely poor filesystem caching behaviour because
the Linux nfs client implementation does not take full advantage of the
dentry cache.
Sometimes people know this, sometimes not.
Please read an important paper on the Linux implementation of nfs.
Please search the internet for
\begin_inset Quotes eld
\end_inset
Why nfs sucks
\begin_inset Quotes erd
\end_inset
from Olaf Kirch (who is one of the original Linux nfs implementors), and
\emph on
read
\emph default
it.
Your opinion about nfs might change.
\end_layout
\begin_layout Itemize
have transactional databases, where high IOPS may be
\emph on
really
\emph default
needed, but
\series bold
\emph on
exceptionally
\series default
\emph default
(!) for this class of application.
For very big enterprise databases like big SAP installations, there may
be a very valid justification for big RAM caches at storage layers.
However: smaller transactional loads, as in webhosting, are
\emph on
often
\emph default
(not always) hammering a
\emph on
low
\emph default
number of
\series bold
hot spots
\series default
, where
\emph on
big
\emph default
caches are not really needed.
Relatively small BBU caches of RAID cards will do it also.
Often people don't notice this because they don't measure the
\series bold
workingset behaviour
\series default
of their application, as could be done for example with
\family typewriter
blkreplay
\family default
(see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://blkreplay.org
\end_layout
\end_inset
).
\end_layout
\begin_layout Itemize
do not notice that
\emph on
well-tuned
\emph default
filesystem caches over iSCSI are typically demanding much less IOPS, sometimes
by several orders of magnitude, and are wasting money with caches at commercial
boxes they don't need (classical
\series bold
over-engineering
\series default
).
\end_layout
\begin_layout Itemize
\series bold
political interest
\series default
, often supported by storage vendors.
\end_layout
\begin_layout Plain Layout
Anyway, local storage can be augmented with various types of local caches
with various dimensioning.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
There is no point in accessing the fastest possible type of RAM cache remotely
over a network.
RAM is best
\series bold
invested money
\series default
when installed
\series bold
locally
\series default
,
\emph on
directly
\emph default
for your applications / services / compute nodes.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Even expensive hardware-based RDMA (e.g.
over Infiniband) cannot deliver the same performance as
\series bold
directly caching
\series default
your data in the
\series bold
\emph on
same
\emph default
RAM
\series default
where your application is running.
The Dentry Cache in the Linux kernel provides highly optimized
\series bold
shared metadata
\series default
in SMP and NUMA systems (nowadays scaling to more than 100 processor cores),
while the Page Cache provides
\series bold
shared memory
\series default
via hardware MMU.
This is crucial for the performance of classical local filesystems.
\end_layout
\begin_layout Standard
The physical laws of Einstein and others are telling us that neither this
type of caching, nor its shared memory behaviour, can be transported over
whatever type of network without causing
\series bold
performance degradation
\series default
.
\end_layout
\begin_layout Subsection
Latencies and Throughput
\begin_inset CommandInset label
LatexCommand label
name "subsec:Latencies-and-Throughput"
\end_inset
\end_layout
\begin_layout Standard
First of all: today there exist only a small number of HDD manufacturers
on the world.
The number of SSD manufacturers will likely decline in the long run.
Essentially, commercial storage vendors are more or less selling you the
same HDDs or SSDs as you could buy and deploy yourself.
If at all, there are only some minor technical differences.
\end_layout
\begin_layout Standard
In the meantime, many people agree to a Google paper that the
\emph on
ratio
\emph default
of market prices (price per terabyte) between HDDs and SSDs are unlikely
to change in a fundamental
\begin_inset Foot
status open
\begin_layout Plain Layout
In folklore, there exists a
\series bold
fundamental empirical law
\series default
, fuzzily called
\begin_inset Quotes eld
\end_inset
Storage Pyramid
\begin_inset Quotes erd
\end_inset
or
\begin_inset Quotes eld
\end_inset
Memory Hierarchy Law
\begin_inset Quotes erd
\end_inset
or similar, which is well-known at least in German OS academic circles.
The empirical law (extrapolated from
\series bold
observations
\series default
, similarly to Moore's law) tells us that faster storage technology is always
\series bold
more expensive
\series default
than slower storage technology, and that capacities of faster storage are
typically always lesser than capacity of slower storage.
This observation has been roughly valid for more than 50 years now.
You can find it in several German lecture scripts.
Unfortunately, the Wikipedia article
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Memory_hierarchy
\end_layout
\end_inset
(retrieved in June 2018) does not cite this very important fundamental
law about
\series bold
cost
\series default
.
In contrast, the German article
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://de.wikipedia.org/wiki/Speicherhierarchie
\end_layout
\end_inset
about roughly the same subject is mentioning
\begin_inset Quotes eld
\end_inset
Kosten
\begin_inset Quotes erd
\end_inset
which means
\begin_inset Quotes eld
\end_inset
cost
\begin_inset Quotes erd
\end_inset
, and
\begin_inset Quotes eld
\end_inset
teuer
\begin_inset Quotes erd
\end_inset
which means
\begin_inset Quotes eld
\end_inset
expensive
\begin_inset Quotes erd
\end_inset
.
\end_layout
\end_inset
way during the next 10 years.
Thus, most large-capacity enterprise storage systems are built on top of
HDDs.
\end_layout
\begin_layout Standard
Typically, HDDs and their mechanics are forming the overall bottleneck.
\end_layout
\begin_layout Itemize
by construction, a
\emph on
local
\emph default
HDD attached via HBAs or a hardware RAID controller will show the least
\emph on
additional
\emph default
overhead in terms of
\emph on
additional
\emph default
latencies and throughput degradation caused by the attachment.
\end_layout
\begin_layout Itemize
When the
\emph on
same
\emph default
HDD is
\emph on
indirectly
\emph default
attached via Ethernet or Infiniband or another rack-to-rack transport,
both latencies and throughput will become worse.
Depending on further factors and influences, such as a too high NOF (see
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
), the overall bottleneck may shift to the network, and it may become worse
over-propotionally.
\end_layout
\begin_layout Standard
The laws of information transfer are telling us:
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
With
\series bold
increasing distance
\series default
, both latencies (laws of Einstein) and throughput (laws of energy needed
for compensation of SNR = signal to noise ratio) are becoming worse.
Distance matters.
Kirchhoff's law (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
) also matters.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Because of this fundamental law, Football+MARS is
\series bold
minimizing IO distances
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
The number of intermediate components, like routers / switches and their
\series bold
queuing
\series default
, matters too.
\end_layout
\begin_layout Standard
Consequently, local storage has
\emph on
always
\emph default
an architectural
\begin_inset Foot
status open
\begin_layout Plain Layout
In order to be fair, an architectural comparison must be made under the
assumption of comparable low-level technologies.
\end_layout
\end_inset
advantage in front of any attachment via network.
Centralized storages are bound to some network, and thus suffer from disadvanta
ges in terms of latencies and throughput.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
What is the expected long-term future? Will additional latencies and throughput
of centralized storages become better over time?
\end_layout
\begin_layout Plain Layout
It is difficult to predict the future.
Let us first look at the past evolution.
The following graphics has taken its numbers from Wikipedia articles
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/List_of_device_bit_rates
\end_layout
\end_inset
and
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/History_of_hard_disk_drives
\end_layout
\end_inset
, showing that HDD capacities have grown
\series bold
over-proportionally
\series default
by about 2 orders of magnitude over about 30 years, when compared to the
relative growth of network bandwidth.
\end_layout
\begin_layout Plain Layout
In the following graphics, effects caused by decreasing form factors have
been neglected, which would even
\emph on
amplify
\emph default
the trend.
For fairness, bundling of parallel disks or parallel communication channels
\begin_inset Foot
status open
\begin_layout Plain Layout
It is easy to see that the slopes of
\family typewriter
HDD.capacity
\family default
vs
\family typewriter
Infiniband.rates
\family default
are different.
Parallelizing by bundling of Infiniband wires will only lift the line a
little upwards, but will not alter its slope in logarithmic scale.
For extrapolated time
\begin_inset Formula $t\rightarrow\infty$
\end_inset
, the extrapolated empirical long-term behaviour is rather striking.
\end_layout
\end_inset
have been ignored.
All comparisons are in logarithmic y axis scale:
\end_layout
\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
filename BitRates/Capacity-BitRate-Comparison.pdf
width 100col%
\end_inset
\end_layout
\begin_layout Plain Layout
\noindent
What does this mean when extrapolated into the future?
\end_layout
\begin_layout Plain Layout
It means that concentrating more and more capacity into a single rack due
to increasing data density will likely lead to more problems in future.
Accessing more and more data over the network will become increasingly
more difficult when concentrating high-capacity HDDs or SSDs
\begin_inset Foot
status open
\begin_layout Plain Layout
It is difficult to compare the space density of contemporary SSDs in a fair
way.
There are too many different form factors.
For example, M2 cards are typically consuming even less
\begin_inset Formula $cm^{3}/TB$
\end_inset
than classical 2.5 inch form factors.
This trend is likely to continue in future.
\end_layout
\end_inset
into the same space volume as before.
\end_layout
\begin_layout Plain Layout
In other words: centralized storages are no good idea yet, and will likely
become an even worse idea in the future.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
Risky central storage architecture
\end_layout
\end_inset
There was a major incident at a German web hosting company at the beginning
of the 2000's.
Their entire webhosting main business was running on a single proprietary
highly redundant CentralStorage solution, which failed.
Restore from backup took way too long from the viewpoint of a huge number
of customers, leading to major press attention.
Before this incident, they were the #1 webhoster in Germany.
A few years later, 1&1 was the #1 instead.
You can speculate whether this has to do with the incident.
But anyway, the later geo-redundancy strategy of 1&1 basing on a sharding
model (originally using DRBD, later MARS) was motivated by conclusions
drawn from this incident.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
Non-competing scalabilty of central storage
\end_layout
\end_inset
In the 1980s, a CentralStorage
\begin_inset Quotes eld
\end_inset
dinosaur
\begin_inset Foot
status open
\begin_layout Plain Layout
With the advent of NVME, SSDs are almost directly driven by DMA.
Accessing any high-speed DMA devices by default via network is a foolish
idea, similarly foolish than playing games via an expensive high-end gamer
graphics cards which is then
\emph on
indirectly
\emph default
attached via RDMA, or even via Ethernet.
Probably no serious gamer would ever
\emph on
try
\emph default
to do that.
But some storage vendors do, for strategic reasons.
Probably for their own survival, their customers are to be misguided to
overlook the blinking red indicators that centralized SSD storage is likely
nothing but an expensive dead end in the history of dinosaur architectures.
\end_layout
\end_inset
\begin_inset Quotes erd
\end_inset
architecture called SLED = Single Large Expensive Disk was propagated with
huge marketing noise and effort, but its historic fate was predictable
for neutral experts not bound to particular interests: SLED finally lost
against their contemporary RAID competition.
Nowadays, many people don't even remember the term SLED.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Strategic advice
\end_layout
\end_inset
Today's
\series bold
future
\series default
is likely dominated by
\series bold
scaling-out architectures
\series default
like
\series bold
sharding
\series default
, as explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Distributed-vs-Local:"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Reliability Differences CentralStorage vs Sharding
\begin_inset CommandInset label
LatexCommand label
name "subsec:Reliability-Differences-CentralStorage"
\end_inset
\end_layout
\begin_layout Standard
In this section, we look at
\emph on
fatal
\emph default
failures only, ignoring temporary failures.
A fatal failure of a storage is an incident which needs to be corrected
by
\series bold
restore from backup
\series default
.
\end_layout
\begin_layout Standard
By definition, even a
\emph on
highly redundant
\emph default
CentralStorage is
\emph on
nevertheless
\emph default
a SPOF = Single Point of Failure.
This also applies to fatal failures.
\end_layout
\begin_layout Standard
Some people are incorrectly arguing with redundancy.
The problem is that
\emph on
any
\emph default
system, even a highly redundant one, can fail fatally.
There exists no perfect system on earth.
One of the biggest known sources of fatal failure is
\series bold
human error
\series default
.
\end_layout
\begin_layout Standard
In contrast, sharded storage (for example the LocalSharding model, see also
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Variants-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
) has MPOF = Multiple Points Of Failure.
It is unlikely that many shards are failing fatally at the same time, because
shards are
\emph on
independent
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
When all shards are residing in the same datacenter, there exists a SPOF
by power loss or other impacts onto the whole datacenter.
However, this applies to both the CentralStorage and to the LocalSharding
model.
In contrast to CentralStorage, LocalSharding can be more easily distributed
over multiple datacenters.
\end_layout
\end_inset
from each other by definition (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Definition-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
for disambiguation of terms
\begin_inset Quotes eld
\end_inset
sharding
\begin_inset Quotes erd
\end_inset
and
\begin_inset Quotes eld
\end_inset
shared-nothing
\begin_inset Quotes erd
\end_inset
).
\end_layout
\begin_layout Standard
What is the difference from the viewpoint of customers of the services?
\end_layout
\begin_layout Standard
When a CentralStorage is failing fatally, a
\emph on
huge
\emph default
number of customers will be affected for a
\emph on
long
\emph default
time (see the example German webhoster mentioned in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Latencies-and-Throughput"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
Reason: restore from backup will take extremely long because huge masses
of data have to be restored =
\series bold
copied
\series default
over a network.
MTBF = Mean Time Between Failures is (hopefully) longer thanks to redundancy,
but MTTR = Mean Time To Repair is also very long.
\end_layout
\begin_layout Standard
With (Local)Sharding, the risk of
\emph on
some
\emph default
fatal incident
\emph on
somewhere
\emph default
in the sharding pool is higher, but the
\series bold
\emph on
size
\series default
\emph default
of such an incident is smaller in three dimensions at the same time:
\end_layout
\begin_layout Enumerate
There are much
\series bold
less customers affected
\series default
(typically only
\begin_inset Formula $1$
\end_inset
shard out of
\begin_inset Formula $n$
\end_inset
shards).
\end_layout
\begin_layout Enumerate
\series bold
MTTR
\series default
= Mean Time To Repair is typically much better because there is much less
data to be restored.
\end_layout
\begin_layout Enumerate
\series bold
Residual risk
\series default
plus resulting fatal damage by
\series bold
un-repairable problems
\series default
is thus lower.
\end_layout
\begin_layout Standard
What does this mean from the viewpoint of an investor of a big
\begin_inset Quotes eld
\end_inset
global player
\begin_inset Quotes erd
\end_inset
company?
\end_layout
\begin_layout Standard
As is promised by the vendors, let us assume that failure of CentralStorage
might be occurring less frequently.
But
\emph on
when
\emph default
it happens on
\series bold
enterprise-critical mass data
\series default
, the stock exchange value of the affected company will be exposed to a
\series bold
hazard
\series default
.
This is not bearable from the viewpoint of an investor.
\end_layout
\begin_layout Standard
In contrast, the (Local)Sharding model is
\emph on
distributing
\emph default
the
\series bold
indispensible incidents
\series default
(because
\series bold
perfect systems do not exist
\series default
, and
\series bold
perfect humans do not exist
\series default
) to a lower number of customers with higher frequency, such that the
\series bold
total impact onto the business
\series default
becomes bearable.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Risk analysis of CentralStorage
\end_layout
\end_inset
Risk analysis for
\series bold
enterprise-critical
\series default
use cases is summarized in the following table:
\end_layout
\begin_layout Plain Layout
\noindent
\align center
\begin_inset Tabular
<lyxtabular version="3" rows="8" columns="3">
<features tabularvalignment="middle">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top" width="0pt">
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
CentralStorage
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
(Local)Sharding
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Probability of
\emph on
some
\emph default
fatal incident
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
lower
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
higher
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
# Customers affected
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
very high
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
very low
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
MTBF per storage
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
higher
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
lower
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
MTTR per storage
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
higher
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
lower
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Unrepairable residual risk
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
higher
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
lower
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Total impact
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
higher
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
lower
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Investor's risk
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\series bold
unbearable
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
stock exchange compatible
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Conclusions: CentralStorage is something for
\end_layout
\begin_layout Itemize
\noindent
Small to medium-sized companies which don't have the
\series bold
manpower
\series default
and the
\series bold
skills
\series default
for professionally building and operating a (Local)Sharding (or similar)
system for their enterprise-critical mass data their business is relying
upon.
\end_layout
\begin_layout Itemize
\series bold
\emph on
Monolithic
\emph default
enterprise applications
\series default
like classical SAP which are anyway bound to a specific vendor, where you
cannot select a different solution (so-called
\series bold
Vendor Lock-In
\series default
).
\end_layout
\begin_layout Itemize
When your application
\series bold
is neither shardable
\series default
by construction (c.f.
section
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:Distributed-vs-Local:"
\end_inset
), or when doing so would be a too high effort,
\series bold
nor going to BigCluster
\begin_inset Foot
status open
\begin_layout Plain Layout
Theoretically, BigCluster can be used to create 1 single huge remote LV
(or 1 single huge remote FS instance) out of a pool of storage machines.
Double-check, better triple-check that such a
\series bold
big
\emph on
logical
\emph default
SPOF
\series default
is
\emph on
really
\emph default
needed, and cannot be circumvented by any means.
Only in such a case, the current version of MARS cannot help (yet), because
its
\emph on
current
\emph default
\emph on
focus
\emph default
is on a big number of machines each having relatively small LVs.
At 1&1 ShaHoLin, the biggest LVs are 40TiB at the moment, running for years
now, and bigger ones are certainly possible.
Only when current local RAID technology with external enclosures cannot
easily create a single LV in the petabyte scale, BigCluster is probably
the better solution (c.f.
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Reliability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\end_inset
\series default
(e.g.
Ceph / Swift / etc, see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Reliability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
) is an option.
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
If you have an
\emph on
already sharded
\emph default
system, e.g.
independent VMs or webhosting, don't convert it to a non-shardable one,
and don't introduce SPOFs needlessly.
You will introduce
\series bold
technical debts
\series default
which are likely to hurt back somewhen in future!
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
As a real big
\begin_inset Quotes eld
\end_inset
global player
\begin_inset Quotes erd
\end_inset
, or as a company being part of such a structure, you should be careful
when listening to
\begin_inset Quotes eld
\end_inset
marketing drones
\begin_inset Quotes erd
\end_inset
of proprietary CentralStorage vendors.
Always check your
\emph on
concrete
\emph default
use case.
Never believe in wrongly generalized claims, which are only valid in some
specific context, but do not really apply to your use case.
It could be about your
\emph on
life
\emph default
.
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Proprietary vs OpenSource
\begin_inset CommandInset label
LatexCommand label
name "subsec:Proprietary-vs-OpenSource"
\end_inset
\end_layout
\begin_layout Standard
In theory, the following dimensions are orthogonal to each other:
\end_layout
\begin_layout Description
Architecture: LocalStorage vs CentralStorage vs DistributedStorage
\end_layout
\begin_layout Description
Licensing: Proprietary vs OpenSource
\end_layout
\begin_layout Standard
In practice, however, many vendors of proprietary storage systems are selecting
the CentralStorage model.
This way, they can avoid inter-operability with their competitors.
This opens the door for the so-called
\series bold
Vendor Lock-In
\series default
.
\end_layout
\begin_layout Standard
In contrast, the OpenSource community is based on
\emph on
cooperation
\emph default
.
Opting for OpenSource means that you can
\series bold
combine and exchange
\series default
numerous
\series bold
components
\series default
with each other.
\end_layout
\begin_layout Standard
Key OpenSource players are
\emph on
basing
\emph default
their business on the
\series bold
usefulness
\series default
of their software components for you, their customer.
Please search the internet for further explanations from Eric S.
Raymond.
\end_layout
\begin_layout Standard
Therefore
\series bold
interoperability
\series default
is a
\emph on
must
\emph default
in the opensource business.
For example, you can relatively easily migrate between DRBD and MARS, forth
and backwards, see
\family typewriter
mars-user-manual.pdf
\family default
.
The
\emph on
generic
\emph default
block devices provided by both DRBD and MARS (and by the kernel LVM2 implementa
tion, and many others
\begin_inset Formula $\ldots$
\end_inset
) can interact with zillions of filesystems, VMs, applications, and so forth.
\end_layout
\begin_layout Standard
Summary:
\series bold
genericity
\series default
is a highly desired property in OpenSource communities, while proprietary
products often try to control their usage by limiting either technical
interoperability at certain layers, and/or legally by contracts.
Trying to do so with OpenSource would make no sense, because
\emph on
you
\emph default
, the customer, are the
\emph on
real
\emph default
king who can
\emph on
really
\emph default
select and combine components.
You can form a
\series bold
really customized system
\series default
to your
\series bold
\emph on
real needs
\series default
\emph default
, not as just promised but not always actually delivered by so-called
\begin_inset Quotes eld
\end_inset
marketing drones
\begin_inset Quotes erd
\end_inset
from commercial vendors who are actually prefering the needs of their employer
in front of yours.
\end_layout
\begin_layout Standard
There is another fundamental difference between proprietary software and
OpenSource: the former is bound to some company, which may
\emph on
vanish
\emph default
from the market.
Commercial storage systems may be
\series bold
discontinued
\series default
.
\end_layout
\begin_layout Standard
This can be a serious threat to your business relying on the value of your
data.
In particular, buying storage systems from
\emph on
small
\emph default
vendors may increase this risk
\begin_inset Foot
status open
\begin_layout Plain Layout
There is a risk of a
\emph on
domino effect
\emph default
: once there is a critical incident on highly redundant CentralStorage boxes
from a particular (smaller) vendor, this may lead to major public media
attention.
This may form the
\emph on
root cause
\emph default
for such a vendor to vanish from the market.
Thus you may be left alone with a buggy system, even if you aren't the
victim of the concrete incident.
\end_layout
\begin_layout Plain Layout
In contrast, bugs in an OpenSource component can be fixed by a larger community
of interested people, or by yourself if you hire somebody for this.
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
OpenSource is different: it cannot die, even if the individual, or the (small)
company which produced it, does no longer exist.
The sourcecode is in the
\series bold
public
\series default
.
It just could get
\emph on
outdated
\emph default
over time.
However, as long as there is enough public interest, you will always find
somebody who is willing to adapt and to
\emph on
maintain
\emph default
it.
Even if you would be the only one having such an interest, you can
\emph on
hire
\emph default
a maintainer for it, specifically for your needs.
You aren't
\series bold
helpless
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Long-term strategy
\end_layout
\end_inset
When some appropriate OpenSource solution, or when some OpenSource components
are availabe, its long-term TCO will be typically better than from proprietary
vendors.
\end_layout
\end_inset
\end_layout
\begin_layout Section
Cost Arguments
\begin_inset CommandInset label
LatexCommand label
name "sec:Cost-Arguments-from"
\end_inset
\end_layout
\begin_layout Standard
A common pre-jugdement is that
\begin_inset Quotes eld
\end_inset
big cluster
\begin_inset Quotes erd
\end_inset
is the cheapest scaling storage technology when built on so-called
\begin_inset Quotes eld
\end_inset
commodity hardware
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Standard
While this is very often true for the
\begin_inset Quotes eld
\end_inset
commodity hardware
\begin_inset Quotes erd
\end_inset
part, it is often
\emph on
not
\emph default
true for the
\begin_inset Quotes eld
\end_inset
big cluster
\begin_inset Quotes erd
\end_inset
part.
Let us first look at the
\begin_inset Quotes eld
\end_inset
commodity
\begin_inset Quotes erd
\end_inset
part.
\end_layout
\begin_layout Subsection
Cost Arguments from Technology
\begin_inset CommandInset label
LatexCommand label
name "subsec:Cost-Arguments-from-Technology"
\end_inset
\end_layout
\begin_layout Subsubsection
Raw Storage Price Comparison
\end_layout
\begin_layout Standard
Here are some rough market prices for basic storage as determined around
end of 2016 / start of 2017:
\begin_inset Separator latexpar
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Tabular
<lyxtabular version="3" rows="6" columns="3">
<features tabularvalignment="middle">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
Technology
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
Enterprise-Grade
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
Price in € / TB
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
Consumer SATA disks via on-board SATA controllers
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
no (small-scale)
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
< 30 possible
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
SAS disks via SAS HBAs (e.g.
in external 14
\begin_inset Quotes erd
\end_inset
shelfs)
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
halfways
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
< 80
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
SAS disks via hardware RAID + LVM (+DRBD/MARS)
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
80 to 150
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
Commercial storage appliances via iSCSI
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
around 1000
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
Cloud storage, S3 over 5 years lifetime
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
yes
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size small
3000 to 8000
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Standard
\noindent
You can see that any self-built and self-administered storage (whose price
varies with slower high-capacity disks versus faster low-capacity disks)
is much cheaper than any commercial offering by about a factor of 10 or
even more.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
If you need to operate several petabytes of data, self-built storage is
\emph on
always
\emph default
cheaper than commercial one, even if some more manpower is needed for commissio
ning and operating, than for communications with the storage provider.
You don't have to pay the shareholders of the storage provider.
Instead, the savings will benefit your
\emph on
own
\emph default
shareholders.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Here we just assume that the storage is needed permanently for at least
5 years, as is the case in web hosting, databases, backup / archival systems,
and many other application areas.
\end_layout
\begin_layout Standard
Commercial offerings of cloud storage are way too much hyped.
Apparently some people don't seem to know that the generic term
\begin_inset Quotes eld
\end_inset
Cloud Storage
\begin_inset Quotes erd
\end_inset
refers to a
\emph on
storage class
\emph default
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Properties-Cloud-Storage"
plural "false"
caps "false"
noprefix "false"
\end_inset
), not to a particular
\emph on
instance
\emph default
like original Amazon S3, and that it is possible to build and operate almost
any instance of any storage class yourself.
\end_layout
\begin_layout Standard
From a commercial perspective,
\series bold
outsourcing
\series default
of
\emph on
huge masses
\emph default
of enterprise-critical storage (to whatever class of storage) usually pays
off
\series bold
only when
\series default
your storage demands are either
\emph on
relatively moderate
\emph default
, or are
\emph on
extremely
\emph default
varying over time, and/or when you need some
\emph on
extra
\emph default
capacity only
\emph on
temporarily
\emph default
for a
\emph on
very
\emph default
short time.
\end_layout
\begin_layout Subsubsection
Waste-Corrected Storage Price Comparison
\begin_inset CommandInset label
LatexCommand label
name "subsec:Waste-Corrected-Storage-Price"
\end_inset
\end_layout
\begin_layout Standard
There is some influence from the granularity of storage (pool sizes) at
cost.
BigCluster or CentralStorage advocates are often emphasizing that larger
storage pools can save cost by
\series bold
flexible assignment
\series default
, which in turn can
\series bold
reduce waste
\series default
(at least
\emph on
potentially
\emph default
).
\end_layout
\begin_layout Standard
FlexibleSharding (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:FlexibleSharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
) in combination with Football can lead to a similar or even better
\begin_inset Foot
status open
\begin_layout Plain Layout
Typical RemoteSharding over CentralStorage lacks easy movement of LVs between
shards, while Football is providing this functionality on LocalStorage.
\end_layout
\end_inset
flexibility in storage assignment, and thus to a similar reduction of waste
under comparable conditions.
\end_layout
\begin_layout Standard
However, pure local storage models like LocalSharding (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Variants-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
) are less flexible from a
\emph on
human
\emph default
point of view.
Do they lead to more waste from a technical viewpoint? Moving around LVs
via Football
\emph on
can
\emph default
be used for flexibility at runtime, but this is less instant, and it cannot
easily compensate for bigger misdimensioning between CPU capacity and storage
capacity.
\end_layout
\begin_layout Standard
Experiences and statistics at 1&1 Ionos ShaHoLin with an LV to PV ratio
of
\begin_inset Formula $\approx$
\end_inset
7:1 (January 2020) are suggesting that the average storage waste caused
by non-fully automated Football
\begin_inset Foot
status open
\begin_layout Plain Layout
Without a pool-optimizer, but more or less optimized
\begin_inset Quotes eld
\end_inset
by hand
\begin_inset Quotes erd
\end_inset
.
\end_layout
\end_inset
is around 8.1 PB allocated LV space from 10.7 PB of totally installed PV
space
\begin_inset Foot
status open
\begin_layout Plain Layout
Without geo-redundancy.
Grand totals must be taken
\begin_inset Formula $\times2$
\end_inset
.
\end_layout
\end_inset
, which is around 24% waste in the space dimension (better to be called
\series bold
spare space
\series default
, since it is
\emph on
usable
\emph default
).
\end_layout
\begin_layout Standard
Notice that this comes close to the annual ShaHoLin data growth rate, which
is around 21%.
Essentially, the current spare space is similar to that.
It is a good idea to keep some spare space for unforeseeable impacts.
Also notice that this
\begin_inset Quotes eld
\end_inset
waste
\begin_inset Quotes erd
\end_inset
comes close to an intended PV filling level of around 80%, which was a
deliberate political decision of some advocates, and has no true technical
reasons.
Technically, higher filling levels up to the theoretical fragmentation
limit of 95% (see scientific literature on fragmentation) would be technically
possible, but for practical reasons more than 90% PV
\begin_inset Foot
status open
\begin_layout Plain Layout
All the above discussion relates to block level solely.
Similar arguments hold for filesystem layer, but the latter is independent
from architectures und thus can be completely factored out from this discussion.
\end_layout
\end_inset
filling level cannot be recommended, for
\emph on
any
\emph default
storage system.
So the current ShaHoLin waste is not far from optimal.
\end_layout
\begin_layout Standard
Some advocates might argue that the real waste would be higher than 24%,
because there would be CPU waste
\begin_inset Foot
status open
\begin_layout Plain Layout
In March 2020, the relative CPU consumption of all primary-side new multicontain
er machines was 37.1% in
\emph on
timely + pool average
\emph default
, with a climbing tendency.
Queueing theory suggests that an average 70% CPU utilization should not
be exceeded much during DDOS attacks and load peaks, in order to prevent
rising service times (which are rather strong SLAs monitored minutely,
while DDOS attacks and high-load periods typically last for hours, sometimes
for days).
Therefore, a day-and-night average of around 70 / 2 = 35% is roughly a
desired target value.
Both queuing theory and practical observation tell us that after exceeding
70% CPU utilization, the system is reacting in a heavily
\series bold
non-linear
\series default
fashion.
The rather strong SLAs forces us to a moderate average CPU utilization.
Do not linearly extrapolate anything under such conditions! For lower SLAs,
somewhat higher density and thus higher CPU utilization would be possible,
but the potential is lower than one might expect, due to non-linearity.
Notice that LXC containers have almost neglectible CPU overhead, while
KVM / vmware would eat a noticable amount.
Do not compare statistics measured inside of VMs with ones gathered from
LXC (or other) hypervisors.
Do not use VM utilization
\emph on
at all(!)
\emph default
for conclusions about
\emph on
hardware
\emph default
.
\series bold
VM-level measurements can be completely meaningless fake results
\series default
, telling almost nothing about the hardware!
\end_layout
\end_inset
.
Until future FlexibleSharding is implemented, the current LocalSharding
leads to a fixed relationship between storage and CPU power.
Better dimensioning of CPU capacity would allow for bigger localstorage
RAID sets.
However, this is a non-storage price argument, using an incomparable measure.
As a courtesy to those advocates, we will now
\emph on
assume(!)
\emph default
that the
\begin_inset Quotes eld
\end_inset
waste
\begin_inset Quotes erd
\end_inset
produced by LocalStorage were around 30%
\begin_inset Foot
status open
\begin_layout Plain Layout
Even higher
\begin_inset Quotes eld
\end_inset
estimations
\begin_inset Quotes erd
\end_inset
of waste differences between local and central storage would not be realistic.
In
\emph on
any
\emph default
of the architectures,
\series bold
spare CPU power
\series default
must be deployed.
Otherwise, DDOS attacks and other types of load peaks cannot be handled
gracefully.
In pure compute farms using remote storage, spare CPUs are typically not
counted for statistics, while at ShaHoLin both the storage and the CPU
power are always fully counted.
Do not compare statistics based on different foundations.
In order to really get a fundamental difference outweighting the CAPEX
advantages of self-built vs commercial storage, the LocalSharding model
would need to be
\series bold
misdimensioned
\series default
.
Arguing with misdimensioning would be
\series bold
unfair
\series default
.
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
This number has to be correlated with the waste produced by other models.
In small CentralStorage installations, higher wastes are common, due to
the low number of building blocks.
The existing building blocks need to be set up with enough spare space
for future data growth.
When CentralStorage technology (commercial storage boxes) are used for
RemoteSharding on top of CentralStorage, the waste may
\emph on
potentially
\emph default
decline.
However, there remains a fundamental problem: LVs cannot easily be moved
\emph on
between
\emph default
CentralStorage shards.
Therefore, some waste is necessary for allowing resizing of existing LVs
during runtime.
As a courtesy to those advocates, we now
\emph on
assume(!)
\emph default
that the waste in such a RemoteSharding over LocalStorage architecture
would be only 10%.
So the difference in waste would be 30%
\begin_inset Formula $-$
\end_inset
10% = 20%.
\end_layout
\begin_layout Standard
Now what is the total price difference? As shown above, the
\emph on
raw
\emph default
price difference between commercial storage and self-built local storage
is between 300% and 1000%.
When multiplying this with an assumed(!)
\emph on
additional
\emph default
waste of 20%, the
\series bold
cost for additionally wasted space
\series default
would be higher for commercial storage.
For CAPEX invest on the
\emph on
total
\emph default
storage space, there would remain an advantage for LocalSharding, even
if the localstorage waste would be assumed as unrealistic 100% (total factor
2).
\end_layout
\begin_layout Standard
\begin_inset VSpace smallskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Real cost of waste
\end_layout
\end_inset
Do not take isolated arguments like waste as a central criterion for price
comparisons.
Always try to determine
\series bold
TCO = Total Cost of Ownership
\series default
as close as possible.
\end_layout
\begin_layout Plain Layout
Another pitfall: do not count localstorage / LocalSharding cost by inclusion
of CPU power, while neglecting CPU and/or network cost for RemoteSharding
etc.
Do not trap into
\series bold
unfair
\series default
comparisons.
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Cost Arguments from Architecture
\begin_inset CommandInset label
LatexCommand label
name "subsec:Cost-Arguments-from-Architecture"
\end_inset
\end_layout
\begin_layout Standard
In addition to basic storage prices, many further factors come into play
when roughly comparing big cluster architectures versus sharding.
The following table bears the
\emph on
unrealistic assumption
\emph default
that BigCluster can be reliably operated with 2 replicas (
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none
the suffix
\begin_inset Formula $\times2$
\end_inset
\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\uuline default
\uwave default
\noun default
\color inherit
means with additional geo-redundancy):
\begin_inset Separator latexpar
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Tabular
<lyxtabular version="3" rows="5" columns="5">
<features tabularvalignment="middle">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
BC
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
SHA
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
BC
\begin_inset Formula $\times2$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
SHA
\begin_inset Formula $\times2$
\end_inset
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
# of Disks
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
>200%
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
<120%
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
>400%
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
<240%
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
# of Servers
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times2$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times1.1$
\end_inset
possible
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times4$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times2.2$
\end_inset
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Power Consumption
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times2$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times1.1$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times4$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times2.2$
\end_inset
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
HU Consumption
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times2$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times1.1$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times4$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times2.2$
\end_inset
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Standard
\noindent
As shown in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Reliability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
, and as recommended by several advocates, two replicas are typically not
sufficient for BigCluster.
Even addicts of BigCluster are typically recommending 3 replicas in so-called
\begin_inset Quotes eld
\end_inset
best practices
\begin_inset Quotes erd
\end_inset
, leading to the following more realistic table:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Tabular
<lyxtabular version="3" rows="5" columns="5">
<features tabularvalignment="middle">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
BC
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
SHA
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
BC
\begin_inset Formula $\times2$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
SHA
\begin_inset Formula $\times2$
\end_inset
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
# of Disks
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
>300%
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
<120%
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
>600%
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
<240%
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
# of Servers
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times3$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times1.1$
\end_inset
possible
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times6$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times2.2$
\end_inset
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Power Consumption
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times3$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times1.1$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times6$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times2.2$
\end_inset
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
HU Consumption
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times3$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times1.1$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times6$
\end_inset
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\begin_inset Formula $\approx\times2.2$
\end_inset
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Standard
\noindent
The crucial point is not only the number of extra servers needed for dedicated
storage boxes, but also the total number of HDDs.
While big cluster implementations like Ceph or Swift can
\emph on
theoretically
\emph default
use some erasure encoding
\begin_inset Foot
status open
\begin_layout Plain Layout
There is a reason why erasure encoding is not practical for many
\family typewriter
BigCluster
\family default
use cases.
The number of total IO requests sent to the internal disks is much higher
than the number of IO requests sent to the storage by your application,
in order to update additional redundancy information.
Like RAID-6, this is typically by
\emph on
factors
\emph default
.
While RAID-6 is
\series bold
offloading
\series default
this additional workload to a small
\emph on
specialized
\emph default
and realtime-capable network called SAS bus,
\family typewriter
BigCluster
\family default
is typically spreading this workload over an unreliable IP network with
packet loss, spanning much larger distances, and involving more switches
/ routers.
\end_layout
\end_inset
for avoiding full object replicas, their
\emph on
practice
\emph default
as seen in internal 1&1 Ceph clusters is similar to RAID-10, but just on
objects instead of block-based sectors.
\end_layout
\begin_layout Standard
Therefore a big cluster typically needs >300% disks to reach the same net
capacity as a simple sharded cluster.
The latter can typically take advantage of hardware RAID-60 with a significantl
y smaller disk overhead, while providing sufficient failure tolerance at
disk level.
\end_layout
\begin_layout Standard
There is a surprising consequence from this: geo-redundancy is not as expensive
as many people are believing.
It just needs to be built with the proper architecture.
A sharded geo-redundant pool based on hardware RAID-60 (last column
\begin_inset Quotes eld
\end_inset
SHA
\begin_inset Formula $\times2$
\end_inset
\begin_inset Quotes erd
\end_inset
) costs typically
\emph on
less
\emph default
than a non-georedundant big cluster with typically needed / recommended
number of replicas (column
\begin_inset Quotes eld
\end_inset
BC
\begin_inset Quotes erd
\end_inset
).
A geo-redundant sharded pool provides even better failure compensation
(see sections
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Reliability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Flexibility-of-Failover"
plural "false"
caps "false"
noprefix "false"
\end_inset
), and comparable flexibility when combined with Football (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Principle-of-Background"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Standard
Notice that geo-redundancy implies by definition (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Geo-Redundancy"
plural "false"
caps "false"
noprefix "false"
\end_inset
) that an unforeseeable
\series bold
full datacenter loss
\series default
(e.g.
caused by
\series bold
disasters
\series default
like a terrorist attack or an earthquake) must be compensated for
\series bold
several days or weeks
\series default
.
Therefore it is
\emph on
not
\emph default
sufficient to take a big cluster and just spread it to two different locations.
\end_layout
\begin_layout Standard
In any case, a MARS-based geo-redundant sharding pool with a reasonable
size is cheaper than using commercial storage appliances, which are much
more expensive by their nature.
\end_layout
\begin_layout Part
MARS for Consultants and Architects
\begin_inset Newline newline
\end_inset
\size footnotesize
Plus Background for Interested Managers / Responsibles / Project Members
/ Sysadmins / etc
\end_layout
\begin_layout Chapter
Use Cases for MARS
\begin_inset CommandInset label
LatexCommand label
name "chap:Use-Cases-for"
\end_inset
\end_layout
\begin_layout Standard
DRBD has a long history of successfully providing HA features to many users
of Linux.
With the advent of MARS, many people are wondering what the difference
is.
They ask for recommendations.
In which use cases should DRBD be recommended, and in which other cases
is MARS the better choice?
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Use cases MARS vs DRBD
\end_layout
\end_inset
The following table is a short guide to the most important cases where the
decision is rather clear:
\begin_inset Separator latexpar
\end_inset
\end_layout
\begin_layout Plain Layout
\noindent
\align center
\begin_inset Tabular
<lyxtabular version="3" rows="6" columns="2">
<features tabularvalignment="middle">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Use Case
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Recommendation
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
server pairs, each directly connected via
\series bold
crossover cables
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
DRBD
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\series bold
active-active
\series default
/ dual-primary, e.g.
\family typewriter
\series bold
gfs2
\family default
\series default
,
\family typewriter
\series bold
ocfs2
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
DRBD
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
distance
\series bold
> 50km
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
MARS
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\series bold
> 100 server pairs
\series default
over a short-distance
\series bold
shared
\series default
line
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
MARS
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
all else / intermediate cases
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
read the following details
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
There exist a few use cases where DRBD is clearly better than the current
version of MARS.
1&1 has a long history of experiences with DRBD where it works very fine,
in particular coupling Linux devices rack-to-rack via crossover cables.
DRBD is just
\emph on
constructed
\emph default
for that use case (RAID-1 over network).
In such a scenario, DRBD is better than MARS because it uses up less disk
space resources.
In addition, newer DRBD versions can run over high-speed but short-distance
interconnects like Infiniband (via the SDP protocol).
Another use case for DRBD is the so-called active-active / dual-primary
mode (see section
\begin_inset CommandInset ref
LatexCommand vref
reference "sec:What-is-active-active"
plural "false"
caps "false"
noprefix "false"
\end_inset
), e.g.
similar to
\family typewriter
ocfs2
\family default
\begin_inset Foot
status open
\begin_layout Plain Layout
Notice that
\family typewriter
ocfs2
\family default
is appearantly not constructed for long distances.
1&1 has some experiences on a specific short distance cluster where the
\family typewriter
ocfs2
\family default
/
\family typewriter
DRBD
\family default
combination scaled a little bit better than
\family typewriter
NFS
\family default
, but worse than
\family typewriter
glusterfs
\family default
(using 2 clients in both cases notice that
\family typewriter
glusterfs
\family default
showed extremely bad performance when trying to enable so-called active-active
\family typewriter
glusterfs
\family default
replication between 2 server instances, therefore we ended up using active-pass
ive DRBD replication below a single
\family typewriter
glusterfs
\family default
server).
Conclusion:
\family typewriter
NFS
\family default
<
\family typewriter
ocfs2
\family default
<
\family typewriter
glusterfs
\family default
< sharding.
We found that
\family typewriter
glusterfs
\family default
on top of active-passive DRBD scalability was about 2 times better than
\family typewriter
NFS
\family default
on top of active-passive DRBD, while
\family typewriter
ocfs2
\family default
on top of
\family typewriter
DRBD
\family default
in the so-called active-active mode (also called dual-primary) was somewhere
inbetween.
All cluster comparisons with an increasing workload over time (measured
as number of customers which could be safely operated).
Each system was replaced by the next one when the respective scalability
was at its respective end, each time leading to operational problems.
The ultimate solution was to replace all of these clustering concepts by
the general concept of
\series bold
sharding
\series default
.
\end_layout
\end_inset
over short
\begin_inset Foot
status open
\begin_layout Plain Layout
So-called active-active replication (see section
\begin_inset CommandInset ref
LatexCommand vref
reference "sec:What-is-active-active"
plural "false"
caps "false"
noprefix "false"
\end_inset
) have been experienced to show massive problems over long distances, because
of (a) high network latencies (cf chapter
\begin_inset CommandInset ref
LatexCommand ref
reference "chap:Cloud-Storage"
\end_inset
and section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
) and/or (b) network failures or other failures leading to split brain with
non-zero probability.
Theoretically, for replication of whole clusters over long distances DRBD
and MARS could be stacked: using DRBD on top for MARS for so-called active-acti
ve clustering of
\family typewriter
gfs2
\family default
or
\family typewriter
ocfs2
\family default
, and a MARS instance
\emph on
below
\emph default
for failover of
\emph on
one
\emph default
of the DRBD replicas over long distances.
No experiences, and not recommended for now.
\end_layout
\end_inset
distances.
\end_layout
\begin_layout Standard
On the other hand, there exist other use cases where DRBD did not work as
expected, leading to incidents and other operational problems.
We analyzed them for our specific use cases.
The later author of MARS came to the conclusion that they could only be
resolved by fundamental changes in the internal architecture of DRBD.
The development of MARS started at the personal initiative of the author,
first in form of a personal project during holidays, and later became operation
al at 1&1.
\end_layout
\begin_layout Standard
MARS and DRBD simply have
\series bold
different application areas
\series default
.
\end_layout
\begin_layout Standard
In the following, we will discuss the pros and cons of each system in particular
situations and contexts, and we shed some light at their conceptual and
operational differences.
\end_layout
\begin_layout Section
Network Bottlenecks
\begin_inset CommandInset label
LatexCommand label
name "sec:Network-Bottlenecks"
\end_inset
\end_layout
\begin_layout Subsection
Behaviour of DRBD
\begin_inset CommandInset label
LatexCommand label
name "subsec:Behaviour-of-DRBD"
\end_inset
\end_layout
\begin_layout Standard
In order to describe the most important problem we found when DRBD was used
to couple whole datacenters (each encompassing thousands of servers) over
metro distances, we strip down that complicated real-life scenario to a
simplified laboratory scenario in order to demonstrate the effect with
minimal means.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Notice that the following DRBD effect does not appear at crossover cables.
The following scenario covers a non-standard case of DRBD.
DRBD works fine when no network bottleneck appears.
\end_layout
\begin_layout Standard
The following picture illustrates an effect which has been observed in 1&1
datacenters when running masses of DBRD instances through a single network
bottleneck.
In addition, the effect is also reproducible by an elder version of the
MARS test suite
\begin_inset Foot
status open
\begin_layout Plain Layout
The effect has been demonstrated some years ago with DRBD version 8.3.13.
By construction, is is independent from any of the DRBD series 8.3.x, 8.4.x,
or 9.0.x.
\end_layout
\end_inset
:
\begin_inset Separator latexpar
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/network-bottleneck-drbd.fig
width 80col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
The simplified scenario is the following:
\end_layout
\begin_layout Enumerate
DRBD is loaded with a low to medium, but constant rate of write operations
for the sake of simplicity of the scenario.
\end_layout
\begin_layout Enumerate
The network has some throughput bottleneck, depicted as a red line.
For the sake of simplicity, we just linearly decrease it over time, starting
from full throughput, down to zero.
The decrease is very slowly over time (some minutes, or even hours).
\end_layout
\begin_layout Standard
What will happen in this scenario?
\end_layout
\begin_layout Standard
As long as the actual DRBD write throughput is lower than the network bandwidth
(left part of the horizontal blue line), DRBD works as expected.
\end_layout
\begin_layout Standard
Once the maximum network throughput (red line) starts to fall short of the
required application throughput (first blue dotted line), we get into trouble.
By its very nature, DRBD works
\series bold
synchronously
\series default
.
Therefore, it
\emph on
must
\emph default
transfer all your application writes through the bottleneck, but now it
is impossible
\begin_inset Foot
status open
\begin_layout Plain Layout
This is independent from the DRBD protocols A through C, because it depends
on an information-theoretic argument independently from any protocol.
We have a fundamental conflict between network capabilities and application
demands here, which cannot be circumvented due to the
\series bold
synchronous
\series default
nature of DRBD.
\end_layout
\end_inset
due to the bottleneck.
\end_layout
\begin_layout Standard
As a consequence, the application running on top of DRBD will see increasingly
higher IO latencies and/or stalls / hangs.
We found practical cases (at least with former versions of DRBD) where
IO latencies exceeded practical monitoring limits such as
\begin_inset Formula $5$
\end_inset
s by far, up to the range of
\emph on
minutes
\emph default
.
Experienced sysadmins will know what happens next: your application will
run into an
\series bold
incident
\series default
, and your customers will be dissatisfied.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
In order to deal with such situations, DRBD has lots of tuning parameters.
In particular, the
\family typewriter
timeout
\family default
parameter and/or the
\family typewriter
ping-timeout
\family default
parameter will determine when DRBD will give up in such a situation and
simply drop the network connection as an emergency measure.
Dropping the network connection is roughly equivalent to an automatic
\family typewriter
disconnect
\family default
, followed by an automatic re-connect attempt after
\family typewriter
connect-int
\family default
seconds.
During the dropped connection, the incident will appear as being resolved,
but at some hidden cost
\begin_inset Foot
status open
\begin_layout Plain Layout
By appropriately tuning various DRBD parameters, such as
\family typewriter
timeout
\family default
and/or
\family typewriter
ping-timeout
\family default
, you can keep the impact of the incident below some viable limit.
However, the automatic disconnect will then happen earlier and more often
in practice.
Flaky or overloaded networks may easily lead to an enormous number of automatic
disconnects.
\end_layout
\end_inset
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
What happens next in our scenario? During the
\family typewriter
disconnect
\family default
, DRBD will record all positions of writes in its bitmap and/or in its activity
log.
As soon as the automatic re-connect succeeds after
\family typewriter
connect-int
\family default
seconds, DRBD has to do a partial re-sync of those blocks which were marked
dirty in the meantime.
This leads to an
\emph on
additional
\emph default
bandwidth demand
\begin_inset Foot
status open
\begin_layout Plain Layout
DRBD parameters
\family typewriter
sync-rate
\family default
resp
\family typewriter
resync-rate
\family default
may be used to tune the height of the additional demand.
In addition, the newer parameters
\family typewriter
c-plan-ahead
\family default
,
\family typewriter
c-fill-target
\family default
,
\family typewriter
c-delay-target
\family default
,
\family typewriter
c-min-rate
\family default
,
\family typewriter
c-max-rate
\family default
and friends may be used to dynamically adapt to
\emph on
some
\emph default
situations where the application throughput
\emph on
could
\emph default
fit through the bottleneck.
These newer parameters were developed in a cooperation between 1&1 and
Linbit, the maker of DRBD.
\end_layout
\begin_layout Plain Layout
Please note that lowering / dynamically adapting the resync rates may help
in lowering the
\emph on
probability
\emph default
of occurrences of the above problems in practical scenarios where the bottlenec
k would recover to viable limits after some time.
However, lowering the rates will also increase the
\emph on
duration
\emph default
of re-sync operations accordingly.
The
\emph on
total amount of re-sync data
\emph default
simply does not decrease when lowering
\family typewriter
resync-rate
\family default
; it even tends to increase over time when new requests arrive.
Therefore, the
\emph on
expectancy value
\emph default
of problems caused by
\emph on
strong
\emph default
network bottlenecks (i.e.
when not even the ordinary application rate is fitting through) is
\emph on
not
\emph default
improved by lowering or adapting
\family typewriter
resync-rate
\family default
, but rather the expectancy value mostly depends on the
\emph on
relation
\emph default
between the amount of holdback data versus the amount of application write
data, both measured for the duration of some given strong bottleneck.
\end_layout
\end_inset
as indicated by the upper dotted blue box.
\end_layout
\begin_layout Standard
Of course, there is
\emph on
absolutely no chance
\emph default
to get the increased amount of data through our bottleneck, since not even
the ordinary application load (lower dotted lines) could be transferred.
\end_layout
\begin_layout Standard
Therefore, you run at a
\series bold
very high risk
\series default
that the re-sync cannot finish before the next
\family typewriter
timeout
\family default
/
\family typewriter
ping-timeout
\family default
cycle will drop the network connection again.
\end_layout
\begin_layout Standard
What will be the final result when that risk becomes true? Simply, your
secondary site will be
\emph on
permanently
\emph default
in state
\family typewriter
inconsistent
\family default
.
This means, you have lost your redundancy.
In our scenario, there is no chance at all to become consistent again,
because the network bottleneck declines more and more, slowly.
It is simply
\emph on
hopeless
\emph default
, by construction.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
In case you lose your primary site now, you are lost at all.
\end_layout
\begin_layout Standard
\noindent
Some people may argue that the probability for a similar scenario were low.
We don't agree on such an argumentation.
Not only because it really happens in pratice, and it may even last some
days until problems are fixed.
In case of so-called
\series bold
\begin_inset Quotes eld
\end_inset
rolling disasters
\series default
\begin_inset Quotes erd
\end_inset
, the network is very likely to become flaky and/or overloaded shortly before
the final damage.
Even in other cases, you can easily end up with inconsistent secondaries.
It occurs not only in the lab, but also in practice if you operate some
hundreds or even thousands of DRBD instances.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Resilience of DRBD
\end_layout
\end_inset
The point is that you can produce an ill behaviour
\emph on
systematically
\emph default
just by overloading the network a bit for some sufficient duration.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
When coupling whole datacenters via some thousands of DRBD connections
(see the example scenario in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
), any (short) network loss will almost certainly increase the re-sync network
load each time the outage appears to be over.
As a consequence, overload may be
\emph on
provoked
\emph default
by the re-sync repair attempts.
This may easily lead to self-amplifying
\series bold
throughput storms
\series default
in some resonance frequency (similar to self-destruction of a bridge when
an army is marching over it in lockstep).
\end_layout
\begin_layout Standard
The only way for reliable prevention of loss of secondaries is to start
any re-connect
\emph on
only
\emph default
in such situations where you can
\emph on
predict in advance
\emph default
that the re-sync is
\emph on
guaranteed
\emph default
to finish before any network bottleneck / loss will cause an automatic
disconnect again.
We don't know of any method which can reliably predict the future behaviour
of a complex network.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Risks from non-crossover DRBD
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Conclusion: in the presence of network bottlenecks, you run a considerable
risk that your DRBD mirrors get destroyed just in that moment when you
desperately need them.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Notice that
\emph on
classical
\emph default
crossover cables usually do not show a behaviour like depicted by the red
line.
Traditional crossover cables are
\emph on
passive components
\emph default
which normally
\begin_inset Foot
status open
\begin_layout Plain Layout
Exceptions might be mechanical jiggling of plugs, or electro-magnetical
interferences.
We never noticed any of them.
\end_layout
\end_inset
either work, or not.
The binary connect / disconnect behaviour of DRBD has no problems to cope
with that.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
However, some newer Ethernet cable technologies like SFP+ and faster are
no longer passive.
They have some internal chips inside of their plugs.
Thus they may
\series bold
fail independently
\series default
from your storage nodes.
Then you run at least the risks from the CAP theorem, see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Explanation-via-CAP"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
In addition to CAP effects, intermitting errors such as flaky electrical
contacts may rise the above risk of permanent data loss.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
or
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Linbit recommends a
\series bold
workaround
\series default
for the inconsistencies during re-sync: LVM snapshots.
We tried it, but found a
\emph on
performance penalty
\emph default
which made it prohibitive for our concrete application.
A problem seems to be the cost of destroying snapshots.
LVM uses by default a BOW strategy (Backup On Write, which is the counterpart
of COW = Copy On Write).
BOW increases IO latencies during ordinary operation.
Retaining snapshots is cheap, but reverting them may be very costly, depending
on workload.
We didn't fully investigate that effect, and our experience is a few years
old.
You might come to a different conclusion for a different workload, for
newer versions of system software, or for a different strategy if you carefully
investigate the field.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
DRBD problems usually arise
\emph on
only
\emph default
when the network throughput shows some
\begin_inset Quotes eld
\end_inset
awkward
\begin_inset Quotes erd
\end_inset
analog behaviour, such as overload, or as occasionally produced by various
switches / routers / transmitters, or other potential sources of packet
loss.
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Behaviour of MARS
\begin_inset CommandInset label
LatexCommand label
name "subsec:Behaviour-of-MARS"
\end_inset
\end_layout
\begin_layout Standard
The behaviour of MARS in the above scenario:
\begin_inset Separator latexpar
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/network-bottleneck-mars.fig
width 80col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
When the network is restrained, an asynchronous system like MARS will continue
to serve the user IO requests (dotted green line) without any impact /
incident while the actual network throughput (solid green line) follows
the red line.
In the meantime, all changes to the block device are recorded at the transactio
n logfiles.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Here is one point in favour of DRBD: MARS stores its transaction logs on
the filesystem
\family typewriter
/mars/
\family default
.
When the network bottleneck is lasting very long (some days or even some
weeks), the filesystem will eventually run out of space some day.
\family typewriter
mars-user-manual.pdf
\family default
discusses countermeasures against that in detail.
In contrast to MARS, DRBD allocates its bitmap
\emph on
statically
\emph default
at resource creation time.
It uses up less space, and you don't have to monitor it for (potential)
overflows.
The space for transaction logs is the price you have to pay if you want
or need anytime consistency, or asynchronous replication in general.
\end_layout
\begin_layout Standard
In order to really grasp the
\emph on
heart
\emph default
of the difference between synchronous and asynchronous replication, we
look at the following modified scenario:
\begin_inset Separator latexpar
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/network-flaky-mars.fig
width 80col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
This time, the network throughput (red line) is varying
\begin_inset Foot
status open
\begin_layout Plain Layout
In real life, many long-distance lines or even some heavily used metro lines
usually show fluctuations of their network bandwidth by an order of magnitude,
or even higher.
We have measured them.
The overall behaviour can be characterized as
\begin_inset Quotes eld
\end_inset
\series bold
chaotic
\series default
\begin_inset Quotes erd
\end_inset
.
\end_layout
\end_inset
in some unpredictable way.
As before, the application throughput served by MARS is assumed to be constant
(dotted green line, often superseded by the solid green line).
The actual replication network throughput is depicted by the solid green
line.
\end_layout
\begin_layout Standard
As you can see, a network dropdown undershooting the application demand
has no impact onto the application throughput, but only onto the replication
network throughput.
Whenever the network throughput is held back due to the flaky network,
it simply catches up as soon as possible by overshooting the application
throughput.
The amount of lag-behind is visualized as shaded area: downward shading
(below the application throughput) means an increase of the lag-behind,
while the upwards shaded areas (beyond the application throughput) indicate
a decrease of the lag-behind (catch-up).
Once the lag-behind has been fully caught up, the network throughput suddenly
jumps back to the application throughput (here visible in two cases).
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Note that the existence of lag-behind areas is roughly corresponding to
DRBD disconnect states, and in turn to DRBD inconsistent states of the
secondary as long as the lag-behind has not been fully cought up.
The very rough
\begin_inset Foot
status open
\begin_layout Plain Layout
Of course, this visualization is not exact.
On one hand, the DRBD inconsistency phase may start later as depicted here,
because it only starts
\emph on
after
\emph default
the first automatic disconnect, upon the first automatic re-connect.
In addition, the amount of resync data may be smaller than the amount of
corresponding MARS transaction logfile data, because the DRBD bitmap will
coalesce multiple writes to the same block into one single transfer.
On the other hand, DRBD will transfer no data at all during its disconnected
state, while MARS continues its best.
This leads to a prolongation of the DRBD inconsistent phase.
Depending on properties of the workload and of the network, the real duration
of the inconsistency phase may be both shorter or longer.
\end_layout
\end_inset
duration of the corresponding DRBD inconsistency phase is visualized as
magenta line at the time scale.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Optimum throughput via MARS
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
MARS utilizes the existing network bandwidth as best as possible in order
to pipe through as much data as possible, provided that there exists some
data requiring expedition.
Conceptually, there exists no better way due to information theoretic limits
(besides data compression).
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Note that
\emph on
in average
\emph default
during a longer period of time, the network must have emough capacity for
transporting
\emph on
all
\emph default
of your data.
MARS cannot magically break through information-theoretic limits.
It cannot magically transport terabytes of data in a few seconds over very
slow modem
\begin_inset Foot
status open
\begin_layout Plain Layout
A certain colleague at 1&1 is using MARS for a private application: CDP
= Continuous Data Protection of a critical Windows VM over his home DSL
line.
\end_layout
\end_inset
lines.
Only
\emph on
relatively short
\emph default
network problems / packet loss can be compensated, depending on the capacity
of the
\family typewriter
/mars
\family default
filesystem.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In case of lag-behind, the version of the data replicated to the secondary
site corresponds to some time in the past.
Since the data is always transferred in the same order as originally submitted
at the primary site, the secondary never gets inconsistent.
Your mirror always remains usable.
Your only potential problem could be the outdated state, corresponding
to some state in the past.
However, the
\begin_inset Quotes eld
\end_inset
as-best-as-possible
\begin_inset Quotes erd
\end_inset
approach to the network transfer ensures that your version is always
\emph on
as up-to-date as possible
\emph default
even under ill-behaving network bottlenecks.
\series bold
There is simply no better way to do it.
\series default
In presence of temporary network bottlenecks such as network congestion,
there exists no better method than prescribed by the information theoretic
limit (red line, neglecting data compression).
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In order to get all of your data through the line, somewhen the network
must be healthy again.
Otherwise, data will be recorded until the capacity of the
\family typewriter
/mars/
\family default
filesystem is exhausted, leading to an emergency mode (see
\family typewriter
mars-user-manual.pdf
\family default
).
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Risk reduction via MARS
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
MARS' property of never sacrificing local data consistency (at the possible
cost of actuality, as long as you have enough capacity in
\family typewriter
/mars/
\family default
) is called
\series bold
Anytime Consistency
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Even when the capacity of
\family typewriter
/mars/
\family default
is exhausted and thus emergency mode is entered, the replicas will
\emph on
not
\emph default
become inconsistent by themselves.
However, when the emergency mode is later
\emph on
cleaned up
\emph default
for a replica via
\family typewriter
marsadm invalidate
\family default
, it will become
\emph on
temporarily
\emph default
inconsistent during the fast full sync.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
When you have a total of
\begin_inset Formula $k\geq3$
\end_inset
replicas, you don't need to invalidate them
\emph on
all in parallel
\emph default
.
By cascading the full syncs sequentially, you can retain some consistent,
but outdated replica for the meantime, until all sync have finished.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Conclusion: you can even use
\series bold
traffic shaping
\series default
on MARS' TCP connections in order to globally balance your network throughput
(of course at the cost of actuality, but without sacrificing local data
consistency).
If you would try to do the same with DRBD, you could easily provoke a disaster.
MARS simply tolerates any network problems, provided that there is enough
disk space for transaction logfiles.
Even in case of completely filling up your disk with transaction logfiles
after some days or weeks, you will not lose local consistency anywhere.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Simple traffic shaping be default
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Newer versions of MARS are automatically setting the so-called TOS fields
in standard TCP/IP packets for you, which is backwards compatible with
the newer DSCP feature.
You just need to properly configure your network equipment for this type
of traffic shaping, unless it isn't already enabled by default from various
network vendors.
In the latter case, you don't need to do anything, in order to get some
improvements by automatic traffic shaping for free.
Details are in
\family typewriter
mars-user-manual.pdf
\family default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Finally, here is yet another scenario where MARS can cope with the situation:
\begin_inset Separator latexpar
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/network-constant-mars.fig
width 80col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
This time, the network throughput limit (solid red line) is assumed to be
constant.
However, the application workload (dotted green line) shows some heavy
peaks.
We know from our 1&1 datacenters that such an application behaviour is
very common (e.g.
in case of certain kinds of DDOS attacks etc).
\end_layout
\begin_layout Standard
When the peaks are exceeding the network capacities for some short time,
the replication network throughput (solid green line) will be limited for
a short time, stay a little bit longer at the limit, and finally drop down
again to the normal workload.
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Resilience against load peaks
\end_layout
\end_inset
In other words, you get a flexible buffering behaviour, coping with application
load peaks.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Similar scenarios (where both the application workload has peaks and the
network is flaky to some degree) are rather common.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
If you would use DRBD in place of MARS, you were likely to run into regular
application performance problems and/or frequent automatic disconnect cycles,
depending on the height and on the duration of the peaks, and on network
resources.
As observed at 1&1, even permanent data loss is possible, with some residual
probability.
\end_layout
\begin_layout Section
Long Distances / High Latencies
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
In general and in some theories, latencies are conceptually independent
from throughput, at least to some degree.
There exist all 4 possible combinations:
\end_layout
\begin_layout Enumerate
There exist communication lines with high latencies but also high throughput.
Examples are raw fibre cables at the ground of the Atlantic.
\end_layout
\begin_layout Enumerate
High latencies on low-throughput lines is very easy to achieve.
If you never saw it, you never ran interactive
\family typewriter
vi
\family default
over
\family typewriter
ssh
\family default
in parallel to downloads on your old-fashioned modem line.
\end_layout
\begin_layout Enumerate
Low latencies need not be incompatible with high throughput.
See Myrinet, InfiniBand or high-speed point-to-point interconnects, such
as modern RAM busses.
\end_layout
\begin_layout Enumerate
Low latency combined with low throughput is also possible: in an ATM system
(or another pre-reservation system for bandwidth), just increase the multiplex
factor on low-capacity but short lines, which is only possible at the cost
of assigned bandwidth.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
In the
\emph on
internet
\emph default
practice, it is very likely that
\series bold
high network latencies will also lead to worse throughput
\series default
, because of the
\emph on
congestion control algorithms
\emph default
running all over the world.
\end_layout
\begin_layout Standard
We have experimented with extremely large TCP send/receive buffers plus
various window sizes and congestion control algorithms over long-distance
lines between the USA and Europe.
Yes, it is possible to improve the behaviour to some degree.
But magic does not happen.
Natural laws like Einstein's laws and Kirchhoff's laws (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
) will always hold.
You simply cannot travel faster than the speed of light, and you cannot
bypass the information theoretic limits of your transport media.
\end_layout
\begin_layout Standard
Our experience leads to the following rule of thumb, not formally proven
by anything, but just observed in practice:
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Safety rule for synchronous replication
\end_layout
\end_inset
In general,
\emph on
synchronous
\emph default
data replication (not limited to applications of DRBD) works reliably only
over distances
\begin_inset Formula $<50$
\end_inset
km, or sometimes even less.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
There may be some exceptions
\begin_inset Foot
status open
\begin_layout Plain Layout
We have heard of cases where even
\emph on
less
\emph default
than 50 km were not working with DRBD.
It depends on application workload, on properties of the line, and on congestio
n caused by other traffic.
Some other people told us that according to
\emph on
their
\emph default
experience, much lesser distances should be considered operable, only in
the range of a few single kilometers.
However, they agree that DRBD is rock stable when used on crossover cables.
\end_layout
\end_inset
, e.g.
when dealing with low-end workstation loads.
But when you are
\series bold
responsible
\series default
for a whole datacenter and/or for
\series bold
enterprise-critical data
\series default
, don't waste your time by trying (almost) impossible things.
We recommend to use MARS in such use cases.
\end_layout
\begin_layout Section
Explanation via CAP Theorem
\begin_inset CommandInset label
LatexCommand label
name "sec:Explanation-via-CAP"
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/cap-theorem.fig
width 60col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
The famous CAP theorem, also called Brewer's theorem, is important for a
deeper understanding of architectural setups, and of important differences
between synchronous and asynchronous replication.
It is mandatory for understanding the implementations DRBD and MARS.
It is a
\emph on
trilemma
\emph default
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-a-Trilemma"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
A good explanation of the CAP Theorem can be found at
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/CAP_theorem
\end_layout
\end_inset
(retrieved July 2018).
\end_layout
\begin_layout Standard
The CAP theorem states that only 2 out of 3 properties can be achieved at
the same time, when an
\emph on
arbitrary
\emph default
Distributed System is under pressure: C = Consistency means
\series bold
\emph on
Strict
\series default
\emph default
Consistency at the level of the
\emph on
distributed
\emph default
system (which is
\emph on
not
\emph default
the same as strict consistency
\emph on
inside
\emph default
of one of the
\emph on
local
\emph default
systems), A = Availability = intuitively clear from a user's perspective,
and P = Partitioning Tolerance = the network may have its own outages at
any time (which is a
\emph on
negative criterion
\emph default
and thus will
\series bold
complicate resolutions like hell
\series default
).
\end_layout
\begin_layout Standard
As explained in the Wikipedia article, the P = Partitioning Tolerance is
a property which is important at least in
\emph on
wide-distance
\emph default
data replication scenarios, and possibly / very likely also in other scenarios.
The property P cannot generally be chosen at runtime, but is
\emph on
given
\emph default
by
\series bold
\emph on
setup
\series default
\emph default
of the Distributed System.
\end_layout
\begin_layout Subsection
CAP Differences between DRBD and MARS
\begin_inset CommandInset label
LatexCommand label
name "subsec:CAP-Differences"
\end_inset
\end_layout
\begin_layout Standard
If you are considering only short distances like passive crossover cables
between racks,
\emph on
then
\emph default
(and
\emph on
only then
\emph default
) you may
\emph on
assume(!)
\emph default
that no effort for achieving property P is required, because it it is already
given for free.
Then, and only then, you can get both A and C at the same time, without
sacrificing P, because P is already for free by
\emph on
assumption
\emph default
.
In such a passive crossover cable scenario, getting all three properties
C and A and P is possible, similarly to an explanation in the Wikipedia
article.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Newer types of network cables for 10 GBit and more (e.g.
SFP+) may have some active chips internally in their plugs.
Suchalike technologies are no longer passive.
Consequently, the assumption
\begin_inset Quotes eld
\end_inset
passive component which cannot fail
\begin_inset Quotes erd
\end_inset
is no longer true by construction.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Some companies are claiming that 100% reliability of (certain) networks
can be achieved.
Please check these claims
\emph on
seriously
\emph default
, and look at the price tag.
Physics says that everything in this universe has a lifetime, and will
fail somewhen.
Nothing is
\emph on
acually
\emph default
for free.
\end_layout
\begin_layout Standard
Relying on the assumption
\begin_inset Quotes eld
\end_inset
P is for free = the network cannot fail
\begin_inset Quotes erd
\end_inset
leads us to classical use cases for DRBD: when both DRBD replicas are always
staying physically connected via a passive crossover cable (which is
\emph on
assumed
\emph default
to never break down), you
\emph on
could potentially
\emph default
get both strict global consistency and availability.
\end_layout
\begin_layout Standard
Whether this is real in practice for DRBD, is a different story.
It depends on the
\emph on
implementation
\emph default
of DRBD.
Some sysadmins at 1&1 Ionos have made the experience that there is no 100%
CAP guarantee, regardless of DRBD protocol configuration, while they were
testing only some cases where only
\emph on
one
\emph default
of the DRBD nodes was failing
\begin_inset Foot
status open
\begin_layout Plain Layout
In addition, you will need some further components like Pacemaker, iSCSI
failover, etc.
These might also be involved in the practically observed behaviour.
\end_layout
\end_inset
.
Both C and A are provided by DRBD during
\family typewriter
connected
\family default
state, while P is
\emph on
assumed
\emph default
to be provided by a passive component.
\end_layout
\begin_layout Standard
By addition of iSCSI failover (e.g.
ALUA and similar technologies), it
\emph on
should
\emph default
be possible to achieve A, even in case of single storage node failures,
while retaining C from the viewpoint
\begin_inset Foot
status open
\begin_layout Plain Layout
Notice: the CAP theorem does not deal with node failures, only with
\emph on
network
\emph default
failures.
Node failures would always violate C by some
\begin_inset Quotes eld
\end_inset
strong
\begin_inset Quotes erd
\end_inset
definition.
By some
\begin_inset Quotes eld
\end_inset
weaker
\begin_inset Quotes erd
\end_inset
definition, the downtime plus recovery time (e.g.
DRBD re-sync) can be taken out of the game.
Notice: while a node can always
\begin_inset Quotes eld
\end_inset
know
\begin_inset Quotes erd
\end_inset
whether it has failed (at least after reboot), network failures cannot
be distinguished from failures of remote nodes in general.
Therefore node failures and network failures are fundamentally different
by their nature.
\end_layout
\end_inset
of the application.
\end_layout
\begin_layout Standard
This is explained by the thick line in the following variant of the graphics,
which is only valid for passive crossover cables where P need not be guaranteed
by the replication because it is already assumed for free:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/cap-drbd-operational.fig
width 60col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
Now look at the case of a truly Distributed System, where P cannot be assumed
as for free.
For example, try to use the component DRBD in a long-distance replication
scenario.
There we cannot assume P as already given.
We
\series bold
must
\emph on
tolerate
\series default
\emph default
replication network outages.
DRBD is reacting to this differently in two different modes.
\end_layout
\begin_layout Standard
First we look at the (short) time interval
\emph on
before
\emph default
DRBD recognizes the replication network incident, and before it leaves
the
\family typewriter
connected
\family default
state.
During this phase, the application IO will
\series bold
hang
\series default
for some time, indicating the (temporary) sacrifice (from a user's perspective)
by a red X:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/cap-drbd-connected.fig
width 60col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
Because Availability is one of the highest goods of enterprise-critical
IT operations, you will typically configure DRBD such that it automatically
switches to some variant of a
\family typewriter
disconnected
\family default
state after some timeout, thereby giving up consistency between both replicas.
The red X indicates not only loss of global strict consistency in the sense
of the CAP theorem, but also that your replica will become
\family typewriter
Inconsistent
\family default
during the following re-sync:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/cap-drbd-disconnected.fig
width 60col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
You may wonder what the difference to MARS is.
As explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Properties-Cloud-Storage"
plural "false"
caps "false"
noprefix "false"
\end_inset
, the component MARS is not only intended for wide distances, but also for
\series bold
Cloud Storage
\series default
where no strict consistency is required at global level by definition,
but instead
\series bold
Eventually Consistent
\series default
is the preferred model for the Distributed System.
Therefore,
\emph on
strict
\emph default
consistency (in the sense of the CAP theorem) is
\emph on
not required by definition
\emph default
.
\end_layout
\begin_layout Standard
Consequently, the red X is not present in the following graphics, showing
the state where the component MARS is remaining
\emph on
locally consistent
\emph default
all the time
\begin_inset Foot
status open
\begin_layout Plain Layout
Notice that the
\emph on
initial
\emph default
full sync is not considered here, neither for DRBD, nor for MARS.
\emph on
Setup
\emph default
of the Distributed System is its own scenario, not considered here.
\emph on
Repair
\emph default
of a
\emph on
damaged
\emph default
system is also a different scenario, also not considered here.
Notice the MARS' emergency mode also belongs to the class of
\begin_inset Quotes eld
\end_inset
damages
\begin_inset Quotes erd
\end_inset
, as well as DRBD' disk failure modes, where is has some additional functionalit
y compared to the current version of MARS.
\end_layout
\end_inset
, even when a network outage occurs:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/cap-mars.fig
width 60col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Notice: the component MARS does not guarantee strict consistency
\emph on
between
\emph default
LV replicas at the level of the Distributed System, but only Eventually
Consistent.
However,
\emph on
at the same time
\emph default
it
\emph on
also
\emph default
guarantees strict consistency as a
\emph on
local component
\emph default
, and even at
\emph on
each
\emph default
of the passive replicas, each by each.
Please don't confuse these different levels.
There are two different consistency guarantees at different levels, both
at the same time.
This might be confusing if you are not looking at the system at different
levels: (1) overall Distributed System versus (2) each of the local system
instances.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Why does the component MARS this? Because a better way is not possible
at all.
The CAP theorem tells us that there exists no better way when both A has
to be guaranteed (as almost everywhere in enterprise-critical IT operations
except database systems), and P has to be ensured in geo-redundant datacenter
disaster scenarios or some other scenarios.
Similarly to natural laws like Einstein's laws of the speed of light, there
\emph on
does not exist
\emph default
a better way.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Solution classification of DRBD
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Conclusion from the CAP theorem: when P is a
\emph on
hard
\emph default
\emph on
requirement
\emph default
, don't use the component DRBD (or any other
\emph on
synchronous
\emph default
replication implementation) for long-distance and/or true Cloud Storage
scenarios.
It is only well-suited for short-distance crossover cable scenarios.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
The red X is in particular problematic during re-sync, after the network
has become healthy again (cf section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Behaviour-of-DRBD"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
The component MARS has no red X at C because of its
\series bold
Anytime Consistency
\series default
\begin_inset Foot
status open
\begin_layout Plain Layout
There is an exception: during
\emph on
repair
\emph default
via
\family typewriter
marsadm invalidate
\family default
, the current replica is reported as
\family typewriter
InConsistent
\family default
.
Repair means: the sysadmin has noticed that something is
\emph on
damaged
\emph default
(e.g.
the underlying HDDs, or the underlying RAID system, etc), or that a split-brain
situation needs to be repaired, und thus has triggered the repair operation
as provided by the replication subsystem.
Of course, repair of damaged components must be always possible (see also
explanation of
\series bold
recovery
\series default
in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Global-Eventual-Consistency"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
A similar re-sync is also supported by DRBD.
\end_layout
\end_inset
, which refers to
\emph on
local
\emph default
consistency, and which is violated by DRBD during certain important phases
of its regular operation.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Impossible requirements
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
If you think that you require alle three properties C+A+P, but you don't
have passive crossover cables over short distances, you are requiring something
which is
\series bold
impossible
\series default
in general.
You need give up one of them, at least with a certain probability.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
There exists no solution, with whatever component, or from whatever commercial
storage vendor.
Although some
\begin_inset Quotes eld
\end_inset
marketing drones
\begin_inset Quotes erd
\end_inset
are claiming the impossible, e.g.
by citing
\emph on
examples
\emph default
, which are then incorrectly generalized.
You might have luck, and there might be
\emph on
exceptional examples
\emph default
where all three C+A+P were ok,
\series bold
by chance
\series default
.
But there remains a
\series bold
risk
\series default
.
The CAP theorem is as hard as Einstein's natural laws are.
\end_layout
\begin_layout Standard
You need a conscious decision about
\series bold
priorities
\series default
, which property to drop first.
Please rethink your complete concept, from end to end.
Something is wrong, somewhere.
Ignoring a fundamental law like CAP on enterprise-critical use cases can
endanger a company and/or your career.
\end_layout
\begin_layout Subsection
CAP Commonalities between DRBD and MARS
\begin_inset CommandInset label
LatexCommand label
name "subsec:CAP-Commonalities"
\end_inset
\end_layout
\begin_layout Standard
In this subsection, we look at the case that P is not for free, but has
to be ensured by the Distributed Storage system.
\end_layout
\begin_layout Standard
You may have noticed that MARS' ordinary CAP behaviour is similar to DRBD's
CAP picture in
\family typewriter
disconnected
\family default
state, or during similar states when the replication network is interrupted.
\end_layout
\begin_layout Standard
Replication network interruption is also known as
\begin_inset Quotes eld
\end_inset
Network Partitioning
\begin_inset Quotes erd
\end_inset
.
This is where property P = Partitioning Tolerance comes into play.
\end_layout
\begin_layout Standard
When a network partition has
\emph on
actually occurred
\emph default
, both DRDB and MARS allow you to do the same: you may
\series bold
forcefully switch
\series default
the
\family typewriter
primary
\family default
role, which means activation of a former
\family typewriter
secondary
\family default
node.
In such a situation, you can issue commands like
\family typewriter
drbdadm primary --force
\family default
or
\family typewriter
marsadm primary --force
\family default
.
It is no accident that both commands are looking similar to each other.
\end_layout
\begin_layout Standard
The outcome will be the same: you will most likely get a
\family typewriter
\series bold
SplitBrain
\family default
\series default
situation.
\end_layout
\begin_layout Standard
The possibility of getting a split brain is no specific property of neither
DRBD nor MARS.
It will also happen with any other replication system, whether synchronous
or asynchronous.
\end_layout
\begin_layout Standard
It is one of the consequences from the CAP theorem when (1a) P has to be
assured, and (1b) a network partition has
\emph on
actually occurred
\emph default
, and (2) when A = Availability is enforced at both sides of the network
partition.
The result is that C =
\emph on
global
\emph default
Consistency may be violated, by creation of two or more versions of the
data.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
There are some
\emph on
ideas
\emph default
for
\emph on
dynamic
\emph default
control of
\family typewriter
SplitBrain
\family default
at
\emph on
runtime
\emph default
.
The decision about forceful creation of SplitBrain can be made
\emph on
dynamically dependent
\emph default
on further external factors, like current customer demands, or forecasts,
etc.
Please evaluate them carefully before going into mass production.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Careful: at least for some application classes, it is a bad idea to systematica
lly create split brain via automatic cluster managers, e.g.
Pacemaker or similar.
As explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Inappropriate-Clustermanger"
plural "false"
caps "false"
noprefix "false"
\end_inset
, some cluster managers were originally constructed for truly shared disk
scenarios, where no split brain can occur by construction.
Using them in masses on versioned data in truly distributed systems can
result in existential surprises, once a bigger network partition and/or
a flaky replication network (so-called
\series bold
flipping
\series default
or
\series bold
flapping
\series default
) can trigger them in masses, and possibly at unexpected moments.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
Split brain should not be provoked when not
\emph on
absolutely
\emph default
necessary.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Split brain resolution is all else but easy
\emph on
in general
\emph default
.
When the data is in a generic block device, you typically will have no
general means for
\emph on
merging
\emph default
both versions.
This means, split brain resolution is typically only possible by
\series bold
throwing away
\series default
some of the versions.
\end_layout
\begin_layout Standard
This kind of split brain resolution problem is not specific for DRBD or
MARS.
It is a fundamental property of Distributed Systems, and the difficulty
of resolution is an inherent property of generic block devices
\begin_inset Foot
status open
\begin_layout Plain Layout
Some people have claimed that the block device layer would be obsolete in
future, to be replaced by filesystems.
Please read sections
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Layering-Rules"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Filesystem-Layer-vs"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
DRBD and MARS have some commands like
\family typewriter
drbdadm invalidate
\family default
or
\family typewriter
marsadm invalidate
\family default
for this.
Again, the similarity is no accident.
\end_layout
\begin_layout Standard
Notice that classical filesystems aren't typically better than raw block
devices.
There are even more possibilities for tricky types of
\series bold
conflicts
\series default
(e.g.
on path names in addition to file content).
Anyway, long-distance replication should not be done at filesystem layer,
see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Performance-Risks-Replication-Layer"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Standard
Similary, BigCluster object stores are often suffering from similar (or
even worse) problems, because higher application layers may have some hidden
internal dependencies between object versions, while the object store itself
is agnostic of version dependencies in general
\begin_inset Foot
status open
\begin_layout Plain Layout
There exists lots of types of potential dependencies between objects.
Timely ones are easy to capture, but this is not sufficient in general
for everything.
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Cautious: when stacking block devices or filesystems, or any other complex
\emph on
structured aggregates
\emph default
on top of some BigCluster object store, you are creating another fundamental
risk, in addition to Dijkstra regressions explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "par:Negative-Example:-object"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Several types
\begin_inset Foot
status open
\begin_layout Plain Layout
Notice: BigCluster architectures are typically discriminating between between
client servers and storage servers.
This will typically introduce some more possibilities into the game, such
as forced client failover, independently from forced storage failover.
\end_layout
\end_inset
of object stores will not magically resolve any split brain for you.
Check whether your favorite object store implementation has some kind of
equivalent of a
\family typewriter
primary --force
\family default
command.
If it doesn't have one, or only a restricted one, you should be
\series bold
\emph on
alerted
\series default
\emph default
.
In case of a
\emph on
long-lasting(!)
\emph default
storage network partition, you might need suchalike
\emph on
desperately
\emph default
for ensuring A, even at the cost of C
\begin_inset Foot
status open
\begin_layout Plain Layout
\noindent
Notice that the C functionality is often not implemented by the object store
itself (which typically provides only
\emph on
eventually consistent
\emph default
at object granularity), but implemented by the distributed block device
or distributed filesystem, if it is implemented at all.
There is a fundamental problem with at least 3 different granularities
to be resolved: in order to guarantee strict consistency at (1) aggregate
granularity, which is independent from the (2) network partition granularity,
in general multiple versions of objects may be required at (3) object granulari
ty.
Does your object store have a means for this, similarly to multiversion
databases, e.g.
multiversion timestamp ordering?
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Check: whether you need this is heavily depending on the
\series bold
\emph on
application class
\series default
\emph default
(see also the Cloud Storage definition in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Properties-Cloud-Storage"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
If you
\emph on
would
\emph default
need it, but you are
\series bold
not prepared for suchalike scenarios at your enterprise-critical data
\series default
, it could cost your company a lot of money and/or reputation and/or even
its existence.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Notice: the
\emph on
concept
\emph default
of
\family typewriter
SplitBrain
\family default
is occurring almost everywhere in truly Distributed Systems when C can
be violated in favour of A+P.
It is a very general consequence
\begin_inset Foot
status open
\begin_layout Plain Layout
There exist only few opportunities for generic conflict resolution, even
in classical databases where
\emph on
some
\emph default
knowledge about the structure of the data is available.
Typically, there exist some more
\emph on
hidden
\emph default
dependencies than people are expecting.
Lossless
\family typewriter
SplitBrain
\family default
resolution will thus need to be implemented at application layer, if it
is possible at all.
\end_layout
\end_inset
of the CAP theorem.
\end_layout
\begin_layout Standard
The only reliable way for avoiding split brain in truly distributed systems
would be: don't insist on A = Availability.
Notice that there exist only a few application classes, like certain types
of banking, where C is typically a higher good than A.
\end_layout
\begin_layout Standard
Notice that both DRBD and MARS are supporting suchalike application classes
also: just
\emph on
don't
\emph default
add the option
\family typewriter
--force
\family default
to the
\family typewriter
primary
\family default
switch command.
\end_layout
\begin_layout Standard
However: even in banking, some
\emph on
extremely extraordinary
\emph default
scenarios might occur, where sacrifice of C in favour of A could be necessary
(e.g.
when
\emph on
manual cleanup
\emph default
of C is cheaper than long-lasting violations of A).
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Summary CAP decisions
\end_layout
\end_inset
Both DRBD and MARS have some emergency measure for killing C in favour of
A.
It requires your
\series bold
conscious decision
\series default
whether / where / when to use it,
\emph on
or not
\emph default
.
\end_layout
\end_inset
\end_layout
\begin_layout Section
Higher Consistency Guarantees vs Actuality
\end_layout
\begin_layout Standard
We already saw in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Network-Bottlenecks"
plural "false"
caps "false"
noprefix "false"
\end_inset
that certain types of network bottlenecks can easily (and reproducibly)
destroy the consistency of your DRBD secondary, while the component MARS
will preserve local consistency at the cost of actuality (
\series bold
anytime consistency
\series default
).
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
Some people, often located at database operations, are obtrusively arguing
that actuality is such a high good that it must not be sacrificed under
any circumstances.
\end_layout
\begin_layout Plain Layout
Anyone arguing this way has at least the following choices (list may be
incomplete):
\end_layout
\begin_layout Enumerate
None of the above use cases for MARS apply.
For instance, short distance replication over crossover cables is sufficient,
and the network is reliable enough such that bottlenecks can never occur
(e.g.
because the total load is extremely low, or conversely the network is extremely
overengineered / expensive), or the occurrence of bottlenecks can
\emph on
provably
\emph default
be taken into account.
In such cases, DRBD is clearly the better solution than MARS, because it
provides better actuality than the current version of MARS, and it uses
up less disk resources.
\end_layout
\begin_layout Enumerate
In the presence of network bottlenecks, some people didn't notice and/or
didn't understand and/or did under-estimate the risk of accidental invalidation
of their DRBD secondaries.
They should carefully check that risk.
They should convince themselves that the risk is
\emph on
really
\emph default
bearable.
Once they are hit by a
\emph on
systematic chain
\emph default
of events (e.g.
a certain instance of so-called
\series bold
rolling disasters
\series default
) which
\emph on
reproducibly
\emph default
provoke the bad effect, it will be too late
\begin_inset Foot
status open
\begin_layout Plain Layout
Some people seem to need a bad experience before they get the difference
between risk caused by reproducible effects and inverted luck.
\end_layout
\end_inset
.
\end_layout
\begin_layout Enumerate
In the presence of network bottlenecks, people found a solution such that
DRBD does not automatically re-connect after the connection has been dropped
due to network problems (c.f.
\family typewriter
ko-count
\family default
parameter).
So the risk of inconsistency
\emph on
appears
\emph default
to have vanished.
In some cases, people did not notice that the risk has
\emph on
not completely
\begin_inset Foot
status open
\begin_layout Plain Layout
Hint: what's the
\emph on
conceptual
\emph default
difference beween an automatic and a manual re-connect? Yes, you can try
to
\emph on
lower
\emph default
the risk in some cases by transferring risks to human analysis and human
decisions, but did you take into account the possibility of human errors?
\end_layout
\end_inset
\emph default
vanished, and/or they did not notice that now the actuality produced by
DRBD is even drastically worse than that of MARS (in the same situation).
It is true that DRBD provides better actuality in
\family typewriter
connected
\family default
state, but for a
\emph on
full picture
\emph default
the actuality in
\family typewriter
disconnected
\family default
state must not be neglected
\begin_inset Foot
status open
\begin_layout Plain Layout
Hint: a potential hurdle may be the fact that the current format of
\family typewriter
/proc/drbd
\family default
does neither display the timestamp of the first
\emph on
relevant
\emph default
network drop nor the total amount of lag-behind user data (which is
\emph on
not
\emph default
the same as the number of dirty bits in the bitmap), while
\family typewriter
marsadm view
\family default
can display it.
So it is difficult to judge the risks.
Possibly a chance is inspection of DRBD messages in the syslog, but quantificat
ion could remain hard.
\end_layout
\end_inset
.
So they didn't notice that their argumentation on the importance of actuality
may be fundamentally wrong.
A possible way to overcome that may be re-reading section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Behaviour-of-MARS"
plural "false"
caps "false"
noprefix "false"
\end_inset
and comparing its outcome with the corresponding outcome of DRBD in the
same situation.
\end_layout
\begin_layout Enumerate
People might not know the CAP theorem (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Explanation-via-CAP"
plural "false"
caps "false"
noprefix "false"
\end_inset
), and are trying to require something which simply is
\series bold
impossible
\series default
.
\end_layout
\begin_layout Enumerate
People are stuck in contradictive requirements because the current version
of MARS does not yet support synchronous or pseudo-synchronous operation
modes.
This should be hopefully resolved some day.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
A common misunderstanding is about the actuality guarantees provided by
filesystems.
The buffer cache / page cache uses by default a
\series bold
writeback strategy
\series default
for performance reasons.
Even modern journalling filesystems will (by default) provide only consistency
guarantees, but no strong actuality guarantee.
In case of power loss, some transactions may be even
\emph on
rolled back
\emph default
in order to restore consistency.
According to POSIX
\begin_inset Foot
status open
\begin_layout Plain Layout
The above argumentation also applies to Windows filesystems in analogous
way.
\end_layout
\end_inset
and other standards, the only
\emph on
reliable
\emph default
way to achieve actuality is usage of system calls like
\family typewriter
sync()
\family default
,
\family typewriter
fsync()
\family default
,
\family typewriter
fdatasync()
\family default
, flags like
\family typewriter
O_DIRECT
\family default
, or similar.
For performance reasons, the
\emph on
vast majority of applications
\emph default
don't use them at all, or are using them only sparingly.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
It makes no sense to require strong actuality guarantees from any block
layer replication (whether DRBD or future versions of MARS) while higher
layers such as filesystems or even applications are already sacrificing
them!
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In summary, the
\series bold
anytime consistency
\series default
provided by MARS is an argument you should consider, even if you need an
extra hard disk for transaction logfiles.
\end_layout
\begin_layout Chapter
Requirements for Long-Distance Replication
\end_layout
\begin_layout Section
Avoiding Inappropriate Clustermanager Types for Medium and Long-Distance
Replication
\begin_inset CommandInset label
LatexCommand label
name "sec:Inappropriate-Clustermanger"
\end_inset
\end_layout
\begin_layout Standard
This section addresses some wide-spread misconceptions.
Its main target audience is
\emph on
userspace
\emph default
developers, but others may profit from
\series bold
detailed explanations of problems and pitfalls
\series default
.
When the problems described in this section are solved somewhen in future,
this section will be shortened and some relevant parts moved to the appendix.
\end_layout
\begin_layout Standard
Doing
\series bold
HA = High Availability
\series default
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-HA"
plural "false"
caps "false"
noprefix "false"
\end_inset
) wrong at
\emph on
concept level
\emph default
may easily get you into trouble, and may cost you several millions of €
or $ in larger installations, or even knock you out of business when disasters
are badly dealt with at higher levels such as clustermanagers.
\end_layout
\begin_layout Subsection
General Cluster Models
\end_layout
\begin_layout Standard
The most commonly known cluster model is called
\series bold
shared-disk
\series default
, and typically controlled by clustermanagers like
\family typewriter
PaceMaker
\family default
:
\begin_inset Separator latexpar
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/shared-disk-model.fig
width 50col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
The most important property of shared-disk is that there exists only a single
disk instance.
Nowadays, this disk often has some
\emph on
internal
\emph default
redundancy such as RAID.
At
\emph on
system
\emph default
architecture layer / network level, there exists no redundant disk at all.
Only the application cluster is built redundantly.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
It should be immediately clear that shared-disk clusters are only suitable
for short-distance operations in the same datacenter, or better in the
same room / rack.
Although running one of the data access lines over short distances between
very near-by datacenters (e.g.
1 km) would be theoretically possible, there would be no sufficient protection
against failure of a whole datacenter.
\end_layout
\begin_layout Standard
Both DRBD and MARS belong to a different architectural model called
\series bold
shared-nothing
\series default
:
\begin_inset Separator latexpar
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/shared-nothing-model.fig
width 50col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
The characteristic feature of a shared-nothing model is (additional)
\series bold
data redundancy at network level
\series default
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Shared-nothing
\begin_inset Quotes eld
\end_inset
clusters
\begin_inset Foot
status open
\begin_layout Plain Layout
Notice that the term
\begin_inset Quotes eld
\end_inset
cluster computing
\begin_inset Quotes erd
\end_inset
usually refers to short-distance only.
Long-distance coupling should be called
\begin_inset Quotes eld
\end_inset
grid computing
\begin_inset Quotes erd
\end_inset
in preference.
As known from the scientific literature, grid computing requires different
concepts and methods in general.
Only for the sake of simplicity, we use
\begin_inset Quotes eld
\end_inset
cluster
\begin_inset Quotes erd
\end_inset
and
\begin_inset Quotes eld
\end_inset
grid
\begin_inset Quotes erd
\end_inset
interchangeably.
\end_layout
\end_inset
\begin_inset Quotes erd
\end_inset
could theoretically be built for
\emph on
any
\emph default
distances, from short to medium to long distances.
However, concrete technologies of disk coupling such as synchronous operation
may pose practical limits on the distances (see chapter
\begin_inset CommandInset ref
LatexCommand nameref
reference "chap:Use-Cases-for"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Standard
In general, clustermanagers must fit to the model.
Some clustermanager can be configured to fit to multiple models.
If so, this must be done properly, or you may get into serious trouble.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
Some people don't seem to know, or they seemingly don't believe even when
told, that different architectural models like shared-disk or shared-nothing
will
\emph on
require
\emph default
an
\emph on
appropriate
\emph default
type of clustermanager and/or at least a different configuration.
Failing to do so, by selection of an inappropriate clustermanager type
and/or an inappropriate configuration may be
\series bold
hazardous
\series default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Pitfall: suchalike problems are typically appearing
\series bold
only during / after incidents
\series default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
It is dangerous to conclude from
\begin_inset Quotes eld
\end_inset
stable ordinary operation
\begin_inset Quotes erd
\end_inset
that a system is reliable.
The real
\series bold
risk
\series default
is that
\series bold
data inconsistencies
\series default
are showing up at the
\series bold
wrong moment
\series default
, e.g.
when the clustermanager has to execute the right actions for compensation
of a certain component failure.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Selection of the right model alone is not sufficient.
Some, if not many, clustermanagers have not been designed for long distances
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Geo-Redundancy"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Standard
As explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Special-Requirements-for"
plural "false"
caps "false"
noprefix "false"
\end_inset
, long distances have further
\series bold
hard requirements
\series default
.
Disregarding them may be also hazardous!
\end_layout
\begin_layout Subsection
Handover / Failover Reasons and Scenarios
\end_layout
\begin_layout Standard
From a sysadmin perspective, there exist a number of different
\series bold
reasons
\series default
why the application workload must be switched from the currently active
side A to the currently passive side B:
\end_layout
\begin_layout Enumerate
Some
\series bold
defect
\series default
has occurred at cluster side A or at some corresponding part of the network.
\end_layout
\begin_layout Enumerate
Some
\series bold
maintenance
\series default
has to be done at side A which would cause a longer downtime (e.g.
security kernel update or replacement of core network equipment or maintainance
of UPS or of the BBU cache etc - hardware isn't 24/7/365 in practice, although
some vendors
\emph on
claim
\emph default
it - it is either not really true, or it becomes
\emph on
extremely
\emph default
expensive).
\end_layout
\begin_layout Standard
Both reasons are valid and should
\begin_inset Foot
status open
\begin_layout Plain Layout
Automatics should be preferred, but there are exceptional cases.
For example, certain scenarios of hardware defects may require
\emph on
manual switchoff
\emph default
of some (parts of) automatics.
\end_layout
\end_inset
be automatically
\emph on
handled
\emph default
(but not necessarily automatically
\emph on
triggered
\emph default
) in larger installations.
In order to deal with all of these reasons, the following basic mechanisms
can be used in either model:
\end_layout
\begin_layout Enumerate
\series bold
Failover
\series default
(triggered either manually or automatically)
\end_layout
\begin_layout Enumerate
\series bold
Handover
\series default
(triggered manually
\begin_inset Foot
status open
\begin_layout Plain Layout
Automatic triggering could be feasible for prophylactic treatments.
\end_layout
\end_inset
)
\end_layout
\begin_layout Standard
It is important to not confuse handover with failover at concept level.
Not only the reasons / preconditions are very different, but also the
\emph on
requirements
\emph default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
Precondition for handover is that
\emph on
both
\emph default
cluster sides are healthy, while precondition for failover is that
\emph on
some really relevant(!)
\emph default
failure has been
\emph on
detected
\emph default
somewhere (whether this is
\emph on
really
\emph default
true is another matter).
Typically, failover must be able to run in masses, while planned handover
often has lower scaling requirements.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Not all existing clustermanagers are dealing with all of these cases (or
their variants) equally well, and some are not even dealing with some of
these cases / variants
\emph on
at all
\emph default
.
\end_layout
\begin_layout Standard
Some clustermanagers cannot easily express the concept of
\begin_inset Quotes eld
\end_inset
automatic triggering
\begin_inset Quotes erd
\end_inset
versus
\begin_inset Quotes eld
\end_inset
manual triggering
\begin_inset Quotes erd
\end_inset
of an action.
There exists simply no cluster-global switch which selects either
\begin_inset Quotes eld
\end_inset
manual mode
\begin_inset Quotes erd
\end_inset
or
\begin_inset Quotes eld
\end_inset
automatic mode
\begin_inset Quotes erd
\end_inset
(except when you start to hack the code and/or write new plugins; then
you might notice that there is no sufficient architectural layering / sufficien
t separation between mechanism and strategy).
Even when such a switch is present and is triggerd by somebody (whether
this is good or bad), this does
\emph on
not
\emph default
imply that network outages cannot ocurr (e.g.
concurrently by accident), and/or that it will work under any
\emph on
unpredictable
\emph default
incident / disaster scenario and/or its variants like
\emph on
partial failures
\emph default
/ rolling disasters / etc (c.f.
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Geo-Redundancy"
plural "false"
caps "false"
noprefix "false"
\end_inset
)
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
Being forced to permanently use an automatic mode for
\series bold
triggering
\series default
several hundreds or even thousands of clusters is not only boring, but
bears a
\series bold
considerable risk
\series default
when automatics do a bad or even wrong decision at hundreds of instances
in parallel.
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Granularity and Layering Hierarchy for Long Distances
\begin_inset CommandInset label
LatexCommand label
name "subsec:Granularity-and-Layering"
\end_inset
\end_layout
\begin_layout Standard
Many existing clustermanager solutions are dealing with a single cluster
instance, as the term
\begin_inset Quotes eld
\end_inset
\emph on
cluster
\emph default
manager
\begin_inset Quotes erd
\end_inset
suggests.
However, when running several hundreds or thousands of cluster instances,
you likely will not want to manage each of them individually.
In addition, failover should
\emph on
not only
\emph default
be
\emph on
triggered
\emph default
(not to be confused with
\emph on
executed
\emph default
) individually at cluster level, but likely
\emph on
also
\emph default
at a higher granularity such as a room, or a whole datacenter.
Otherwise, some chaos is likely to happen.
\end_layout
\begin_layout Standard
This is not enough: the CAP theorem and its sisters will also apply.
Avoid SPOF = Single Points of Failure also at higher layers.
\end_layout
\begin_layout Standard
Here is what you probably will
\series bold
need
\series default
, possibly in difference to what you may find on the market (whether OpenSource
or not).
For simplicity, the following diagram shows only two levels of granularity,
but can be easily extended to multiple layers of granularity, or to some
concept of various
\emph on
subsets of clusters
\emph default
:
\begin_inset Separator latexpar
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/clustermanager-hierarchy.fig
width 70col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
Notice that many existing clustermanager solutions are not addressing the
datacenter granularity at all.
Typically, they use concepts like
\series bold
quorums
\series default
for determining failures
\emph on
at cluster level
\emph default
solely, and then immediately executing failover of the cluster, sometimes
without clean architectural distinction between trigger and execution (similar
to the
\begin_inset Quotes eld
\end_inset
separation of concerns
\begin_inset Quotes erd
\end_inset
between
\series bold
mechanism
\series default
and
\series bold
strategy
\series default
in Operating Systems).
Sometimes there is even no internal software layering / modularization
according to this separation of concerns at all.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
When there is no distinction between different levels of granularity, you
are hopelessly bound to a non-extensible and thus non-adaptable system
when you need to operate masses of clusters.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Minimum requirements for larger installations
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
A lacking distinction between automatic mode and manual mode in a cluster
management solution, and/or lack of corresponding
\series bold
architectural software layers
\series default
should be viewed as ignoration of well-established best practices from
\series bold
software engineering
\series default
.
It will likely bind you to an
\series bold
inflexible system
\series default
, producing direct and indirect
\series bold
long-term follow-up cost
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Terminology: for practical reasons, we use the general term
\begin_inset Quotes eld
\end_inset
clustermanager
\begin_inset Quotes erd
\end_inset
also for speaking about layers dealing with higher granularity, such as
datacenter layers, and also for long-distance replication scenarios, although
some terminology from grid computing would be more appropriate in a scientific
background.
\end_layout
\begin_layout Standard
Please consider the following: when it comes to long-distance HA, the above
layering architecture is also motivated by vastly different numbers of
instances for each layer.
Ideally, the topmost automatics layer should be able to overview several
datacenters in parallel, in order to cope with (almost) global network
problems such as network partitions.
Additionally, it should also detect single cluster failures, or intermediate
problems like
\begin_inset Quotes eld
\end_inset
rack failure
\begin_inset Quotes erd
\end_inset
or
\begin_inset Quotes eld
\end_inset
room failure
\begin_inset Quotes erd
\end_inset
, as well as various types of (partial / intermediate) (replication) network
failures.
Incompatible decisions at each of the different granularities would be
a no-go in practice.
Somewhere and somehow, you need one single
\begin_inset Foot
status open
\begin_layout Plain Layout
If you have
\emph on
logical pairs of datacenters
\emph default
which are firmly bound together, you could also have several topmost automatics
instances, e.g.
for each
\emph on
pair
\emph default
of datacenters.
However, that would be very
\series bold
inflexible
\series default
, because then you cannot easily mix locations or migrate your servers between
datacenters.
Using
\begin_inset Formula $k>2$
\end_inset
replicas with MARS would also become a nightmare.
In your own interest, please don't create any concepts where masses of
hardware are firmly bound to fixed constants at some software layers.
\end_layout
\end_inset
top-most
\emph on
logical
\emph default
problem detection / ranking instance, which should be
\emph on
internally distributed
\emph default
of course, typically using some
\series bold
distributed consensus protocol
\series default
; but in difference to many published distributed consensus algorithms it
should be able to work with
\emph on
multiple
\emph default
granularities at the same time.
\end_layout
\begin_layout Subsection
Discussion of Handover / Failover Methods
\end_layout
\begin_layout Subsubsection
Failover Methods
\begin_inset CommandInset label
LatexCommand label
name "subsec:Failover-Methods"
\end_inset
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
Failover methods are only needed in case of an incident.
They should not be used for regular handover, because preconditions are
different.
Inappropriate merges of both method classes will cause unnecessary
\series bold
indirekt cost
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Paragraph
STONITH-like Methods
\end_layout
\begin_layout Standard
STONITH = Shoot The Other Node In The Head
\end_layout
\begin_layout Standard
These methods are widely known, although they have several serious drawbacks.
Some people even believe that
\emph on
any
\emph default
clustermanager must
\emph on
always
\emph default
have some STONITH-like functionality.
This is wrong.
There
\emph on
exist
\emph default
alternatives, as shown in the next paragraph.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
A historical motivation for STONITH was prevention of illegal modifications
of the
\emph on
shared disk
\emph default
by amok-running defective clients.
In those ancient times, disks were
\emph on
passive
\emph default
mechanical components, while their disk controller was often belongig to
the server.
In modern shared-nothing scenarios, this motivation does no longer exist.
Anyway, you can achieve
\series bold
disk fencing
\series default
by various software means nowadays.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
The most obvious drawback is that STONITH will always create a
\series bold
damage
\series default
, by definition.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
Typical contemporary STONITH implementations are using IPMI and relatives
for automatically powering off your server, or at least pushing the (virtual)
reset button.
This will
\emph on
always
\emph default
create a certain type of damage: the affected systems will definitely not
be available, at least for some time until they have (manually) rebooted.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
The STONITH damage leads to a
\emph on
conceptual
\emph default
contradiction: the reason for starting failover is that you want to restore
availability as soon as possible, but in order to do so you will first
\emph on
destroy
\emph default
the availability of a particular
\emph on
component
\emph default
.
This may be counter-productive.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
When your hot standby node B does not work as expected, or if it works even
\emph on
worse
\emph default
than A before, you will
\emph on
at least
\emph default
loose some time until you
\emph on
can
\emph default
become operational again at the old side A.
In addition, pushing the reset button bears the
\series bold
risk of unnecessary data loss
\series default
from RAM buffers not yet written to disk, and in turn to
\series bold
risk of data inconsistencies
\series default
, like need for a filesystem check.
When some of the hardware is defective, like for example the boot disk
or the boot sector, the system may not come up at all after reset.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
STONITH variant for shared-nothing
\end_layout
\end_inset
Here is an example method for handling a failure scenario.
The old active side A is assumed to be no longer healthy anymore.
The method uses a sequential state transition chain with a STONITH-like
step:
\end_layout
\begin_layout Description
Phase1 Check whether the hot standby B is currently usable.
If this is violated (which may happen during certain types of disasters),
abort the failover for any affected resources.
\end_layout
\begin_layout Description
Phase2
\emph on
Try
\emph default
to shutdown the damaged side A (in the
\emph on
hope
\emph default
that there is no
\emph on
serious
\emph default
damage).
\end_layout
\begin_layout Description
Phase3 In case phase2 did not work during a grace period / after a timeout,
assume that A is badly damaged and therefore STONITH it.
\end_layout
\begin_layout Description
Phase4 Start the application at the hot standby B.
\end_layout
\begin_layout Plain Layout
Notice: any cleanup actions, such as
\series bold
repair
\series default
of defective hard- or software etc, are outside the scope of failover processes.
Typically, they are executed much later when restoring redundancy.
\end_layout
\begin_layout Plain Layout
Also notice: this method is a
\emph on
heavily
\emph default
distributed one, in the sense that sequential actions are alternated multiple
times on different hosts.
This is known to be cumbersome in distributed systems, in particular in
presence of network problems.
\end_layout
\begin_layout Plain Layout
\begin_inset CommandInset label
LatexCommand label
name "Phase4-in-more"
\end_inset
Phase4 in more detail for DRBD, augmented with some pseudo code for application
control:
\end_layout
\begin_layout Enumerate
at side B:
\family typewriter
drbdadm disconnect all
\end_layout
\begin_layout Enumerate
at side B:
\family typewriter
drbdadm primary --force all
\end_layout
\begin_layout Enumerate
at side B:
\family typewriter
applicationmanager start all
\end_layout
\begin_layout Plain Layout
The same phase4 using MARS:
\end_layout
\begin_layout Enumerate
at side B:
\family typewriter
marsadm pause-fetch all
\end_layout
\begin_layout Enumerate
at side B:
\family typewriter
marsadm primary --force all
\end_layout
\begin_layout Enumerate
at side B:
\family typewriter
applicationmanager start all
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
This sequential 4-phase method is far from optimal, for the following reasons:
\end_layout
\begin_layout Itemize
The method tries to handle both failover and handover scenarios with one
single sequential receipe.
In case of a true failover scenario where it is
\emph on
already known for sure
\emph default
that side A is badly damaged, this method will unnecessarily waste time
for phase 2.
This could be fixed by introduction of a conceptual distinction between
handover and failover, but it would not fix the following problems.
\end_layout
\begin_layout Itemize
Before phase4 is started (which will re-establish the service from a user's
perspective), a lot of time is wasted by
\emph on
both
\emph default
phases 2
\emph on
and
\emph default
3.
Even if phase 2 would be skipped, phase 3 would unnecessarily cost some
time.
In the next paragraph, an alternative method is explained which eliminates
any unnecessary waiting time at all.
\end_layout
\begin_layout Itemize
The above method is adapted from the shared-disk model.
It does not take advantage of the shared-nothing model, where further possibili
ties for better solutions exist.
\end_layout
\begin_layout Itemize
In case of long-distance network partitions and/or sysadmin / system management
subnetwork outages, you may not even be able to (remotely) execute STONITH
at all.
Thus the above method misses an important failure scenario.
\end_layout
\begin_layout Standard
Some people seem to have a
\emph on
binary
\emph default
view at the healthiness of a system: in their view, a system is either
operational, or it is damaged.
This kind of view is ignoring the fact that some systems may be half-alive,
showing only
\emph on
minor
\emph default
problems, or occurring only from time to time.
\end_layout
\begin_layout Standard
It is obvious that damaging a healthy system is a bad idea by itself.
Even
\emph on
generally
\emph default
damaging a half-alive system in order to
\begin_inset Quotes eld
\end_inset
fix
\begin_inset Quotes erd
\end_inset
problems is not generally a good idea, because it may increase the damage
when you don't know the
\emph on
real
\emph default
reason
\begin_inset Foot
status open
\begin_layout Plain Layout
Example, occurring in masses: an incorrectly installed bootloader, or a
wrong BIOS boot priority order which unexpectedly lead to hangs or infinite
reboot cycles once the DHCP or BOOTP servers are not longer available /
reachable.
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
Even worse: in a distributed system
\begin_inset Foot
status open
\begin_layout Plain Layout
Notice: the STONITH concept is more or less associated with short-distance
scenarios where
\series bold
crossover cables
\series default
or similare equipment are used.
The assumption is that crossover cables can't go defective, or at least
it would be an extremely unlikely scenario.
For long-distance replication, this assumption is simply not true.
\end_layout
\end_inset
you sometimes
\emph on
cannot(!)
\emph default
know whether a system is healthy, or to what degree it is healthy.
Typical STONITH methods as used in some contemporary clustermanagers are
\series bold
assuming a worst case
\series default
, even if that worst case is currently not for real.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Advice
\end_layout
\end_inset
Avoid the following
\series bold
fundamental flaws
\series default
in failover concepts and healthiness models, which apply to implementors
/ configurators of clustermanagers:
\end_layout
\begin_layout Itemize
Don't mix up knowledge with conclusions about a (sub)system, and also don't
mix this up with the real state of that (sub)system.
In reality, you don't have any knowledge about a complex distributed system.
You only may have
\emph on
some
\emph default
knowledge about
\emph on
some
\emph default
parts of the system, but you cannot
\begin_inset Quotes eld
\end_inset
see
\begin_inset Quotes erd
\end_inset
a complex distributed system as a whole.
What you think is your knowledge, isn't knowledge in reality: in many cases,
it is
\emph on
conclusion
\emph default
, not knowledge.
Don't mix this up!
\end_layout
\begin_layout Itemize
Some systems are more complex than your model of it.
Don't neglect important parts (such as networks, routers, switches, cables,
plugs) which may lead you to wrong conclusions!
\end_layout
\begin_layout Itemize
Don't restrict your mind to boolean models of healthyness.
Doing so can easily create unnecessary damage by construction, and even
at concept level.
You should know from software engineering that defects in concepts or models
are much more serious than simple bugs in implementations.
Choosing the wrong model cannot be fixed as easily as a typical bug or
a typo.
\end_layout
\begin_layout Itemize
Try to deduce the state of a system as
\series bold
reliably
\series default
as possible.
If you don't know something for sure, don't generally assume that it has
gone wrong.
Don't confuse missing knowledge with the conclusion that something is bad.
Boolean algebra restricts your mind to either
\begin_inset Quotes eld
\end_inset
good
\begin_inset Quotes erd
\end_inset
or
\begin_inset Quotes eld
\end_inset
bad
\begin_inset Quotes erd
\end_inset
.
Use at least
\series bold
tri-state algebra
\series default
which has a means for expressing
\series bold
\begin_inset Quotes eld
\end_inset
unknown
\begin_inset Quotes erd
\end_inset
\series default
.
Even better: attach a probability to anything you (believe to) know.
Errare humanum est: nothing is absolutely for sure.
\end_layout
\begin_layout Itemize
Oversimplification: don't report an
\begin_inset Quotes eld
\end_inset
unknown
\begin_inset Quotes erd
\end_inset
or even a
\begin_inset Quotes eld
\end_inset
broken
\begin_inset Quotes erd
\end_inset
state for a complex system whenever a smaller subsystem exists for which
you have some knowledge (or you can conclude something about it with reasonable
evidence).
Otherwise, your users / sysadmins may draw wrong conclusions, and assume
that the whole system is broken, while in reality only some minor part
has some minor problem.
Users could then likely make wrong decisions, which may then easily lead
to bigger damages.
\end_layout
\begin_layout Itemize
Murphy's law:
\series bold
never assume that something can't go wrong!
\series default
Doing so is a blatant misconception at topmost level: the
\emph on
purpose
\emph default
of a clustermanager is creating High Availablity (HA) out of more or less
\begin_inset Quotes eld
\end_inset
unreliable
\begin_inset Quotes erd
\end_inset
components.
It is the damn duty of both a clustermanager and its configurator to try
to compensate
\emph on
any
\emph default
failures,
\emph on
regardless of their probability
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
Never claim that something has only low probability (and therefore it were
not relevant).
In the HA area, you simply
\series bold
cannot know
\series default
that, because you typically have
\emph on
sporadic
\emph default
incidents.
In extreme cases, the
\emph on
purpose
\emph default
of your HA solution is protection against 1 failure per 10 years.
You simply don't have the time to wait for creating an incident statistics
about that!
\end_layout
\end_inset
, as best as possible.
\end_layout
\begin_layout Itemize
Never confuse
\series bold
probability
\series default
with
\series bold
expectancy value!
\series default
If you don't know the mathematical term
\begin_inset Quotes eld
\end_inset
expectancy value
\begin_inset Quotes erd
\end_inset
, or if you don't know what this means
\emph on
in practice
\emph default
, don't take responsibility for millions of € or $.
\end_layout
\begin_layout Itemize
When operating masses of hard- and software: never assume that a particular
failure can occur only at a low number of instances.
There are
\series bold
\emph on
unknown(!)
\emph default
systematic errors
\series default
which may pop up at the wrong time and in huge masses when you don't expect
them.
\end_layout
\begin_layout Itemize
Multiple layers of fallback:
\emph on
any
\emph default
action can fail.
Be prepared to have a plan B, and even a plan C, and even better a plan
D, wherever possible.
\end_layout
\begin_layout Itemize
Never increase any damage anywhere, unnecessarily! Always try to
\emph on
miminize
\emph default
any damage! It can be mathematically proven that in deterministic probabilistic
systems having finite state, increases of a damage level
\emph on
at the wrong place
\emph default
will
\emph on
introduce
\emph default
an
\emph on
additional
\emph default
\emph on
risk
\emph default
of getting into an
\series bold
endless loop
\series default
.
This is also true for nondeterministic systems, as known from formal language
theory
\begin_inset Foot
status open
\begin_layout Plain Layout
Finite automatons are known to be transformable to deterministic ones, usually
by an exponential increase in the number of states.
\end_layout
\end_inset
.
\end_layout
\begin_layout Itemize
Apply the
\series bold
best effort principle
\series default
.
You should be aware of the following fact: in general, it is impossible
to create an
\emph on
absolutely reliable system
\emph default
out of unreliable components.
You can
\emph on
lower
\emph default
the risk of failures to any
\begin_inset Formula $\epsilon>0$
\end_inset
by investing a lot of resources and of money, but whatever you do:
\begin_inset Formula $\epsilon=0$
\end_inset
is impossible.
Therefore, be careful with boolean algebra.
Prefer approximation methods / optimizing methods instead.
Always do
\emph on
your
\emph default
best, instead of trying to reach a
\emph on
global
\emph default
optimum which likely does not exist at all (because the
\begin_inset Formula $\epsilon$
\end_inset
can only
\emph on
converge
\emph default
to an optimum, but will never actually reach it).
\begin_inset Newline newline
\end_inset
The best effort principle means the following: if you discover a method
for improving your operating state by reduction of a (potential) damage
in a reasonable time and with reasonable effort, then
\series bold
simply do it
\series default
.
Don't argue that a particular step is no 100% solution for all of your
problems.
\emph on
Any
\emph default
\emph on
improvement
\emph default
is valuable.
\series bold
Don't miss any valuable step
\series default
having reasonable cost with respect to your budget.
Missing valuable measures which have low cost are certainly a violation
of the best effort principle, because you are not doing
\emph on
your
\emph default
best.
Keep that in mind.
\begin_inset Newline newline
\end_inset
If you have
\emph on
understood
\emph default
this (e.g.
deeply think at least one day about it), you will no longer advocate STONITH
methods
\emph on
in general
\emph default
, when there are alternatives.
STONITH methods are only valuable when you
\emph on
know in advance
\emph default
that the final outcome (after reboot) will most likely be better, and that
waiting for reboot will most likely
\emph on
pay off
\emph default
.
In general, this condition is
\emph on
not true
\emph default
if you have a healthy hot standby system.
This should be easy to see.
But there exist well-known clustermanager solutions / configurations blatantly
ignoring
\begin_inset Foot
status open
\begin_layout Plain Layout
For some
\emph on
special(!)
\emph default
cases of the shared-disk model, there exist some justifications for doing
STONITH
\emph on
before
\emph default
starting the application at the hot standby.
Under certain circumstances, it can happen that system A running amok could
destroy the data on your single shared disk (example: a filesystem doubly
mounted
\emph on
in parallel
\emph default
, which will certainly destroy your data, except you are using
\family typewriter
ocfs2
\family default
or suchalike).
This argument is only valid for
\emph on
passive
\emph default
disks which are
\emph on
directly
\emph default
attached to
\emph on
both
\emph default
systems A and B, such that there is no
\emph on
external
\emph default
means for fencing the disk.
In case of iSCSI running over ordinary network equipment such as routers
or switches, the argument
\begin_inset Quotes eld
\end_inset
fencing the disk is otherwise not possible
\begin_inset Quotes erd
\end_inset
does not apply.
You can interrupt iSCSI connections at the network gear, or you can often
do it at cluster A or at the iSCSI target.
Even commercial storage appliances speaking iSCSI can be remotely controlled
for forcefully aborting iSCSI sessions.
In modern times, the STONITH method has no longer such a justification.
The justification stems from ancient times when a disk was a purely passive
mechanical device, and its disk controller was part of the server system.
\end_layout
\end_inset
this.
Only when the former standby system does not work as expected (this means
that
\emph on
all
\emph default
of your redundant systems are not healthy enough for your application),
\emph on
only then
\begin_inset Foot
status open
\begin_layout Plain Layout
Notice that STONITH may be needed for (manual or partially automatic)
\emph on
repair
\emph default
in some cases, e.g.
when you know that a system has a kernel crash.
Don't mix up the repair phase with failover or handover phases.
Typically, they are executed at different times.
The repair phase is outside the scope of this section.
\end_layout
\end_inset
\emph default
STONITH is unevitable as a
\emph on
last resort
\emph default
option.
\begin_inset Newline newline
\end_inset
In short: blindly using STONITH without true need during failover is a violation
of the best effort principle.
You are simply not doing your best.
\end_layout
\begin_layout Itemize
When your budget is limited, carefully select those improvements which make
your system
\series bold
as reliable as possible
\series default
, given your fixed budget.
\end_layout
\begin_layout Itemize
Create statistics on the duration of your actions.
Based on this, try to get a
\emph on
balanced
\emph default
optimum between time and cost.
\end_layout
\begin_layout Itemize
Whatever actions you can
\series bold
start in parallel
\series default
for saving time, do it.
Otherwise you are disregarding the best effort principle, and your solution
will be sub-optimal.
You will require deep knowledge of parallel systems, as well as experience
with dealing with problems like (distributed) races.
Notice that
\emph on
any
\emph default
distributed system is
\emph on
inherently parallel
\emph default
.
Don't believe that sequential methods can deliver an optimum solution in
such a difficult area.
\end_layout
\begin_layout Itemize
If you don't have the
\series bold
necessary skills
\series default
for (a) recognizing already existing parallelism, (b) dealing with parallelism
at concept level, (c) programming and/or configuring parallelism race-free
and deadlock-free (or if you even don't know what a race condition is and
where it may occur in practice), then don't take responsibility for millions
of € or $.
\end_layout
\begin_layout Itemize
Avoid hard timeouts wherever possible.
Use
\series bold
adaptive timeouts
\series default
instead.
Reason: depending on hardware or workload, the same action A may take a
very short time on cluster 1, but take a very long time on cluster 2.
If you need to guard action A from hanging (which is almost always the
case because of Murphy's law), don't configure any fixed timeout for it.
When having several hundreds of clusters, you would need to use the
\emph on
worst case value
\emph default
, which is the longest time occurring somewhere at the very slow clusters
/ slow parts of the network.
This wastes a lot of time in case one of the fast clusters is hanging.
Adaptive timeouts work differently: they use a kind of
\begin_inset Quotes eld
\end_inset
progress bar
\begin_inset Quotes erd
\end_inset
to monitor the
\emph on
progress
\emph default
of an action.
They will abort only if there is
\emph on
no progress
\emph default
for a certain amount of time.
Hint: among others,
\family typewriter
marsadm view-*-rest
\family default
commands or macros are your friend.
\end_layout
\end_inset
\end_layout
\begin_layout Paragraph
ITON = Ignore The Other Node
\end_layout
\begin_layout Standard
This strategy means
\series bold
fencing from application traffic
\series default
, and can be used as an alternative to STONITH when done properly.
\begin_inset Separator latexpar
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/fencing-hierarchy.fig
width 60col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
Fencing from application traffic is best suited for the shared-nothing model,
but can also be adapted to the shared-disk model with some quirks.
\end_layout
\begin_layout Standard
The idea is simple: always route your application network traffic to the
current (logically) active side, whether it is currently A or B.
Just don't route any application requests to the current (logically) passive
side at all.
\end_layout
\begin_layout Standard
For failover (and
\emph on
only
\emph default
for that), you
\emph on
should not care about
\emph default
any split brain occurring at the low-level generic block device:
\begin_inset Separator latexpar
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/split-brain-history.fig
width 50col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
Although having a split brain at the generic low-level block device, you
now define the
\begin_inset Quotes eld
\end_inset
logically active
\begin_inset Quotes erd
\end_inset
and
\begin_inset Quotes eld
\end_inset
logically passive
\begin_inset Quotes erd
\end_inset
side by yourself by
\emph on
logically ignoring
\emph default
the
\begin_inset Quotes eld
\end_inset
wrong
\begin_inset Quotes erd
\end_inset
side as defined by yourself:
\begin_inset Separator latexpar
\end_inset
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/split-brain-resolved.fig
width 50col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
This is possible because the generic block devices provided by DRBD or MARS
are completely
\series bold
agnostic
\series default
of the
\begin_inset Quotes eld
\end_inset
meaning
\begin_inset Quotes erd
\end_inset
of either version A or B.
Higher levels such as clustermanagers (or humans like sysadmins) can assign
them a meaning like
\begin_inset Quotes eld
\end_inset
relevant
\begin_inset Quotes erd
\end_inset
or
\begin_inset Quotes eld
\end_inset
not relevant
\begin_inset Quotes erd
\end_inset
, or
\begin_inset Quotes eld
\end_inset
logically active
\begin_inset Quotes erd
\end_inset
or
\begin_inset Quotes eld
\end_inset
logically passive
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Standard
As a result of fencing from application traffic, the
\begin_inset Quotes eld
\end_inset
logically passive
\begin_inset Quotes erd
\end_inset
side will
\emph on
logically
\emph default
cease any actions such as updating user data, even if it is
\begin_inset Quotes eld
\end_inset
physically active
\begin_inset Quotes erd
\end_inset
during split-brain (when two primaries exist in DRBD or MARS sense
\begin_inset Foot
status open
\begin_layout Plain Layout
Hint: some clustermanagers and/or some people seem to define the term
\begin_inset Quotes eld
\end_inset
split-brain
\begin_inset Quotes erd
\end_inset
differently from DRBD or MARS.
In the context of generic block devices, split brain means that the
\emph on
history
\emph default
of both versions has been split to a Y-like
\series bold
fork
\series default
(for whatever reason), such that re-joining them
\emph on
incrementally
\emph default
by ordinary write operations is no longer guaranteed to be possible.
As a slightly simplified definition, you might alternatively use the definition
\begin_inset Quotes eld
\end_inset
two incompatible primaries are existing in parallel
\begin_inset Quotes erd
\end_inset
, which means almost the same in practice.
Details of formal semantics are not the scope of this treatment.
\end_layout
\end_inset
).
\end_layout
\begin_layout Standard
If you already have some load balancing at the network, or BGP, or another
\emph on
mechanism
\emph default
for dynamic routing, you already have an important part for the ITON method.
Additionally, ensure by an appropriate
\emph on
strategy
\emph default
that your balancer status / BGP announcement etc does always coincide with
the
\begin_inset Quotes eld
\end_inset
logically active
\begin_inset Quotes erd
\end_inset
side (recall that even during split-brain
\emph on
you
\emph default
must define
\begin_inset Quotes eld
\end_inset
logically active
\begin_inset Quotes erd
\end_inset
\series bold
uniquely
\series default
\begin_inset Foot
status open
\begin_layout Plain Layout
A possible strategy is to use a Lamport clock for route changes: the change
with the most recent Lamport timestamp will always win over previous changes.
\end_layout
\end_inset
by yourself).
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Application fencing
\end_layout
\end_inset
\end_layout
\begin_layout Description
Phase1 Check whether the hot standby B is currently usable.
If this is violated (which may happen during certain types of disasters),
do not start failover for any affected resources.
\end_layout
\begin_layout Description
Phase2 Do the following
\emph on
in parallel
\begin_inset Foot
status open
\begin_layout Plain Layout
For database applications where no transactions should get lost, you should
slightly modify the order of operations: first fence the old side A, then
start the application at standby side B.
However, be warned that even this cannot guarantee that no transaction
is lost.
When the network between A and B is interrupted
\emph on
before
\emph default
the incident happens, DRBD will automatically disconnect, and MARS will
show a lagbehind.
In order to fully eliminate this possibility, you can either use DRBD and
configure it to hang forever during network outages (such that users will
be unable to commit any transactions at all), or you can use the shared-disk
model instead.
But in the latter case, you are introducing a SPOF at the single shared
disk.
The former case is logically almost equivalent to shared-disk, but avoiding
some parts of the physical SPOF.
In a truly distributed system, the famous CAP theorem is limiting your
possibilities.
Therefore, no general solution exists fulfilling all requirements at the
same time.
\end_layout
\end_inset
:
\begin_inset Separator latexpar
\end_inset
\end_layout
\begin_deeper
\begin_layout Itemize
Start all affected applications at the hot standby B.
This can be done with the same DRBD or MARS procedure as described in
\begin_inset CommandInset ref
LatexCommand nameref
reference "Phase4-in-more"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Itemize
Fence A by fixedly routing all affected application traffic to B.
\end_layout
\end_deeper
\begin_layout Plain Layout
That's all which has to be done for a shared-nothing model.
Of course, this will likely produce a split-brain (even when using DRBD
in place of MARS), but that will not matter from a user's perspective,
because the users will no longer
\begin_inset Quotes eld
\end_inset
see
\begin_inset Quotes erd
\end_inset
the
\begin_inset Quotes eld
\end_inset
logically passive
\begin_inset Quotes erd
\end_inset
side A through their network.
Only during the relatively small time period where application traffic
was going to the old side A while not replicated to B due to the incident,
a very small number of updates
\emph on
could
\emph default
have gone lost.
In fields like webhosting, this can be taken into account.
Users will usually not complain when some (smaller amount of) data is lost
due to split-brain.
They will complain when the service is unavailable.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
This method is the
\series bold
fastest
\series default
for restoring HA, because it doesn't try to execute any (remote) action
at side A.
Only from a sysadmin's perspective, there remain some cleanup tasks to
be done during the following repair phase, such as split-brain resolution,
which are outside the scope of this treatment.
\end_layout
\begin_layout Standard
By running the application fencing step
\emph on
sequentially
\emph default
(including wait for its partial successfulness such that the old side A
can no longer be reached by any users) in front of the failover step, you
may minimize the amount of lost data, but at the cost of total duration.
Your service will take longer to be available again, while the amount of
lost data could be
\emph on
theoretically
\emph default
somewhat smaller.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
A few people might clamour when some data is lost.
In long-distance replication scenarios with high update traffic, there
is
\emph on
simply no way at all
\emph default
for guaranteeing that no data can be lost ever.
According to the laws of Einstein and the laws of Distributed Systems like
the famous CAP theorem (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Explanation-via-CAP"
plural "false"
caps "false"
noprefix "false"
\end_inset
), this isn't the fault of DRBD+proxy or MARS, but simply the
\emph on
consequence
\emph default
of having long distances.
If you want to protect against data loss as best as possible, and when
you can afford it financially, then don't use
\begin_inset Formula $k=2$
\end_inset
replicas.
Use
\begin_inset Formula $k\geq3$
\end_inset
, and spread them over different distances, such as mixed small + medium
+ long distances.
Future versions of MARS are planned to support adaptive pseudo-synchronous
modes, which will allow individual adaptation to network latencies / distances.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
The ITON method can be adapted to shared-disk by additionally fencing the
common disk from the (presumably) failed cluster node A.
\end_layout
\begin_layout Subsubsection
Handover Methods
\end_layout
\begin_layout Standard
Planned handover is conceptually simpler, because both sides must be (almost)
healthy as a
\emph on
precondition
\emph default
.
There are simply no pre-existing failures to deal with.
\end_layout
\begin_layout Standard
Here is an example using DRBD, some application commands denoted as pseudo
code:
\end_layout
\begin_layout Enumerate
at side A:
\family typewriter
applicationmanager stop all
\end_layout
\begin_layout Enumerate
at side A:
\family typewriter
drbdadm secondary all
\end_layout
\begin_layout Enumerate
at side B:
\family typewriter
drbdadm primary all
\end_layout
\begin_layout Enumerate
at side B:
\family typewriter
applicationmanager start all
\end_layout
\begin_layout Standard
MARS already has a conceptual distinction between handover and failover.
With MARS, it becomes even simpler, because a generic handover procedure
is already built in:
\end_layout
\begin_layout Enumerate
at side A:
\family typewriter
applicationmanager stop all
\end_layout
\begin_layout Enumerate
at side B:
\family typewriter
marsadm primary all
\end_layout
\begin_layout Enumerate
at side B:
\family typewriter
applicationmanager start all
\end_layout
\begin_layout Standard
When using the
\family typewriter
systemd
\family default
interface of
\family typewriter
marsadm
\family default
(see
\family typewriter
mars-user-mnaual.pdf
\family default
), this can be shortened into only one command:
\end_layout
\begin_layout Enumerate
at side B:
\family typewriter
marsadm primary all
\end_layout
\begin_layout Subsubsection
Hybrid Methods
\end_layout
\begin_layout Standard
In general, a planned handover may fail at any stage.
Notice that such a failure is also a failure, but (partially) caused by
the planned handover.
You have the following alternatives for automatically dealing with such
cases:
\end_layout
\begin_layout Enumerate
In case of a failure, switch back to the old side A.
\end_layout
\begin_layout Enumerate
Instead, forcefully switch to the new side A, similar to the methods described
in section
\begin_inset CommandInset ref
LatexCommand ref
reference "subsec:Failover-Methods"
\end_inset
.
\end_layout
\begin_layout Standard
Similar options exist for a failed failover (at least in theory), but chances
are lower for actually recovering if you have only
\begin_inset Formula $k=2$
\end_inset
replicas in total.
\end_layout
\begin_layout Standard
Whatever you decide to do in what case in whatever priority order, whether
you decide it in advance or during the course of a failing action: it simply
means that according to the best effort principle, you should
\series bold
never leave your system in a broken state
\series default
when there exists a chance to recover availability with any method.
\end_layout
\begin_layout Standard
Therefore, you should
\emph on
implement
\emph default
neither handover nor failover in their pure forms.
Always implement hybrid forms following the best effort principle.
\end_layout
\begin_layout Subsection
Special Requirements for Long Distances
\begin_inset CommandInset label
LatexCommand label
name "subsec:Special-Requirements-for"
\end_inset
\end_layout
\begin_layout Standard
Most contemporary clustermanagers have been constructed for short distance
shared-nothing clusters, or even for
\emph on
local
\emph default
shared-nothing clusters (c.f.
DRBD over crossover cables), or even for shared-disk clusters (
\emph on
originally
\emph default
, when their
\emph on
concepts
\emph default
were developed).
Blindly using them for long-distance replication without modification /
adaptation bears some additional risks.
\end_layout
\begin_layout Itemize
Notice that long-distance replication always
\emph on
requires
\emph default
a
\series bold
shared-nothing
\series default
model.
\end_layout
\begin_layout Itemize
As a consequence,
\series bold
split brain
\series default
can appear
\emph on
regularly
\emph default
during failover.
There is no way for preventing it! This is an
\emph on
inherent property
\emph default
of distributed systems, not limited to MARS (e.g.
also ocurring with DRBD if you try to use it over long distances).
Therefore, you
\emph on
must
\emph default
deal with occurences of split-brain as a
\emph on
requirement
\emph default
.
\end_layout
\begin_layout Itemize
The probability of
\series bold
network partitions
\series default
is much higher: although you should have been required by Murphy's law
to deal with network partitions already in short-distance scenarios, it
now becomes
\emph on
mandatory
\emph default
.
\end_layout
\begin_layout Itemize
Be prepared that in case of certain types of (more or less global) internet
partitions, you may not be able to trigger STONITH actions
\emph on
at all
\emph default
.
Therefore,
\series bold
fencing of application traffic
\series default
is
\emph on
mandatory
\emph default
.
\end_layout
\begin_layout Itemize
When considering algorithms like
\series bold
leader election
\series default
or
\series bold
master selection
\series default
or similar, or even highly sophisticated ones like
\emph on
consensus on state machine replication
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
Some families of protocols like PAXOS (see
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://en.wikipedia.org/wiki/Paxos_(computer_science)
\end_layout
\end_inset
) are currently hyped.
Certainly, suchalike agreement algorithms and their promises
\emph on
look
\emph default
appealing for humans.
Practical replication of masses of so-called
\begin_inset Quotes eld
\end_inset
machines
\begin_inset Quotes erd
\end_inset
over long distances is not as easy as laymen in Theoretical Computer Science
may conclude, e.g.
from a
\emph on
variety
\emph default
of
\emph on
misunderstandings
\emph default
of terms and descriptions.
This guide is on
\emph on
geo-redundancy
\emph default
of
\emph on
datacenters
\emph default
(cf section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Geo-Redundancy"
plural "false"
caps "false"
noprefix "false"
\end_inset
), and this does
\emph on
not
\emph default
imply that the runtime state of VMs & co needs to be replicated.
Discussions are outside the scope of this guide.
\end_layout
\end_inset
, please think twice.
The CAP theorem will hold at
\emph on
any
\emph default
layer, and may produce
\emph on
different
\emph default
results at
\emph on
each
\emph default
of the layers.
Chaos may occur.
For example, independent split brain may occur at the layer of so-called
\emph on
orchestrations
\emph default
, and at different points in time.
\begin_inset Newline newline
\end_inset
Example of an analogy: when a big classical orchestra is forcefully split
into multiple sub-orchestras by some unexpected external force during their
performance, the surviving players will not improve their music, in particular
when they cannot hear each other anymore.
Having a common conductor will also not help if he/she breaks down, or
cannot be seen anymore by some of the surviving players, or when suddenly
two independent conductors are entering the scene, e.g.
because each of them believes that the other one would be already dead.
\end_layout
\begin_layout Chapter
Advice for Managers and Architects
\begin_inset CommandInset label
LatexCommand label
name "chap:Advice-for-Managers"
\end_inset
\end_layout
\begin_layout Section
Maturity Considerations for Managers
\begin_inset CommandInset label
LatexCommand label
name "sec:Maturity-Considerations"
\end_inset
\end_layout
\begin_layout Subsection
Maturity of Architectures
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
Instances of storage system
\emph on
architectures
\emph default
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
) typically have a
\series bold
lifetime
\series default
of
\series bold
decades
\series default
.
\end_layout
\begin_layout Plain Layout
While implementations / components / storage vendors etc can often be exchanged
or updated more frequently (typically lifecycles of 3 to 5 years for CAPEX
reasons),
\series bold
fundamental architectures
\series default
are much less flexible to change, and thus are
\emph on
forcing
\emph default
you into a
\series bold
long-term strategy
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
In contrast, certain hardware technologies have a much lower lifetime, typically
between 1 and 2 years.
New server hardware / new disks / SSDs etc are hitting their market all
the time, like waves in the ocean.
\end_layout
\begin_layout Standard
\emph on
System software
\emph default
technologies (OS layer) typically have a lifetime inbetween hardware and
architecture lifetimes.
Although their update cycles / minor release cycles are typically even
faster than hardware releases, their
\emph on
fundamental product appearance points
\emph default
are rather stable
\begin_inset Foot
status open
\begin_layout Plain Layout
Appearance of certain technologies may occur in
\series bold
hype cycles
\series default
, caused by
\emph on
social
\emph default
effects.
While there are founding wa ves for (sometimes similar) product classes,
other solution appearancesare more evenly spread over the decades.
For example, appearance of many Unix clones / descendants appears to rather
smoothly distributed over half a century.
\end_layout
\end_inset
.
For example, the Linux kernel is now more than 20 years old, while its
\emph on
fundamental architecture
\emph default
has been copied from Unix and is now almost 50 years old.
\end_layout
\begin_layout Standard
Certain advocates are arguing with the
\emph on
current
\emph default
status of maturity of
\emph on
components
\emph default
.
In a long-term business operated by professionals, there is an observable
long-term trend:
\end_layout
\begin_layout Quote
\series bold
\size large
Maturity of components is (almost) always improving over the years.
\end_layout
\begin_layout Standard
Of course, maturity is important.
In sensible areas, so-called
\begin_inset Quotes eld
\end_inset
banana software
\begin_inset Quotes erd
\end_inset
may even kill you.
In such a situation, the
\emph on
current
\emph default
maturity status is important.
However, once an implementation is
\emph on
mature enough
\emph default
, and/or once only some nice-to-have features are deservable, the long-term
maturity trend / forecast of implementations / components is more important
than the current status.
You can influence this with your
\series bold
long-term investment decisions
\series default
.
\end_layout
\begin_layout Standard
There exists something which is even more important:
\end_layout
\begin_layout Quote
\series bold
\size large
Maturity of fundamental architectures is most important, because they
\emph on
cannot
\emph default
improve.
Architectures need to be
\uuline on
right from scratch
\uuline default
.
\end_layout
\begin_layout Standard
This is similar to mathematics: Pythagoras' theorem or Einstein's laws cannot
be improved.
They will last forever.
At most, they can get old-fashioned or otherwise
\series bold
outdated
\series default
/ obsoleted.
However, there are other chances and
\series bold
opportunities
\series default
:
\end_layout
\begin_layout Itemize
New / better architetures may appear (rarely).
\end_layout
\begin_layout Itemize
Implementations of architectures should evolve slowly over time.
\end_layout
\begin_layout Itemize
Implementations may slowly migrate to other architectures, or even support
multiple architectures at the same time (convergence properties).
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
General advice
\end_layout
\end_inset
\end_layout
\begin_layout Quote
\series bold
\size large
Pay more attention to fundamental architectures.
Develop a
\uuline on
long-term strategy
\uuline default
for maturity of components and implementations.
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Maturity of MARS
\begin_inset CommandInset label
LatexCommand label
name "subsec:Maturity-of-MARS"
\end_inset
\end_layout
\begin_layout Standard
Notice that MARS itself is just a component.
For a fully functional system, you will need some more infrastructure at
several layers.
\end_layout
\begin_layout Itemize
\series bold
MARS
\series default
itself is in production since 2013, and on mass data (several petabytes)
since 2014.
MARS itself is
\emph on
generic
\emph default
, and can be used for a multitude of Linux application stacks.
\end_layout
\begin_layout Itemize
A
\series bold
cluster manager
\series default
\begin_inset Foot
status open
\begin_layout Plain Layout
1&1 Ionos ShaHoLin uses a self-built proprietray cluster manager called
\family typewriter
cm3
\family default
.
It works only with the internal 1&1 database infrastructure, and is not
generic.
\end_layout
\end_inset
is typically also needed for mass installations.
You can use the
\family typewriter
systemd
\family default
template engine of
\family typewriter
marsadm
\family default
, see
\family typewriter
mars-user-manual.pdf
\family default
, which is easily configurable by Linux sysadmins.
\end_layout
\begin_layout Itemize
Tpically,
\series bold
monitoring
\series default
is anyway specific for each application stack.
Adding some simple Icinga scripts or similar should be no problem for professio
nal Linux admins.
\end_layout
\begin_layout Itemize
Automatic
\series bold
mass deployment
\series default
: this is anyway specific for the deployment system used for your system
plus application stack.
At the moment, plugins for generic solutions like OpenStack etc are missing.
This is an opportunity for other OpenSource projects!
\end_layout
\begin_layout Itemize
The
\series bold
Football framework
\series default
is in mass production at 1&1 Ionos ShaHoLin since 2018.
It has some plugin for driving the
\family typewriter
systemd
\family default
cluster manager.
Its plugin architecture should allow easy adaptation to other system and
application stacks.
\end_layout
\begin_layout Itemize
Another opportunity for OpenSource projects: some web-based point-and-click
\series bold
dashboard
\series default
similar to the Ceph Dashboard, but displaying and controlling sharded LVM
pools which are replicated via MARS, and also controlling Football, would
be a highly appreciated addendum.
\end_layout
\begin_layout Section
Recommendations for Hard- and Software Project Setup
\begin_inset CommandInset label
LatexCommand label
name "sec:Recommendations-for-Project"
\end_inset
\end_layout
\begin_layout Standard
Big enterprises are often binding their technical projects (whether developmenta
l or operational ones) to
\emph on
specific
\emph default
products, or to
\emph on
specific
\emph default
platforms.
In addition, inter-team organisational structures are tending to
\emph on
fragmentation
\emph default
.
This can can easily produce lots of
\series bold
missed opportunties
\series default
for
\series bold
synergy effects
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
In the storage field, missed synergy effects from projects are often creating
considerable
\series bold
\emph on
direct
\emph default
cost
\series default
.
For a total of petabytes, this can easily sum up to some millions.
\series bold
\emph on
Indirect
\emph default
long-term cost
\series default
, including
\series bold
insufficient flexibility
\series default
for the market, can be even higher.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
This section hints you at some countermeasures.
\end_layout
\begin_layout Subsection
Hardware Projects and Virtualization
\begin_inset CommandInset label
LatexCommand label
name "subsec:Hardware-Projects-and-Virtualization"
\end_inset
\end_layout
\begin_layout Standard
This section hints you at several pitfalls, which may result from misconceptions.
\end_layout
\begin_layout Subsubsection
Physical Hardware vs Virtual Hardware
\begin_inset CommandInset label
LatexCommand label
name "subsec:Physical-Hardware-vs-Virtual"
\end_inset
\end_layout
\begin_layout Standard
In theory, server hardware is independent from system software.
For example, you may install both Windows and Linux onto the same server
iron.
In practice, however, each software application stack may have
\emph on
different
\emph default
requirements for ...
\end_layout
\begin_layout Itemize
CPU power
\end_layout
\begin_layout Itemize
RAM size
\end_layout
\begin_layout Itemize
IOPS demands
\end_layout
\begin_layout Standard
...
independently from storage, whether it would be local one, or remote storage
over network.
In order to save cost, several companies are using
\series bold
virtualization
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Capabilities of virtualization
\end_layout
\end_inset
Several people are believing that virtualization will
\emph on
generally
\emph default
improve things.
While this is often true, there are
\emph on
exceptions
\emph default
.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
For large applications requiring a lot of CPU and RAM, such as big databases,
or masses of smaller databases, or webhosting with PHP as a primary consumer
of resources, virtualization will
\emph on
not
\emph default
magically give you more resources.
It can just
\emph on
dynamically re-distribute
\emph default
existing hardware resources across the same hypervisor iron, without magically
creating new resources out of thin air.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not try to virtualize a system which is
\emph on
already virtualized
\emph default
.
This can only be counter-productive.
Many people do not know that
\series bold
classical UNIX processes
\series default
\begin_inset Foot
status open
\begin_layout Plain Layout
Originally, processes have been invented at the beginning of the 1960s for
better exploitation of expensive physical resources, originally by providing
multiple
\begin_inset Quotes eld
\end_inset
virtual computers
\begin_inset Quotes erd
\end_inset
to
\emph on
different
\emph default
users.
Later, the concept of
\begin_inset Quotes eld
\end_inset
communicating sequential processes
\begin_inset Quotes erd
\end_inset
(Hoare) become popular as a structuring aid for the
\emph on
same
\emph default
user, which is now standard, and has been extended in various ways.
\end_layout
\end_inset
are also a form of virtualization.
When your system is already at its limit when carrying masses of conventional
processes (e.g.
by dynamically scaling the number of daemons / server processes), an additional
KVM layer or
\emph on
masses
\emph default
of docker instances (lesser with an LXC layer or a
\emph on
low
\emph default
number of docker instances allowing resource sharing in the kernel) will
\emph on
not
\emph default
speed up your existing processes, but in contrary, will likely lead to
\series bold
density regressions
\series default
.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not neglect the
\series bold
overhead of virtualization
\series default
.
Running several dozens to hundrets of KVM instances on one iron will consume
a lot of RAM overhead, while the same amount of LXC containers is typically
cheaper.
For CPU overhead, the picture is similar, but typically less stronger,
provided that CPU overbooking is
\emph on
very moderate
\emph default
.
When overbooking CPU too much with KVM / qemu (or commercial alternatives
like vmware), so-called
\series bold
steal overhead
\series default
can grow considerably, depending on various influences.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not expect linear behaviour: steal overhead can
\emph on
amplify itself
\emph default
in various situations, and hardware-based SMP systems can also go into
\series bold
RAM thrashing
\series default
/ multilayer
\series bold
CPU cache thrashing
\series default
when overloaded with too big workingsets (cf.
section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The so-called
\series bold
noisy neighbour problem
\series default
has been publicly advocated a few years ago, thus it is known by more people.
However, it is only a special sub-problem of more general workingset problems.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Capabilities of virtualization
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Avoid the above detail problems, which can lead to
\series bold
serious cost increase
\series default
(both direct and indirect cost), by careful checking in advance.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Let the check done by skilled experts who know what a workingset is, and
how to measure it, and how to workaround corresponding problems.
\end_layout
\end_inset
\end_layout
\begin_layout Subsubsection
Storage Hardware
\begin_inset CommandInset label
LatexCommand label
name "subsec:Storage-Hardware"
\end_inset
\end_layout
\begin_layout Standard
It is easy to miss opportunities for cost savings, or even to produce
\emph on
massive regressions
\emph default
by
\emph on
factors
\emph default
, by
\series bold
unexpected side effects
\series default
of management decisions.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\series bold
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Missed: architecture had to follow organization
\end_layout
\end_inset
\series default
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
A frequent mistake is to organize teams or departments by introduction
of a border between
\begin_inset Quotes eld
\end_inset
storage admins
\begin_inset Quotes erd
\end_inset
and
\begin_inset Quotes eld
\end_inset
sysadmins
\begin_inset Quotes erd
\end_inset
, and assigning them more or less complementary technical responsibilities.
Typical arguments can be heard that each could then better
\emph on
concentrate
\emph default
at his speciality.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
What looks like a
\begin_inset Quotes eld
\end_inset
good idea
\begin_inset Quotes erd
\end_inset
at first glance, will likely prevent several cost-saving models like
\family typewriter
FlexibleSharding
\family default
, see sections
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Variants-of-Sharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:FlexibleSharding"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
As explained there, this can
\series bold
increase cost
\series default
by factors, and
\series bold
reduce reliability
\series default
considerably (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Optimum-Reliability-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Similarly: creating a department (or a team) which is
\series bold
responsible for the whole storage of a divison or of the company
\series default
is a very bad idea.
Besides the NOF risks explained in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
, it can easily
\series bold
bind you for decades
\series default
, likely to either cost-intensitive commercial storage appliances (depending
on the gusto of involved people, see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Local-vs-Centralized"
plural "false"
caps "false"
noprefix "false"
\end_inset
), and/or to some
\family typewriter
BigCluster
\family default
architecture.
It simply means that only
\series bold
network-centric storage hardware
\series default
can be used in practice, and that an expensive storage network becomes
mandatory in practice (otherwise capacity planning etc could become difficult).
Other types of storage will become almost impossible.
Changing such an architecture for some petabytes of data will be very cumbersom
e and time-consuming.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\series bold
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Better: organization follows architecture
\end_layout
\end_inset
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Always consider alternatives, and determine / estimate their TCO for at
least 5 years, better 10 years.
You need to include
\series bold
migration cost
\series default
when both EOL storage hardware and EOL server hardware has to be replaced
by newer one (hardware lifecycle).
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Notice: the
\family typewriter
FlexibleSharding
\family default
model is naturally well-suited for VMs of various types.
If you want to splice the overall IT responsibility, then the
\series bold
VM layer
\series default
is
\emph on
typically
\emph default
a much better
\emph on
candidate
\emph default
than introduction of a dedicated network-centric storage layer.
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Software Project Recommendations
\begin_inset CommandInset label
LatexCommand label
name "subsec:Software-Project-Recommendations"
\end_inset
\end_layout
\begin_layout Standard
On one hand, software
\emph on
appears
\emph default
to be easier exchangeable than masses of hardware.
However, this only applies to
\emph on
components
\emph default
in practice.
More complex software stacks or networks are typically too complex, and
are often containing lots of
\series bold
hidden dependencies
\series default
.
\end_layout
\begin_layout Standard
In this section, we will look at various obstacles where software, and in
particular the
\series bold
fundamental architecture
\series default
of software, is
\series bold
limiting flexibility
\series default
and producing
\series bold
unnecessary cost
\series default
.
\end_layout
\begin_layout Standard
The scope of this section is exceeding the storage area.
Most of the given advice will also apply to more general enterprise software.
\end_layout
\begin_layout Subsubsection
Usefulness Scope of Software
\begin_inset CommandInset label
LatexCommand label
name "subsec:Usefulness-Scope-of-Software"
\end_inset
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
A very important property of software: after it is
\series bold
written once
\series default
,
\emph on
in general
\emph default
it can be
\series bold
instantiated many times
\series default
.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
While creation of copies of tangible goods typically costs a lot of effort
and money, software copies
\emph on
as such
\emph default
are costing
\emph on
almost nothing
\emph default
.
This is a major source of
\series bold
cost saving potential
\series default
, while at the same time
\series bold
improving quality
\series default
as explained below.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Observations from the whole industry, not very specific for a single company:
in practice there exists
\emph on
lots
\emph default
of software which actually is installed only
\emph on
once
\emph default
.
Most of it is constructed in such a way that it
\emph on
cannot
\emph default
be easily installed another time, or suchalike would not be useful, because
it is
\series bold
firmly bound
\series default
to a
\series bold
singleton instance
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Singletons
\end_layout
\end_inset
So-called
\begin_inset Quotes eld
\end_inset
enterprise databases
\begin_inset Quotes erd
\end_inset
which often have their own enterprise-specific databasse schema, or even
their own
\series bold
product-specific schema
\series default
.
Much of the software / scripts around them makes only sense for this particular
schema.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Workarounds for incompatibilities
\end_layout
\end_inset
So-called
\begin_inset Quotes eld
\end_inset
middleware
\begin_inset Foot
status open
\begin_layout Plain Layout
This usage of the term
\begin_inset Quotes eld
\end_inset
middleware
\begin_inset Quotes erd
\end_inset
is
\emph on
incorrect
\emph default
in strong sense.
The original goal of middleware was providing
\series bold
universally generic
\series default
marshalling and translation of data formats between
\begin_inset Quotes eld
\end_inset
incompatible
\begin_inset Quotes erd
\end_inset
\begin_inset Quotes eld
\end_inset
platforms
\begin_inset Quotes erd
\end_inset
(where nowadays the latter term often is also used incorrectly, because
a
\begin_inset Quotes eld
\end_inset
platform
\begin_inset Quotes erd
\end_inset
is a
\series bold
stable interface
\series default
/ foundation for a
\series bold
\emph on
multitude
\emph default
of application
\emph on
classes
\series default
\emph default
).
\end_layout
\end_inset
\begin_inset Quotes erd
\end_inset
is often translating and adapting between multiple singletons.
It makes no sense to instantiate this type of
\begin_inset Quotes eld
\end_inset
middleware
\begin_inset Quotes erd
\end_inset
somewhere else.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Another frequent ill-design is placement of
\series bold
business logic
\series default
in so-called
\begin_inset Quotes eld
\end_inset
middleware
\begin_inset Quotes erd
\end_inset
.
According to Dijkstra's layering rules (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Layering-Rules"
plural "false"
caps "false"
noprefix "false"
\end_inset
), business logic should get its own layer, independently from cross-platform
concerns (aka
\series bold
separation of concerns
\series default
).
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
From the discipline of software engineering
\begin_inset Foot
status open
\begin_layout Plain Layout
Explanation: software engineering as a discipline has the
\emph on
opposite
\emph default
goal of
\emph on
maximizing
\emph default
several important KPIs of software.
\end_layout
\end_inset
:
\series bold
non-instantiable singletons
\series default
are an
\series bold
indicator
\series default
of
\series bold
\emph on
poor software design
\series default
and practice
\emph default
.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Likely, your competitors will have similar problems, often without noticing
them.
If you are the first to
\series bold
overcome them in long term
\series default
, you will get an
\series bold
advantage
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
The usefulness of software and/or of its components can be roughly classified
as follows:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/usefulness.fig
width 60col%
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Globally useful software
\end_layout
\end_inset
The Linux kernel is installed at several
\emph on
billions
\emph default
of instances.
From the biggest server, on supercomputers, down to
\emph on
billions
\emph default
of smartphones, and on tiny IoT gadgets.
In order to support such a wide variety of hardware, it is
\series bold
highly customizable
\series default
through thousands of compile-time config options, and lots of runtime options.
Additionally, it has a high degree of automatic adaptation to hardware
components,
\series bold
automatic self-configuring
\series default
, etc.
Its userspace API does not only support classical libc-based Unix software,
but also the completely different execution engines of smartphones, and
much more.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Domain-specific generic software
\end_layout
\end_inset
Football (see
\family typewriter
football-user-manual.pdf
\family default
) is domain-specific in the sense that it is only useful for sharded storage,
but not for BigCluster storage.
Its main part is generic, since it is
\series bold
extensible via plugins
\series default
.
For usage in other application areas than currently in production, some
new plugins might be necessary.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Instance-specific software
\end_layout
\end_inset
Tetris is the 1&1-internal name for the instance-specific customization
\emph on
plugin
\emph default
of Football.
Its size is about 1/3 of the whole Football system.
The Tetris plugin is only useful at the 1&1 Ionos ShaHoLin software instance,
while the 2/3 generic parts are intended to be useful for
\emph on
any
\emph default
MARS installation.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In general, most non-modularized instance-specific software is
\emph on
not
\emph default
based on higher usability levels.
Then the
\emph on
whole
\emph default
invest is practically not re-usable.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Tetris is an example how divergent requirements from broader usefulness
desires can be combined with instance-specific requirements.
The basic idea is an
\series bold
extensible plugin software architecture
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Maximizing the usefulness KPI
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In general, all three usefulness classes (globally useful / domain-specific
generic / instance-specific) are needed for a healthy enterprise.
It is not possible to operate your business purely with
\begin_inset Quotes eld
\end_inset
globally useful software
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Plain Layout
\noindent
On the other hand, operating your whole business with instance-specific
software would be
\emph on
theoretically
\emph default
possible, but extremely expensive, and likely un-economical / non-competitive.
\end_layout
\begin_layout Plain Layout
You can
\series bold
maximize the overall usefulness
\series default
by using
\emph on
as much
\emph default
from the upper classes
\emph on
as reasonably possible
\emph default
.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
For example, you can make an inventory of all your software assets, including(!
) free ones from OpenSource your people are just downloading and installing,
and
\series bold
evaluate the usefulness
\series default
according to the above classification, then determine the
\series bold
number of instances
\series default
for each asset, and finally
\series bold
create a weighted
\series default
\begin_inset Foot
status open
\begin_layout Plain Layout
The
\begin_inset Quotes eld
\end_inset
size
\begin_inset Quotes erd
\end_inset
or
\begin_inset Quotes eld
\end_inset
development effort
\begin_inset Quotes erd
\end_inset
for software components needs to be taken into account.
They can vary by some orders of magitudes.
Treating them as
\begin_inset Quotes eld
\end_inset
equal-sized bricks
\begin_inset Quotes erd
\end_inset
would massively over-emphasize tiny helper scripts.
Since there is often some binary-only proprietary software, a possibly
weighting method could be the
\emph on
installed binary size
\emph default
in bytes.
This will also lead to distortions, but typically less significant than
\begin_inset Quotes eld
\end_inset
uniform bricks
\begin_inset Quotes erd
\end_inset
.
Theoretically, you could discriminate between code and data (e.g.
images), but this might lead to a high effort for inventory.
Simple solutions are better in practice.
Exceptional corrections can be applied when distortion are getting too
high in certain places.
\end_layout
\end_inset
\series bold
KPI
\series default
out of it.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
It is
\emph on
critical
\emph default
to
\series bold
not forget
\emph on
external
\emph default
OpenSource
\series default
assets which cost
\emph on
nothing
\emph default
, but heavily contribute to your business value, and/or contribute by risk
reduction, etc.
Beware of SAP & relatives, typically there exists no inventory for them.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
You might derive further sub-KPIs, such as per-asset TCO, or business value,
or risk indicators, etc.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
As a side effect, you will likely find much more opportunities for long-term
improvements of your enterprise than you can implement in short term.
Evaluate their
\series bold
potential
\series default
, and
\series bold
prioritize
\series default
accordingly.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
As a manager, the big question is:
\emph on
how
\emph default
can you achieve better usefulness in
\series bold
long term
\series default
? Just use a KPI, or are there further aspects not modeled by KPIs?
\end_layout
\begin_layout Standard
For a better background, have a brief look at the following classification
of
\series bold
architectural potentials
\series default
.
\end_layout
\begin_layout Subsubsection
Architectural Levels of Genericity
\begin_inset CommandInset label
LatexCommand label
name "subsec:Architectural-Levels-of-Genericity"
\end_inset
\end_layout
\begin_layout Standard
Managers only interested in an overview may skip the rest after the first
graphics, showing 3 different levels of genericity.
Architects should
\emph on
not
\emph default
skip the examples.
\end_layout
\begin_layout Standard
Here is a classification of
\series bold
genericity
\series default
according to its
\series bold
re-use potential
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\series bold
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Genericity and re-use
\end_layout
\end_inset
Re-use
\series default
means that each time something needs to be implemented, or each time some
requirements are changing, some new software components need not be implemented
from scratch, but already existing components / parts are just
\series bold
recycled
\series default
and used in a different way or in a different context.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In general, components / parts
\series bold
need to be constructed for re-use
\series default
.
When not prepared for re-use, artefacts will be less useful, or even not
useful for re-use at all.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
A good way for re-use preparation is
\series bold
genericity
\series default
.
It means that something is only
\begin_inset Quotes eld
\end_inset
prepared for use
\begin_inset Quotes erd
\end_inset
, by providing some
\emph on
concrete interface
\emph default
for both use and re-use, such that any concrete usage is
\emph on
relatively easy
\emph default
.
\end_layout
\begin_layout Plain Layout
In other words: although the
\emph on
first use
\emph default
is slightly more expensive because of intermediate introduction of genericity
and its documented or
\emph on
self-documenting(!)
\emph default
interfaces,
\emph on
any
\emph default
later
\emph on
re-use
\emph default
will then be
\series bold
cheaper
\series default
than making everything from scratch again.
When re-use is executed frequently enough,
\series bold
investments into genericity
\series default
will
\series bold
pay off rapidly
\series default
.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
If you are
\series bold
unsatisfied
\series default
with
\series bold
software development productivity
\series default
in your company, consider the following.
You need to
\series bold
explicitly request
\series default
a certain level of genericity as a
\series bold
preparation for long-term re-use
\series default
.
Otherwise, you likely won't get it.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Reason: people want to finish their current projects
\emph on
as fast as possible
\emph default
, typically
\emph on
missing
\emph default
important opportunities for preparation of re-use (provided they have the
necessary skills).
This behaviour is often heavily amplified by
\series bold
deadlines
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
The following can be used to classify not only the genericity of software
itself or of programming styles, but also of
\series bold
software architectures
\series default
(see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
The biggest potential of genericity is when applied at architectural level:
\end_layout
\begin_layout Standard
\noindent
\align center
\begin_inset Graphics
filename images/levels-of-genericity.fig
width 60col%
\end_inset
\end_layout
\begin_layout Enumerate
\series bold
Universal genericity
\series default
means that potentially an
\series bold
infinite
\series default
number of re-usage variants (potential:
\begin_inset Formula $\infty$
\end_inset
) can be derived
\series bold
easily
\series default
, by
\series bold
configuration
\series default
and/or by
\series bold
convention
\series default
.
A few examples:
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Unix files
\end_layout
\end_inset
Invented in the 1970s, Unix files are extremely universal.
They can hold
\emph on
anything
\emph default
, from simple ASCII text to executables, and to complex database containers.
This is possible by a
\series bold
universally generic representation
\series default
: a file is nothing but a sequence of bytes
\begin_inset Foot
status open
\begin_layout Plain Layout
Predecessor filesystems were typically more complex, e.g.
a file was a sequence of
\emph on
records
\emph default
.
There was a variety of variants, like fixed-length records, variable-length
records, indexed records, etc.
These had further problems, because the
\emph on
byte
\emph default
was not yet standardized as exactly 8 bit.
There were 6-bit bytes, or 12-bit bytes, etc.
\end_layout
\end_inset
, with an arbitrary length, which can change dynamically at runtime.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The genericity of Unix files is a striking example that sometimes
\series bold
less code is more value
\series default
! Unix files are
\series bold
simpler
\series default
than the
\series bold
unnecessary complexity
\series default
of historical record-based predecessor file concepts.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The
\series bold
only invest
\series default
for exploitation of fruitful generic simplification:
\series bold
careful thinking
\series default
before starting an implementation, best from experienced software architects
/ experts.
This can save you up to
\emph on
factors
\emph default
!
\end_layout
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Business process languages
\end_layout
\end_inset
Business process languages like BMPL and their execution engines are modern
universally generic systems, but typically used for domain-specific purposes.
There you can see that both concepts usefulness vs genericity are
\emph on
orthogonal
\emph default
to each other by some degree.
\end_layout
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Universal compilers / interpreters
\end_layout
\end_inset
LISP is one of the eldest programming languages in the world, invented 1959.
It can be used to express
\series bold
any mathematical problem
\series default
, while only a
\emph on
subset
\emph default
of them is actually
\series bold
computable
\series default
, even by modern supercomputers.
While the Unix file as such is just a
\emph on
passive
\emph default
item and thus not prone to computability problems, active items like LISP
programs are
\series bold
Turing Complete
\series default
, which is a two-sided sword in practice.
Although
\emph on
extremely capable
\emph default
, it is not easy to understand and to control.
Many modern
\series bold
IT risks
\series default
(e.g.
security risks) can be deduced from Turing Completeness.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
There are universally generic compilers and interpreters, for example parser
generators, which are
\emph on
not
\emph default
Turing Complete by their basic configuration language.
\end_layout
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Macro mechanisms and C++ templates
\end_layout
\end_inset
Parameterization can be done via C preprocessor macros, or C++ templates,
or other macro processors.
Macro substitution can not only be applied to programming languages, but
also to configuration data.
An example is the
\family typewriter
systemd
\family default
interface of
\family typewriter
marsadm
\family default
, see
\family typewriter
mars-user-manual.pdf
\family default
.
It suffices to define a certain
\family typewriter
systemd
\family default
unit template only once, and then let it automatically instantiate for
hundrets or thousands of LVs and their application stacks.
\end_layout
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Recommendation
\end_layout
\end_inset
Universal genericity has the
\series bold
highest potential
\series default
, and should be always considered for
\emph on
passive
\emph default
use cases.
Several
\emph on
active
\emph default
systems, however, bear a relatively high risk when Turing Complete, when
not developed and maintained and operated by
\series bold
highly skilled staff
\series default
which can
\emph on
really
\emph default
deal with their complexity, and who are
\series bold
really knowing what they are doing
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Enumerate
\series bold
Compositorical genericity
\series default
is similar to the composibility of LEGO bricks: via a more or less
\series bold
uniform standard interface
\series default
, numerous re-combinations / compositions can be easily created.
Its potential is similar to
\series bold
permutations
\series default
, thus factorial:
\begin_inset Formula $O(n!).$
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Pipe and filters style
\end_layout
\end_inset
A good example is an architectural style called
\series bold
pipe and filters style
\series default
, which is the heart of the Unix Philosophy.
In the original Unix concept, a relatively
\emph on
low
\emph default
number of simple
\begin_inset Foot
status open
\begin_layout Plain Layout
Modern Unix-like systems including GNU/Linux have much more complex operators,
some with hundreds of options.
Nevertheless, they can also be used for compositorical genericity.
\end_layout
\end_inset
basic operators were used for creation of an extremely wide variety of
complex data processing pipelines.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
There is a programming language which directly supports this style, called
Bash Script in its modern version.
\end_layout
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Stacked block devices
\end_layout
\end_inset
Linux has inherited the concept from Unix.
In Unix,
\begin_inset Quotes eld
\end_inset
everything is a file
\begin_inset Quotes erd
\end_inset
, and thus Unix devices are also
\emph on
represented
\emph default
as a file.
Block devices are a special case, where only certain access granularities
like multiples of
\emph on
sectors
\emph default
are possible.
Modern Linux has augmented the concept with several special operations,
such as
\family typewriter
BLKDISCARD
\family default
and other
\family typewriter
ioctl()
\family default
syscalls.
Nevertheless, block devices are stackable, for example for creation of
software RAID.
Stacks are very flexible, for example you may place MARS on top of LVM
on top of software RAID, or in a different order, or you may insert SSD
caches at various positions, etc.
The number of
\emph on
potential
\emph default
combinations is very high.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
For
\emph on
usage
\emph default
of stacked block devices, you don't need to be a programmer.
Exploiting compositorical genericity is possible from sysadmin space.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
However,
\emph on
creation
\emph default
of a new stackable component is a completely different story.
Linux
\series bold
kernel programming
\series default
requires completely different skills, and even among kernel hackers a junior
level is all else but sufficient
\begin_inset Foot
status open
\begin_layout Plain Layout
C programming is
\emph on
one
\emph default
of many
\emph on
preconditions
\emph default
for kernel hacking.
It is however not sufficient.
The Linux kernel is a technical universe in itself.
While many userspace C programmers need not deal with
\series bold
concurrency
\series default
, or only with harmless standard cases, kernel programmers need to know
and have experiences with about a
\emph on
dozen
\emph default
of different concurrency models and their concrete implementations.
This is required for SMP scalability, weak memory semantics / memory barrier
hardware operations, RCU, and much more, in addition to classical interrupt-dri
ven concurrency models.
\end_layout
\end_inset
.
As a manager, do not confuse these HR requirements!
\end_layout
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Electrical engineering
\end_layout
\end_inset
Electrical engineers have used compositorical genericity even before the
digital computer had been invented.
Their
\series bold
wiring diagrams
\series default
are connecting basic
\series bold
functional units
\series default
, for example transistors or resistors, or whole sub-circuits.
\end_layout
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Established use case for compositorical genericity
\end_layout
\end_inset
By using Linux, you automatically get it via
\family typewriter
ssh
\family default
commandlines used by sysadmins.
Experienced Linux seniors will confirm that its
\series bold
automation potential
\series default
is beyond anything having a graphical point-and-click interface.
System administration for several hundrets or thousands of servers would
be an extreme effort, or almost impossible otherwise.
\end_layout
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
New use cases for compositorical genericity
\end_layout
\end_inset
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
There are much more use cases where compositorical genericity would be
extremely beneficial.
Its potential is
\begin_inset Formula $O(n!)$
\end_inset
where
\begin_inset Formula $n$
\end_inset
is not the number of developers, but the number of functor instances
\begin_inset Foot
status open
\begin_layout Plain Layout
In general, a functor of a certain type can be instantiated several times,
even in the same pipeline.
\end_layout
\end_inset
.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
The biggest practical obstacle is that too few people know of its enormous
potential, and even less people have practical experiences with it in larger
scale systems, such as Distributed Systems.
When you have few excellent people with the necessary skills, don't force
them to use so-called
\begin_inset Quotes eld
\end_inset
standard paradigms
\begin_inset Quotes erd
\end_inset
like OO, but let them exploit the much higher potential of compositorical
genericity.
Often, they won't be able to do so unless you help them by creating a special
friendly working environment.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Don't be surprised when a single developer shows a
\series bold
productivity
\series default
roughly equivalent to 10 conventional OO developers, or even more.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not confuse the roles of sysadmins with the roles of developers.
Just because sysadmins usually are more used to pipe and filters style,
this does not magically convert them into developers.
A developer for compositorical genericity at large scale needs to know
much more, at least at a
\series bold
master's level in computer science
\series default
, if not at a PhD level.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not populate a team with OO addicts or with people who don't have the
necessary skills, if you want to exploit the potential of compositorical
genericity.
Ask the inevitable
\emph on
experienced technical leader
\emph default
, who else may have the necessary skills, in order to qualify as additional
team member.
There exists practically no standard hiring profile at the job market.
\end_layout
\end_inset
\end_layout
\begin_layout Enumerate
\series bold
Extensional genericity
\series default
means that an existing component needs to be re-used by
\emph on
extending
\emph default
it.
Its potential is only
\begin_inset Formula $O(k)$
\end_inset
where
\begin_inset Formula $k$
\end_inset
is a constant depending on your development resources.
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Classical OO = Object Orientation
\end_layout
\end_inset
No detailed explanation necessary, because many people already know what
\series bold
OO inheritance
\series default
is, and have some experiences with it.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Typically, programmer skills are required for non-trivial large-scale systems.
Pure sysadmin skills are often not sufficient.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
There are lots of programmers at the job market, qualifying for OO.
However, many of them are often lacking some sysadmin skills when HA operations
is required.
Thus a
\emph on
mixed team
\emph default
with both skill sets is something you should consider for
\series bold
enterprise-critical
\series default
application stacks.
In addition, automated testing is highly recommendable.
\end_layout
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Classical OO = Object Orientation
\end_layout
\end_inset
Probably you are surprised that classical OO inheritance has the
\emph on
least
\emph default
potential, only
\begin_inset Formula $O(k)$
\end_inset
, while alternatives are much better, e.g.
\begin_inset Formula $O(n!)$
\end_inset
or
\begin_inset Formula $O(\infty)$
\end_inset
.
\end_layout
\begin_layout Plain Layout
Reason: for any new OO functionality, some skilled programmer has to write
some program code, which needs to be tested and made production-ready.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Thus real-life OO productivity is often lower than promised by advocates.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In general, programming language paradigms are
\emph on
orthogonal
\emph default
to levels of genericity.
For example, compositorical genericity can be implemented with OO languages.
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Genericity in the Linux kernel
\end_layout
\end_inset
The Linux kernel has more than 20 millions of lines of code written in C.
Many people are regarding C as an imperative language, some even condemning
it as
\begin_inset Quotes eld
\end_inset
high-level assembler
\begin_inset Quotes erd
\end_inset
.
However, the kernel has many parts like stackable filesystems where OO
techniques are used.
Several parts, like the dm = device mapper infrastructure, are more or
less following many principles from compositorical genericity.
Universal genericity is also present, for example in firewall rules execution
engines.
Few people seem to know that even FP = Functional Programming style is
possible in C, if you know how to do it.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Good C programming requires some skills.
People who
\emph on
really
\emph default
have those skills are reaching a similar productivity than with other programmi
ng languages.
Notice that C has some unique application areas where other languages are
practically out of the game
\begin_inset Foot
status open
\begin_layout Plain Layout
Several years ago, some Java advocates were claiming that operating systems
would be better written in Java, thus C will vanish in the long term.
This has not become true.
Reason: it is not
\emph on
reasonably
\emph default
possible to write a JVM = Java Virtual Machine in Java, while all major
JVMs are written in C.
\end_layout
\end_inset
, such as kernel and deep system programming.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
A good programmer is treating programming languages as
\series bold
tools
\series default
, which have
\emph on
no global
\emph default
pros and cons, but each of them is more or less well-suited for each specific
\series bold
application area
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Section
From OpenSource Consumers to Contributors to Leaders
\begin_inset CommandInset label
LatexCommand label
name "sec:From-OpenSource-Consumers"
\end_inset
\end_layout
\begin_layout Standard
The basic idea of OpenSource is very simple:
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Fundamental idea of OpenSource
\end_layout
\end_inset
Several competitors and enthusiasts are meeting together in a common neutral
playground, also called
\series bold
commons
\series default
or
\series bold
common land
\series default
.
Each is contributing something useful to the commons.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
As a
\emph on
result
\emph default
of collaboration,
\emph on
each
\emph default
of them is
\series bold
getting back more value
\series default
than
\emph on
each
\emph default
of them have contributed.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
OpenSource is much more than a particular component.
In fact, it is a
\series bold
whole ecosystem
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
This means: by definition, only globally useful software (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Usefulness-Scope-of-Software"
plural "false"
caps "false"
noprefix "false"
\end_inset
) can qualify as OpenSource commons.
In some cases, domain-specific generic software may qualify also, but this
needs to be checked.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
This
\emph on
usability gap
\emph default
leaves you an opportunity for
\series bold
company-individual or product-specific customization
\series default
even of your own OpenSource components, provided you manage to get an appropria
te degree of genericity (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Architectural-Levels-of-Genericity"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Best areas for OpenSource
\end_layout
\end_inset
Some people seem to fear that OpenSource might help their competitors too
much.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
When OpenSource is used for
\series bold
basic infrastructure
\series default
instead of finished competitive products, then a
\series bold
win-win
\series default
situation is always
\series bold
improving competitiveness
\series default
.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
There are a few
\emph on
examples
\emph default
where even giving away a
\emph on
full product
\emph default
can improve competitiveness.
An example is Google Android.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In addition, each competitor may make better business by
\series bold
strengthening a whole ecosystem
\series default
, e.g.
attracting more customers in total, etc.
\end_layout
\begin_layout Plain Layout
In particular, this can make sense when competition is more between
\emph on
whole ecosystems
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
Example: today there is an increasing competition between the webhosting
ecosystem (including public blog software like WordPress), and account-based
social media (e.g.
Facebook & co).
\end_layout
\end_inset
, than between individual companies.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
The vast majority of companies will profit from the Linux kernel, because
selling OS software is
\emph on
not
\emph default
their core business.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
Several ecosystems are already
\series bold
dominated by OpenSource
\series default
.
Commercial competitors would not have a chance anyway
\begin_inset Foot
status open
\begin_layout Plain Layout
Exceptions are confirming the rule: only certain near-by niches which aren't
covered by OpenSource may have a chance for commercial products.
When the nice is big enough, or when its importance increases, some existing
nearby OpenSource projects may extend their coverage, or some new projects
may jump in.
The history of Linux has shown that even pre-existing elder commercial
Unix systems were marginalized, and several disappeared eventually.
\end_layout
\end_inset
, because the
\series bold
world-wide total productivity
\series default
(e.g.
\series bold
scaling effects
\series default
) of OpenSource is unbeatable in such areas.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
MARS would not have a chance for long-term survival if it weren't OpenSource.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
How to take advantage of OpenSource? The OSAMM = Open Source Adoption Maturity
Model is explained in simplified form at
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://baloise.github.io/open-source/docs/md/goals/uplift.html
\end_layout
\end_inset
.
More context can be found in Lofi Dewanto's presentation
\begin_inset Flex URL
status open
\begin_layout Plain Layout
https://drive.google.com/file/d/1GHLogE3ibdyjPaYfK_O4ELVtvUcE051R/view
\end_layout
\end_inset
, in particular slide 24.
There are 3 levels of OpenSource adoption which are interesting for most
companies:
\end_layout
\begin_layout Enumerate
\series bold
Use
\series default
.
Typically, OpenSource software is just downloaded, possibly compiled (depending
on development model), and installed.
\end_layout
\begin_layout Enumerate
\series bold
Contribute
\series default
.
Some code / documentation / feedback is flowing from in-house users back
to the public project.
\end_layout
\begin_layout Enumerate
\series bold
Champion
\series default
.
Somebody in the company has a leading role in the public project, and is
thus leading a
\series bold
movement
\series default
.
\end_layout
\begin_layout Standard
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Important OpenSource specialities
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
There are certain misconceptions about OpenSource, which can lead to
\series bold
fatal failures
\series default
.
Here are some extremely important explanations:
\end_layout
\begin_layout Itemize
OpenSource is more about a
\emph on
movement
\emph default
than about the
\begin_inset Quotes eld
\end_inset
software as such
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Itemize
Several important OpenSource projects, like the Linux kernel, have been
\series bold
founded by individuals
\series default
and
\emph on
not
\emph default
by companies.
Such projects are following
\series bold
different rules than company projects
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Rules in personal OpenSource projects
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresToxiques.png
lyxscale 50
scale 17
\end_inset
Managers who don't know the written and un-written rules of
\series bold
personally led OpenSource projects
\series default
can easily create
\series bold
substantial damage
\series default
, up to the destruction of a (sub-)project.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
The principles behind OpenSource movement rules can be found at Eric Raymond's
articles from the 1990s and early 2000s.
You need to understand that OpenSource communities are a
\series bold
gift culture
\series default
, aka
\series bold
meritocracy
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Personal leadership in Linux
\end_layout
\end_inset
Practically everybody knows that Linus Torvalds has founded the Linux kernel.
His name is even encoded into the project name.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
As you can read at
\begin_inset Flex URL
status open
\begin_layout Plain Layout
http://www.kernel.org
\end_layout
\end_inset
, there is no chance to submit a patch originating from a company.
Linus and the kernel hackers will simply ignore it.
Only patches submitted by
\emph on
individuals
\emph default
are acceptable at all.
It would be bad style to argue
\begin_inset Quotes eld
\end_inset
you must accept this patch because I am from company XYZ, and I am paid
by my company to create this patch
\begin_inset Quotes erd
\end_inset
.
Even if the company name had three capital letters, it wouldn't help.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Kernel modules like MARS
\end_layout
\end_inset
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
As mentioned above, MARS would have no long-term chance for survival unless
OpenSource.
Since it is a Linux kernel module, it
\series bold
\emph on
cannot exist
\emph default
independently from Linux
\series default
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Consequence: anyone who wants to work at the core of the MARS project
\series bold
\emph on
must accept
\emph default
the same rules
\series default
as for the Linux kernel
\begin_inset Foot
status open
\begin_layout Plain Layout
Example:
\family typewriter
grsecurity
\family default
was
\emph on
technically
\emph default
a sub-project of Linux, but did not comply to the rules of the Linux community.
Therefore it failed in 2017, after more of a decade of OpenSource activity.
Some of its remains are now migrated into mainstream, but not by the original
founder of the technical sub-project.
\end_layout
\end_inset
.
\end_layout
\begin_layout Plain Layout
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Eric Raymond's famous articles need to be obeyed, too.
For example, as a company you
\emph on
cannot decide
\emph default
to replace the founder of the project (which started only upon personal
initiative and
\emph on
not
\emph default
as a company project), with another person.
Otherwise, the public OpenSource project would be either dead, or it would
necessarily lead to a project fork.
Only one of the forks could survive in long term, and be included into
mainstream Linux.
Which one will become clear to you, once you have read Eric Raymond's articles
(if you cannot guess it anyway from terms like
\series bold
meritocracy
\series default
).
\end_layout
\end_inset
\end_layout
\begin_layout Section
Recommendations for Design and Operation of Storage Systems
\begin_inset CommandInset label
LatexCommand label
name "sec:Recommendations-for-Designing"
\end_inset
\end_layout
\begin_layout Subsection
Recommendations for Managers
\begin_inset CommandInset label
LatexCommand label
name "subsec:Recommendations-for-Managers"
\end_inset
\end_layout
\begin_layout Standard
When you are responsible for
\series bold
masses of enterprise-critical data
\series default
, the most important point is to get people with
\series bold
the right skills
\series default
, in
\emph on
addition(!) to
\emph default
the
\emph on
right mindset
\emph default
, and to assign the right roles to them.
\end_layout
\begin_layout Standard
Practical observation from many groups in many companies: which storage
systems / architectures are in use, and how much they are
\emph on
really
\emph default
\series bold
failure resistent
\series default
and
\series bold
reliable
\series default
, and how much they are
\emph on
really
\emph default
\series bold
scalable
\series default
for their workload, and what is their
\series bold
TCO = Total Cost of Ownership
\series default
, does often
\emph on
not
\emph default
depend on real knowledge and on facts.
It often depends
\series bold
randomly
\series default
on
\series bold
personal habits
\series default
and
\series bold
pre-judgement
\series default
of staff
\begin_inset Foot
status open
\begin_layout Plain Layout
\noindent
This can be seen in a bigger company (e.g.
after mergers etc) when very different architectures have been built by
different teams for very similar usecases, although they are sometimes
even roughly comparable in size and workload.
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In essence, this results in a
\series bold
gambling game
\series default
how safe / cost-effective etc your critical data
\emph on
really
\emph default
is.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In particular after company mergers, suchalike varieties need not remain
a permanent disadvantage.
You may turn it into an advantage.
Once you have enough reliable and validated KPIs about each of the systems,
and after you have checked that they are
\emph on
really
\emph default
comparable, you can derive a detailed comparison of competing architectures
and/or of their actual implementations.
Then you may start
\series bold
merging
\series default
some of the technical platforms, provided there is a business case for
it.
Or, you may
\series bold
bleed out
\series default
some old / obsolete technology.
\end_layout
\begin_layout Standard
When the game is about building up
\series bold
new functionality
\series default
from scratch, it is much different.
There are two main possibilities:
\end_layout
\begin_layout Enumerate
check whether your
\emph on
best
\emph default
platform can be extended with the new functionality.
Good architectures are also
\series bold
easily extensible
\series default
.
\end_layout
\begin_layout Enumerate
build a new platform.
\end_layout
\begin_layout Standard
The rest of this section focusses on architecture of
\emph on
new
\emph default
platforms.
Always check whether existing
\emph on
experience
\emph default
can be re-used.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
As explained throughout section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Scalability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
, there are many pitfalls, and there are only few people who know them,
because more people are working in small-scale systems than in large-scale
enterprise ones.
There are so many lots of people at the market who
\emph on
claim
\emph default
to have some experience, but in reality they don't know what they don't
know (
\series bold
second-order ignorance
\series default
).
\end_layout
\begin_layout Standard
Second-order ignorance is very dangerous, even for affected people themselves,
because they are in good faith about their own skills, and that they would
be able to control everything (sometimes they really want to control literally
\emph on
everything
\emph default
, even other people who have more real experience and knowledge).
See for example wrong assumptions and
\begin_inset Quotes eld
\end_inset
false proofs
\begin_inset Quotes erd
\end_inset
about scalability, derived from different use cases (or even from workstation
workloads).
See the failed scalability scenario in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example-Failures-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
where some freelancers were consulted as
\begin_inset Quotes eld
\end_inset
external experts
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Pitfall
\begin_inset Quotes eld
\end_inset
false experts
\begin_inset Quotes erd
\end_inset
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Check your information sources! There is a
\emph on
systematic reason
\emph default
for ill-informed
\begin_inset Quotes eld
\end_inset
experts
\begin_inset Quotes erd
\end_inset
: the internet.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
On the internet, you can find a lot of so-called
\begin_inset Quotes eld
\end_inset
best practices
\begin_inset Quotes erd
\end_inset
.
Many of them propagating badly scaling storage architectures for enterprise
workloads, sometimes even
\emph on
generally
\emph default
claiming they would
\begin_inset Quotes eld
\end_inset
scale very well
\begin_inset Quotes erd
\end_inset
, which is however often based on
\emph on
assumptions
\emph default
instead of knowledge (and rarely based on
\emph on
measurements
\emph default
at the right measurement points for deriving substantial knowledge about
your
\emph on
real
\emph default
application behaviour).
Literally
\emph on
anyone
\emph default
can post incorrectly generalized
\begin_inset Quotes eld
\end_inset
best practices
\begin_inset Quotes erd
\end_inset
to the internet.
Together with second-order ignorance about the non-transferability of
\begin_inset Quotes eld
\end_inset
success stories
\begin_inset Quotes erd
\end_inset
from usecase A to usecase B (resulting in
\emph on
false
\begin_inset Quotes eld
\end_inset
proofs
\emph default
\begin_inset Quotes erd
\end_inset
), the internet is creating
\series bold
information bubbles
\series default
.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 1
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Superfluous load balancers
\end_layout
\end_inset
Good examples are HTTP or other IP-based load balancers placed in front
of VMs.
Almost always, this is an
\series bold
expensive ill-design
\series default
.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Notice: as long as
\emph on
multiple
\emph default
VM instances are hosted on
\emph on
one
\emph default
hypervisor iron, load balancers are most likely completely useless
\begin_inset Foot
status open
\begin_layout Plain Layout
Reason: on SMP servers, there
\emph on
already exists
\emph default
a
\begin_inset Quotes eld
\end_inset
load balancer
\begin_inset Quotes erd
\end_inset
.
The kernel and its
\series bold
process scheduler
\series default
can do even better than any external load balancer, by better distribution
of physical CPUs to processes, and by exploitation of
\series bold
shared memory
\series default
, for example shared filesystem kernel caches, such as the Dentry Cache,
and the fscache / Page Cache.
Exceptions would only occur when there were per-VM global bottlenecks,
such as interdependent processes.
For instance, it is easy to
\emph on
misconfigure
\emph default
Apache logfiles to become such a bottleneck.
Just fix such misconfigurations, before claiming that SMP scalability would
be limited.
\end_layout
\end_inset
.
Instead, just assign more physical resources to a single VM.
Only when the application load is
\emph on
really
\emph default
so high that 1 VM would fill up a hypervisor
\emph on
completely
\emph default
, only then a load balancer
\emph on
might
\emph default
be potentially useful.
However,
\emph on
first
\emph default
check that there are enough RAM and SMP hardware threads.
Only when state-of-the-art multi-socket CPUs with
\begin_inset Formula $\approx128$
\end_inset
or more CPU threads would be insufficient for a very high connection rate,
and after tuning measures like PHP OpCache were not sufficient, a load
balancer or another means for load distribution
\emph on
could
\emph default
become necessary.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Even then, there are often more intelligent alternative solutions, like
wide-area
\emph on
distributed
\emph default
\series bold
input traffic partitioning
\series default
to geo-distributed servers, in place of a central load balancer acting
as a SPOF in a single datacenter.
For example, source-IP based routing can partition global traffic into
per-continent datacenters, drastically reducing application traffic latencies.
In essence, this is coarse granularity sharding at global level.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In a nutshell: compared to the scalability of sharding, load balancers
would be
\series bold
only suitable for small-scale scalability
\series default
.
However, small-scale scalability is much easier to achieve via hardware-based
SMP = Symmetric MultiProcessing, at least in
\emph on
most
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
Personally, I have never seen a situation where a load balancer was really
necessary.
In all example cases, they were superfluous.
In a few cases, they were even counter-productive.
\end_layout
\end_inset
cases.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Never start a design with a load balancer
\emph on
by default
\emph default
.
Only use load balancers when there is
\emph on
well-founded strong evidence
\emph default
that other scalability measures won't suffice.
In particular, it needs to be very clear that sharding is really impossible,
which in turn implies that there exists only 1 big customer, and that its
data cannot be partitioned at all.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Cost explosion by superfluous load balancers
\end_layout
\end_inset
Unnecessary load balancers are causing
\series bold
follow-up cost by increased complexity
\series default
.
In addition to the load balancer hardware and its setup / administration,
\emph on
multiple
\emph default
servers and/or VMs need to be set up and administered.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
If you just need a redirection mechanism, read sections
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Location-transparency"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Where-implement-Location-Transparency"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
For example, the traffic from BGP = Border Gateway Protocol is executed
by your
\series bold
ordinary network routers
\series default
, without additional hardware, and they can distribute sharded traffic to
wide-area geo-locations.
In comparison, load balancers are just restricted
\series bold
overkill
\series default
.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Never accept a system design with a
\emph on
mandatory
\emph default
load balancer.
It will likely imply a BigCluster-like
\emph on
architecture
\emph default
, though typically only
\emph on
implemented
\emph default
as a SmallCluster.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 2
status open
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Mandatory load balancers are often
\begin_inset Foot
status open
\begin_layout Plain Layout
There are some rare potential exceptions, like
\series bold
game servers
\series default
rendering scenes in
\series bold
realtime
\series default
, consuming
\emph on
massive
\emph default
CPU and/or GPU power in relation to network bandwidth.
Even there, sharding is often a better alternative.
In contrast, ordinary video streaming typically consumes very low CPU power,
because file streaming is executed by kernel
\family typewriter
sendpage()
\family default
and partly offloaded to DMA hardware acceleration.
\end_layout
\end_inset
creating some
\begin_inset Formula $O(n^{2})$
\end_inset
behaviour, showing up somewhere, often unexpectedly.
Even when reduced to
\begin_inset Formula $O(n)$
\end_inset
, load balancers are close to the
\series bold
opposite of sharding
\series default
at
\emph on
concept level
\emph default
, because they try to
\emph on
distribute
\emph default
an
\emph on
unpartitioned load
\emph default
to servers needing
\series bold
shared data
\series default
similar to DSM (see section
\begin_inset CommandInset ref
LatexCommand ref
reference "subsec:Explanations-from-DSM"
plural "false"
caps "false"
noprefix "false"
\end_inset
), instead of first
\emph on
partitioning the data
\emph default
and thus also partitioning the corresponding traffic.
Read section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Error-Propagation-to"
plural "false"
caps "false"
noprefix "false"
\end_inset
about typical
\emph on
real
\emph default
scalability and reliability.
When this doesn't help, read section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example-Failures-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
where the load balancer was a major
\emph on
source(!)
\emph default
of massive scalability problems.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Do not mis-use load balancer hardware for achieving location transparency.
Suchalike would need to be called
\begin_inset Quotes eld
\end_inset
load
\emph on
redirector
\emph default
\begin_inset Quotes erd
\end_inset
in place of
\begin_inset Quotes eld
\end_inset
load
\emph on
balancer
\emph default
\begin_inset Quotes erd
\end_inset
.
You pay a lot of money for functionality you don't need, see also section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Layering-Rules"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Traffic redirection is both cheaper and more performant when executed by
your ordinary network routers.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
\series bold
Sharding
\series default
architectures typically don't need any load balancers, although they are
\series bold
massively scalable
\emph on
horizontally
\series default
\emph default
.
Typically, they rely on the scalability of DNS, and of IP routing.
Notice: when DNS would reach its scalability limit, then the internet as
such would not scale anymore.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In comparison, a load balancer is a SPOB = Single Point Of
\series bold
Bottleneck
\series default
, where the traffic must physically
\series bold
flow through
\series default
(thereby increasing hops and latencies), instead of dynamic wide-area routing.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Load balancers vs sharding
\end_layout
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
As a manger, if you
\begin_inset Quotes eld
\end_inset
buy
\begin_inset Quotes erd
\end_inset
a
\emph on
mandatory
\emph default
load balancer, there is a high risk for
\series bold
architecturally hindering long-term scalability
\series default
by sharding.
\end_layout
\begin_layout Plain Layout
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Check whether people are
\emph on
really
\emph default
experts, when they want to solve suspected(!) scalability problems via
mandatory load balancers.
It is just poor system design, often inducing DSM problems, and producing
unnecessary follow-up cost.
Unfortunately, load balancers are systematically promoted by
\series bold
internet information bubbles
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Real knowledge originates from evaluated sources, such as
\series bold
scientific publications
\series default
which have undergone at least some minimum
\emph on
quality check
\emph default
, and which are trying to describe their preconditions and operating environment
s as precisely
\begin_inset Foot
status open
\begin_layout Plain Layout
\noindent
Therefore, chances are better to get a real expert when he has some (higher)
academic degrees, and was working in the area for a longer time.
\end_layout
\end_inset
as possible.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Real experts will tell you when they don't know something.
In addition, they will tell you
\emph on
multiple
\emph default
ways for obtaining such information, such as measurements, simulation,
etc.
In addition, real experts are able to do well-founded measurements and
deriving forecasts from them.
Later, when it works, their forecasts were roughly correct.
Check the quality of forecasts afterwards!
\end_layout
\begin_layout Standard
If you don't have anyone in your teams who knows how
\series bold
caching
\series default
\emph on
really
\emph default
works, or if it is a single guy who cannot withstand the pressure from
a whole group of
\begin_inset Quotes eld
\end_inset
alpha animals
\begin_inset Quotes erd
\end_inset
, you are running an
\series bold
increased risk
\series default
of unnecessary expenses
\begin_inset Foot
status open
\begin_layout Plain Layout
I know of cases which have produced unnecessary
\emph on
direct
\emph default
cost of at least € 20 millions, not counting further indirect cost such
as power and rackspace consumption.
\end_layout
\end_inset
, worse services (indirect cost), failed projects, and sometimes even resulting
in loss of market share and/or of stock exchange value.
\end_layout
\begin_layout Standard
The problem is that it
\emph on
looks so easy
\emph default
, as if everyone could build a
\emph on
large(!)
\emph default
storage and/or application system, with ease.
It looks easy once a small prototype is running at a workstation.
Some people believe that
\begin_inset Quotes eld
\end_inset
just spend some more money
\begin_inset Quotes erd
\end_inset
would all which is needed.
Unfortunately, both
\begin_inset Quotes eld
\end_inset
marketing drones
\begin_inset Quotes erd
\end_inset
from commercial storage vendors, and even a few OpenSource advocates, are
propagating this
\series bold
dangerous mindset
\series default
.
\end_layout
\begin_layout Standard
As a responsible manager,
\series bold
how can you detect
\series default
dangerous partly knowledge?
\end_layout
\begin_layout Standard
Good indicators are wrong usage of the term
\begin_inset Quotes eld
\end_inset
architecture
\begin_inset Quotes erd
\end_inset
(see definition in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
), and/or
\series bold
confusion of architecture with implementation
\series default
.
When somebody confuses
\begin_inset Foot
status open
\begin_layout Plain Layout
Notice that there exist people who use the term
\begin_inset Quotes eld
\end_inset
architecture
\begin_inset Quotes erd
\end_inset
inadvertly.
They even don't even know that they are confusing architecture with implementat
ion.
Pure usage of a certain term is no clear indicator that somebody is really
an expert.
\end_layout
\end_inset
this, he does not really have an overview of different architectural solution
classes.
Instead, such people are tending to propagate their random
\begin_inset Quotes eld
\end_inset
favourite solution
\begin_inset Quotes erd
\end_inset
or their random
\begin_inset Quotes eld
\end_inset
favourite product
\begin_inset Quotes erd
\end_inset
.
For you as a responsible, this increases the
\series bold
risk
\series default
of getting a non-optimum, or possibly even a bad / dangerous solution.
\end_layout
\begin_layout Standard
Another good indicator is advocacy of load balancers.
See above boxes about the size of their real application area and their
real value.
Do not confuse people's belief with deep knowledge about Operating Systems
and Distributed Systems.
The latter also requires substantial theoretical background, in addition
to practical experience.
\end_layout
\begin_layout Standard
Not everything which works in a garage, or in a student pool, or in the
testlab (whether it's yours or from a commercial storage vendor), or in
a PoC with so-called
\begin_inset Quotes eld
\end_inset
friendly customers
\begin_inset Quotes erd
\end_inset
, is well-suited for large enterprises and their critical data (measured
in petabytes / billions of files / etc), or is the optimum solution for
TCO.
Some rules of thumb, out of experience and observation:
\end_layout
\begin_layout Itemize
For each 1 or 2 orders of magnitude of the
\series bold
size
\series default
of your data, you will need
\series bold
better methods
\series default
for safe construction and operation, as would be sufficient for lower demands.
\end_layout
\begin_layout Itemize
For each 3 to 4 orders of magnitude (sometimes even for less), you will
need
\series bold
better architectures
\series default
, and people who can deal with them.
\end_layout
\begin_layout Itemize
For each 1 or 2 orders of magntitude of
\series bold
criticality
\series default
of your data (measured by
\emph on
losses
\emph default
in case of certain incidents), you will also need better architecture,
not just better components.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Custom Color Box 3
status open
\begin_layout Plain Layout
\noindent
\begin_inset Argument 1
status open
\begin_layout Plain Layout
\series bold
Important advice
\end_layout
\end_inset
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
If you start a new platform from scratch, always
\series bold
start with a
\emph on
good
\emph default
architecture
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
Once a platform is in production, even with a small number of customers,
it becomes increasingly difficult to change its fundamental architecture.
While bugs can be relatively easily fixed, and while single components
can be exchanged with some effort, changing an architecture may turn out
\emph on
close to impossible
\emph default
, or at least very expensive.
\end_layout
\begin_layout Subsection
Recommendations for Architects
\begin_inset CommandInset label
LatexCommand label
name "subsec:Recommendations-for-Architects"
\end_inset
\end_layout
\begin_layout Standard
In order of precedence, do the following:
\end_layout
\begin_layout Enumerate
\series bold
Fix and/or limit and/or tune the
\emph on
application
\series default
\emph default
.
\begin_inset Newline newline
\end_inset
Some extreme examples:
\end_layout
\begin_deeper
\begin_layout Itemize
When you encounter a classical Unix
\series bold
fork bomb
\series default
, you have no chance against it.
Even the
\begin_inset Quotes eld
\end_inset
best and the most expensive hardware
\begin_inset Foot
status open
\begin_layout Plain Layout
There is an old joke from the 1980s: a Cray is a computer capable of running
an endless loop in 10 seconds.
\end_layout
\end_inset
\begin_inset Quotes erd
\end_inset
is unable to successfully run a fork bomb.
The only countermeasure is
\emph on
limitation of resources
\emph default
.
Reason: unlimited resources do not exist on earth.
\end_layout
\begin_layout Itemize
If you think that this were only of academic interest: several types of
internet
\series bold
DDOS attacks
\series default
are acting like a fork bomb, and
\series bold
Apache
\series default
is also acting similar to a fork bomb when not configured properly.
This is not about academics, it is about
\emph on
your survival
\emph default
(in the sense of Darwin).
\end_layout
\begin_layout Itemize
If you think it cannot hurt you because you are running
\family typewriter
fast-cgi
\family default
or another application scheme where forks are not part of the game (e.g.
databases and many others): please notice that
\series bold
network queues
\series default
are often acting as a replacement for processes.
Overflow of queues can have a similar effect than fork bombs from the viewpoint
of customers: they simply don't get the service they are expecting.
\end_layout
\begin_layout Itemize
If you think this cannot hurt you, because you are working in a completely
different area from Apache:
\emph on
any
\emph default
type of IP-based network traffic can show queueing behaviour.
Complex queuing systems can show
\begin_inset Quotes eld
\end_inset
unexpected
\begin_inset Quotes erd
\end_inset
behaviour, and sometimes even a dangerous one.
\end_layout
\begin_layout Itemize
Real-life example for application-level problems: some percentage of
\family typewriter
WordPress
\family default
customers are typically and
\emph on
systematically
\emph default
\series bold
misconfiguring
\series default
their
\family typewriter
wp-cron
\family default
cron jobs.
They create backups of their website, which
\emph on
include
\emph default
their old backups.
Result: in each generation of the backups, the needed disk space will roughly
\emph on
double
\emph default
.
Even if you had
\begin_inset Quotes eld
\end_inset
unlimited storage
\begin_inset Quotes erd
\end_inset
on top of the
\begin_inset Quotes eld
\end_inset
best and the most expensive storage system
\begin_inset Quotes erd
\end_inset
, and even if you would like to give
\begin_inset Quotes eld
\end_inset
unlimited storage
\begin_inset Quotes erd
\end_inset
to your customers, it simply cannot work at all.
Exponential growth is exponential growth.
After a few months of this kind of daily backup, you would need more storage
than atoms exist in the whole universe.
You
\emph on
must
\emph default
introduce some quota limits somewhere.
And you
\emph on
must
\emph default
ensure that the
\family typewriter
wp-cron
\family default
misconfiguration is fixed, whoever is responsible for fixing it.
\end_layout
\begin_layout Itemize
Another
\family typewriter
WordPress
\family default
example: the
\family typewriter
wp-cron
\family default
configuration syntax is not easily understandable by laymen.
It is easy to
\series bold
misconfigure
\series default
such that a backup is created
\emph on
once per minute
\emph default
.
As long as the website is very small, this will not even be noticed by
sysadmins.
However, for bigger websites (and they are typically growing over time),
the IO load may increase to a point until even asynchronous replication
over 10Gig interfaces cannot catch up.
Even worse: the next run of
\family typewriter
wp-cron
\family default
may start before the old one has finished within a minute.
Again, there is no chance except fixing the
\emph on
root cause
\emph default
at application level.
\end_layout
\end_deeper
\begin_layout Enumerate
\series bold
Choose the right
\emph on
overall
\emph default
architecture
\series default
(not limited to storage).
\begin_inset Newline newline
\end_inset
An impressive example for architectural (cf section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
) ill-design can be found in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example-Failures-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
Important explanations are in section
\begin_inset CommandInset ref
LatexCommand ref
reference "subsec:Properties-Scalability"
\end_inset
, in particular subsection
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Influence-Factors-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
, and section
\begin_inset CommandInset ref
LatexCommand vref
reference "subsec:Filesystem-Layer-vs"
\end_inset
.
A strategic example is in subsection
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example-Scalability-Scenario"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
It is absolutely necessary to know the standard cache hierarchy of Unix
(similarly also found in Windows) from section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Performance-Risk-Arguments-from-Layer"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
More explanations are in this manual at many places.
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
In general, major ill-designs of overall architectures (end-to-end) cannot
be fixed at component level.
Even the
\begin_inset Quotes eld
\end_inset
best tuning of the world
\begin_inset Quotes erd
\end_inset
executed by the
\begin_inset Quotes eld
\end_inset
best tuning expert
\begin_inset Quotes erd
\end_inset
on top of the
\begin_inset Quotes eld
\end_inset
best and most expensive storage
\emph on
components
\emph default
over the best storage
\emph on
network
\emph default
of the world
\begin_inset Quotes erd
\end_inset
cannot compensate major ill-designs, such as
\begin_inset Formula $O(n^{2})$
\end_inset
behaviour, or disregarding of Kirchhoff's laws (see section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Kirchhoff-Suitability-of-Storage-Networks"
plural "false"
caps "false"
noprefix "false"
\end_inset
).
\begin_inset Newline newline
\end_inset
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
Similarly for reliability: if you have problems with too many and/or too
large incidents affecting too many customers, read sections
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:Reliability-Arguments-from"
plural "false"
caps "false"
noprefix "false"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Reliability-Differences-CentralStorage"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\end_layout
\begin_layout Enumerate
\series bold
Choice and tuning of components
\series default
.
\begin_inset Newline newline
\end_inset
No further explanations necessary, because most people already know this.
In case you think this is the
\emph on
only
\emph default
way: no, it is typically the
\emph on
worst
\emph default
and typically only the
\emph on
last resort
\emph default
when compared to the previous enumeration items.
See example in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "subsec:Example-Failures-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
\begin_inset Newline newline
\end_inset
Exception: choice of wrong components with insufficient properties for your
particular application / use case, or even hard restrictions as mentioned
in section
\begin_inset CommandInset ref
LatexCommand nameref
reference "sec:What-is-Architecture"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
But this is an
\emph on
architectural
\emph default
problem in reality, and belongs to the previous item, not to this one.
\end_layout
\begin_layout Part
\start_of_appendix
Appendices
\end_layout
\begin_layout Chapter
Mathematical Model of Architectural Reliability
\begin_inset CommandInset label
LatexCommand label
name "chap:Mathematical-Model-of"
\end_inset
\end_layout
\begin_layout Standard
The assumptions used in the model are explained in detail in section
\begin_inset CommandInset ref
LatexCommand vref
reference "sub:Detailed-explanation"
\end_inset
.
Here is a quick recap of the main parameters:
\end_layout
\begin_layout Itemize
\begin_inset Formula $n$
\end_inset
is the number of basic storage units.
It is also used for the number of application units, assumed to be the
same.
\end_layout
\begin_layout Itemize
\begin_inset Formula $k$
\end_inset
is the replication degree, or number of replicas.
In general, you will have to deploy
\begin_inset Formula $N=k*n$
\end_inset
storage servers for getting
\begin_inset Formula $n$
\end_inset
basic storage units.
This applies to any of the competing architectures.
\end_layout
\begin_layout Itemize
\begin_inset Formula $s$
\end_inset
is the architecture-dependent spread exponent: it tells whether a storage
incident will spread to the application units.
Examples:
\begin_inset Formula $s=0$
\end_inset
means that there is no spread between storage unit failures and application
unit failures, other than a local 1:1 one.
\begin_inset Formula $s=1$
\end_inset
means that an uncompensated storage node incident will cause
\begin_inset Formula $n$
\end_inset
application incidents.
\end_layout
\begin_layout Itemize
\begin_inset Formula $p$
\end_inset
is the probability of a storage server incident.
In the examples at section
\begin_inset CommandInset ref
LatexCommand vref
reference "sec:Reliability-Arguments-from"
\end_inset
, a fixed
\begin_inset Formula $p=0.0001$
\end_inset
was used for easy understanding, but the following formulae should also
hold for any other
\begin_inset Formula $p\in(0,1)$
\end_inset
.
\end_layout
\begin_layout Itemize
\begin_inset Formula $T$
\end_inset
is the observational period, introduced for convenience of understanding.
The following can also be computed independently from any
\begin_inset Formula $T$
\end_inset
, as long as the probability
\begin_inset Formula $p$
\end_inset
does not change over time, which is assumed.
Because
\begin_inset Formula $T$
\end_inset
is only here for convenience of understanding, we set it to
\begin_inset Formula $T=1/p$
\end_inset
.
In the examples from section
\begin_inset CommandInset ref
LatexCommand vref
reference "sub:Detailed-explanation"
\end_inset
, a fixed
\begin_inset Formula $T=10,000$
\end_inset
hours was used.
\end_layout
\begin_layout Section
Formula for DRBD / MARS
\end_layout
\begin_layout Standard
We need not discrimiate between a storage failure probability S and an applicati
on failure probability A because applications are run locally at the storage
servers 1:1.
The probability for failure of a single shard consisting of
\begin_inset Formula $k$
\end_inset
nodes is
\end_layout
\begin_layout Standard
\begin_inset Formula
\[
A_{p}(k)=p^{k}
\]
\end_inset
because all
\begin_inset Formula $k$
\end_inset
shard members have to be down all at the same time.
In section
\begin_inset CommandInset ref
LatexCommand vref
reference "sub:Detailed-explanation"
\end_inset
we assumed that there is no cross-communication between shards.
Therefore they are completely independent from each other, and the total
downtime of
\begin_inset Formula $n$
\end_inset
shards during the observational period
\begin_inset Formula $T$
\end_inset
is
\end_layout
\begin_layout Standard
\begin_inset Formula
\[
A_{p,T}(k,n)=T*n*p^{k}
\]
\end_inset
\end_layout
\begin_layout Standard
\noindent
When introducing the spread exponent
\begin_inset Formula $s$
\end_inset
, the formula turns into
\end_layout
\begin_layout Standard
\begin_inset Formula
\[
A_{s,p,T}(k,n)=T*n^{s+1}*p^{k}
\]
\end_inset
\end_layout
\begin_layout Section
Formula for Unweighted BigCluster
\end_layout
\begin_layout Standard
This is based on the Bernoulli formula.
The probability that exactly
\begin_inset Formula $\bar{k}$
\end_inset
storage nodes out of
\begin_inset Formula $N=k*n$
\end_inset
total storage nodes are down is
\end_layout
\begin_layout Standard
\begin_inset Formula
\[
\bar{S}_{p}(\bar{k},N)=\binom{N}{\bar{k}}*p^{\bar{k}}*(1-p)^{N-\bar{k}}
\]
\end_inset
\end_layout
\begin_layout Standard
\noindent
Similarly, the probability for getting
\begin_inset Formula $k$
\end_inset
or more storage node failures (up to
\begin_inset Formula $N$
\end_inset
) at the same time is
\end_layout
\begin_layout Standard
\begin_inset Formula
\[
S_{p}(k,N)=\sum_{\bar{k}=k}^{N}\bar{S}_{p}(\bar{k},N)=\sum_{\bar{k}=k}^{N}\binom{N}{\bar{k}}*p^{\bar{k}}*(1-p)^{N-\bar{k}}
\]
\end_inset
\end_layout
\begin_layout Standard
\noindent
By replacing
\begin_inset Formula $N$
\end_inset
with
\begin_inset Formula $k*n$
\end_inset
(for conversion of the x axis into basic storage units) and by introducing
\begin_inset Formula $T$
\end_inset
we get
\end_layout
\begin_layout Standard
\begin_inset Formula
\[
S_{p,T}(k,n)=T*\sum_{\bar{k}=k}^{k*n}\binom{k*n}{\bar{k}}*p^{\bar{k}}*(1-p)^{k*n-\bar{k}}
\]
\end_inset
\end_layout
\begin_layout Standard
\noindent
For comparability with DRBDorMARS, we have to compute the application downtime
A instead of the storage downtime S, which depends on the spread exponent
\begin_inset Formula $s$
\end_inset
as follows:
\end_layout
\begin_layout Standard
\begin_inset Formula
\[
A_{s,p,T}(k,n)=n^{s+1}*S_{p,T}(k,n)=n^{s+1}*T*\sum_{\bar{k}=k}^{k*n}\binom{k*n}{\bar{k}}*p^{\bar{k}}*(1-p)^{k*n-\bar{k}}
\]
\end_inset
\end_layout
\begin_layout Standard
\noindent
Notice that at
\begin_inset Formula $s=0$
\end_inset
we have introduced a factor of
\begin_inset Formula $n$
\end_inset
, which corresponds to the hashing effect (teardown of
\begin_inset Formula $n$
\end_inset
application instances by a single uncompensated storage incident) as described
in section
\begin_inset CommandInset ref
LatexCommand vref
reference "sub:Detailed-explanation"
\end_inset
.
\end_layout
\begin_layout Section
Formula for SizeWeighted BigCluster
\end_layout
\begin_layout Standard
In difference to above, we need to introduce a correction factor by the
fraction of affected objects, relative to basic storage units.
Otherwise the y axis would not stay comparable due to different units.
\end_layout
\begin_layout Standard
For the special case of
\begin_inset Formula $k=1$
\end_inset
, there is no difference to above.
\end_layout
\begin_layout Standard
For the special case of
\begin_inset Formula $k=2$
\end_inset
replica, the correction factor is
\begin_inset Formula $1/(N-1)$
\end_inset
, because we assume that all the replica of the affected first node are
uniformly spread to all other nodes, which is
\begin_inset Formula $N-1$
\end_inset
.
The probability for hitting the intersection of the first node with the
second node is thus
\begin_inset Formula $1/(N-1)$
\end_inset
.
\end_layout
\begin_layout Standard
For higher values of
\begin_inset Formula $k$
\end_inset
, and with a similar argument (never put another replica of the same object
onto the same storage node) we get the correction factor as
\end_layout
\begin_layout Standard
\begin_inset Formula
\[
C(k,N)=\prod_{l=1}^{k-1}\frac{1}{N-l}
\]
\end_inset
\end_layout
\begin_layout Standard
\noindent
Hint: there are maximum
\begin_inset Formula $k$
\end_inset
physical replicas on the disks.
For higher values of
\begin_inset Formula $\bar{k}\geq k$
\end_inset
, there are
\begin_inset Formula $\binom{\bar{k}}{k}$
\end_inset
combinations of object intersections (when assuming that the number of
objects on a node is very large such and no further object repetition can
occur execpt for the
\begin_inset Formula $k$
\end_inset
-fold replica placement).
Thus the generalization to
\begin_inset Formula $\bar{k}\geq k$
\end_inset
is
\end_layout
\begin_layout Standard
\begin_inset Formula
\[
C(k,\bar{k},N)=\binom{\bar{k}}{k}\prod_{l=1}^{k-1}\frac{1}{N-l}
\]
\end_inset
\end_layout
\begin_layout Standard
\noindent
By inserting this into the above fomula, we get
\end_layout
\begin_layout Standard
\begin_inset Formula
\[
A_{s,p,T}(k,n)=n^{s+1}*T*\sum_{\bar{k}=k}^{k*n}C(k,\bar{k},k*n)*\binom{k*n}{\bar{k}}*p^{\bar{k}}*(1-p)^{k*n-\bar{k}}
\]
\end_inset
\end_layout
\begin_layout Chapter
Draft Definition of
\begin_inset Quotes eld
\end_inset
*Scalabilty
\begin_inset Quotes erd
\end_inset
\begin_inset CommandInset label
LatexCommand label
name "chap:Definition-of-Scalability"
\end_inset
\end_layout
\begin_layout Standard
\noindent
Here is a
\emph on
proposal
\emph default
of a
\emph on
hopefully
\emph default
more valuable definition of multiple variants of
\begin_inset Quotes eld
\end_inset
*scalability
\begin_inset Quotes erd
\end_inset
(as a DRAFT denoted in quotes) for
\series bold
\emph on
computer science
\series default
\emph default
.
Certainly, this draft needs some improvements.
However, the author does not know a better definition, without the mentioned
\series bold
academic deficiencies
\series default
from section
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:What-is-Scalability"
plural "false"
caps "false"
noprefix "false"
\end_inset
.
This draft tries to compensate much of them by introduction of some necessary
properties for
\series bold
enterprise-grade real-life systems
\series default
, which need to satisfy the management body of a company, which in turn
needs to satisfy their stock holders.
\end_layout
\begin_layout Standard
\begin_inset VSpace defskip
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Color Box
status open
\begin_layout Plain Layout
For some index
\begin_inset Formula $i$
\end_inset
the following has been already determined (otherwise a definition of
\begin_inset Quotes eld
\end_inset
*scalability
\begin_inset Quotes erd
\end_inset
in the following sense is
\emph on
not possible
\emph default
):
\end_layout
\begin_layout Description
(1) An existing real-life computer hardware or software system or sub-system
\begin_inset Formula $S_{i}$
\end_inset
\end_layout
\begin_layout Description
(2) An existing and measurable real-life workload
\begin_inset Formula $W_{i}$
\end_inset
\end_layout
\begin_layout Description
(3) From
\begin_inset Formula $S_{i}$
\end_inset
and
\begin_inset Formula $W_{i}$
\end_inset
an actual SLA conformance value
\begin_inset Formula $C(S_{i},W_{i})$
\end_inset
between 0% and 100% has been determined by measurement, and checked for
validity.
\end_layout
\begin_layout Description
(4) An SLA conformance target
\begin_inset Formula $T_{0}$
\end_inset
between 0% and 100% had been given by an external product management, where
the index
\begin_inset Formula $0$
\end_inset
does not depend on
\begin_inset Formula $i$
\end_inset
.
\end_layout
\begin_layout Plain Layout
Now for another
\begin_inset Formula $j>i$
\end_inset
the following can be determined (or at least
\emph on
predicted
\emph default
) for a
\emph on
future
\emph default
point in time, and checked for validity (or at least for
\emph on
sufficient prediction accuracy
\emph default
):
\end_layout
\begin_layout Description
(1b) it is
\emph on
possible
\emph default
to augment
\begin_inset Formula $S_{i}$
\end_inset
to a truly bigger system
\begin_inset Formula $S_{j}$
\end_inset
with
\begin_inset Formula $S_{j}\supset S_{i}$
\end_inset
\end_layout
\begin_layout Description
(2b) it is
\emph on
possible
\emph default
to increase the old workload
\begin_inset Formula $W_{i}$
\end_inset
to a greater workload
\begin_inset Formula $W_{j}$
\end_inset
with
\begin_inset Formula $W_{j}\supset W_{i}$
\end_inset
\end_layout
\begin_layout Description
(3c) it is
\emph on
possible
\emph default
to determine (or at least
\emph on
predict
\emph default
) the new SLA conformance value
\begin_inset Formula $C(S_{j},W_{j})$
\end_inset
\end_layout
\begin_layout Plain Layout
The new system
\begin_inset Formula $S_{j}$
\end_inset
is called a
\series bold
\begin_inset Quotes eld
\end_inset
better scaling system
\begin_inset Quotes erd
\end_inset
\series default
than
\begin_inset Formula $S_{i}$
\end_inset
if and only if
\begin_inset Formula $C(S_{j},W_{j})\geq C(S_{i},W_{i})$
\end_inset
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
This draft definition just tries to obey some
\emph on
bare minimum
\emph default
as needed by today's enterprises.
In practice, the workloads
\begin_inset Formula $W_{i}$
\end_inset
and
\begin_inset Formula $W_{j}$
\end_inset
must be somewhat harder: they must
\series bold
conform to the needs
\series default
of both the company and
\emph on
enough
\emph default
of its customers (defined by SLA) who are using the current system
\begin_inset Formula $S_{i},$
\end_inset
and will use the future system
\begin_inset Formula $S_{j}$
\end_inset
.
\end_layout
\begin_layout Standard
Important: the current and future systems must obey some more business-relevant
conditions, such as
\series bold
cost
\series default
/
\series bold
risk
\series default
/ etc.
Details would be out of scope of this guide.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Notice: we just have defined the term
\begin_inset Quotes eld
\end_inset
better scaling
\begin_inset Quotes erd
\end_inset
(or its potential contrary
\begin_inset Quotes eld
\end_inset
worse scaling
\begin_inset Quotes erd
\end_inset
) on a given system.
But this is
\emph on
not yet
\emph default
a definition of
\begin_inset Quotes eld
\end_inset
scalability
\begin_inset Quotes erd
\end_inset
as many people would like.
Now comes the key in mathematical terms:
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Color Box
status open
\begin_layout Plain Layout
\series bold
A.
\series default
an existing system
\begin_inset Formula $S_{i}$
\end_inset
is called
\series bold
\begin_inset Quotes eld
\end_inset
currently scalable
\begin_inset Quotes erd
\end_inset
\series default
, if and only if some index
\begin_inset Formula $j>i$
\end_inset
exists, which satisfies
\end_layout
\begin_layout Description
(1)
\begin_inset Formula $S_{j}\supset S_{i}$
\end_inset
\end_layout
\begin_layout Description
(2)
\begin_inset Formula $W_{j}\supset W_{i}$
\end_inset
\end_layout
\begin_layout Description
(3)
\begin_inset Formula $C(S_{j},W_{j})\geq C(S_{i},W_{i})\geq T_{0}$
\end_inset
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
In management speak: when the workload is increasing from the current
\begin_inset Formula $W_{i}$
\end_inset
to some future
\begin_inset Formula $W_{j}$
\end_inset
, it
\emph on
must
\emph default
be possible to
\emph on
upgrade
\emph default
the current system
\begin_inset Formula $S_{i}$
\end_inset
to some future
\begin_inset Formula $S_{j}$
\end_inset
without violation of the SLA target
\begin_inset Formula $T_{0}$
\end_inset
, and not even just
\emph on
worsening
\emph default
the current service level (which means that the SLA
\emph on
could
\emph default
be
\emph on
potentially
\emph default
strengthened).
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/MatieresCorrosives.png
lyxscale 50
scale 17
\end_inset
This does not include the
\series bold
cost
\series default
of the upgrade, and some other business-relevant side effects like non-linear
impact onto personal cost or datacenter cost or some external limitations
etc.
This should be added to a future improved version of this definition, which
is just an
\emph on
attempt
\emph default
to convert an academic view of the problem space to a more practical view
onto it.
\end_layout
\begin_layout Standard
\noindent
\begin_inset Flex Color Box
status open
\begin_layout Plain Layout
\series bold
B.
\series default
an existing system
\begin_inset Formula $S_{i}$
\end_inset
is called
\series bold
\begin_inset Quotes eld
\end_inset
unscalable
\begin_inset Quotes erd
\end_inset
\series default
, if and only if
\series bold
no
\series default
index
\begin_inset Formula $j>i$
\end_inset
exists, which satisfies
\end_layout
\begin_layout Description
(1)
\begin_inset Formula $S_{j}\supseteq S_{i}$
\end_inset
(notice that the hardware and/or software may stay
\emph on
unmodified
\emph default
)
\end_layout
\begin_layout Description
(2)
\begin_inset Formula $W_{j}\supset W_{i}$
\end_inset
\end_layout
\begin_layout Description
(3)
\begin_inset Formula $C(S_{j},W_{j})\geq T_{0}$
\end_inset
(notice that the current
\begin_inset Formula $S_{i}$
\end_inset
may be already
\emph on
overloaded
\emph default
)
\end_layout
\begin_layout Plain Layout
\series bold
C.
\series default
an existing system
\begin_inset Formula $S_{i}$
\end_inset
is called
\series bold
\begin_inset Quotes eld
\end_inset
limited scalable
\begin_inset Quotes erd
\end_inset
\series default
, if and only if it was
\begin_inset Quotes eld
\end_inset
currently scalable
\begin_inset Quotes erd
\end_inset
\emph on
somewhen in the past
\emph default
, and now it is neither
\begin_inset Quotes eld
\end_inset
currently scalable
\begin_inset Quotes erd
\end_inset
(see A) nor
\begin_inset Quotes eld
\end_inset
unscalable
\begin_inset Quotes erd
\end_inset
(see C).
\end_layout
\begin_layout Plain Layout
\series bold
D.
\series default
the pure term
\series bold
\begin_inset Quotes eld
\end_inset
scalable
\begin_inset Quotes erd
\end_inset
\series default
without any prefix is deliberately either
\series bold
undefined
\series default
, or defined as
\series bold
\backslash
error
\series default
.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\noindent
\begin_inset Graphics
filename images/lightbulb_brightlit_benj_.png
lyxscale 9
scale 5
\end_inset
Important for managers: when a
\emph on
relevant
\emph default
future system
\begin_inset Formula $S_{j}$
\end_inset
exists which will be scalable from the current
\begin_inset Formula $S_{i}$
\end_inset
, but there exists no even greater
\begin_inset Formula $k>j$
\end_inset
which would be
\begin_inset Quotes eld
\end_inset
future scalable
\begin_inset Quotes erd
\end_inset
:=
\begin_inset Quotes eld
\end_inset
currently scalable
\begin_inset Quotes erd
\end_inset
in
\emph on
future
\emph default
, then you
\emph on
will
\emph default
reach
\emph on
some
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
In general, the mathematical lattice theory tells us that a Scalability
Limit does
\emph on
not
\emph default
need to be unique, e.g.
when following
\emph on
different
\emph default
hardware upgrade paths.
\end_layout
\end_inset
\series bold
Scalability Limit
\series default
of your concrete system
\begin_inset Formula $S_{i}$
\end_inset
.
\end_layout
\begin_layout Standard
\begin_inset CommandInset include
LatexCommand input
preview true
filename "common-back-matter.lyx"
\end_inset
\end_layout
\end_body
\end_document