#LyX 2.3 created this file. For more info see http://www.lyx.org/ \lyxformat 544 \begin_document \begin_header \save_transient_properties true \origin unavailable \textclass scrreprt \begin_preamble \usepackage{listings} \end_preamble \options abstracton,most,usenames,dvipsnames \use_default_options true \begin_modules customHeadersFooters enumitem fixltx2e tcolorbox \end_modules \maintain_unincluded_children false \language english \language_package default \inputencoding auto \fontencoding global \font_roman "default" "default" \font_sans "default" "default" \font_typewriter "default" "default" \font_math "auto" "auto" \font_default_family rmdefault \use_non_tex_fonts false \font_sc false \font_osf false \font_sf_scale 100 100 \font_tt_scale 100 100 \use_microtype false \use_dash_ligatures false \graphics default \default_output_format default \output_sync 1 \bibtex_command default \index_command default \paperfontsize 10 \spacing single \use_hyperref true \pdf_title "MARS Architecture Guide" \pdf_author "Thomas Schöbel-Theuer" \pdf_bookmarks true \pdf_bookmarksnumbered false \pdf_bookmarksopen true \pdf_bookmarksopenlevel 2 \pdf_breaklinks true \pdf_pdfborder true \pdf_colorlinks true \pdf_backref section \pdf_pdfusetitle true \papersize a4paper \use_geometry true \use_package amsmath 1 \use_package amssymb 1 \use_package cancel 1 \use_package esint 1 \use_package mathdots 1 \use_package mathtools 1 \use_package mhchem 1 \use_package stackrel 1 \use_package stmaryrd 1 \use_package undertilde 1 \cite_engine basic \cite_engine_type default \biblio_style plain \use_bibtopic false \use_indices false \paperorientation portrait \suppress_date false \justification true \use_refstyle 1 \use_minted 0 \index Index \shortcut idx \color #008000 \end_index \leftmargin 3.7cm \topmargin 2.7cm \rightmargin 2.8cm \bottommargin 2.3cm \secnumdepth 3 \tocdepth 3 \paragraph_separation indent \paragraph_indentation default \is_math_indent 0 \math_numbering_side default \quotes_style english \dynamic_quotes 0 \papercolumns 1 \papersides 2 \paperpagestyle headings \tracking_changes false \output_changes false \html_math_output 0 \html_css_as_file 0 \html_be_strict false \end_header \begin_body \begin_layout Standard \begin_inset ERT status open \begin_layout Plain Layout \backslash title{MARS Architecture Guide} \end_layout \end_inset \end_layout \begin_layout Standard \begin_inset CommandInset include LatexCommand input preview true filename "common-front-matter.lyx" \end_inset \end_layout \begin_layout Chapter* Preface \end_layout \begin_layout Section* Introduction \end_layout \begin_layout Standard \begin_inset CommandInset include LatexCommand input preview true filename "common-introduction.lyx" \end_inset \end_layout \begin_layout Section* Purpose \end_layout \begin_layout Standard This document explains and discusses how to select the right storage architectur e for typical use cases in big enterprises. Besides general storage architectures, pitfalls of geo-redundancy and long-dist ance replication are highlighted. \end_layout \begin_layout Standard In addition to technical discussion, \series bold cost and risks \series default are treated as well, addressing some \series bold management needs \series default up to CTO level. \end_layout \begin_layout Standard In contrast to several other publications, it is \emph on not \emph default an enumeration of sheer endless possibilites and components on the market. It provides \series bold guidance \series default about the \series bold structures and ideas \series default \emph on behind \emph default storage architectures and their connection to application processing. Particular attention is on \series bold avoidance of pitfalls \series default . \end_layout \begin_layout Standard It provides both \emph on technical \emph default and \emph on management \emph default guidance about selection of architectures as well as their implementation \emph on classes \emph default , and also about selection of suitable component \emph on classes \emph default . \end_layout \begin_layout Standard Finally, it helps checking for use cases where MARS will be a good solution, and where other solutions will be better suited. It also addresses some unexpected problems when inappropriate types of cluster managers are selected for long-distance replication. \end_layout \begin_layout Section* Scope \end_layout \begin_layout Standard The following topics are covered within this document: \end_layout \begin_layout Itemize Management Summary \end_layout \begin_layout Itemize Architectures of Cloud Storage, and \end_layout \begin_deeper \begin_layout Itemize their application area \end_layout \begin_layout Itemize their reliability / risks / pitfalls \end_layout \begin_layout Itemize their cost \end_layout \begin_layout Itemize scalability and performance of architectures \end_layout \begin_layout Itemize recommendations for managers and architects \end_layout \end_deeper \begin_layout Itemize Selection of components \end_layout \begin_deeper \begin_layout Itemize MARS vs DRBD \end_layout \end_deeper \begin_layout Itemize Architecture and pitfalls of Cluster Managers \end_layout \begin_layout Section* Audience \end_layout \begin_layout Standard This document is mainly written for system architects. Technical decision makers / managers with technical background, up to CTO level, should also benefit from \series bold risk reduction \series default and \series bold cost saving \series default , when making clever investment and consolidation decisions. \end_layout \begin_layout Standard Researchers in the field of storage systems are also addressed in the section about \series bold reliability \series default and the appendix, by providing mathematical models of reliability. \end_layout \begin_layout Section* How to use this document \end_layout \begin_layout Standard Managers should start with chapter \begin_inset CommandInset ref LatexCommand nameref reference "chap:Management-Summary" plural "false" caps "false" noprefix "false" \end_inset . Then read the short chapter \begin_inset CommandInset ref LatexCommand nameref reference "chap:Important-Concepts" plural "false" caps "false" noprefix "false" \end_inset . For details, just follow the internal links within this document. In any case, the last chapter \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Recommendations-for-Managers" plural "false" caps "false" noprefix "false" \end_inset is highly recommended. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout These boxes are something you definitely should read as a manager. It explains \series bold important key items \series default in a nutshell. \end_layout \end_inset \end_layout \begin_layout Standard \noindent All others should read chapter 1 and 2 sequentially, and proceed to the other chapters when interested. \end_layout \begin_layout Standard When MARS is already in use (or planned to be used), reading all of the chapters may pay off for \series bold avoidance of pitfalls \series default . \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout Examples are marked with boxes like this. They can be skipped if you don't have much time. Examples will however help for understanding of complex material. \end_layout \end_inset \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout Detail explanations are marked like this. They are recommended for system architects for more elaborate methodology, and for deeper understanding of fundamentals. \end_layout \end_inset \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 4 status open \begin_layout Plain Layout This document is no scientific work in a strong sense. However, it is based on scientific background. In a few places, hints like this could be fruitful for spawning research activity. \end_layout \end_inset \end_layout \begin_layout Section* Related documents \end_layout \begin_layout Itemize \family typewriter mars-user-manual.pdf \family default : for sysadmins who want to install and run MARS. \end_layout \begin_layout Itemize \family typewriter football-user-manual.pdf \family default : for sysadmins and userspace developers who want to use Football. \end_layout \begin_layout Itemize \family typewriter mars-for-kernel-developers.pdf \family default : some infos for kernel developers. \end_layout \begin_layout Standard \begin_inset CommandInset toc LatexCommand tableofcontents \end_inset \end_layout \begin_layout Chapter Management Summary \begin_inset CommandInset label LatexCommand label name "chap:Management-Summary" \end_inset \end_layout \begin_layout Standard TBD \end_layout \begin_layout Chapter Important Concepts \begin_inset CommandInset label LatexCommand label name "chap:Important-Concepts" \end_inset \end_layout \begin_layout Standard This chapter is \emph on very short \emph default . Recommended reading for \emph on everyone \emph default is \emph on each \emph default of the definitions in \emph on each \emph default section, even if you think that you already know what each concept means. \end_layout \begin_layout Standard In case you \series bold notice a difference \series default between your former opinion about a concept and what you are reading here, then \series bold don't skip the rest \series default of the corresponding section. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Skipping anything in this chapter exposes you to serious risks: \end_layout \begin_layout Itemize \series bold Misunderstanding \series default of following important parts. This may become \series bold expensive \series default . This guide is about investments and follow-up cost in the range of \series bold millions \series default of €. \end_layout \begin_layout Itemize \series bold Second-order ignorance \series default : you probably don't know what you don't know. This is not only risky in \series bold enterprise-critical \series default areas. You can also risk your \series bold carreer \series default . \end_layout \begin_layout Section What is Architecture \begin_inset CommandInset label LatexCommand label name "sec:What-is-Architecture" \end_inset \end_layout \begin_layout Standard From \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/Software_architecture \end_layout \end_inset : \end_layout \begin_layout Quote Software architecture refers to the \series bold high level structures \series default of a software system and the \series bold discipline \series default of creating such structures and systems. \end_layout \begin_layout Standard Throughout this document, the term \begin_inset Quotes eld \end_inset \series bold architecture \series default \begin_inset Quotes erd \end_inset (without preceding \begin_inset Quotes eld \end_inset software \begin_inset Quotes erd \end_inset ) is strictly separated from \begin_inset Quotes eld \end_inset \series bold implementation \series default \begin_inset Quotes erd \end_inset (without preceding \begin_inset Quotes eld \end_inset software \begin_inset Quotes erd \end_inset ). Any of \begin_inset Quotes eld \end_inset architecture \begin_inset Quotes erd \end_inset or \begin_inset Quotes eld \end_inset implementation \begin_inset Quotes erd \end_inset can relate to both hard- and software. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset A certain architecture may have multiple implementations. An implementation is based on a \emph on set \emph default of \series bold technologies \series default \begin_inset Foot status open \begin_layout Plain Layout Architectures are serving as aids for \series bold classification of solutions \series default . An implementation is a solution which has \emph on materialized \emph default (in contrast to solutions which exist only on paper). Complex solutions / implementations are typically matching only one architectur e. Thus the relationship between architectures and solutions / implementations is typically \begin_inset Formula $1:n$ \end_inset , while the relationship between solutions / implementations and technologies is \begin_inset Formula $n:m$ \end_inset in general. In case of a very simple solution, it may \emph on exceptionally \emph default match multiple architectures, but this is not typical for classification schemes. \end_layout \end_inset . \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Unfortunately, certain technologies are not suitable for certain architectures. There may be \series bold restrictions \series default . \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Because of \series bold hidden restrictions \series default which may show up later, you should not start with implementations or technolog ies. Always start top-down with architectural considerations, while trying to identify potential restrictions \emph on as early as possible \emph default . \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset The biggest \series bold potential for good solutions \series default is at architectural level. Exchanging a single component or a technology is typically much easier than changing a whole architecture, once it has been implemented. Often, changing an architecture is close to impossible. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Starting with a particular implementation and/or with a particular technology in mind, and not sufficiently reasoning abouts its fundamental architecture, and/or \series bold not seriously considering alternative architectures \series default , is a major source of \series bold costly ill-designs \series default . An example may be found in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Example-Failures-of" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Confusion of \begin_inset Quotes eld \end_inset architecture \begin_inset Quotes erd \end_inset with \begin_inset Quotes eld \end_inset implementation \begin_inset Quotes erd \end_inset and/or \begin_inset Quotes eld \end_inset technology \begin_inset Quotes erd \end_inset is another major source of ill-designs, which then often cause major product flaws and/or operational problems. Be sure to understand the differences. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Recommended best practice is to (1) look at the \series bold problem space \series default , then (2) consider a \emph on set \emph default of \series bold architectural solution classes \series default , and (3) look at each of the \series bold mappings \series default between problem space and solution space. The \emph on complexity \emph default of such a mapping is a first hint. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset In somewhat more detail: start with \series bold architectural requirements \series default for a particular \series bold application area \series default (typically covering \emph on multiple \emph default use cases), then look at \series bold multiple solution architectures \series default , and finally go down to a \series bold \emph on set \series default \emph default of potential implementations, but only \emph on after \emph default the former has been understood. Selection of components and technologies should be the \emph on last \emph default step during the first iteration of this method. Then do a \series bold quality check \series default at \emph on concept \emph default level. Often, this review will disguise some problems / limitations etc, which should be treated by further iterations, restarting top-down again. \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset During this iterative concept work, you should \series bold validate \series default your solution(s) several times, e.g. for \series bold compatibility \series default (no conflicts caused by restrictions, etc). \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Starting at the bottom with a particular single solution in mind, and/or presuming a certain technology, is almost a \emph on guarantee \emph default for a non-optimum solution, or even a failed project, or even a disaster at company level when \series bold enterprise-critical mass data \series default is involved. Always consider a \emph on set of \emph default candidate architectures, and for each of them, a \emph on set of \emph default solutions / technologies. \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Nevertheless, don't think in waterfall models. Always work \series bold iteratively \series default and \series bold evolutionary \series default by \emph on re-considering architecture \emph default whenever you find problems / contradictions induced by restrictions, similar to the \series bold spiral model \series default \begin_inset Foot status open \begin_layout Plain Layout See \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/Spiral_model \end_layout \end_inset . \end_layout \end_inset . \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Be cautious when transferring \emph on software \emph default development methods to storage architectures, where operations involves masses of hardware. You need to find a balance between extreme waterfall-like and agile \begin_inset Foot status open \begin_layout Plain Layout Purely agile methods are less suited for quality ensurance of storage architectu res, because they are tempting people to start with simple approaches before the problem domain has been fully understood, increasing the \series bold risk of architectural ill-designs \series default . Starting an implementation \emph on too early \emph default on basis of an ill-design can easily \series bold lead into a dead end \series default . Agile methods are typically encouraging \series bold early deliverables \series default ., which can be counter-productive. Example: it is clearly a bad idea to plan for an early deliverable for some petabytes of storage. Thus architects and managers are tempted to \emph on start small \emph default , e.g. a BigCluster architecture with only 3 storage servers. This type of \begin_inset Quotes eld \end_inset early deliverable \begin_inset Quotes erd \end_inset cannot detect any \series bold scalability problems \series default early enough, see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Example-Failures-of" plural "false" caps "false" noprefix "false" \end_inset . So you are in a \series bold dilemma \series default , whether you like it or not. Although you probably dislike it, the planning phase of big storage systems is unfortunately more like a waterfall process, by its very nature. Thus \emph on workarounds \emph default for the shortcomings of a pure waterfall model are needed. German readers may also check the V-model XT, as described in \begin_inset Flex URL status open \begin_layout Plain Layout https://de.wikipedia.org/wiki/V-Modell_(Entwicklungsstandard) \end_layout \end_inset . Unfortunately, the newer XT variant of the V-model is missing in the correspond ing English Wikipedia article (retrieved autumn 2019), misleading readers with unfortunate opinions like the V-model being too similar to a waterfall model. Notice that the newer XT variant of the V-model, as well as some other variants (e.g. lecture notes from Professor Jochen Ludewig / University of Stuttgart), have adopted many ideas from the agile community, such as rework in loops and cycles, and thus should not be classifed as \begin_inset Quotes eld \end_inset linear waterfall \begin_inset Quotes erd \end_inset models. In particular, \series bold early quality ensurance of concepts and architectures \series default and \series bold rework of architecture as early as possible \series default is something you definitely should borrow from the V-model and its modern variants, even if you dislike V-models otherwise. \end_layout \end_inset methods. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Serious bugs in an \emph on architectural \emph default ill-design (examples see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Example-Failures-of" plural "false" caps "false" noprefix "false" \end_inset ) are typically very hard by causing serious limitation and/or impact, and cannot be fixed by the best implementation, or by the best technology of the world. Be sure to understand the fundamental difference between architecture and its (multiple / alternative) implementations, as well as multiple technologies, and their respective restrictions, as well as their \series bold reach \series default . \end_layout \begin_layout Section What is HA = High Availability \begin_inset CommandInset label LatexCommand label name "sec:What-is-HA" \end_inset \end_layout \begin_layout Standard HA is defined by a single number, denoting the \emph on minimum percentage of uptime \emph default of a certain system from a user's perspective. Some examples: \end_layout \begin_layout Itemize 99% availability: a total downtime of more than 87.6 hours per year is not acceptable. \end_layout \begin_layout Itemize 99.9% availability: a total downtime of more than 8.76 hours per year is not acceptable. \end_layout \begin_layout Itemize 99.99% availability: a total downtime of more than 52.56 minutes per year is not acceptable. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset HA is a \series bold requirement \series default . Requirements are characterizations of the \series bold problem space \series default . In software engineering, requirements are \emph on strictly separated \emph default from any measures, how a requirement can be met (solution space). In general, there may be \emph on several \emph default solutions for achieving a certain HA percentage. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Some of the potential solutions for the same HA percentage may be much more \series bold expensive \series default than others, sometimes by \emph on factors \emph default . We will see some examples later. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Some people are arguing \emph on incorrectly \emph default , by claiming that \emph on any \emph default HA solution would \emph on need \emph default to be built up by \emph on hardware redundancy. \emph default Some people even believe that redundancy would be needed at \emph on each and every single hardware component \emph default , otherwise it would not be HA. This confuses requirements with solutions. It is wrong in general, because even a certain degree of hardware redundancy cannot guarantee a certain overall hard+software HA percentage in general, for example when certain components such as failover software are not reliable enough. See also section \begin_inset CommandInset ref LatexCommand nameref reference "sub:Detailed-explanation" plural "false" caps "false" noprefix "false" \end_inset for a counter-example, where addition of more redundancy \begin_inset Formula $>k$ \end_inset does not help. Of course, higher degrees of HA are \emph on typically(!) \emph default built using certain types and degrees of redundancy, including variants like geo-redundancy. In general, however, there might be other means for achieving HA, like extremely quick automatic repair methods, self-healing \begin_inset Foot status open \begin_layout Plain Layout This is no joke. For example, certain spacecrafts need to run for years or even for decades, without any maintenance. Thus it helps enormously when some of their components are self-healing, for example certain surfaces or shields after a hit by micro meteorites. \end_layout \end_inset systems, etc. \end_layout \begin_layout Section What is Geo-Redundancy \begin_inset CommandInset label LatexCommand label name "sec:What-is-Geo-Redundancy" \end_inset \end_layout \begin_layout Standard From the technical viewpoint of HA, geo-redundancy belongs to the \emph on solution space \emph default . From the viewpoint of \series bold government authorities \series default , and/or from \series bold owners \series default of a company / rating agencies determining the \series bold business risk \series default and the \series bold stock exchange value \series default of a company, it is also a \emph on requirement \emph default . \end_layout \begin_layout Standard Geo-redundancy means that the \series bold risk \series default of certain types of geo-localized \series bold physical impacts \series default , such as earthquakes, floods, terrorist attacks, cascading mass power blackouts , etc, must be \series bold compensated \series default by being able to run at least the \series bold core business \series default from another geo-location within some reasonable timeframe. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Notice that the same family of requirements can be solved \emph on very \emph default differently. This guide explains ways for both \series bold cost reduction \series default and \series bold risk reduction \series default at the same time, by \emph on combining \emph default HA requirements with geo-redundancy requirements in a clever way, such that the combined solution will meet both at the same time. A resulting combined solution is called \series bold Football on top of MARS \series default . It provides additional operational value, such as load balancing via the \series bold ability for butterfly \series default , see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Flexibility-of-Failover" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \end_inset \end_layout \begin_layout Standard \noindent There are some ongoing political discussions about detail requirements for geo-redundancy. The mimimum distance requirement between suitable geo-locations is seen differently by different interest groups, and even differently in different countries. Background are the \series bold enormous cost \series default for setup of a datacenter. \end_layout \begin_layout Standard \begin_inset ERT status open \begin_layout Plain Layout \backslash sloppy \end_layout \end_inset While some NGOs = Non-Governmental Organizations are fighting for a minimum distance of only 5 km, the German government authority BSI recommends a minimum distance of 200 km between datacenters for \series bold critical infrastructures. \series default See \begin_inset Flex URL status open \begin_layout Plain Layout https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Sicherheitsberatung/Standort-Kr iterien_HV-RZ/Standort-Kriterien_HV-RZ.pdf?__blob=publicationFile&v=5 \end_layout \end_inset . Although this is only a \begin_inset Quotes eld \end_inset recommendation \begin_inset Quotes erd \end_inset officially, certain sectors like \series bold banking \series default are actually forced to treat this more or less like a requirement. \end_layout \begin_layout Standard For an observer, it could be interesting how \emph on international requirements \emph default will evolve, and how rating agencies will change their rules during the course of the next decades. \end_layout \begin_layout Section What is \emph on Cloud Storage \begin_inset CommandInset label LatexCommand label name "sec:What-is-Cloud-Storage" \end_inset \end_layout \begin_layout Standard According to a popular definition from \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/Cloud_storage \end_layout \end_inset (retrieved June 2018), cloud storage is \end_layout \begin_layout Description (1) Made up of many \series bold distributed resources \series default , but still \series bold act as one \series default . \end_layout \begin_layout Description (2) Highly \series bold fault tolerant \series default through redundancy and distribution of data. \end_layout \begin_layout Description (3) Highly \series bold durable \series default through the creation of versioned copies. \end_layout \begin_layout Description (4) Typically \series bold eventually consistent \series default with regard to data replicas. \end_layout \begin_layout Standard A detailed analysis of consequences from this definition is in secction \begin_inset CommandInset ref LatexCommand nameref reference "sec:Requirements-for-Cloud" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Section What is SDS = Software Defined Storage \begin_inset CommandInset label LatexCommand label name "sec:What-is-Software-defined-Storage" \end_inset \end_layout \begin_layout Standard As explained in \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/Software-defined_storage \end_layout \end_inset , SDS is a \series bold marketing term \series default , subsuming a wide variety of offerings from several \emph on vendors \emph default . \end_layout \begin_layout Standard In essence, it can be \emph on almost anything \emph default from the storage area, where hardware can be treated independently from software, or at least some software configuration is available. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout \noindent Even a \begin_inset Quotes eld \end_inset simple \begin_inset Quotes erd \end_inset HDD = Hard Disk Drive device has not only some \series bold network interface \series default (typically SATA or SAS in place of Ethernet), but also contains some software called firmware, which \emph on could \emph default (at least potentially) be exchanged independently. Believe it or not: even such a \begin_inset Quotes eld \end_inset simple hardware \begin_inset Quotes erd \end_inset device is providing \series bold storage virtualization \series default , although a rather primitive one. For example, it maps logical sector numbers (LBNs) to physical coordinates like CHS = Cylinder / Head / Sector, or similar. Newer 4k sector disks can emulate old 512 byte sector formats, etc. Thus such devices would match the fuzzy Wikipedia description of SDS. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset In practice, the term SDS is a \series bold tautology \series default because it can mean almost anything from the storage area, thus the term is not really useful. \end_layout \begin_layout Standard In order to talk about SDS in technical terms of architecture, here is an \emph on attempt \emph default to somehow narrow it down, and to somehow relate it to Cloud Storage: \end_layout \begin_layout Quote SDS (in the sense of this guide) is a Cloud Storage system. \end_layout \begin_layout Standard \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Treating SDS as equivalent to Cloud Storage makes it more useful, but neglects the opportunity for defining something useful inbetween of Cloud Storage and \begin_inset Quotes eld \end_inset anything \begin_inset Quotes erd \end_inset . \end_layout \begin_layout Standard Notice that a Wikipedia search \begin_inset Quotes eld \end_inset storage as a service \begin_inset Quotes erd \end_inset (which could be abbreviated StaaS) is delivering a redirection to \begin_inset Quotes eld \end_inset Cloud Storage \begin_inset Quotes erd \end_inset . Another missed opportunity for getting some useful structure into the \series bold wild-growing jungle \series default , and for clearly explaining differences, and for a fruitful discussion of pro and cons. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Remark \end_layout \end_inset This is an indicator that the storage area is not really mature. There are more short-sighted hypes than fundamental concepts. This architecture guide is an attempt to guide \begin_inset Foot status open \begin_layout Plain Layout German saying, semantically translated to English: \begin_inset Quotes eld \end_inset You cannot see the forest because there are too many trees in front of it. \begin_inset Quotes erd \end_inset \end_layout \end_inset you through the hype jungle in a structured way. \end_layout \end_inset \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Indirect cost of hypes \end_layout \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Beware of hyped buzzwords like \begin_inset Quotes eld \end_inset storage as a service \begin_inset Quotes erd \end_inset . It narrows your attention to network-centric architectures, and distracts your attention from major cost saving opportunities like \family typewriter LocalSharding \family default (see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Variants-of-Sharding" plural "false" caps "false" noprefix "false" \end_inset ). \end_layout \end_inset \end_layout \begin_layout Chapter Architectures of Cloud Storage / Software Defined Storage \begin_inset CommandInset label LatexCommand label name "chap:Cloud-Storage" \end_inset \end_layout \begin_layout Standard Datacenter architects have no easy job. Building up some petabytes of data in the wrong way can easily endanger a company, as will be shown later. There are some architectural laws to know and some rules to follow. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout As a responsible manager, you will make architectural decisions, even if you are \emph on not aware \emph default of them. Bad decisions, even if you are not aware of its consequences, can endanger major products, and increase cost by \emph on factors \emph default . Once you have commited to a certain architecture, it will be \emph on extremely cumbersome \emph default to modify it later. Thus you need to get an architecture right from start. Typically, you will have \series bold only one shot \series default . \end_layout \end_inset \end_layout \begin_layout Standard \noindent First, we need to take a look at the most general possibilities how storage can be architecturally designed: \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/storage-classification.fig width 80col% \end_inset \end_layout \begin_layout Standard \noindent The topmost question is: do we always need to access bigger masses of (typically unstructured) data over a network? \end_layout \begin_layout Standard There is a common belief that both reliability and scalability could be only achieved this way. In the past, local storage has often been viewed as \begin_inset Quotes eld \end_inset too simple \begin_inset Quotes erd \end_inset to provide enterprise grade reliability, and scalability, and maintainability. In the past, this was sometimes true. \end_layout \begin_layout Standard However, this picture has changed with the advent of a new \series bold load balancing \series default method called \series bold LV Football \series default , see \family typewriter football-user-manual.pdf \family default . \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout When Football is combined with a \family typewriter FlexibleSharding \family default architecture (see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:FlexibleSharding" plural "false" caps "false" noprefix "false" \end_inset ), practically the same flexibility as promised by \family typewriter BigCluster \family default is possible. \end_layout \end_inset \end_layout \begin_layout Section Architectural Properties of Cloud Storage \emph on \begin_inset CommandInset label LatexCommand label name "sec:Requirements-for-Cloud" \end_inset \end_layout \begin_layout Standard Brief recall from section \begin_inset CommandInset ref LatexCommand nameref reference "sec:What-is-Cloud-Storage" plural "false" caps "false" noprefix "false" \end_inset . Cloud storage is \end_layout \begin_layout Description (1) Made up of many \series bold distributed resources \series default , but still \series bold act as one \series default . \end_layout \begin_layout Description (2) Highly \series bold fault tolerant \series default through redundancy and distribution of data. \end_layout \begin_layout Description (3) Highly \series bold durable \series default through the creation of versioned copies. \end_layout \begin_layout Description (4) Typically \series bold eventually consistent \series default with regard to data replicas. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset The requirement (1) \begin_inset Quotes eld \end_inset act as one \begin_inset Quotes erd \end_inset implies some appropriate type of \series bold location transparency \series default (see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Location-transparency" plural "false" caps "false" noprefix "false" \end_inset ). \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset The definition says nothing about the \series bold granularity \series default / sizes of the distributed resources. See section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Granularity-at-Architecture" plural "false" caps "false" noprefix "false" \end_inset for a more detailed discussion of opportunities arising from better informed decisions about this. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Notice that the term \begin_inset Quotes eld \end_inset network \begin_inset Quotes erd \end_inset does not occur in this definition. However, the term \begin_inset Quotes eld \end_inset distributed resources \begin_inset Quotes erd \end_inset is implying \emph on some(!) \emph default kind of network. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset The definition does \emph on not \emph default imply some \emph on specific \emph default type of network, such as a costly \series bold storage network \series default which must be capable of transporting masses of IO operations in \series bold realtime \series default . In general, we are free to use other types of networks, such as cheaper \series bold replication networks \series default , which need not be dimensioned for realtime IO traffic, but are sufficient for \series bold background data migration \series default , and even over long distances, where \emph on any \emph default network has some bottlenecks. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Often, there are \series bold restrictions from technology \series default . Not every architecture as discussed in this guide can be easily implemented via a certain technology. Example: when a so-called \series bold Vendor Lock-In \series default is binding you to to a certain brand of commercial storage boxes, certain opportunities will be missed. By going to self-built and self-administered RAID storage, typically an invest factor between 3 and 10 can be saved (see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Cost-Arguments-from-Technology" plural "false" caps "false" noprefix "false" \end_inset ). On top of this, about another factor of 2 is possible, about \emph on halving your total hardware invest \emph default , by use of Linux-based local storage + Football in place of network-based commercial storage, provided it is possible for your use case. See sections \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Proprietary-vs-OpenSource" plural "false" caps "false" noprefix "false" \end_inset and \begin_inset CommandInset ref LatexCommand nameref reference "sec:Local-vs-Centralized" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Notice that the definition says nothing about the \series bold time scale \series default of operations \begin_inset Foot status open \begin_layout Plain Layout Go down to a time scale of microseconds. You will then notice that typical IO operations will require several hundreds of machine instructions between IO request \emph on submission \emph default and the corresponding IO request \emph on completion \emph default . This is not only true for local IO. In network clusters like Ceph, it will involve much more work, like creation of network packets, and lead to additional IO latencies implied by the network packet transfer latencies. \end_layout \end_inset . In general, we are free to implement certain operations, such as \series bold background data migration \series default , in a rather long timescale (from a human point of view). This bears an opportunity for \series bold major cost reduction \series default , as well as \series bold improving reliability \series default by decreasing dependencies from (hidden) SPOFs \begin_inset Foot status open \begin_layout Plain Layout Several people appear to work with the \emph on assumption \emph default that networks are available all the time. Although minor network outages can be compensated very well, there remains a \series bold residual risk \series default for a major outage, similar to what happened in Fukushima. Thus such an attitude can endanger both companies and carreers. \end_layout \end_inset = Single Points Of Failure. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Replication \begin_inset space ~ \end_inset network \begin_inset space ~ \end_inset failures \end_layout \end_inset Football on top of MARS for background LV migration over both short and geo-distances. When the replication network is down, it will just pause for a while, and MARS will automatically resume once the network is up again. Football can be configured to also resume the higher-level migration process, when necessary. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Storage \begin_inset space ~ \end_inset network \begin_inset space ~ \end_inset failures \end_layout \end_inset It is clear that a failure of a classical storage network will halt all services depending on it. Some people believe that realtime storage networks cannot be avoided, in order to react at varying load situations, and are running much faster due to load distribution. This is not the full picture: \end_layout \begin_layout Enumerate Football plus FlexibleSharding can achieve a similar level of elasticity. \end_layout \begin_layout Enumerate Load distribution is essentially nothing else but a variant of \series bold data striping \series default . If you really need it for performance reasons, you can often do similarly with certain types of local RAID, such as RAID-10 or RAID-60, and with a variety of RAID parameters. Notice that \emph on any \emph default kind of data striping, whether at block level or at object level, is coming with some cost \begin_inset Foot status open \begin_layout Plain Layout For a given redundancy degree \begin_inset Formula $k$ \end_inset , \series bold reliability is reduced \series default by striping. In case of RAID, this is well-known since decades. Unfortunately, in case of BigCluster some misleading \begin_inset Quotes eld \end_inset propaganda \begin_inset Quotes erd \end_inset was blurring the public opinion for many years. Notice that the BigCluster analysis in section \begin_inset CommandInset ref LatexCommand nameref reference "sub:Detailed-explanation" plural "false" caps "false" noprefix "false" \end_inset is showing up some parallels to the well-known reliability loss caused by RAID striping, when some granularity differences (block vs object level etc) are ignored. \end_layout \end_inset . \end_layout \begin_layout Enumerate LocalStorage is even faster (when using a comparable technology yielding the same size), because IO does not involve \emph on any dedicated storage network \emph default at all. Therefore, it is also more reliable (when using comparable technology). \end_layout \begin_layout Enumerate \noindent Reorg tasks: these can occur in all top-level architectures. In general, not all operations can run in realtime, by construction. For example, increasing the number of replicas in an operational Ceph cluster, already containing a few hundreds of terabytes of data, will not only require additional storage hardware, but will also take a rather long time, implied by the very nature of bigger reorganisational tasks. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset When \series bold geo-redundancy \series default = some minimum distance between datacenters for \series bold survial of geo-disasters \series default like earthquakes or floods is added to (2) as an additional requirement (see also section \begin_inset CommandInset ref LatexCommand nameref reference "sec:What-is-Geo-Redundancy" plural "false" caps "false" noprefix "false" \end_inset ), some \emph on further consequences \emph default will arise. For example, the German government authority BSI recommends a minimum distance of 200 km between datacenters for \series bold critical infrastructures \series default \begin_inset Foot status open \begin_layout Plain Layout See \begin_inset Flex URL status collapsed \begin_layout Plain Layout https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Sicherheitsberatung/Standort-Kr iterien_HV-RZ/Standort-Kriterien_HV-RZ.pdf?__blob=publicationFile&v=5 \end_layout \end_inset \end_layout \begin_layout Plain Layout Some press comments on this: \begin_inset Flex URL status open \begin_layout Plain Layout https://www.it-finanzmagazin.de/bsi-rechenzentren-entfernung-bafin-84078/ \end_layout \end_inset \end_layout \end_inset . Over suchalike distances, realtime storage networks cannot be used anymore in general. Thus some kind of \begin_inset Quotes eld \end_inset migration \begin_inset Quotes erd \end_inset of data over long distances will be needed anyway. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Since data migration is needed \emph on anyway \emph default over long distances, there is an opportunity for \series bold saving cost \emph on and \emph default increasing reliabilty + flexibility \series default all at the same time. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Basic idea behind Football on top of a Sharding model: \series bold minimize the \emph on distances \series default \emph default between your storage spindles and the corresponding data processing. When background data migration is automated properly, real-time storage networks can become superfluous, or at least the corresponding realtime IO traffic can be drastically reduced. When minimization is well dimensioned, a pair of storage + application server residing in the same geo-location can be \series bold collapsed into a single box \series default . This is not only a \series bold major cost reducer \series default , it also \series bold improves reliability \series default because there are less components which can fail. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Unfortunately, this opportunity is easily \emph on missed \emph default if both system architects and responsible managers are just requiring only DR = Disaster Recovery over long distances, instead of requiring the ability for butterfly (see section \begin_inset CommandInset ref LatexCommand ref reference "subsec:Flexibility-of-Failover" plural "false" caps "false" noprefix "false" \end_inset ). \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \noindent Essentially, suchalike minimum requirements can be easily interpreted as \begin_inset Quotes eld \end_inset everything has to be doubled \begin_inset Quotes erd \end_inset in order to survive any geo-disaster \begin_inset Foot status open \begin_layout Plain Layout A geo-disaster like an earthquake will typically last for weeks, if not months, until it is fully repaired. During such a period, a single surviving datacenter must be capable of providing \begin_inset Quotes eld \end_inset good enough \begin_inset Quotes erd \end_inset SLAs. These disaster-SLAs can be lower than usual. For example, in place of an ordinary 99.98% availability, 98% may be a sufficien t target \emph on during \emph default such a geo-disaster. By unnecessarily requiring much more during a very rare corner case, you can easily explode the cost, even beyond doubling, without reasonable benefit during ordinary operations. \end_layout \end_inset . This would double cost in comparison to certain kinds of fully locally redundant architectures, missing the opportunity for \series bold \emph on splitting \series default \emph default much of the overall redundancy into two geo-locations, \series bold instead of doubling \series default virtually everything. \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Some people are arguing that doubling were unavoidable, which is \emph on incorrect in general \emph default , as Football can demonstrate as a positive counter-example. See section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Cost-Arguments-from-Architecture" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Counter-productive cost arguments are sometimes heard when geo-redundancy is discussed about, without considering newer possibilities such as Football. As explained in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Flexibility-of-Failover" plural "false" caps "false" noprefix "false" \end_inset , the \series bold granularity of failover \series default should not be required as a \series bold coarse failover of a full datacenter \series default , but explicitly be required as \series bold fine-grained cross-geo failover + handover at VM level \series default , or at a similar granularity (c.f. section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Granularity-at-Architecture" plural "false" caps "false" noprefix "false" \end_inset ). This will force people to think about \series bold wide-area distribution \series default of resources instead of plainly doubling them (once again \begin_inset Foot status open \begin_layout Plain Layout Example: commercial storage boxes from NetApp, IBM, etc already have some \emph on local redundancy \emph default , typically doubling the amount of physical disks you are actually buying when you buy a single storage box. Typically, the amount of \emph on physical \emph default disks is not directly reported as a KPI, although it is major cost producer. When introducing geo-replication, you will likely need to buy double the number of boxes, resulting in a total of about 4x the capacity at the physical layer. In contrast, MARS + Football can often be built on top of local RAID-6. As pointed out in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Cost-Arguments-from-Architecture" plural "false" caps "false" noprefix "false" \end_inset , this leads to only about 2.2x the physical capacity you will need to buy. In addition, the rackspace is much lower when using local storage, reducing the number of servers to deploy and administer, and reducing networking cost by omission of dedicated storage networks. \end_layout \end_inset ). \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Important keyword for flexible cross-geo distribution: \series bold ability for butterfly \series default , see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Flexibility-of-Failover" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset There is a \series bold tradeoff \series default between the effort for implementation of per-VM flexibility, and hardware cost savings. Sometimes arguments are heard that a high level of flexibility would be too costly. Although this might be true in some relatively small corner cases, the picture can rapidly change when thousands of servers and/or petabytes or storage are involved. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent Doubling the overall cost for big datacenters instead of intelligently geo-distr ibuting resources, is likely much more cost intensive in the long term than investing once into \series bold intelligent abilities \series default of the company like Football, which can then \series bold scale up \series default (more details see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Scalability-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset ). \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset As a consequence from sufficiently fine-grained handover + failover, the above definition of cloud storage can be \series bold met at geo-datacenter level \series default , i.e. the distributed resources according to (1) will be distributed over \emph on multiple geo-redundant \emph default locations / datacenters. As pointed out in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Cost-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset , sometimes this may be even cheaper than building certain types of local redundancy inside the same datacenter. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset The famous CAP theorem (see section \begin_inset CommandInset ref LatexCommand vref reference "sec:Explanation-via-CAP" plural "false" caps "false" noprefix "false" \end_inset ) is one of the motivations behind requirement (4) \begin_inset Quotes eld \end_inset eventually consistent \begin_inset Quotes erd \end_inset . This is not an accident. There is a \emph on reason \emph default for it, although it is not a \emph on hard \emph default requirement. Strict consistency is not needed for many applications running on top of cloud storage. In addition, the CAP theorem and some other theorems cited at \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/CAP_theorem \end_layout \end_inset are telling us that Strict Consistency would be \series bold difficult and expensive \series default to achieve at global level in a bigger Distributed System, and at the cost of other properties. More detailed explanations are in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Explanation-via-CAP" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \end_inset \end_layout \begin_layout Section Suitability of Architectures for Cloud Storage \begin_inset CommandInset label LatexCommand label name "subsec:Suitability-of-Architectures" \end_inset \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout There are some consequences from the above definition of Cloud Storage (see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Requirements-for-Cloud" plural "false" caps "false" noprefix "false" \end_inset ), for each of our high-level storage architectures: \end_layout \begin_layout Description Distributed \begin_inset space ~ \end_inset Storage, in particular \family typewriter BigCluster \family default architectures (see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Distributed-vs-Local:" plural "false" caps "false" noprefix "false" \end_inset ): many of them (with few exceptions) are conforming to all of these requirement s. Typical granularity are objects, or chunks, or other relatively small units of data. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Distributed Storage is the growground where Cloud Storage was invented. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Many contemporary \family typewriter BigCluster \family default implementations are \emph on not really \emph default supporting \series bold geo-distribution \series default of masses of objects over long distances, in the sense of well-proven use cases (maturity). Small object granularity and/or strict consistency on top of unreliable objects are worsening the effects of the CAP theorem and its sister theorems. Thus object-based architectures are typically only suited for local (non-geo) operations. \begin_inset Newline newline \end_inset Example: at the moment (mid 2019), Amazon AWS is offering object replication only over campus distances, which cannot meet the requirements from BSI. \end_layout \begin_layout Description Centralized \begin_inset space ~ \end_inset Storage: does not conform to (1) and to (4) by definition \begin_inset Foot status open \begin_layout Plain Layout Notice that sharding on top of CentralStorage is no longer a CentralStorage model by definition, but a RemoteSharding model according to section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Variants-of-Sharding" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \end_inset . By introduction of synchronous or asynchronous replication, it can be made to \emph on almost \emph default conform, except for (1) where some concept mismatches remain (probably resolvable by going to a RemoteSharding model on top of CentralStorage, where CentralStorage is only a \emph on sub-component \emph default ). Typical granularity is replication of whole internal storage pools, or of filesystem instances. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset In general, \family typewriter CentralStorage \family default architectures are a \series bold mismatch \series default to Cloud Storage, by their very nature. Healing suchalike \series bold concept \series default mismatches may be close to impossible, or at least very tricky and costly. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Adding asynchronous replication to commercial storage boxes will not only double the cost, which are anyway at a very high starting level (see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Cost-Arguments-from-Technology" plural "false" caps "false" noprefix "false" \end_inset ). In addition, the \series bold handover granularity \series default (see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Flexibility-of-Failover" plural "false" caps "false" noprefix "false" \end_inset ) may not meet the optimum. \end_layout \begin_layout Description LocalStorage, and some further models like \family typewriter RemoteSharding \family default (see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Variants-of-Sharding" plural "false" caps "false" noprefix "false" \end_inset ) \begin_inset Newline newline \end_inset There is some historical belief that cloud storage cannot be reasonably built on top of them. When newer developments and opportunities are taken into account, this has changed. Here are some examples, mentioning some example components: \end_layout \begin_deeper \begin_layout Description (1) can be achieved at LV granularity with Football (see \family typewriter football-user-manual.pdf \family default ), which creates a \series bold Big Virtual LVM Pool \series default . Football is in mass production at 1&1 Ionos since August 2018. \end_layout \begin_layout Description (2) can be achieved at disk granularity with local RAID, and at LV granularity with DRBD or MARS. Both are in mass production since several years. \end_layout \begin_layout Description (3) can be achieved at LV granularity with LVM snapshots, and/or ZFS (or other filesystem) snapshots, and/or above filesystem layer by addition of classical backup. \end_layout \begin_layout Description (4) at least \family typewriter Eventually Consistent \family default or better can be alternatively achieved by one of the components \end_layout \begin_deeper \begin_layout Description (4a) \series bold DRBD \series default , which provides \family typewriter Strict Consistency \family default during \family typewriter connected \family default state, but works only reliably with passive crossover cables over \series bold short distances \series default (see CAP theorem in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Explanation-via-CAP" plural "false" caps "false" noprefix "false" \end_inset ). \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset DRBD violates any type of consistency within your \emph on replicas \emph default during (automatic) re-sync, and thus does not \emph on fully \emph default comply with the above definition of cloud storage in a \emph on strong \emph default sense. You may argue at a coarse time granularity scale in order to \begin_inset Quotes eld \end_inset fix \begin_inset Quotes erd \end_inset this. \end_layout \begin_layout Description (4b) \series bold MARS \series default , which works over \series bold long distances \series default and provides two different consistency guarantees at different levels, \emph on both at the same time \emph default : \end_layout \begin_deeper \begin_layout Description locally: \family typewriter Strict Consistency \family default at local LV granularity, also \emph on within \emph default each of the LV replicas. \end_layout \begin_layout Description globally: \family typewriter Eventually Consistent \family default \emph on between \emph default different LV replicas (global level). \begin_inset Newline newline \end_inset The CAP theorem (see section \begin_inset CommandInset ref LatexCommand ref reference "sec:Explanation-via-CAP" \end_inset ) says that \family typewriter Strict Consistency \family default is \series bold not possible \series default in general at \emph on unplanned failover \emph default during long-distance network outages (P = Partitioning Tolerance), when A = Availability is also a requirement. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset However, in case of a \emph on planned handover \emph default , MARS is also \family typewriter Strictly Consistent \family default at a global level, but may need some extra time for catching up. \begin_inset Newline newline \end_inset Notice: global \family typewriter Strict Consistency \family default is also possible at a \emph on coarse timescale \emph default , in accordance with the CAP theorem, if you decide to sacrifice A = Availabilit y during such a network incident by simply \emph on not \emph default doing a failover action. Just wait until the network outage is gone, and MARS will automatically resume \begin_inset Foot status open \begin_layout Plain Layout This automatic MARS behaviour is similar to the behaviour of DRBD in such situations, when DBRD can automatically go to \family typewriter disconnected \family default -like state, and you are later manually or automatically resuming the DRBD connection for an incremental re-sync. MARS does everything automatically because it has no firmly built-in assumption s about the actual duration of any network communication. \end_layout \end_inset everything ASAP, and thus you are using MARS \emph on only \emph default as a protection against \series bold fatal \series default storage failures / unplanned \series bold disasters \series default . \begin_inset Newline newline \end_inset Notice: A = Availability is \emph on not generally \emph default required by the above definition of cloud storage, because from a user's perspective it would not generally make sense in the global internet where connection loss may anyway occur at any time. Thus it is a valid operational strategy to \emph on not \emph default fail-over your LVs during certain minor, or even during certain types of major network outages (e.g. when failover would not improve much). \begin_inset Newline newline \end_inset Notice: long-term \series bold disaster tolerance \series default (e.g. perpetual loss of some storage nodes during an earthquake) is \emph on not \emph default modeled by the CAP theorem, but is more or less required by (2) and (3) from the above definition of cloud storage. \end_layout \end_deeper \end_deeper \end_deeper \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset \family typewriter BigCluster \family default architectures are creating \emph on virtual \emph default storage pools out of physically distributed storage servers. For fairness reasons, creation of a big virtual LVM pool, must be considered as \emph on another \emph default valid Cloud Storage \emph on model \emph default , matching the above definition of Cloud Storage. The main architectural difference is granularity, as explained in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Granularity-at-Architecture" plural "false" caps "false" noprefix "false" \end_inset , and the stacking order of sub-components. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Football is creating \series bold location transparency \series default inside of the distributed virtual LVM pool. This is an important (though not always required) basic property of \emph on any \emph default type of clusters and/or grids. \end_layout \begin_layout Section Location Transparency \begin_inset CommandInset label LatexCommand label name "sec:Location-transparency" \end_inset \end_layout \begin_layout Standard Location Transparency is an extremely important and well-known \series bold fundamental principle \series default in Distributed Systems, and has attracted research for decades. \end_layout \begin_layout Standard Simply stated, it means that the location of an object or of a service is never (part of) a primary key, but any access is via a \emph on logical name \emph default not depending on the location. Thus the location may (relatively easily) change at runtime. \end_layout \begin_layout Standard There are numerous examples where this fundamental principle is obeyed. Unfortunately, there are also many examples where it is violated. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout \series bold \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Phone numbers \end_layout \end_inset \series default Phone numbers are \emph on not \emph default location transparent in general. For stationary phones, they contain a location-dependent prefix. In general, it is not possible to move to a different city while keeping the old stationary phone number. In case of mobile phones / cellphones, numbers are \begin_inset Quotes eld \end_inset more location transparent \begin_inset Quotes erd \end_inset , but even there they are \emph on not fully \emph default location transparent: for international calls, they contain prefixes referring to the country, e.g +1 for US or +49 for Germany. In practice, it is not easily possible to permanently move from Germany to US, without giving up the old number after a while. In addition, often the \emph on service provider \emph default and/or the network technology (D-net vs E-net etc) may be also be encoded in cellphone numbers, e.g. somewhere as an infix, so changing the provider may have some restrictions. However, for \emph on most practical purposes \emph default , such as Europeans spending their holidays in US, mobile phone numbers are \emph on sufficienctly \emph default location transparent. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset In practice, location transparency is not just a boolean property. As explained by the cellphone example, it may have various \series bold degrees \series default . In addition, it can refer to different sub-systems at different architectural layers. Some layers / some components may be (more or less) location transparent, but others not at all. Thus it is important to mention the \series bold layer or the component \series default when talking about location transparency. \end_layout \begin_layout Standard Interestingly, the Wikipedia article \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/Location_transparency \end_layout \end_inset is an incomplete stub when this section was written (Autumn 2019). It seems that people are actually paying less attention to it. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Major violations of location transparency are almost always carrying some \series bold technical debt \series default , likely causing future problems and impediments. \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Therefore, establishment of location transparency needs to be seen as \series bold best practice \series default . \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset It may happen that somebody thinks there would be not enough time and/or resources for implementing certain kinds of location transparency. Although in many cases this is not really true, there might be some corner cases where it sometimes is true, or close to true. Notice that \series bold investments into location transparency \series default are often \emph on longterm \emph default investments. Not doing it will likely \series bold decrease your business opportunities \series default and \series bold increase your risks \series default in the long term. If anyone is arguing that location transparancy were \emph on not needed \emph default as a major feature, you should check whether such a person is really an expert. There needs to be a clear and valid justification for such an opinion. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Where location transparency makes sense or not \end_layout \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset In general, it is not necessary to implement location transparency \emph on everywhere \emph default , for each and every single component / subsystem. The art of system architecture consists of knowing \end_layout \begin_layout Enumerate \noindent where it is \emph on needed \emph default , \end_layout \begin_layout Enumerate \noindent where it is \emph on beneficial \emph default for future growth / future reqirements in multiple dimensions, \end_layout \begin_layout Enumerate \noindent where it is (or will be) too expensive to pay off in the mid-term future, using current technology, but nevertheless \emph on cheap provisions for its later introduction \emph default can be prepared, and \end_layout \begin_layout Enumerate \noindent where its lack can be easily (or even \emph on trivially \emph default ) compensated by location transparency at another layer, such that a particular component does not need to be constructed with location transparency, but nevertheless the \emph on overall system \emph default is sufficiently location transparent, and \end_layout \begin_layout Enumerate when there are multiple choices \emph on where \emph default to implement it, knowing which will be the best one for a familiy of use cases, and finally \end_layout \begin_layout Enumerate \emph on how \emph default to implement it. For example, a common misconception is to believe that storage must always reside at a storage network. Football (see \family typewriter football-user-manual.pdf \family default ) demonstrates that sufficient \begin_inset Foot status open \begin_layout Plain Layout There could be arguments that Football's background migrations might be too slow or might take too long for certain use cases. Notice that \family typewriter BigCluster \family default also needs data migration during operations, e.g. upon replacement of physical disks. When the \family typewriter FlexibleSharding \family default model (see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:FlexibleSharding" plural "false" caps "false" noprefix "false" \end_inset ) is combined with Football, it provides practically the same timescale and flexibility than \family typewriter BigCluster \family default . \end_layout \end_inset location transparency can be achieved on top of local storage, while expensive and performance-eating dedicated storage networks \begin_inset Foot status open \begin_layout Plain Layout Anyway, realtime storage networks cannot span long distances. Thus they are not suitable for achieving location transparency in a geo-redunda nt setup. \end_layout \end_inset are not generally necessary for achieving location transparency. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset In the definition of Cloud Storage in section \begin_inset CommandInset ref LatexCommand vref reference "sec:Requirements-for-Cloud" plural "false" caps "false" noprefix "false" \end_inset , the requirement \begin_inset Quotes eld \end_inset act as one \begin_inset Quotes erd \end_inset is \emph on implying \emph default some appropriate type of location transparency of the resources. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Consequence: any system not sufficiently implementing location transparency of the customer's resources (visible layer from outside) should not be called \begin_inset Quotes eld \end_inset Cloud Storage \begin_inset Quotes erd \end_inset or a \begin_inset Quotes eld \end_inset Cloud Product \begin_inset Quotes erd \end_inset when location transparency is not sufficient at the layer of the customer. \end_layout \begin_layout Section Layering Rules and their Importance \begin_inset CommandInset label LatexCommand label name "subsec:Layering-Rules" \end_inset \end_layout \begin_layout Standard Complex systems are composed of several layers. In this section, we will learn how to organize them (close to) \series bold optimally \series default . \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Non-optimal layering is a major cause of \series bold financial losses \series default , decreased reliability / \series bold increased risk \series default , \series bold worse scalability \series default , etc. \end_layout \end_inset \end_layout \begin_layout Standard \noindent Well-designed systems can be recognized as roughly following Dijkstra's famous \series bold layering rules, \series default originating from his pioneer THE project. Wikipedia article \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/THE_multiprogramming_system \end_layout \end_inset is mentioning an important principle behind Dijkstra's layers, in section \begin_inset Quotes eld \end_inset Design \begin_inset Quotes erd \end_inset : \end_layout \begin_layout Quotation \series bold higher layers only depend on lower layers \end_layout \begin_layout Standard The original article \begin_inset Flex URL status open \begin_layout Plain Layout http://www.cs.utexas.edu/users/EWD/ewd01xx/EWD196.PDF \end_layout \end_inset resp \begin_inset Flex URL status open \begin_layout Plain Layout https://dl.acm.org/citation.cfm?doid=363095.363143 \end_layout \end_inset contains very interesting information, and is a highly recommended reading. The introduction and the progress report is relevant for today's managers, optionally the \begin_inset Quotes eld \end_inset design experience \begin_inset Quotes erd \end_inset , and certainly the conclusions. The section \begin_inset Quotes eld \end_inset System hierarchy \begin_inset Quotes erd \end_inset is relevant for today's system architects, while the rest is mostly of historical interest for OS and kernel specialists. Reading the relevant parts after more than 50 years is extremely well-invested time. Dijkstra provides solutions for \series bold invariant problems \series default which are facing us today with the same boring ignorance, even after 50 years. The heart of his conclusions is \series bold timeless \series default . \end_layout \begin_layout Standard Dijkstra's methodology has been intensively discussed \begin_inset Foot status open \begin_layout Plain Layout An important contribution is from Haberman, by clarifying that there exist serveral types of hierarchies. \end_layout \end_inset by the scientific OS community, and has been generalized in various ways to what folklore calls \begin_inset Quotes eld \end_inset Dijkstra's layering rules \begin_inset Quotes erd \end_inset . Here is a condensed summary of its essence: \end_layout \begin_layout Itemize Layers should be viewed as \series bold abstractions \series default . \end_layout \begin_layout Itemize Higher layers should only depend on lower layers. \end_layout \begin_layout Itemize Each layer should \series bold add \series default some \series bold new \series default functionality. \end_layout \begin_layout Itemize Trivial conclusion by reversing this: \series bold Regressions \series default should be avoided. A regression is when some functionality is \emph on lost \emph default at a higher layer, although it was present at a lower layer. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset This sounds very simple. However, on a closer look, there are numerous violations of these rules in modern system designs. Some examples will follow. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset The term \begin_inset Quotes eld \end_inset \series bold functionality \series default \begin_inset Quotes erd \end_inset is very abstract, and deliberately not very specific \begin_inset Foot status open \begin_layout Plain Layout Elder schools of software engineering know that \series bold design processes \series default must \emph on necessarily \emph default start with unspecific terms, in order to start to bridge the so-called \series bold semantic gap \series default . \end_layout \end_inset . It is \series bold independent \series default from any implementations, programming languages, or programming / user interfaces, or other matters of \series bold representation \series default . \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset The same functionality may be accessible via \emph on multiple \emph default different \series bold interfaces \series default . Thus a different interface does \emph on not \emph default imply that functionality is (fundamentally) different. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Nevertheless, people are often confusing functionality with interfaces. They think that a different interface must provide a different functionality. As explained, this is not correct in general. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Confusion of interfaces with functionality is exploited by so-called marketing drones and other types of advertising (e.g. aquisition of \series bold venture capital \series default ), in order to \series bold open your money pocket \series default . As a responsible manager, you should always check the \emph on functionality \emph default behind a certain product and its interfaces: what is \emph on really \emph default behind the scenes? \end_layout \end_inset \end_layout \begin_layout Subsection Negative Example: object store implementations mis-used as backend for block devices / POSIX filesystems \begin_inset CommandInset label LatexCommand label name "par:Negative-Example:-object" \end_inset \end_layout \begin_layout Standard Several object store implementations are following the client-server paradigm, where servers and clients are interconnected via some \begin_inset Formula $O(n^{2})$ \end_inset storage network (see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Distributed-vs-Local:" plural "false" caps "false" noprefix "false" \end_inset ). \end_layout \begin_layout Standard We start by looking at the \emph on internal \emph default architecture of certain OSD = Object Storage Device (see \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/Object_storage \end_layout \end_inset ) implementations. Some publications are treating them more or less as black boxes (e.g. as abstract interfaces). Certain people are selling this as an advantage. \end_layout \begin_layout Standard However, we will check this here. Thus we need to take a closer look at the \emph on internal \emph default sub-architecture of certain OSD implementations: \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/ceph-layering-server.fig scale 50 \end_inset \end_layout \begin_layout Standard \noindent The crucial point is: several OSD implementations are internally using \series bold filesystems \series default for creating the object abstraction. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold OSD implementation strategies \end_layout \end_inset For implementors, this seems to be a very tempting \begin_inset Foot status open \begin_layout Plain Layout Linux kernel implementations of filesystems need typically at least 10 years, if not 20 years to be considered \begin_inset Quotes eld \end_inset mature \begin_inset Quotes erd \end_inset enough for mass production on billions of inodes. \end_layout \end_inset shortcut strategy. Implementing their own object store functionality on top of block devices, which could easily take some years or decades until mature enough for productio n use. Linus Torvalds, for example, is measuring the maturity cycles of filesystem implementations in units of \emph on decades \emph default , not in years. Pure object stores would need to solve similar \emph on fundamental problems \emph default , like \series bold fragmentation problems \series default , which is a science in itself. Thus existing kernel-level filesystem implementations are often just re-used for OSDs. They seem to be already there, \begin_inset Quotes eld \end_inset for free \begin_inset Quotes erd \end_inset . \end_layout \begin_layout Plain Layout However, at architectural level, they are \emph on not \emph default for free. They are violating Dijkstra's layering rules by causing \emph on regressions \emph default . \end_layout \begin_layout Plain Layout At abstract functionality level: passive objects, and even some associated \emph on rich metadata \emph default , are more or less nothing else but \series bold restricted files \series default , optionally augmented with POSIX EAs = Extended Attributes \begin_inset Foot status open \begin_layout Plain Layout Posix EAs = Extended Attributes implementations as provided by classical filesystems are providing roughly the same functionalities as \emph on passive \emph default augmented object metadata. Even active metadata is possible, e.g. by separate processes present in \family typewriter Akonadi \family default or \family typewriter miner \family default . With such a standard addendum, classical filesystems can also be used for providing active functionality. \end_layout \end_inset . \end_layout \begin_layout Itemize Object IDs can be \series bold trivially mapped \series default \begin_inset Foot status open \begin_layout Plain Layout Example: random hex key \family typewriter 0123456789ABCDEF \family default can be trivially mapped to a path \family typewriter /objectstore/0123/4567/89ABCDEF \family default in an easily reversible way (bijective mapping) \end_layout \end_inset to filenames / pathnames. At \emph on abstract functionality \emph default level, there is almost no difference between pathnames and object IDs, with the exception that pathnames are \emph on more general \emph default , e.g. by allowing deep nesting into subfolders. \end_layout \begin_layout Itemize Newer versions of certain Linux-based filesystems can even automatically generate random object keys, and even atomically (= free of race conditions when executed concurrently). Example: supply the option \family typewriter O_TMPFILE \family default to \family typewriter open() \family default , followed by \family typewriter linkat() \family default . \end_layout \begin_layout Itemize While filesystems are translating file IDs = pathnames into \series bold file handles \series default before further operations can be carried out, object stores are typically skipping this intermediate step from a user's viewpoint. The user needs to supply the object ID for \emph on any \emph default operation. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset In the implementation, this can lead to considerable \series bold runtime overhead \series default , because ID lookup functionality similar to \family typewriter open() \family default has to be re-executed for each operation. In contrast, valid file handles are \emph on directly \emph default referring to the relevant kernel objects, without need to search for a filename again. Extreme example: consider the total runtime overhead by repeatedly appending 1 byte to an object in a loop. \end_layout \begin_layout Itemize Consequently, certain file operations associated with file handles are missing in pure object stores, such as \family typewriter lseek() \family default , as well as many other operations. \end_layout \begin_layout Itemize \series bold Concurrency \series default functionality of a POSIX-compliant \begin_inset Foot status open \begin_layout Plain Layout POSIX requires \series bold strict consistency \series default for many operations, while weaker consistency models are often \emph on sufficient \emph default (but not required) for object stores. \end_layout \end_inset filesystem is much more elaborated than actually needed by an object store. Examples: fine-grained locking operations like \family typewriter flock() \family default are typically not needed in pure object stores. The \family typewriter rename() \family default operation, including its side effects onto concurrency, would even \emph on contradict \emph default to the fundamental idea of immutable object IDs. \end_layout \begin_layout Itemize \series bold Shared memory \series default functionality. Filesystems need to support \family typewriter mmap() \family default and relatives. This is \emph on inevitable \emph default in modern kernels like Linux, for hardware MMU-supported \series bold execution of processes \series default , employing the COW = Copy On Write strategy. See \family typewriter fork() \family default and \family typewriter execve() \family default syscalls, and their relatives. In general, shared memory can be used by several processes concurrently, and on \series bold sparse files \series default . Filesystem implementors need to spend a considerable fraction of their total effort on this. Concurrency on shared memory, together with SMP scalability to a contemporary degree, is what makes implementation really hard, and why there are only relatively few people in the world mastering this art. As a manager, compare with Dijkstra's remarks on required \series bold skill levels \series default for serious OS work... \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Object stores are typically lacking shared memory functionalities completely. Thus they are not suited as a \emph on core component \emph default \begin_inset Foot status open \begin_layout Plain Layout Years ago, certain advocates of object stores have claimed that filesystems would be superseded by object stores / OSDs in future. This is unrealistic, due to the lack of mentioned basic functionalities. When missing functionality would be added to object stores, they would turn into filesystems, or into so-called \begin_inset Quotes eld \end_inset hybrid systems \begin_inset Quotes erd \end_inset . Consequently, there is no clue in claiming that object stores are forming a fundamental base for operating systems. They are essentially just a special case, optionally augmented with some active functionality, which in turn should be attributed to a \emph on separate \emph default layer, independently from filesystems or object stores. \end_layout \end_inset of a modern OS. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset In comparison, creating a different interface for an already existing sub-funct ionality, and optionally adding some metadata harvesters and filters, is requiring much lower \begin_inset Foot status open \begin_layout Plain Layout Roughly, computer science students should be able to do that after a 1 semester OS course. \end_layout \end_inset skills and effort. \end_layout \begin_layout Itemize Several less-used functionalities, like \series bold hardlinks \series default etc. \end_layout \end_inset \end_layout \begin_layout Standard \noindent Obviously, these functionalities are \emph on lost \emph default at the object layer and/or latest at the exports interface. Thus we have identified a Dijkstra regression. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset As explained in the detail box: \series bold trivial differences \series default in an interface, such as usage of intermediate file handles / or not, or near-trivial \series bold representation \series default variants like pathnames vs object IDs, are no valid \emph on \begin_inset Foot status open \begin_layout Plain Layout Arguing with (trivial) syscall combinations or trivial parameter passing can be observed sometimes. As a responsible manager, you should draw another conclusion: someone arguing this way is either fighting for a particular \series bold political interest \series default in an \series bold unfair \series default manner, and/or in reality he demonstrates nothing but an extremely \series bold poor skill level \series default . \end_layout \end_inset \emph default arguments for claiming differences in the \emph on abstract functionality \emph default in the sense of Dijkstra. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold \emph on Real \emph default functionality behind object stores \end_layout \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Conclusion: \emph on passive \emph default object stores are approximately nothing else but a \series bold special case \series default of fileystems. \end_layout \end_inset \end_layout \begin_layout Standard \noindent Now let us look at some \emph on active \emph default functionality of some object stores, such as automatic collection of \series bold rich metadata \series default , or filtering functionality on top of them: are suchalike functionalities really specific for object stores? \end_layout \begin_layout Standard There is a clear answer: NO. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset For example, \family typewriter Akonadi \family default , \family typewriter miner \family default , and similar standard Linux tools are indexing the EXIF metadata of images, or metadata of mp3 songs, videos, etc, residing in a classical filesystem. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Do not draw wrong conclusions from the fact that the classical Unix Philosophy (see \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/Unix_philosophy \end_layout \end_inset ) has a long tradition of \series bold decomposing \series default functionality into \series bold separate layers \series default , such as the distinction between passive filesystems and active metadata indexing. When some object advocates are merging these separate layers into one, this is \emph on not \emph default an advantage. In contrary, there are disadvantages like \emph on hidden cartesian products \emph default occurring at architecture level, and possibly also in implementations. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold \emph on Real \emph default implementation value of OSDs \begin_inset Formula $\Longrightarrow$ \end_inset business value \end_layout \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset For responsibles: when certain advocates are claiming that functionality mergers, such as more or less \series bold trivial combinations \series default of filesystem sub-functionality with some metadata harvesters, are constituting some new product, be \series bold cautious \series default . It is about \series bold \emph on your \emph default money \series default , or about your company's money. \end_layout \begin_layout Plain Layout While it might be a \begin_inset Quotes eld \end_inset new \begin_inset Quotes erd \end_inset product from the perspective of end customers, you should \series bold check \series default the \series bold technical effort \series default for \begin_inset Quotes eld \end_inset implementing \begin_inset Quotes erd \end_inset the \begin_inset Quotes eld \end_inset new \begin_inset Quotes erd \end_inset functionality. There are cases where more than 90% functionality is already there. When it is from OpenSource, do not pay a lot of money for some more or less trivial adaptors. \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset When more than 95% of functionality is already there \emph on for free \emph default , beware of costly blown-up architectural ill-designs, such as \begin_inset Formula $O(n^{2})$ \end_inset client-server BigCluster architectures. \end_layout \begin_layout Plain Layout \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Dijkstra's layering rules can be used as tools for analyzing this, and for discovery of \series bold technical debt \series default by unfortunate layering, causing further cost and trouble in the long term. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset When augmented metadata functionality is present (whether actively or passively ), it should \emph on not \emph default be viewed as an integral part of object stores, but as an \emph on optional addendum \emph default . \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Reason: \series bold rich metadata is \emph on conceptually independent \series default \emph default from both filesystems and object stores. \end_layout \begin_layout Standard You may wonder what is the \emph on damage \emph default caused by Dijkstra regressions at object stores. \end_layout \begin_layout Standard We now look at a \emph on mis-use \emph default of object stores, which has been unfortunately advocated by object store advocates several years ago. Some advocates appear to have learned from bad experiences with suchalike setups (see examples in section \begin_inset CommandInset ref LatexCommand ref reference "subsec:Explanations-from-DSM" plural "false" caps "false" noprefix "false" \end_inset ), no longer propagating suchalike mis-uses anymore, but to focus on more appropriate use cases for \emph on native \emph default object stores instead. \end_layout \begin_layout Standard We continue by looking at the client part of distributed block devices / distributed filesystems on top of OSDs. The following example requires POSIX compliance \begin_inset Foot status open \begin_layout Plain Layout 1&1 Ionos has made the experience that a near POSIX-compliant filesystem called \family typewriter nfs \family default did not work correctly, causing customer complaints, because it is \emph on not fully \emph default POSIX-compliant. \end_layout \end_inset for toplevel application Apache webhosting with \family typewriter ssh \family default access: \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/ceph-layering-client.fig scale 50 \end_inset \end_layout \begin_layout Standard \noindent It should catch your eyes that both block-device and filesystem functionality is re-appearing once again, although it had been already implemented at OSD level. Obviously, there are two more Dijkstra regressions. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Do not over-stress the fact that now we are creating \emph on distributed \emph default block-devices, or \emph on distributed \emph default filesystems in place of local ones. This does \emph on not \emph default imply that a \family typewriter BigCluster \family default architecture is needed on top an \begin_inset Formula $O(n^{2})$ \end_inset storage network, or that \series bold random replication \series default inducing further problems and serious reliability problems (see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Reliability-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset ) is needed. There are near-trivial alternatives at architecture level, see \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Variants-of-Sharding" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset There is another (fourth) Dijkstra regression. Distributed block devices are typically storing 4k sectors or similar \begin_inset Foot status open \begin_layout Plain Layout Mapping of multiple 4k sectors onto a smaller number of bigger objects (e.g. 128k) opens up another \series bold tradeoff \series default , called \series bold false sharing \series default . This can lead to serious performance degradation of highly random workloads. \end_layout \end_inset \series bold fixed-size \series default entities in the object store, although objects are capable of \series bold varying sizes \series default . Thus objects and their \emph on dynamic key indirection mechanisms \emph default are \begin_inset Quotes eld \end_inset misused \begin_inset Quotes erd \end_inset for a restricted use case where array-like virtual data structures would be sufficient. When some petabytes of block device data are created in such a way, a \series bold massive overhead \begin_inset Foot status open \begin_layout Plain Layout For example, an \family typewriter xfs \family default inode has a typical size of 256 bytes. When each 4k sector of a distributed block device is stored as 1 object in an \family typewriter xfs \family default filesystem consuming 1 inode, there is not only noticable space overhead. In addition, random access by large application workingsets will need at least two seeks in total (inode + sector content). Without caching, this just doubles the needed worst-case IOPS. When taking the lookup fuctionality into account, the picture will worsen once again. \end_layout \end_inset \series default is induced. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout Some damages caused (or at least \emph on supported \emph default ) by Dijkstra regressions: \end_layout \begin_layout Itemize \series bold Increased invest \series default . Further reasons like doubled effort are explained in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Cost-Arguments-from-Architecture" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Itemize \series bold Increased operational cost \series default , both manpower and electrical power. Example: certain Ceph OSD implementations have been estimated as roughly consuming 1 GHz CPU power and 1 GB RAM per spindle. Even when newer versions are implemented somewhat more efficiently, there remains architectural Dijkstra overhead as explained above. \end_layout \begin_layout Itemize \series bold Decreased reliability \series default / \series bold increased risk \series default , simply caused by \series bold additional complexity \series default introduced by Dijkstra regressions. Further reasons are explained in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Reliability-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Itemize \series bold Decreased total performance \series default , simply induced by regression overhead. Some more reasons can be found in sections \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Explanations-from-DSM" plural "false" caps "false" noprefix "false" \end_inset and \begin_inset CommandInset ref LatexCommand nameref reference "sec:Performance-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Itemize \series bold Limited scalability \series default as explained in sections \begin_inset CommandInset ref LatexCommand nameref reference "sec:Scalability-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset and \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Explanations-from-DSM" plural "false" caps "false" noprefix "false" \end_inset is further worsened by Dijkstra regressions. \end_layout \end_inset \end_layout \begin_layout Subsection Positive Example: ShaHoLin storage + application stack \begin_inset CommandInset label LatexCommand label name "par:Positive-Example:-ShaHoLin" \end_inset \end_layout \begin_layout Standard ShaHoLin = Shared Hosting Linux at 1&1 Ionos. It is a \series bold managed product \series default , i.e. the sysadmins can login anywhere as \family typewriter root \family default . Notice that this has some influence at the architecture. In general, unmanaged products need to be constructed somewhat differently. \end_layout \begin_layout Standard ShaHoLin's architecture does not suffer from Dijkstra regressions, since each layer is adding new functionality, which is also available at, or at least functionally influences, any of the higher layers. \end_layout \begin_layout Standard Because of this, and by using a scalability principle called Sharding (see sections \begin_inset CommandInset ref LatexCommand nameref reference "par:Definition-of-Sharding" plural "false" caps "false" noprefix "false" \end_inset and \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Variants-of-Sharding" plural "false" caps "false" noprefix "false" \end_inset ), architectural properties are \series bold close to optimal \series default . \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout ShaHoLin Layering \begin_inset CommandInset label LatexCommand label name "ShaHoLin-Layering" \end_inset \end_layout \end_inset The following bottom-up description explains some granularity considerations at each layer: \end_layout \begin_layout Enumerate Hardware-based RAID-6, with an internal sub-architecture based on SAS networking \begin_inset Foot status open \begin_layout Plain Layout Certain advocates are overlooking the fact that SAS busses are a small network, just using the SAS protocol in place of TCP/IP. When necessary, the SAS network can be dynamically extended, e.g. by addition of external enclosures. \end_layout \end_inset . The newest LSI-based chip generation supports 8 GB fast BBU cache, which has RAM speed. Depending on the number of disks, this creates one big block device per RAID set. Current dimensioning (2019) is between \begin_inset Formula $\approx$ \end_inset 15 TB on 10 fast spindles in a small pizza box, and 48 large-capacity slower spindles with a total capacity of \begin_inset Formula $\approx$ \end_inset 300 TB, spread over 3 RAID sets. This is somewhat conservative; with current technology higher capacity would be possible, at the cost of lower IOPS. \end_layout \begin_layout Enumerate LVM = Logical Volume Management. This is provided by the dm = device mapper infrastructure of the Linux kernel, and by the standard LVM2 userspace tools. It is sub-divided into the following sub-layers: \end_layout \begin_deeper \begin_layout Enumerate PV = Physical Volumes, one per RAID set, with practically the same size / granularity. \end_layout \begin_layout Enumerate VG = Volume Group. All PVs \begin_inset Formula $\cong$ \end_inset RAID sets are merged into one local storage pool. Typical sizes are between 15 and 300 TB, depending on hardware class. Very old hardware may have only \begin_inset Formula $\approx$ \end_inset 3 TB, but these machines should go EOL soon. \end_layout \begin_layout Enumerate LV = Logical Volumes, one per VM \begin_inset Formula $\cong$ \end_inset LXC container instance. Typical sizes are between \begin_inset Formula $\approx$ \end_inset 300 GB and \begin_inset Formula $\approx$ \end_inset 40 TB. When necessary, the size can be dynamically increased during runtime. Typical number of LVs per physical machine (also called \series bold hypervisor \series default ) is between 3 and 14 (or exceptionally only 1 on very small old hardware). \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset The number of LVs per hypervisor can change during operations by moving around some LVs \begin_inset Formula $\cong$ \end_inset VMs \begin_inset Formula $\cong$ \end_inset LXC containers via Football (see \family typewriter football-user-manual.pdf \family default ). This is used for multiple purposes, such as decommissioning of old hardware, or load balancing, or for physical reorganizations, e.g. defragmentation of racks in some of the datacenters. \end_layout \end_deeper \begin_layout Enumerate Replication layer, using MARS. Each LV can be switched over individually (ability for butterfly, see \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Flexibility-of-Failover" plural "false" caps "false" noprefix "false" \end_inset ). In addition to geo-redundancy, MARS provides the base for Football. LV sizes / granularities are not modified by MARS. \end_layout \begin_layout Enumerate Filesystem layer, typically \family typewriter xfs \family default mounted locally \begin_inset Foot status open \begin_layout Plain Layout Only on a few old machines, which are shortly before EOL, \family typewriter /dev/mars/vm_name \family default is exported via iSCSI and imported into some near-diskless clients. This is an old architectural model, showing worse reliability (more components which can fail), and higher cost (more hardware, more power, more rackspace, etc). Due to iSCSI, IOPS are much worse than with pure \family typewriter LocalStorage \family default . Contrary to some old belief, it is \emph on not \emph default much more flexible. The ability for butterfly is already sufficient for rare exceptional overload situations, or for sporadic hardware failures. Since Football also works on the old iSCSI-based architecture, load balancing etc does not need to be done via iSCSI. \end_layout \end_inset . This layer is extremely important for getting the granularities right: typically, each xfs instance contains several millions of customer inodes and/or files. In some cases, the number can climb up to several tenths of millions. Reason: shared webhosting has to deal with myriads of extremely small customer files, intermixed with a lower number of bigger files, up to terabytes in a handful of scarce corner cases. \end_layout \begin_layout Enumerate LXC containers \begin_inset Formula $\cong$ \end_inset VMs. Each of them has a publicly visible customer IP address, which is shared by all of its customers (typically a few hundrets up to several tenthousands per container). Upon primary handover / failover, this IP is handed over to the sister datacenter via BGP = Border Gateway Protocol. Upon Football migrations, this IP is also retained, but just automatically routed to a different physical network segment. \end_layout \begin_layout Enumerate Application layer. Here are only some important highlights: \end_layout \begin_deeper \begin_layout Enumerate Apache, spawning PHP via suexec. One Apache instance per LXC container is typically sufficient for serving thousands or tenthousands of customers. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Some surprising detail: \family typewriter fastcgi \family default is deliberately \emph on not \emph default used at the moment, because security / \series bold user isolation \series default is considered much more important than a few \emph on permille(!) \emph default of performance gain by saving a few \family typewriter fork() \family default + \family typewriter execve() \family default system calls. While the Linux kernel is highly optimized for them, typical PHP applications like Wordpress are poorly optimized, for example by clueless runtime inclusion of \begin_inset Formula $\approx$ \end_inset 120 PHP include files, cluelessly repeated for each and every PHP request. Even when \family typewriter OpCache \family default is enabled, this costs much more than any potential savings by \family typewriter fastcgi \family default . \end_layout \begin_layout Enumerate EhB = Enhanced Backup. This is a 1&1-specific proprietary solution, supporting a grand total of \begin_inset Formula $\approx$ \end_inset 10 billions of inodes. It is also organized via the Sharding principle, but based on a different granularity. In order to parallelize daily incremental-forever backups, several measures are taken. Among others, customer homedirectories are grouped into 49 subdirectories called \emph on hashes \emph default in 1&1-slang. Both backups and restores may run in parallel, independently for each hash, and distributed over multiple shards. Hashes are thus forming an \series bold intermediate granularity \series default between xfs instances, and a grand total of \begin_inset Formula $\approx$ \end_inset 9 millions of customer home directories. \end_layout \end_deeper \end_inset \end_layout \begin_layout Section Granularity at Architecture \begin_inset CommandInset label LatexCommand label name "sec:Granularity-at-Architecture" \end_inset \end_layout \begin_layout Standard There are several alternative implementation technologies for (cloud) storage systems. They can be classified according to the granularity of their basic transfer units. \end_layout \begin_layout Subsection Granularities for Achieving Strict Consistency \begin_inset CommandInset label LatexCommand label name "subsec:Granularities-for-Strict" \end_inset \end_layout \begin_layout Standard End users are \emph on always \emph default expecting \series bold strict consistency \series default \begin_inset Foot status open \begin_layout Plain Layout For an overview of consisteny models, see \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/Consistency_model \end_layout \end_inset . While strict consistency is the most \begin_inset Quotes eld \end_inset natural \begin_inset Quotes erd \end_inset one as expected by humans, most other models are only of academic interest. \end_layout \end_inset from a storage system. Whenever they are \begin_inset Quotes eld \end_inset saving \begin_inset Quotes erd \end_inset several \begin_inset Quotes eld \end_inset things \begin_inset Quotes erd \end_inset to a (cloud) storage system in a particular order, they are expecting to always retrieve the \emph on newest \emph default version of each of them, afterwards. \end_layout \begin_layout Standard Here are the most important architectural differences between object-based storages and LV-based (Logical Volume) storages, provided that you \emph on want to cover comparable use cases \emph default : \end_layout \begin_layout Standard \noindent \align center \begin_inset Tabular \begin_inset Text \begin_layout Plain Layout \series bold Strict Consistency required \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Objects \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout LVs \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Granularity \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout small (typically KiB) \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout huge (several TiB) \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Number of instances \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout very high \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout low to medium \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \emph on Native \emph default consistency model \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout weak \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout strict \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Typical access \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout random keys \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout named \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Update in place \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no / yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Resize during operation \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no / yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Object support \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout native \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout on top of \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout LV support \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout on top of \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout native \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Filesystem support \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout on top of \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout on top of \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Scalable \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout at cluster \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout both cluster and grid \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Location distances \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout per datacenter / on campus \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout long distances possible \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Centralized pool management \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout per cluster \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Football uniting clusters \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Easy sharding support \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout cumbersome \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \end_inset \end_layout \begin_layout Standard \noindent As indicated in sections \begin_inset CommandInset ref LatexCommand nameref reference "sec:Reliability-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset and \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Explanations-from-DSM" plural "false" caps "false" noprefix "false" \end_inset , there are problems with object storage's \series bold consistency model \series default when higher aggregates like LVs or filesystems are \emph on requiring \emph default \series bold strict consistency \series default , but are built on top of objects which are only \emph on eventually consistent \emph default due to their inherent nature. \end_layout \begin_layout Subsection Granularity for Achieving Eventually Consistent \begin_inset CommandInset label LatexCommand label name "subsec:Granularity-for-Eventually" \end_inset \end_layout \begin_layout Standard This section is \emph on not \emph default about expectations from users. It is about implementation-specific \series bold weak consistency models \series default , such as \series bold eventually consistent \series default ., see \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/Consistency_model#Eventual_consistency \end_layout \end_inset , or several other weak consistency models and their variants. \end_layout \begin_layout Standard The following table reflects use cases for \begin_inset Quotes eld \end_inset native \begin_inset Quotes erd \end_inset object storage, where eventually consistent (or similar) is sufficient, or at least claimed to be sufficient: \end_layout \begin_layout Standard \noindent \align center \begin_inset Tabular \begin_inset Text \begin_layout Plain Layout \series bold \size small Eventually Consistent sufficient \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Objects \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout LVs \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Granularity \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout medium (1 object = 1 file) \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout huge (several TiB) \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Number of instances \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout medium \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout low to medium \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Typical access \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout random keys \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout named + random \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Update in place \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Resize during operation \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Object support \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout native \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout on top of \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Scalable \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout at cluster \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout both cluster and grid \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Location distances \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout per datacenter / on campus \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout long distances possible \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Centralized pool management \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout per cluster \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Football uniting clusters \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Easy sharding support \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout possible but expensive \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \end_inset \end_layout \begin_layout Section Replication vs Backup \begin_inset CommandInset label LatexCommand label name "sec:Replication-vs-Backup" \end_inset \end_layout \begin_layout Standard Intuitively, data backup and data replication are two different solution classes, addressing different problems. \end_layout \begin_layout Standard However, there exist descriptions where both solution classes are overlapping, as well as their corresponding problem classes. For example, backup as explained in \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/Backup \end_layout \end_inset could be seen as also encompassing some types of storage replications explained in \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/Replication_(computing) \end_layout \end_inset . \end_layout \begin_layout Standard For a rough comparison of \emph on typical \emph default implementations, see the following \emph on typical \emph default differences: \end_layout \begin_layout Standard \noindent \align center \begin_inset Tabular \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Backup \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Replication \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Fast handover (planned) \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no, or cumbersome \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Fast failover (unplanned) \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no, or cumbersome \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Protect for physical failures \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Protect for logical data corruption \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes (partly) \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout typically no \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Disaster Recovery Time (MTTR) \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout typically (very) slow \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout fast \end_layout \end_inset \end_inset \end_layout \begin_layout Standard \noindent Because of these typical differences, enterprise-critical data typically deserves \emph on both \emph default solution classes. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout Confusion of solution classes and/or their corresponding problem classes / properties can be harmful to enterprises and to carreers of responsible persons. \end_layout \end_inset \end_layout \begin_layout Subsection Flexibility of Handover / Failover Granularities \begin_inset CommandInset label LatexCommand label name "subsec:Flexibility-of-Failover" \end_inset \end_layout \begin_layout Standard Typical management buzzwords like DR = Disaster Recovery or CDP = Continuous Data Protection are neglecting the \emph on granularity \emph default of the data units to be protected by replication, and the ability for quick service \begin_inset Foot status open \begin_layout Plain Layout In the table, \begin_inset Quotes eld \end_inset Backup \begin_inset Quotes erd \end_inset means that only the data is replicated into a different datacenter. In difference, \begin_inset Quotes eld \end_inset Replication \begin_inset Quotes erd \end_inset means that both the data and the necessary compute resources are available in two datacenters. \end_layout \end_inset handover due to \series bold maintenance \series default reasons such as power supply maintenance. The following table explains some differences when granularity aspects like replication at physical volume (PV) aka physical disk level versus logical volume (LV) resp filesystem level are taken into account: \end_layout \begin_layout Standard \noindent \align center \begin_inset Tabular \begin_inset Text \begin_layout Plain Layout Method \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Disadvantages \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Advantages \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Backup at FS level \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no real data consistency \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout logical copy \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no handover / failover \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no load balancing \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no CDP / high MTTR \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Backup via FS snapshots \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout handover cumbersome \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout some point-in-time consistency \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no real load balancing \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout medium to high MTTR \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout logical copy \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout delayed consistency \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Replication at PV granularity \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout whole clusters switch \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout easier to setup \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no load balancing \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout physical copy \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout medium MTTR \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Replication at LV granularity \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout physical copy \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout load balancing between LVs \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout easy migration / Football \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout full handover consistency \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout low MTTR \end_layout \end_inset \end_inset \end_layout \begin_layout Standard \noindent In order to implement flexibility of handover / failover, the network infrastruc ture (as well as other infrastructures) must support it. Here are \series bold levels of flexibility \series default , in ascending order: \end_layout \begin_layout Enumerate \begin_inset Argument 1 status open \begin_layout Plain Layout start=0 \end_layout \end_inset (completely inflexible) Statically assigned IP addresses at \emph on each \emph default server and at \emph on both \emph default of 2 datacenters, and in particular for \series bold customer traffic \series default . This is typical for contemporary backup solutions. As a consequence, any handover / failover attempt would need massive sysadmin work, even if there were enough CPU and RAM power at the target datacenter. Switching whole datacenters or bigger server farms would take days, if not weeks, to manually reconfigure. Consequence: sysadmins will heavily dislike such type of work (acceptance problem of geo-redundancy). \end_layout \begin_layout Standard \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Some people think this can be easily done at DNS level. Just update all of your publicly visible DNS records to point to the new IP addresses. However, DNS updates have serious drawbacks for public internet traffic. Although there exists a field TTL = Time To Live for limiting the caching period of DNS clients, this field is \emph on ignored \emph default by many clients / DNS caches throughout the world. In practice it will take days, if not weeks, until the last client has got the new IP address, even if you try to speed this up by setting a TTL of 1 minute. It simply does not work as expected. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Dynamic routing protocols at AS = Autonomous Systems level are your friend, such as BGP = Border Gateway Protocol. For any \emph on serious \emph default geo-redundancy, it is a \series bold must \series default . If you don't have the ability for \series bold dynamic routing at the appropriate granularity \series default , you should better not claim that you are geo-redundant. If handover / failover takes far longer than acceptable by customer expectation s / SLAs (typically minutes), you are \emph on not really \emph default geo-redundant from the viewpoint of your customers. \end_layout \begin_layout Enumerate (inflexible) Manual or semi-automated routing at datacenter uplink level. Here the customer traffic is always routed to the \emph on same \emph default IP visible from outside, while there is a \emph on separate \emph default static IP per server for sysadmin \family typewriter ssh \family default access. The customer traffic routing needs to be changed \emph on globally \emph default for the \emph on complete \emph default traffic to \emph on any \emph default of two datacenters, and thus is very inflexible. This model protects \emph on only \emph default against a full datacenter loss, but almost nothing else. Unfortunately, this model appears very simple to implement, so both staff and chief executive managers are sometimes preferring this \begin_inset Quotes eld \end_inset simple \begin_inset Quotes erd \end_inset model, although it causes headaches at operational level when really needed. \end_layout \begin_layout Enumerate (medium flexibility) Dynamic routing of customer traffic at the granularity of building blocks, or even per hypervisor / physical server. When automated appropriately, switchover is a matter of minutes, or even seconds. \end_layout \begin_layout Enumerate (flexible) Dynamic routing of each VM / LV / resource, individually. This has massive advantages: in case of overload, DDOS attacks, etc, you can quickly load-balance into a so-called \series bold butterfly runtime configuration \series default : half of your VMs belonging to the same hypervisor is running in datacenter A, while the other half is running in datacenter B. In the following illustration showing one hypervisor per datacenter, green color denotes the active (primary) side, while white means passive (secondary): \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/replication-butterfly.fig width 100line% \end_inset \begin_inset Newline newline \end_inset During butterfly, each of your hypervisor iron has to carry only \emph on half \emph default of the ordinary workload. For comparison, here is the normal situation where only datacenter A would be active: \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/replication-normal.fig width 100line% \end_inset \begin_inset Newline newline \end_inset In the above butterfly configuration, you have essentially \series bold doubled the available CPU and RAM power \series default , when compared to the ordinary situation where side B does not carry any application workload. This is a \emph on tremendous \emph default aid for \series bold survival \series default of certain types of incidents, such as (unhandled \begin_inset Foot status open \begin_layout Plain Layout There is no 100% DDOS protection. Attackers are continuosly improving their methods. Catching all types of novel patterns is not possible in general. \end_layout \end_inset ) DDOS attacks. \end_layout \begin_layout Enumerate (most flexible) In addition to dynamic routing at VM level, the VMs are \series bold location transparent \series default (see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Location-transparency" plural "false" caps "false" noprefix "false" \end_inset ). They may transparently migrate to another hypervisor, possibly residing in another building block, or even residing in a different datacenter. In its most general form, the number of replicas may be different for each VM, and may change dynamically, adapting to any needs. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset The \series bold ability for butterfly \series default is relevant at CTO level. It is a massive \series bold risk reducer \series default , even at company and at stock exchange value level. \end_layout \begin_layout Plain Layout In order to really get it implemented in its best form, CTOs should clearly require \end_layout \begin_layout Plain Layout \noindent \align center \series bold Location Transparency at Application Level \end_layout \begin_layout Plain Layout \noindent It means that not only your servers, but also your \series bold services \series default can run in any of more than 1 datacenter, without notice by your customers. \end_layout \end_inset \end_layout \begin_layout Standard \noindent The location of your services is no longer a primary key, but a dependent runtime attribute which may change at runtime. Of course, your databases, your dashboards, your monitoring, and other surrounding tools, must also be able to properly deal with location transparenc y. \end_layout \begin_layout Standard Example: 1&1 Ionos ShaHoLin = Shared Hosting Linux has implemented it on thousands of servers, and on several petabytes of data. See \begin_inset CommandInset ref LatexCommand nameref reference "ShaHoLin-Layering" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Subsection Example: Point-in-time Replication via ZFS Snapshots \begin_inset CommandInset label LatexCommand label name "subsec:Example:-ZFS-Replication" \end_inset \end_layout \begin_layout Standard Some ZFS advocates believe that ZFS snapshots, which were originally designed for backup-like use cases, are also appropriate solutions for achieving geo-redundancy (cf section \begin_inset CommandInset ref LatexCommand nameref reference "sec:What-is-Geo-Redundancy" plural "false" caps "false" noprefix "false" \end_inset ). The basic idea is to run incremental ZFS snapshots in an endless loop, e.g. via some simple scripts, and expediting to another host where the snapshots are then applied to another ZFS instance. When there is less data to be expedited, loop cycle times can go down to a few seconds. When much data is written at the primary site, loop cycle times will rise up. \end_layout \begin_layout Standard The following table tries to explain why geo-redundancy is not as simple to achieve as believed, at least without addition of sophisticated additional means \begin_inset Foot status open \begin_layout Plain Layout ZFS advocates often argue with many features which aren't present at other filesystem types. The above table shows some dimensions not dealing with properties of local filesystems, but with \emph on problems / tasks \emph default arising in long-distance distributed systems involving masses of enterprise-cri tical storage. \end_layout \end_inset : \end_layout \begin_layout Standard \noindent \align center \begin_inset Tabular \begin_inset Text \begin_layout Plain Layout OpenSource Component \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout DRBD \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout MARS \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout ZFS \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Synchronity (in average) \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout delay \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout delay * 1.5 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Generic solution \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout FS-specific \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Granularity \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout LVs \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout LVs \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout subvolumes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Built-in snapshots \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Long distances \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Replication parallelism (per gran.) \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $1$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\geq2$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $1$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Built-in primary/secondary roles \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Built-in handover (planned) \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout mostly \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Built-in failover (unplanned) \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Built-in data overflow handling \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout unnecessary \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no, missing \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Unnoticed data loss due to overflow \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout possible \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Split-brain awareness \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Execute split-brain resolution \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Protect against illegal data modification \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout no \end_layout \end_inset \end_inset \end_layout \begin_layout Standard \noindent The last item means that ZFS by itself does not protect against amok-running applications modifiying the secondary (backup) side in parallel to the replication process (at least not by default). Workarounds may be possible, but are not easy to create and to test for enterprise-critical applications. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset In simple words: the \series bold ability for butterfly \series default is non-trivial to achieve. It can easily turn into a nightmare, if you would try to establish it on top of larger \family typewriter zfs \family default installations. Although termed \begin_inset Quotes eld \end_inset replication \begin_inset Quotes erd \end_inset , it is more similar to \begin_inset Quotes eld \end_inset backup \begin_inset Quotes erd \end_inset . \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Known \family typewriter zfs \family default replication setups at sisters of 1&1 Ionos are lacking the butterfly ability, likely due to these difficulties. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Notice that \family typewriter zfs \family default \series bold snapshots \series default (without adding replication on top of it) can be \series bold easily combined \series default with DRBD or MARS replication, because \family typewriter zfs \family default snapshots are residing at \emph on filesystem \emph default layer, while DRBD / MARS replicas are located at the lower \emph on block \emph default layer. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Combination of zfs with MARS \end_layout \end_inset Just create your zpools at the \emph on top \emph default of DRBD or MARS virtual devices, and use \family typewriter zpool import \family default / \family typewriter export \family default \emph on individually \emph default at handover / failover of each LV. A relatively easy way for implemention is the \family typewriter systemd \family default interface of MARS (see the according section in \family typewriter mars-user-manual.pdf \family default ). You just need to write \emph on once \emph default a small unit template file, containing a few \family typewriter zpool \family default commands. This small template will then be automatically instantiated for each resource by the \family typewriter marsadm \family default macro processor, as often as needed. \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset There is a \series bold \emph on fundamental \emph default difference \series default between zpools and classical RAID / LVM stacked architectures. Some zfs advocates are propagating zpools as a replacement for both RAID and LVM. However, there is a massive difference in architecture, as illustrated in the following example (10 logical resources over 48 physical spindles), achieving practically the \series bold \emph on same \series default zfs snapshot functionality \emph default from a user's perspective, but in a different way: \end_layout \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename images/raid-lvm-architecture.fig height 6cm \end_inset \begin_inset Graphics filename images/zpool-architecture.fig height 6cm \end_inset \end_layout \begin_layout Plain Layout \noindent When RAID functionality is executed by zfs, it will be located at the \emph on top \emph default of the hierarchy. On one hand, this easily allows for different RAID levels for each of the 10 different logical resources. On the other hand, this \emph on exposes \emph default the \series bold physical spindle configuration \series default to the topmost filesystem layer (48 spindles in this example). There is no easy way for replication of these \emph on physical properties \emph default in a larger / heterogenous distributed system, e.g. when some hardware components are replaced over a longer period of time (hardware lifecycle, or LV Football as explained in \family typewriter football-user-guide.pdf \family default ). Essentially, only replication of \emph on logical \emph default structures like snapshots remains as the only reasonable option, with its drawbacks as explained above. \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset There is another argument: zfs tries to \emph on hide \emph default its internal structures and interfaces from the sysadmins, forming a more or less \series bold monolithic \begin_inset Foot status open \begin_layout Plain Layout Some sysadmins acting as \family typewriter zfs \family default advocates are reclaiming this as an advantage, because they need to understand only a single tool for managing \begin_inset Quotes eld \end_inset everything \begin_inset Quotes erd \end_inset . However, this is a short-sighted argument when it comes to \emph on true \emph default flexibility as offered by a component-based system, where multiple types of hardware / software RAID, multiple types of LVM functionality, and much more can be almost orthogonally combined in a very flexible way. \end_layout \end_inset architecture \series default as seen from outside. \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset This violates the classical \emph on layering rules \emph default from Dijkstra (see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Layering-Rules" plural "false" caps "false" noprefix "false" \end_inset ). In contrast, classical LVM-based configurations (see section \begin_inset CommandInset ref LatexCommand nameref reference "par:Positive-Example:-ShaHoLin" plural "false" caps "false" noprefix "false" \end_inset or the example setup in \family typewriter mars-user-manual.pdf \family default ) are \series bold component oriented \series default , according to the \series bold Unix Philosophy \series default . \end_layout \end_inset \end_layout \begin_layout Section Local vs Centralized Storage \begin_inset CommandInset label LatexCommand label name "sec:Local-vs-Centralized" \end_inset \end_layout \begin_layout Standard There is some historical belief that only centralized storage systems, as typically sold by commercial storage vendors, could achieve a high degree of reliability, while local storage were inferior by far. In the following, we will see that this is only true for an \series bold \emph on unfair \series default \emph default comparison involving different classes of storage systems. \end_layout \begin_layout Subsection Internal Redundancy Degree \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout Centralized commerical storage systems are typically built up from highly redundant \emph on internal \emph default components: \end_layout \begin_layout Enumerate Redundant power supplies with UPS. \end_layout \begin_layout Enumerate Redundancy at the storage HDDs / SSDs. \end_layout \begin_layout Enumerate Redandancy at internal transport busses. \end_layout \begin_layout Enumerate Redundant RAM / SSD caches. \end_layout \begin_layout Enumerate Redundant network interfaces. \end_layout \begin_layout Enumerate Redundant compute heads. \end_layout \begin_layout Enumerate Redundancy at control heads / management interfaces. \end_layout \begin_layout Plain Layout What about local hardware RAID controllers? Some people think that these relatively cheap units were massively inferior at practically each of these points. Please take a \emph on really deep \emph default look at what classical RAID chip manufacturers like LSI / Avago / Broadcom and their competitors are offering as configuration variants of their top notch models. The following enumeration is in the same order as above (item by item): \end_layout \begin_layout Enumerate Redundant hardware RAID cards with BBU caches, each with local goldcaps surviving power outages, their BBU caches cross-coupled via high-speed interconnects. \end_layout \begin_layout Enumerate HDD / SSD redundancy: almost any RAID level you can think of. \end_layout \begin_layout Enumerate Redundant SAS cross-cabling: any head can access any device. \end_layout \begin_layout Enumerate BBU caches are redundant and cross-coupled, similarly to RDMA. When SSD caches are added to both cards, you also get redundancy there. \end_layout \begin_layout Enumerate When using cross-coupled redundant cards, you automatically get redundant host bus interfaces (HBAs). \end_layout \begin_layout Enumerate The same story: you also get two independent RAID controller instances which can do RAID computations independently from each other. Some implementations do this even in hardware (ASICs). \end_layout \begin_layout Enumerate Dito: both cards may be plugged into two different servers, thereby creating redundancy at control level. As a side effect, you may also get a similar functionality than DRBD. \end_layout \end_inset \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Redunduncy degree of RAID vs commercial appliances \end_layout \end_inset When dimensioned appropriately, real architectual and functional differences at block layer are smaller than certain people are claiming. For many block layer use cases, redundancy is \series bold roughly comparable \series default . \end_layout \begin_layout Plain Layout If you compare typical prices for both competing systems, you will notice a \emph on huge \emph default difference in favour of RAID. See also section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Cost-Arguments-from-Technology" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \end_inset \end_layout \begin_layout Subsection Capacity Differences \end_layout \begin_layout Standard There is another hard-to-die myth: commercial storage would provide higher capacity. Please read the data sheets. It is \emph on possible \emph default (but not generally recommended) to put several hundreds of spindles into several external HDD enclosures, and then connect them to a redundant cross-cou pled pair of RAID controllers via several types of SAS busses. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Maximum possible RAID capacity \end_layout \end_inset By filling a rack this way, RAID can easily reach similar, if not higher capacities than commercial storage boxes, for a \emph on fraction \emph default of the price. \end_layout \end_inset \end_layout \begin_layout Standard \noindent However, this is not the recommended way for \emph on general \emph default use cases, but could be an option for low demands like archiving. The big advantage of RAID-based local storage is \series bold massive scale-out by sharding, \series default as explained in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Distributed-vs-Local:" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Subsection Caching Differences \end_layout \begin_layout Standard A frequent argument is that centralized storage systems had bigger caches than local RAID systems. While this argument is often true, it neglects an important point: \end_layout \begin_layout Standard Local RAID systems often \emph on don't need \emph default bigger caches, because they are typically located at the \emph on bottom \emph default of a cache hierarchy, playing only a \emph on particular \emph default role in that hierarchy. There exist \emph on further \emph default caches which are \series bold erronously not considered \series default by such an argument! \end_layout \begin_layout Standard Example, see also section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Performance-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset for more details: At 1&1 Shared Hosting Linux (ShaHoLin), a typical LXC container containing several thousands to tenthousands of customer home directories, creates a long-term \emph on average(!) \emph default IOPS load at block layer of about 70 IOPS. No, this isn't a typo. It is not 70,000 IOPS. It is only 70 IOPS. \end_layout \begin_layout Standard Reason: the standard Linux kernel has two main caches, the Page Cache for file content, and the Dentry Cache (plus Inode slave cache) for metadata. Both caches are residing in \series bold RAM \series default , which is the \emph on fastest \emph default type of cache you can get. Some more details are in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Performance-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Standard Nowadays, typical servers have several hundreds of gigabytes of RAM, sometimes even up to terabytes, resulting in an incredible caching behaviour which can be measured \begin_inset Foot status open \begin_layout Plain Layout Caution: this requires \emph on extremely solid \emph default expert knowledge and experience. It can be easily done wrongly. When managers are believing \series bold fake results \series default , whether produced by accident from people stuck to \series bold second-order ignorance \series default , or whether produced for some \series bold political reasons \series default : This can be \series bold dangerous for companies \series default . \end_layout \end_inset . \end_layout \begin_layout Standard Many people appear to neglect these caches, sometimes not knowing of their existence, and erronously assuming that 1 application r \family typewriter ead() \family default or \family typewriter write() \family default operation will also lead to 1 IOPS at block layer. As a consequence, they are demanding 50,000 IOPS or 100,000 or even 1,000,000 IOPS. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold IOPS over-engineering \end_layout \end_inset IOPS over-engineering by some orders of magnitudes can cause \emph on considerable \emph default unnecessary expenses. Be sure to carefully \series bold check real demands \series default ! \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \noindent Some (but not all) commercial storage systems can deliver similar IOPS rates, because they have \series bold internal RAM caches \series default in the same order of magnitude. Notice that persistent RAM is the \series bold most expensive \series default type of scalable storage you can buy. \end_layout \begin_layout Plain Layout People who are demanding such systems are typically falling into some of the following classes (list is probably incomplete): \end_layout \begin_layout Itemize some people know this, but price does not matter - the more caches, the better. Wasted money for doubled caches does not count for them, or is even viewed as an advantage to them (personally). Original citation of an anonymous person: \begin_inset Quotes eld \end_inset only the best and the most expensive storage is good enough for us \begin_inset Quotes erd \end_inset . \end_layout \begin_layout Itemize using NFS, which has extremely poor filesystem caching behaviour because the Linux nfs client implementation does not take full advantage of the dentry cache. Sometimes people know this, sometimes not. Please read an important paper on the Linux implementation of nfs. Please search the internet for \begin_inset Quotes eld \end_inset Why nfs sucks \begin_inset Quotes erd \end_inset from Olaf Kirch (who is one of the original Linux nfs implementors), and \emph on read \emph default it. Your opinion about nfs might change. \end_layout \begin_layout Itemize have transactional databases, where high IOPS may be \emph on really \emph default needed, but \series bold \emph on exceptionally \series default \emph default (!) for this class of application. For very big enterprise databases like big SAP installations, there may be a very valid justification for big RAM caches at storage layers. However: smaller transactional loads, as in webhosting, are \emph on often \emph default (not always) hammering a \emph on low \emph default number of \series bold hot spots \series default , where \emph on big \emph default caches are not really needed. Relatively small BBU caches of RAID cards will do it also. Often people don't notice this because they don't measure the \series bold workingset behaviour \series default of their application, as could be done for example with \family typewriter blkreplay \family default (see \begin_inset Flex URL status open \begin_layout Plain Layout https://blkreplay.org \end_layout \end_inset ). \end_layout \begin_layout Itemize do not notice that \emph on well-tuned \emph default filesystem caches over iSCSI are typically demanding much less IOPS, sometimes by several orders of magnitude, and are wasting money with caches at commercial boxes they don't need (classical \series bold over-engineering \series default ). \end_layout \begin_layout Itemize \series bold political interest \series default , often supported by storage vendors. \end_layout \begin_layout Plain Layout Anyway, local storage can be augmented with various types of local caches with various dimensioning. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset There is no point in accessing the fastest possible type of RAM cache remotely over a network. RAM is best \series bold invested money \series default when installed \series bold locally \series default , \emph on directly \emph default for your applications / services / compute nodes. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Even expensive hardware-based RDMA (e.g. over Infiniband) cannot deliver the same performance as \series bold directly caching \series default your data in the \series bold \emph on same \emph default RAM \series default where your application is running. The Dentry Cache in the Linux kernel provides highly optimized \series bold shared metadata \series default in SMP and NUMA systems (nowadays scaling to more than 100 processor cores), while the Page Cache provides \series bold shared memory \series default via hardware MMU. This is crucial for the performance of classical local filesystems. \end_layout \begin_layout Standard The physical laws of Einstein and others are telling us that neither this type of caching, nor its shared memory behaviour, can be transported over whatever type of network without causing \series bold performance degradation \series default . \end_layout \begin_layout Subsection Latencies and Throughput \begin_inset CommandInset label LatexCommand label name "subsec:Latencies-and-Throughput" \end_inset \end_layout \begin_layout Standard First of all: today there exist only a small number of HDD manufacturers on the world. The number of SSD manufacturers will likely decline in the long run. Essentially, commercial storage vendors are more or less selling you the same HDDs or SSDs as you could buy and deploy yourself. If at all, there are only some minor technical differences. \end_layout \begin_layout Standard In the meantime, many people agree to a Google paper that the \emph on ratio \emph default of market prices (price per terabyte) between HDDs and SSDs are unlikely to change in a fundamental \begin_inset Foot status open \begin_layout Plain Layout In folklore, there exists a \series bold fundamental empirical law \series default , fuzzily called \begin_inset Quotes eld \end_inset Storage Pyramid \begin_inset Quotes erd \end_inset or \begin_inset Quotes eld \end_inset Memory Hierarchy Law \begin_inset Quotes erd \end_inset or similar, which is well-known at least in German OS academic circles. The empirical law (extrapolated from \series bold observations \series default , similarly to Moore's law) tells us that faster storage technology is always \series bold more expensive \series default than slower storage technology, and that capacities of faster storage are typically always lesser than capacity of slower storage. This observation has been roughly valid for more than 50 years now. You can find it in several German lecture scripts. Unfortunately, the Wikipedia article \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/Memory_hierarchy \end_layout \end_inset (retrieved in June 2018) does not cite this very important fundamental law about \series bold costs \series default . In contrast, the German article \begin_inset Flex URL status open \begin_layout Plain Layout https://de.wikipedia.org/wiki/Speicherhierarchie \end_layout \end_inset about roughly the same subject is mentioning \begin_inset Quotes eld \end_inset Kosten \begin_inset Quotes erd \end_inset which means \begin_inset Quotes eld \end_inset cost \begin_inset Quotes erd \end_inset , and \begin_inset Quotes eld \end_inset teuer \begin_inset Quotes erd \end_inset which means \begin_inset Quotes eld \end_inset expensive \begin_inset Quotes erd \end_inset . \end_layout \end_inset way during the next 10 years. Thus, most large-capacity enterprise storage systems are built on top of HDDs. \end_layout \begin_layout Standard Typically, HDDs and their mechanics are forming the overall bottleneck. \end_layout \begin_layout Itemize by construction, a \emph on local \emph default HDD attached via HBAs or a hardware RAID controller will show the least \emph on additional \emph default overhead in terms of \emph on additional \emph default latencies and throughput degradation caused by the attachment. \end_layout \begin_layout Itemize When the \emph on same \emph default HDD is \emph on indirectly \emph default attached via Ethernet or Infiniband or another rack-to-rack transport, both latencies and throughput will become worse. Depending on further factors and influences, the overall bottleneck may shift to the network. \end_layout \begin_layout Standard The laws of information transfer are telling us: \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout With \series bold increasing distance \series default , both latencies (laws of Einstein) and throughput (laws of energy needed for compensation of SNR = signal to noise ratio) are becoming worse. Distance matters. \end_layout \begin_layout Plain Layout \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Because of this fundamental law, Football+MARS is \series bold minimizing IO distances \series default . \end_layout \end_inset \end_layout \begin_layout Standard \noindent The number of intermediate components, like routers / switches and their \series bold queuing \series default , matters too. \end_layout \begin_layout Standard Consequently, local storage has \emph on always \emph default an architectural \begin_inset Foot status open \begin_layout Plain Layout In order to be fair, an architectural comparison must be made under the assumption of comparable low-level technologies. \end_layout \end_inset advantage in front of any attachment via network. Centralized storages are bound to some network, and thus suffer from disadvanta ges in terms of latencies and throughput. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout What is the expected long-term future? Will additional latencies and throughput of centralized storages become better over time? \end_layout \begin_layout Plain Layout It is difficult to predict the future. Let us first look at the past evolution. The following graphics has taken its numbers from Wikipedia articles \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/List_of_device_bit_rates \end_layout \end_inset and \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/History_of_hard_disk_drives \end_layout \end_inset , showing that HDD capacities have grown \series bold over-proportionally \series default by about 2 orders of magnitude over about 30 years, when compared to the relative growth of network bandwidth. \end_layout \begin_layout Plain Layout In the following graphics, effects caused by decreasing form factors have been neglected, which would even \emph on amplify \emph default the trend. For fairness, bundling of parallel disks or parallel communication channels \begin_inset Foot status open \begin_layout Plain Layout It is easy to see that the slopes of \family typewriter HDD.capacity \family default vs \family typewriter Infiniband.rates \family default are different. Parallelizing by bundling of Infiniband wires will only lift the line a little upwards, but will not alter its slope in logarithmic scale. For extrapolated time \begin_inset Formula $t\rightarrow\infty$ \end_inset , the extrapolated empirical long-term behaviour is rather striking. \end_layout \end_inset have been ignored. All comparisons are in logarithmic y axis scale: \end_layout \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename BitRates/Capacity-BitRate-Comparison.pdf width 100col% \end_inset \end_layout \begin_layout Plain Layout \noindent What does this mean when extrapolated into the future? \end_layout \begin_layout Plain Layout It means that concentrating more and more capacity into a single rack due to increasing data density will likely lead to more problems in future. Accessing more and more data over the network will become increasingly more difficult when concentrating high-capacity HDDs or SSDs \begin_inset Foot status open \begin_layout Plain Layout It is difficult to compare the space density of contemporary SSDs in a fair way. There are too many different form factors. For example, M2 cards are typically consuming even less \begin_inset Formula $cm^{3}/TB$ \end_inset than classical 2.5 inch form factors. This trend is likely to continue in future. \end_layout \end_inset into the same space volume as before. \end_layout \begin_layout Plain Layout In other words: centralized storages are no good idea yet, and will likely become an even worse idea in the future. \end_layout \end_inset \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout Risky central storage architecture \end_layout \end_inset There was a major incident at a German web hosting company at the beginning of the 2000's. Their entire webhosting main business was running on a single proprietary highly redundant CentralStorage solution, which failed. Restore from backup took way too long from the viewpoint of a huge number of customers, leading to major press attention. Before this incident, they were the #1 webhoster in Germany. A few years later, 1&1 was the #1 instead. You can speculate whether this has to do with the incident. But anyway, the later geo-redundancy strategy of 1&1 basing on a sharding model (originally using DRBD, later MARS) was motivated by conclusions drawn from this incident. \end_layout \end_inset \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout Non-competing scalabilty of central storage \end_layout \end_inset In the 1980s, a CentralStorage \begin_inset Quotes eld \end_inset dinosaur \begin_inset Foot status open \begin_layout Plain Layout With the advent of NVME, SSDs are almost directly driven by DMA. Accessing any high-speed DMA devices by default via network is a foolish idea, similarly foolish than playing games via an expensive high-end gamer graphics cards which is then \emph on indirectly \emph default attached via RDMA, or even via Ethernet. Probably no serious gamer would ever \emph on try \emph default to do that. But some storage vendors do, for strategic reasons. Probably for their own survival, their customers are to be misguided to overlook the blinking red indicators that centralized SSD storage is likely nothing but an expensive dead end in the history of dinosaur architectures. \end_layout \end_inset \begin_inset Quotes erd \end_inset architecture called SLED = Single Large Expensive Disk was propagated with huge marketing noise and effort, but its historic fate was predictable for neutral experts not bound to particular interests: SLED finally lost against their contemporary RAID competition. Nowadays, many people don't even remember the term SLED. \end_layout \end_inset \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Strategic advice \end_layout \end_inset Today's \series bold future \series default is likely dominated by \series bold scaling-out architectures \series default like \series bold sharding \series default , as explained in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Distributed-vs-Local:" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \end_inset \end_layout \begin_layout Subsection Reliability Differences CentralStorage vs Sharding \begin_inset CommandInset label LatexCommand label name "subsec:Reliability-Differences-CentralStorage" \end_inset \end_layout \begin_layout Standard In this section, we look at \emph on fatal \emph default failures only, ignoring temporary failures. A fatal failure of a storage is an incident which needs to be corrected by \series bold restore from backup \series default . \end_layout \begin_layout Standard By definition, even a \emph on highly redundant \emph default CentralStorage is \emph on nevertheless \emph default a SPOF = Single Point of Failure. This also applies to fatal failures. \end_layout \begin_layout Standard Some people are incorrectly arguing with redundancy. The problem is that \emph on any \emph default system, even a highly redundant one, can fail fatally. There exists no perfect system on earth. One of the biggest known sources of fatal failure is \series bold human error \series default . \end_layout \begin_layout Standard In contrast, sharded storage (for example the LocalSharding model, see also section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Variants-of-Sharding" plural "false" caps "false" noprefix "false" \end_inset ) has MPOF = Multiple Points Of Failure. It is unlikely that many shards are failing fatally at the same time, because shards are \emph on independent \emph default \begin_inset Foot status open \begin_layout Plain Layout When all shards are residing in the same datacenter, there exists a SPOF by power loss or other impacts onto the whole datacenter. However, this applies to both the CentralStorage and to the LocalSharding model. In contrast to CentralStorage, LocalSharding can be more easily distributed over multiple datacenters. \end_layout \end_inset from each other by definition (see section \begin_inset CommandInset ref LatexCommand nameref reference "par:Definition-of-Sharding" plural "false" caps "false" noprefix "false" \end_inset for disambiguation of terms \begin_inset Quotes eld \end_inset sharding \begin_inset Quotes erd \end_inset and \begin_inset Quotes eld \end_inset shared-nothing \begin_inset Quotes erd \end_inset ). \end_layout \begin_layout Standard What is the difference from the viewpoint of customers of the services? \end_layout \begin_layout Standard When a CentralStorage is failing fatally, a \emph on huge \emph default number of customers will be affected for a \emph on long \emph default time (see the example German webhoster mentioned in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Latencies-and-Throughput" plural "false" caps "false" noprefix "false" \end_inset ). Reason: restore from backup will take extremely long because huge masses of data have to be restored = \series bold copied \series default over a network. MTBF = Mean Time Between Failures is (hopefully) longer thanks to redundancy, but MTTR = Mean Time To Repair is also very long. \end_layout \begin_layout Standard With (Local)Sharding, the risk of \emph on some \emph default fatal incident \emph on somewhere \emph default in the sharding pool is higher, but the \series bold \emph on size \series default \emph default of such an incident is smaller in three dimensions at the same time: \end_layout \begin_layout Enumerate There are much \series bold less customers affected \series default (typically only \begin_inset Formula $1$ \end_inset shard out of \begin_inset Formula $n$ \end_inset shards). \end_layout \begin_layout Enumerate \series bold MTTR \series default = Mean Time To Repair is typically much better because there is much less data to be restored. \end_layout \begin_layout Enumerate \series bold Residual risk \series default plus resulting fatal damage by \series bold un-repairable problems \series default is thus lower. \end_layout \begin_layout Standard What does this mean from the viewpoint of an investor of a big \begin_inset Quotes eld \end_inset global player \begin_inset Quotes erd \end_inset company? \end_layout \begin_layout Standard As is promised by the vendors, let us assume that failure of CentralStorage might be occurring less frequently. But \emph on when \emph default it happens on \series bold enterprise-critical mass data \series default , the stock exchange value of the affected company will be exposed to a \series bold hazard \series default . This is not bearable from the viewpoint of an investor. \end_layout \begin_layout Standard In contrast, the (Local)Sharding model is \emph on distributing \emph default the \series bold indispensible incidents \series default (because \series bold perfect systems do not exist \series default , and \series bold perfect humans do not exist \series default ) to a lower number of customers with higher frequency, such that the \series bold total impact onto the business \series default becomes bearable. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Risk analysis of CentralStorage \end_layout \end_inset Risk analysis for \series bold enterprise-critical \series default use cases is summarized in the following table: \end_layout \begin_layout Plain Layout \noindent \align center \begin_inset Tabular \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout CentralStorage \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout (Local)Sharding \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Probability of \emph on some \emph default fatal incident \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout lower \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout higher \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout # Customers affected \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout very high \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout very low \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout MTBF per storage \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout higher \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout lower \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout MTTR per storage \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout higher \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout lower \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Unrepairable residual risk \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout higher \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout lower \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Total impact \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout higher \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout lower \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Investor's risk \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \series bold unbearable \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout stock exchange compatible \end_layout \end_inset \end_inset \end_layout \end_inset \end_layout \begin_layout Standard \noindent Conclusions: CentralStorage is something for \end_layout \begin_layout Itemize \noindent Small to medium-sized companies which don't have the \series bold manpower \series default and the \series bold skills \series default for professionally building and operating a (Local)Sharding (or similar) system for their enterprise-critical mass data their business is relying upon. \end_layout \begin_layout Itemize \series bold \emph on Monolithic \emph default enterprise applications \series default like classical SAP which are anyway bound to a specific vendor, where you cannot select a different solution (so-called \series bold Vendor Lock-In \series default ). \end_layout \begin_layout Itemize When your application \series bold is neither shardable \series default by construction (c.f. section \begin_inset CommandInset ref LatexCommand ref reference "sec:Distributed-vs-Local:" \end_inset ), or when doing so would be a too high effort, \series bold nor going to BigCluster \begin_inset Foot status open \begin_layout Plain Layout Theoretically, BigCluster can be used to create 1 single huge remote LV (or 1 single huge remote FS instance) out of a pool of storage machines. Double-check, better triple-check that such a \series bold big \emph on logical \emph default SPOF \series default is \emph on really \emph default needed, and cannot be circumvented by any means. Only in such a case, the current version of MARS cannot help (yet), because its \emph on current \emph default \emph on focus \emph default is on a big number of machines each having relatively small LVs. At 1&1 ShaHoLin, the biggest LVs are 40TiB at the moment, running for years now, and bigger ones are certainly possible. Only when current local RAID technology with external enclosures cannot easily create a single LV in the petabyte scale, BigCluster is probably the better solution (c.f. section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Reliability-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset ). \end_layout \end_inset \series default (e.g. Ceph / Swift / etc, see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Reliability-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset ) is an option. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset If you have an \emph on already sharded \emph default system, e.g. independent VMs or webhosting, don't convert it to a non-shardable one, and don't introduce SPOFs needlessly. You will introduce \series bold technical debts \series default which are likely to hurt back somewhen in future! \end_layout \end_inset \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout As a real big \begin_inset Quotes eld \end_inset global player \begin_inset Quotes erd \end_inset , or as a company being part of such a structure, you should be careful when listening to \begin_inset Quotes eld \end_inset marketing drones \begin_inset Quotes erd \end_inset of proprietary CentralStorage vendors. Always check your \emph on concrete \emph default use case. Never believe in wrongly generalized claims, which are only valid in some specific context, but do not really apply to your use case. It could be about your \emph on life \emph default . \end_layout \end_inset \end_layout \begin_layout Subsection Proprietary vs OpenSource \begin_inset CommandInset label LatexCommand label name "subsec:Proprietary-vs-OpenSource" \end_inset \end_layout \begin_layout Standard In theory, the following dimensions are orthogonal to each other: \end_layout \begin_layout Description Architecture: LocalStorage vs CentralStorage vs DistributedStorage \end_layout \begin_layout Description Licensing: Proprietary vs OpenSource \end_layout \begin_layout Standard In practice, however, many vendors of proprietary storage systems are selecting the CentralStorage model. This way, they can avoid inter-operability with their competitors. This opens the door for the so-called \series bold Vendor Lock-In \series default . \end_layout \begin_layout Standard In contrast, the OpenSource community is based on \emph on cooperation \emph default . Opting for OpenSource means that you can \series bold combine and exchange \series default numerous \series bold components \series default with each other. \end_layout \begin_layout Standard Key OpenSource players are \emph on basing \emph default their business on the \series bold usefulness \series default of their software components for you, their customer. Please search the internet for further explanations from Eric S. Raymond. \end_layout \begin_layout Standard Therefore \series bold interoperability \series default is a \emph on must \emph default in the opensource business. For example, you can relatively easily migrate between DRBD and MARS, forth and backwards, see \family typewriter mars-user-manual.pdf \family default . The \emph on generic \emph default block devices provided by both DRBD and MARS (and by the kernel LVM2 implementa tion, and many others \begin_inset Formula $\ldots$ \end_inset ) can interact with zillions of filesystems, VMs, applications, and so forth. \end_layout \begin_layout Standard Summary: \series bold genericity \series default is a highly desired property in OpenSource communities, while proprietary products often try to control their usage by limiting either technical interoperability at certain layers, and/or legally by contracts. Trying to do so with OpenSource would make no sense, because \emph on you \emph default , the customer, are the \emph on real \emph default king who can \emph on really \emph default select and combine components. You can form a \series bold really customized system \series default to your \series bold \emph on real needs \series default \emph default , not as just promised but not always actually delivered by so-called \begin_inset Quotes eld \end_inset marketing drones \begin_inset Quotes erd \end_inset from commercial vendors who are actually prefering the needs of their employer in front of yours. \end_layout \begin_layout Standard There is another fundamental difference between proprietary software and OpenSource: the former is bound to some company, which may \emph on vanish \emph default from the market. Commercial storage systems may be \series bold discontinued \series default . \end_layout \begin_layout Standard This can be a serious threat to your business relying on the value of your data. In particular, buying storage systems from \emph on small \emph default vendors may increase this risk \begin_inset Foot status open \begin_layout Plain Layout There is a risk of a \emph on domino effect \emph default : once there is a critical incident on highly redundant CentralStorage boxes from a particular (smaller) vendor, this may lead to major public media attention. This may form the \emph on root cause \emph default for such a vendor to vanish from the market. Thus you may be left alone with a buggy system, even if you aren't the victim of the concrete incident. \end_layout \begin_layout Plain Layout In contrast, bugs in an OpenSource component can be fixed by a larger community of interested people, or by yourself if you hire somebody for this. \end_layout \end_inset . \end_layout \begin_layout Standard OpenSource is different: it cannot die, even if the individual, or the (small) company which produced it, does no longer exist. The sourcecode is in the \series bold public \series default . It just could get \emph on outdated \emph default over time. However, as long as there is enough public interest, you will always find somebody who is willing to adapt and to \emph on maintain \emph default it. Even if you would be the only one having such an interest, you can \emph on hire \emph default a maintainer for it, specifically for your needs. You aren't \series bold helpless \series default . \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Long-term strategy \end_layout \end_inset When some appropriate OpenSource solution, or when some OpenSource components are availabe, its long-term TCO will be typically better than from proprietary vendors. \end_layout \end_inset \end_layout \begin_layout Section Distributed vs Local: Scalability Arguments from Architecture \begin_inset CommandInset label LatexCommand label name "sec:Distributed-vs-Local:" \end_inset \end_layout \begin_layout Standard Datacenters aren't usually operated for fun or for hobby. Scalability of an \emph on architecture \emph default (cf section \begin_inset CommandInset ref LatexCommand ref reference "sec:What-is-Architecture" \end_inset ) is very important, because it can seriously limit your business. Overcoming architectural ill-designs can grow extremely cumbersome and costly. \end_layout \begin_layout Standard Many enterprise system architects are starting with a particular architecture in mind, called \begin_inset Quotes eld \end_inset Big Cluster \begin_inset Quotes erd \end_inset . There is a common belief that otherwise \series bold scalability \series default could not be achieved: \begin_inset Separator latexpar \end_inset \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/Architecure_Big_Cluster.pdf width 100col% \end_inset \end_layout \begin_layout Standard \noindent The crucial point is the \series bold storage network \series default here: \begin_inset Formula $n$ \end_inset storage servers are interconnected with \begin_inset Formula $m=O(n)$ \end_inset frontend servers, in order to achieve properties like scalability, failure tolerance, etc. \end_layout \begin_layout Standard Since \emph on any \emph default of the \begin_inset Formula $m$ \end_inset frontends must be able to access \emph on any \emph default of the \begin_inset Formula $n$ \end_inset storages in realtime, the storage network must be dimensioned for \begin_inset Formula $O(n\cdot m)=O(n^{2})$ \end_inset network connections running in parallel. Even if the total network throughput is scaling only with \begin_inset Formula $O(n)$ \end_inset , nevertheless \begin_inset Formula $O(n^{2})$ \end_inset network connections have to be maintained at connection oriented protocols and at various layers of the operating software. The network has to \emph on switch \emph default the packets from \begin_inset Formula $n$ \end_inset sources to \begin_inset Formula $m$ \end_inset destinations (and their opposite way back) in \series bold realtime \series default . \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout The \begin_inset Formula $O(n^{2})$ \end_inset \series bold cross-bar functionality \series default in \series bold realtime \series default makes the storage network complicated and \series bold expensive \series default , while decreasing grand-total reliability and thus \series bold increasing risk \series default . \end_layout \end_inset \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout Some further factors are increasing the cost of storage networks: \end_layout \begin_layout Itemize In order to limit error propagation from other networks, the storage network is often built as a \emph on physically separate \emph default = \emph on dedicated \emph default network. \end_layout \begin_layout Itemize Because storage networks are heavily reacting to high latencies and packet loss, they often need to be dimensioned for the \series bold worst case \series default (load peaks, packet storms, etc), needing one of the best = typically most expensive components for reducing latency and increasing throughput. Dimensioning to the worst case instead of an average case plus some safety margins is nothing but an expensive \series bold overdimensioning \series default / \series bold over-engineering \series default . \end_layout \begin_layout Itemize When \series bold multipathing \series default is required for improving fault tolerance of the storage network itself, (parts of) these efforts may easily \emph on double \emph default . \end_layout \begin_layout Itemize When \series bold geo-redundancy \series default is required (see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:What-is-Geo-Redundancy" plural "false" caps "false" noprefix "false" \end_inset ), the total effort may easily double another time because in cases of disasters like terrorist attacks the backup datacenter must be prepared for taking over for multiple days or weeks. \end_layout \end_inset \end_layout \begin_layout Standard \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset In general, storage networks won't work over long distances. Even it would be possible, \series bold asymmetry problems \series default would be introduced into an architecture which is \emph on conceptually symmetric \emph default by its very nature. Thus, and generally in \begin_inset Formula $n:m$ \end_inset relationships, failover granularities are tending to \series bold stick to coarse \series default . Finer granularites as discussed in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Flexibility-of-Failover" plural "false" caps "false" noprefix "false" \end_inset are much more difficult to achieve. \end_layout \begin_layout Standard Fortunately, there is an alternative called \begin_inset Quotes eld \end_inset \series bold Sharding Architecture \series default \begin_inset Quotes erd \end_inset or \begin_inset Quotes eld \end_inset \series bold Shared-nothing Architecture \series default \begin_inset Quotes erd \end_inset . \end_layout \begin_layout Paragraph Definition of Sharding \begin_inset CommandInset label LatexCommand label name "par:Definition-of-Sharding" \end_inset \end_layout \begin_layout Standard Notice that the term \begin_inset Quotes eld \end_inset Sharding \begin_inset Quotes erd \end_inset originates from database architecture \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/Shard_(database_architecture) \end_layout \end_inset where it has a slightly different meaning than used here. Our usage of the term \begin_inset Quotes eld \end_inset sharding \begin_inset Quotes erd \end_inset reflects slightly different situations in some webhosting companies \begin_inset Foot status open \begin_layout Plain Layout According to \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/Shared-nothing_architecture \end_layout \end_inset , Google also uses the term \begin_inset Quotes eld \end_inset sharding \begin_inset Quotes erd \end_inset for a particular \begin_inset Quotes eld \end_inset shared-nothing architecture \begin_inset Quotes erd \end_inset . Although our above definition of \begin_inset Quotes eld \end_inset sharding \begin_inset Quotes erd \end_inset does not fully comply with its original meaning, a similar usage by Google probably means that our usage of the term is not completely uncommon. \end_layout \end_inset , and can be certainly transferred to some more application areas. Our more specific use of the term \begin_inset Quotes eld \end_inset sharding \begin_inset Quotes erd \end_inset has the following properties, \emph on all at the same time: \end_layout \begin_layout Enumerate User / customer data is \series bold partitioned \series default . This is very similar to database sharding. However, the original database term also allows \emph on some \emph default data to remain unpartitioned. In webhosting, suchalike may exists also, but typically only for \emph on system data, \emph default like OS images, including large parts of their configuration data. Suchalike system data is typically \emph on replicated \emph default from a central \begin_inset Quotes eld \end_inset golden image \begin_inset Quotes erd \end_inset in an \emph on offline \emph default fashion, e.g. via regular \family typewriter rsync \family default cron jobs, etc. Typically, it comprises only of few gigabytes per instance and is mostly read-only with a slow change rate, while total customer data is typically in the range of some petabytes with a higher total change rate. \end_layout \begin_layout Enumerate The system has (almost \begin_inset Foot status open \begin_layout Plain Layout In general, there are some more natural single points of contention, such as the physical space of a datacenter, which might be destroyed by an explosion , for example. \end_layout \end_inset ) \series bold no single point of contention \series default , and thus the partitions are \series bold completely independent \series default from each other, like in \series bold shared-nothing \series default architectures \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/Shared-nothing_architecture \end_layout \end_inset . However, the original term \begin_inset Quotes eld \end_inset shared-nothing \begin_inset Quotes erd \end_inset has also been used for describing \emph on replicas \emph default , e.g. DRBD mirrors. In our context of \begin_inset Quotes eld \end_inset sharding \begin_inset Quotes erd \end_inset , the shared-nothing principle \emph on only \emph default refers to the \begin_inset Quotes eld \end_inset \series bold no single point of contention \series default \begin_inset Quotes erd \end_inset principle at \emph on partitioning \emph default level, which means it \emph on only \emph default refers to to the \emph on partitioning \emph default of the user data, but \emph on not \emph default to their replicas. \end_layout \begin_layout Enumerate Shared-nothing replicas (e.g. in the sense of some DRBD descriptions) may be also present (and in fact they are at 1&1 Shared Hosting Linux), but these \series bold replicas \series default are considered \series bold orthogonal to sharding \series default . Customer data replicas form an \emph on independent \emph default dimension called \begin_inset Quotes eld \end_inset replication layer \begin_inset Quotes erd \end_inset . The replication layer also obeys the shared-nothing principle in original sense, but it is \emph on not \emph default meant by our term \begin_inset Quotes eld \end_inset sharding \begin_inset Quotes erd \end_inset in order to avoid confusion \begin_inset Foot status open \begin_layout Plain Layout Notice that typically \family typewriter BigCluster \family default architectures are also abstracting away their replicas when talking about their architecture. \end_layout \end_inset between these two independent dimensions. \end_layout \begin_layout Standard \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Conceptual separation of replication from sharding has some advantages. For example, control over the replication degree \begin_inset Formula $k$ \end_inset can be more fine-grained than at physical shard level. For example, both DRBD and MARS are supporting this, by allowing a different number of replicas for each logical resource. \end_layout \begin_layout Standard Our sharding model does not need a dedicated storage network in general, at least when built and dimensioned properly. Instead, it \emph on should have \emph default (but not always needs) a so-called \series bold replication network \series default which can, when present, be dimensioned much smaller because it does neither need realtime operations nor scalabiliy to \begin_inset Formula $O(n^{2})$ \end_inset : \begin_inset Separator latexpar \end_inset \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/Architecure_Sharding.pdf width 100col% \end_inset \end_layout \begin_layout Standard \noindent Sharding architectures are extremely well suited when both the input traffic and the data is \series bold already partitioned \series default . For example, when several thousands or even millions of customers are operating on disjoint data sets, like in web hosting where each webspace is residing in its own home directory, or when each of millions of mySQL database instances has to be isolated from its neighbour. Masses of customers are also appearing at cloud storage applications like Cloud Filesystems (e.g. Dropbox or similar). \end_layout \begin_layout Standard Even in cases when any customer may potentially access any of the data items residing in the whole storage pool (e.g. like in a search engine), sharding can be often applied. The trick is to create some relatively simple content-based dynamic switching or redirect mechanism in the input network traffic, similar to HTTP load balancers or redirectors. \end_layout \begin_layout Standard Only when partitioning of input traffic plus data is not possible in a reasonabl e way, big cluster architectures as implemented for example in Ceph or Swift (and partly even possible with MARS when restricted to the block layer) may have a use case. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout When sharding is possible, it is the preferred model due to reliability and cost and performance reasons. \end_layout \end_inset \end_layout \begin_layout Standard Another good explanation can be found at \begin_inset Flex URL status open \begin_layout Plain Layout http://www.benstopford.com/2009/11/24/understanding-the-shared-nothing-architectur e/ \end_layout \end_inset . \end_layout \begin_layout Subsection Variants of Sharding \begin_inset CommandInset label LatexCommand label name "subsec:Variants-of-Sharding" \end_inset \end_layout \begin_layout Description LocalSharding The simplest possible sharding architecture is simply putting both the storage and the compute CPU power onto the same iron. \begin_inset Newline newline \end_inset \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Dimensioning of 1&1 Shared Hosting Linux (ShaHoLin) \end_layout \end_inset We have dimensioned several variants of this. \end_layout \begin_layout Enumerate We are using 1U pizza boxes with local hardware RAID controllers with fast hardware BBU cache and ~ 10 local disks for the majority of LXC container instances where the \begin_inset Quotes eld \end_inset small-sized \begin_inset Quotes erd \end_inset customers (up to ~100 GB webspace per customer) are residing. Since most customers have very small home directories with extremely many but small files, this is a very cost-efficient model. \end_layout \begin_layout Enumerate Less that 1 permille of all customers have > 250 GB (up to 2TB) per home directory. For these few customers we are using another dimensioning variant of the same architecture: 4U servers with 48 high-capacity spindles on 3 RAID sets, delivering a total PV capacity of ~300 TB, which are then cut down to ~10 LXC containers of ~30 TB each. \end_layout \begin_layout Enumerate (currently in planning stage) An intermediate dimensioning between both extremes could save some more cost, and hopefully improve reliability even more, due to better pre-distribution of customer behaviour. The so-called midclass could be dimensioned as 90 TB per 2U pizza box, roughly on 12 spindles. It would carry the customers between ~50 and ~250 GB webspace each. \end_layout \end_inset \begin_inset Newline newline \end_inset In order to operate this model at a bigger scale, you should consider the \begin_inset Quotes eld \end_inset container football \begin_inset Quotes erd \end_inset method as described in section \begin_inset CommandInset ref LatexCommand ref reference "subsec:Principle-of-Background" \end_inset and in \family typewriter football-user-manual.pdf \family default . \end_layout \begin_layout Description RemoteSharding This variant needs a (possibly dedicated) storage network, which is however only \begin_inset Formula $O(n)$ \end_inset in total. Each storage server exports a block device over iSCSI (or over another transport like MARS' prosumer device) to at most \begin_inset Formula $O(k)$ \end_inset dedicated compute nodes where \begin_inset Formula $k$ \end_inset is some \series bold constant \series default . \begin_inset Newline newline \end_inset \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Hint 1 \end_layout \end_inset It is advisable to build this type of storage network with \series bold local switches \series default and no routers inbetween, in order to avoid \begin_inset Formula $O(n^{2})$ \end_inset -style network architectures and traffic. This reduces error propagation upon network failures. Keep the storage and the compute nodes locally close to each other, e.g. in the same datacenter room, or even in the same rack. \end_layout \end_inset \begin_inset Newline newline \end_inset \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Hint 2 \end_layout \end_inset Additionally, you can provide some (low-dimensioned) backbone for \series bold exceptional(!) \series default cross-traffic between the local storage switches. Don't plan to use any realtime cross-traffic \emph on regularly \emph default , but only for clear cases of emergency! \end_layout \end_inset \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset In this model, a shard typically consists of one storage node plus \begin_inset Formula $k+1$ \end_inset or \begin_inset Formula $k+2$ \end_inset compute servers, introducing some additional failure redundancy \emph on within \emph default such a shard, while retaining the \begin_inset Quotes eld \end_inset no single point of contention \begin_inset Quotes erd \end_inset property \emph on between \emph default the shards (according to section \begin_inset CommandInset ref LatexCommand nameref reference "par:Definition-of-Sharding" plural "false" caps "false" noprefix "false" \end_inset ). \end_layout \begin_layout Description FlexibleSharding This is a dynamic combination of LocalSharding and RemoteShardi ng, dynamically re-configurable, as explained below. \end_layout \begin_layout Description BigClusterSharding The sharding model can also be placed \series bold on top of \series default a BigCluster model, or possibly \begin_inset Quotes eld \end_inset internally \begin_inset Quotes erd \end_inset in such a model, leading to a similar effect. Whether this makes sense needs some discussion. It can be used to reduce the \emph on logical \emph default BigCluster size from \begin_inset Formula $O(n)$ \end_inset to some \begin_inset Formula $O(k)$ \end_inset , such that it is no longer a \begin_inset Quotes eld \end_inset big cluster \begin_inset Quotes erd \end_inset but a \begin_inset Quotes eld \end_inset small cluster \begin_inset Quotes erd \end_inset , and thus reducing the serious problems described in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Reliability-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset to some degree. \begin_inset Newline newline \end_inset \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Some use cases for BigClusterSharding \end_layout \end_inset This could make sense in the following use cases: \end_layout \begin_layout Itemize When you \series bold already have \series default invested into a big cluster, e.g. Ceph or Swift, which does not really scale and/or does not really deliver the expected reliability. Some possible reasons for this are explained in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Reliability-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset and subsection \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Explanations-from-DSM" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Itemize When you really need a \emph on single \emph default LV which is necessarily \series bold bigger \series default than can be reasonably built on top of local LVM. This means, you are likely claiming that you really need \series bold strict consistency \series default as provided by a block device on more than 1 PB with current technology (2018). Examples are very \series bold big enterprise databases \series default like classical SAP (c.f. section \begin_inset CommandInset ref LatexCommand ref reference "sec:Local-vs-Centralized" \end_inset ), or if you really need \series bold POSIX-compliance \series default on a single big filesystem instance. Be conscious when you think this is the only solution to your problem. Double-check or triple-check whether there is \emph on really \emph default no other solution than creating such a huge block device and/or such a huge filesystem instance. Such huge SPOFs are tending to create similar problems \begin_inset Foot status open \begin_layout Plain Layout Running \family typewriter fsck \family default or its Windows equivalents on huge filesystems is certainly no fun. \end_layout \end_inset as described in section \begin_inset CommandInset ref LatexCommand ref reference "sec:Reliability-Arguments-from" \end_inset for similar reasons. \end_layout \end_inset \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout When building a \series bold new \series default storage system, be sure to check the following use cases. You should seriously consider a LocalSharding / RemoteSharding / FlexibleShardi ng model in favor of BigClusterSharding when ... \end_layout \begin_layout Itemize ... when more than 1 LV instance would be placed onto your \begin_inset Quotes eld \end_inset small cluster \begin_inset Quotes erd \end_inset shards. Then a \series bold {Local,Remote,Flexible}Sharding \series default model could be likely used instead. Then the total overhead ( \series bold total cost of ownership \series default ) introduced by a BigCluster \emph on model \emph default but actually stripped down to a \begin_inset Quotes eld \end_inset SmallCluster \begin_inset Quotes erd \end_inset \emph on implementation / configuration \emph default should be examined separately. Does it really pay off? \end_layout \begin_layout Itemize ... when there are \series bold legal requirements \series default that you can tell at any time where your data is. Typically, this is all else but easy on a BigCluster model, even when stripped down to SmallCluster size. \end_layout \end_inset \end_layout \begin_layout Subsection FlexibleSharding \begin_inset CommandInset label LatexCommand label name "subsec:FlexibleSharding" \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Notice that MARS' new prosumer device feature (formerly called \emph on remote device \emph default , like a kind of replacement for iSCSI) can not only be used for a \family typewriter RemoteSharding \family default model, but \emph on could \emph default also be used for implementing some sort of \begin_inset Quotes eld \end_inset big cluster \begin_inset Quotes erd \end_inset model at block layer. However, consider the warnings for certain use cases from section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Explanations-from-DSM" plural "false" caps "false" noprefix "false" \end_inset . If you deserve a very similar level of flexibility as promised by \family typewriter BigCluster \family default , read on. \end_layout \begin_layout Standard Models re-introducing some kind of \begin_inset Formula $O(n^{2})$ \end_inset \begin_inset Quotes eld \end_inset big dedicated storage network \begin_inset Quotes erd \end_inset , considering the \emph on potential \emph default connections, and \begin_inset Formula $O(n)$ \end_inset considering the \emph on actual \emph default realtime connections during runtime, are \series bold not \series default the preferred model for MARS operations in large scale. Following is a compromize. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset The basic idea is that each server \emph on can \emph default (as far as necessary) operate \emph on both \emph default in server \emph on and \emph default in client role, both at the same time, and individually for each resource. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout Following is a \series bold super-model \series default which combines both the \begin_inset Quotes eld \end_inset big cluster \begin_inset Quotes erd \end_inset and sharding models at block layer in a very flexible way, without fully depending on \begin_inset Formula $O(n)$ \end_inset realtime network connections. The result is a similar flexibility than promised by BigCluster. \end_layout \end_inset \end_layout \begin_layout Standard \noindent The following example shows only two servers from a pool consisting of hundreds or thousands of servers: \begin_inset Separator latexpar \end_inset \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/MARS_Cluster_on_Demand.pdf width 100col% \end_inset \end_layout \begin_layout Standard \noindent The main difference to \family typewriter BigCluster \family default is to use iSCSI or the MARS prosumer device \emph on only where necessary \emph default . Preferably, local storage is divided into multiple Logical Volumes (LVs) via LVM, which should be \emph on directly \emph default used \emph on locally \emph default by Virtual Machines (VMs), whenever possible. At abstract architectual level, detail technologies KVM/qemu vs filesystem-base d local LXC containers make no real difference \begin_inset Foot status open \begin_layout Plain Layout A way for abstracting many details between KVM and LXC is for example provided by \family typewriter libvirt \family default . \end_layout \end_inset . \end_layout \begin_layout Standard In the above example, the left machine has relatively less CPU power or RAM than storage capacity. Therefore, not \emph on all \emph default LVs could be instantiated locally at the same time without causing operational problems, but \emph on some \emph default of them can be run locally. The example solution is to \emph on exceptionally(!) \emph default export LV3 to the right server, which has some otherwise unused CPU and RAM capacity. \end_layout \begin_layout Standard Notice that local operations of VMs doesn't produce any storage network traffic at all. Therefore, this is the preferred runtime configuration. \end_layout \begin_layout Standard Only in cases of resource imbalance, such as (transient) CPU or RAM peaks (e.g. caused by DDOS attacks), and only when the \series bold ability for butterfly \series default (see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Flexibility-of-Failover" plural "false" caps "false" noprefix "false" \end_inset ) is not available \begin_inset Foot status open \begin_layout Plain Layout This may happen when a disaster has already destroyed one of your datacenters, and thus you are forced to run in the surviving datacenter. \end_layout \end_inset or is not sufficienct, only then the following \series bold fallback strategy \series default is used: \emph on Some \emph default VMs or containers may then be run somewhere else over the network. In a well-balanced and well-dimensioned system, this will be the \series bold vast minority \series default , and should be only used for dealing with timely load peaks, unforeseeable customer behaviour, etc. \end_layout \begin_layout Standard \series bold \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout Running (geo-)redundant VMs directly on the same servers as their storage devices is a major cost reducer. \end_layout \end_inset \end_layout \begin_layout Standard You simply don't need to buy and operate \begin_inset Formula $2\cdot(n+m)$ \end_inset servers, but only about \begin_inset Formula $2\cdot(\max(n,m)+m\cdot\epsilon)$ \end_inset servers, where \begin_inset Formula $\epsilon$ \end_inset corresponds to some relative small extra resources needed by MARS. \end_layout \begin_layout Standard In addition, \series bold shared memory \series default can be exploited more efficiently. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset In addition to this and to reduced networking cost, there are further cost savings at power consumption, air conditioning, Height Units (HUs), number of HDDs, operating cost, etc as explained in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Cost-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Subsection Principle of Background Migration \begin_inset CommandInset label LatexCommand label name "subsec:Principle-of-Background" \end_inset \end_layout \begin_layout Standard The sharding model needs a different approach to load balancing of storage space than the big cluster model. There are serveral possibilities at different layers, each addressing different \series bold granularities \series default , starting from finest to coarsest: \end_layout \begin_layout Itemize Moving per-customer data, typically at filesystem or database level via \family typewriter rsync \family default or \family typewriter mysqldump \family default or similar. \begin_inset Newline newline \end_inset \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Fine-grained migration of customer home directories \end_layout \end_inset At 1&1 Shared Hosting Linux, we have about 9 millions of customer home directori es. We also have a script \family typewriter movespace.pl \family default using incremental \family typewriter tar \family default or \family typewriter rsync \family default for their moves. Now, if we would try to move around \emph on all \emph default of them this way, it could easily take years or even decades for millions of extremely small home directories, due to overhead like DNS updates etc. However, there exist a small handful of large customer home directories in the terabyte range. For these, and only for these, it is a clever idea to use \family typewriter movespace.pl \family default because thereby the size of a LV can be regulated more fine grained than at LV level. \end_layout \end_inset \end_layout \begin_layout Itemize Dynamically growing the sizes of LVs during operations. \begin_inset Newline newline \end_inset \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Medium-grained extension of LVs \end_layout \end_inset Football's \family typewriter expand \family default operation roughly does the following: \family typewriter lvresize \family default followed by \family typewriter marsadm resize \family default followed by \family typewriter xfs_growfs \family default or some equivalent filesystem-specific operation. \end_layout \end_inset \end_layout \begin_layout Itemize Moving whole LVs via MARS + Football, as shown in the following example: \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/MARS_Background_Migration.pdf width 100col% \end_inset \end_layout \begin_layout Standard \noindent The idea of Football's \family typewriter migrate \family default operation is to dynamically create \emph on additional \emph default LV replicas for the sake of \series bold background migration \series default . \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold using MARS as replication engine \end_layout \end_inset \end_layout \begin_layout Itemize If not yet done, you should transparently introduce MARS \begin_inset Foot status open \begin_layout Plain Layout When necessary, create the first MARS replica with \family typewriter marsadm create-resource \family default on your already-existing LV data, which will be retained unmodified, and restart your application again. \end_layout \end_inset into your LVM-based stack. If you don't want more than \begin_inset Formula $k=1$ \end_inset replicas in general, you can use the so-called \begin_inset Quotes eld \end_inset standalone mode \begin_inset Quotes erd \end_inset of MARS. \end_layout \begin_layout Itemize Optionally: once you have MARS in place, you may use iSCSI or the MARS prosumer device or another means for exporting \family typewriter /dev/mars/lv3 \family default to another hypervisor. This might be the same hypervisor you want to migrate the data to, or it could be another machine. This is not generally needed, but it help for achieving a similar elasticity than promised by \family typewriter BigCluster \family default . \end_layout \begin_layout Itemize Now, for the sake of migration, you just create an additional replica at your target server via \family typewriter marsadm join-resource \family default . Optionally, this may be the same server where the remote VM is already running at the moment. Wait until the additional mirror has been fully \series bold synced \series default in background, while your application is continuously running and while the content of the LV is modified \emph on in parallel \emph default by your ordinary applications running inside the VM. \end_layout \begin_layout Itemize Then you do a \series bold primary handover \series default to your mirror (or to \emph on any \emph default of multiple mirrors). This is usually a matter of seconds. Newer versions of the prosumer device will allow this without shutdown of your VM. With standard \begin_inset Foot status open \begin_layout Plain Layout There are some iSCSI features like ALUA which \emph on should \emph default be able to handover an active session to another storage box without interrupti on. However, the corresponding Linux documentation looks very sparse, and the maturity status for Linux initiators / targets is unclear at the moment. \end_layout \end_inset iSCSI, you will typically have to shortly shutdown the VM and to restart it a few seconds later. \end_layout \begin_layout Itemize Once the application is running again at the old location or at another location, you may delete the old replica via \family typewriter marsadm leave-resource \family default and \family typewriter lvremove \family default . \end_layout \begin_layout Itemize Finally, you may re-use the freed-up space for something else (e.g. \family typewriter lvresize \family default of \emph on another \emph default LV followed by \family typewriter marsadm resize \family default followed by \family typewriter xfs_growfs \family default or similar). Or, you may later migrate \emph on another \emph default (smaller) LV to this server, in order to re-use of the free space, or similar. \end_layout \begin_layout Itemize For the sake of \series bold hardware lifecycle \series default , you may run a slightly different strategy: evacuate the original source server completely via Football, and eventually decommission it. \end_layout \begin_layout Itemize In case you already have a redundant LV copy somewhere else, you may run a similar procedure, but starting with \begin_inset Formula $k=2$ \end_inset replicas, and temporarily increasing the number of replicas to either \begin_inset Formula $k'=3$ \end_inset when moving each replica step-by-step, or you may even directly go up to \begin_inset Formula $k'=4$ \end_inset in one step, thereby moving \emph on pairs \emph default at once. Example: the latter variant is the default in the ShaHoLin configuration variant of Football, internally called Tetris. \begin_inset Newline newline \end_inset Technical details: see \family typewriter football.sh \family default in the \family typewriter football/ \family default directory of MARS, which is a checkout of the Football sub-project, and \family typewriter football-user-manual.pdf \family default . \end_layout \begin_layout Itemize When already starting with \begin_inset Formula $k\geq3$ \end_inset LV replicas in the starting position, you may have the luxury of using a lesser variant. For example, we have some mission-critical servers at 1&1 Ionos which are running \begin_inset Formula $k=4$ \end_inset replicas all the time on relatively small but important LVs for extremely increased safety. Only in such a case, you may have the freedom to temporarily decrease from \begin_inset Formula $k=4$ \end_inset to \begin_inset Formula $k'=3$ \end_inset and then going up to \begin_inset Formula $k''=4$ \end_inset again, before starting primary handover. This has the advantage of requiring less temporary storage space for \emph on swapping \emph default some LV replicas. \end_layout \end_inset \end_layout \begin_layout Section Cost Arguments \begin_inset CommandInset label LatexCommand label name "sec:Cost-Arguments-from" \end_inset \end_layout \begin_layout Standard A common pre-jugdement is that \begin_inset Quotes eld \end_inset big cluster \begin_inset Quotes erd \end_inset is the cheapest scaling storage technology when built on so-called \begin_inset Quotes eld \end_inset commodity hardware \begin_inset Quotes erd \end_inset . \end_layout \begin_layout Standard While this is very often true for the \begin_inset Quotes eld \end_inset commodity hardware \begin_inset Quotes erd \end_inset part, it is often \emph on not \emph default true for the \begin_inset Quotes eld \end_inset big cluster \begin_inset Quotes erd \end_inset part. Let us first look at the \begin_inset Quotes eld \end_inset commodity \begin_inset Quotes erd \end_inset part. \end_layout \begin_layout Subsection Cost Arguments from Technology \begin_inset CommandInset label LatexCommand label name "subsec:Cost-Arguments-from-Technology" \end_inset \end_layout \begin_layout Standard Here are some rough market prices for basic storage as determined around end of 2016 / start of 2017: \begin_inset Separator latexpar \end_inset \end_layout \begin_layout Standard \noindent \align center \begin_inset Tabular \begin_inset Text \begin_layout Plain Layout \size small Technology \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \size small Enterprise-Grade \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \size small Price in € / TB \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \size small Consumer SATA disks via on-board SATA controllers \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \size small no (small-scale) \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \size small < 30 possible \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \size small SAS disks via SAS HBAs (e.g. in external 14 \begin_inset Quotes erd \end_inset shelfs) \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \size small halfways \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \size small < 80 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \size small SAS disks via hardware RAID + LVM (+DRBD/MARS) \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \size small yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \size small 80 to 150 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \size small Commercial storage appliances via iSCSI \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \size small yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \size small around 1000 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \size small Cloud storage, S3 over 5 years lifetime \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \size small yes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \size small 3000 to 8000 \end_layout \end_inset \end_inset \end_layout \begin_layout Standard \noindent You can see that any self-built and self-administered storage (whose price varies with slower high-capacity disks versus faster low-capacity disks) is much cheaper than any commercial offering by about a factor of 10 or even more. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent If you need to operate several petabytes of data, self-built storage is \emph on always \emph default cheaper than commercial one, even if some more manpower is needed for commissio ning and operating, than for communications with the storage provider. You don't have to pay the shareholders of the storage provider. Instead, the savings will benefit your \emph on own \emph default shareholders. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Here we just assume that the storage is needed permanently for at least 5 years, as is the case in web hosting, databases, backup / archival systems, and many other application areas. \end_layout \begin_layout Standard Commercial offerings of cloud storage are way too much hyped. Apparently some people don't seem to know that the generic term \begin_inset Quotes eld \end_inset Cloud Storage \begin_inset Quotes erd \end_inset refers to a \emph on storage class \emph default (see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Requirements-for-Cloud" plural "false" caps "false" noprefix "false" \end_inset ), not to a particular \emph on instance \emph default like original Amazon S3, and that it is possible to build and operate almost any instance of any storage class yourself. \end_layout \begin_layout Standard From a commercial perspective, \series bold outsourcing \series default of \emph on huge masses \emph default of enterprise-critical storage (to whatever class of storage) usually pays off \series bold only when \series default your storage demands are either \emph on relatively moderate \emph default , or are \emph on extremely \emph default varying over time, and/or when you need some \emph on extra \emph default capacity only \emph on temporarily \emph default for a \emph on very \emph default short time. \end_layout \begin_layout Subsection Cost Arguments from Architecture \begin_inset CommandInset label LatexCommand label name "subsec:Cost-Arguments-from-Architecture" \end_inset \end_layout \begin_layout Standard In addition to basic storage prices, many further factors come into play when roughly comparing big cluster architectures versus sharding. The following table bears the \emph on unrealistic assumption \emph default that BigCluster can be reliably operated with 2 replicas ( \family roman \series medium \shape up \size normal \emph off \bar no \strikeout off \uuline off \uwave off \noun off \color none the suffix \begin_inset Formula $\times2$ \end_inset \family default \series default \shape default \size default \emph default \bar default \strikeout default \uuline default \uwave default \noun default \color inherit means with additional geo-redundancy): \begin_inset Separator latexpar \end_inset \end_layout \begin_layout Standard \noindent \align center \begin_inset Tabular \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout BC \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout SHA \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout BC \begin_inset Formula $\times2$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout SHA \begin_inset Formula $\times2$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout # of Disks \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout >200% \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout <120% \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout >400% \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout <240% \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout # of Servers \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times2$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times1.1$ \end_inset possible \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times4$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times2.2$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Power Consumption \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times2$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times1.1$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times4$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times2.2$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout HU Consumption \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times2$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times1.1$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times4$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times2.2$ \end_inset \end_layout \end_inset \end_inset \end_layout \begin_layout Standard \noindent As shown in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Reliability-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset , and as recommended by several advocates, two replicas are typically not sufficient for BigCluster. Even addicts of BigCluster are typically recommending 3 replicas in so-called \begin_inset Quotes eld \end_inset best practices \begin_inset Quotes erd \end_inset , leading to the following more realistic table: \end_layout \begin_layout Standard \noindent \align center \begin_inset Tabular \begin_inset Text \begin_layout Plain Layout \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout BC \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout SHA \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout BC \begin_inset Formula $\times2$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout SHA \begin_inset Formula $\times2$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout # of Disks \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout >300% \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout <120% \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout >600% \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout <240% \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout # of Servers \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times3$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times1.1$ \end_inset possible \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times6$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times2.2$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Power Consumption \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times3$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times1.1$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times6$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times2.2$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout HU Consumption \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times3$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times1.1$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times6$ \end_inset \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \begin_inset Formula $\approx\times2.2$ \end_inset \end_layout \end_inset \end_inset \end_layout \begin_layout Standard \noindent The crucial point is not only the number of extra servers needed for dedicated storage boxes, but also the total number of HDDs. While big cluster implementations like Ceph or Swift can \emph on theoretically \emph default use some erasure encoding \begin_inset Foot status open \begin_layout Plain Layout There is a reason why erasure encoding is not practical for many \family typewriter BigCluster \family default use cases. The number of total IO requests sent to the internal disks is much higher than the number of IO requests sent to the storage by your application, in order to update additional redundancy information. Like RAID-6, this is typically by \emph on factors \emph default . While RAID-6 is \series bold offloading \series default this additional workload to a small \emph on specialized \emph default and realtime-capable network called SAS bus, \family typewriter BigCluster \family default is typically spreading this workload over an unreliable IP network with packet loss, spanning much larger distances, and involving more switches / routers. \end_layout \end_inset for avoiding full object replicas, their \emph on practice \emph default as seen in internal 1&1 Ceph clusters is similar to RAID-10, but just on objects instead of block-based sectors. \end_layout \begin_layout Standard Therefore a big cluster typically needs >300% disks to reach the same net capacity as a simple sharded cluster. The latter can typically take advantage of hardware RAID-60 with a significantl y smaller disk overhead, while providing sufficient failure tolerance at disk level. \end_layout \begin_layout Standard There is a surprising consequence from this: geo-redundancy is not as expensive as many people are believing. It just needs to be built with the proper architecture. A sharded geo-redundant pool based on hardware RAID-60 (last column \begin_inset Quotes eld \end_inset SHA \begin_inset Formula $\times2$ \end_inset \begin_inset Quotes erd \end_inset ) costs typically \emph on less \emph default than a non-georedundant big cluster with typically needed / recommended number of replicas (column \begin_inset Quotes eld \end_inset BC \begin_inset Quotes erd \end_inset ). A geo-redundant sharded pool provides even better failure compensation (see sections \begin_inset CommandInset ref LatexCommand nameref reference "sec:Reliability-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset and \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Flexibility-of-Failover" plural "false" caps "false" noprefix "false" \end_inset ), and comparable flexibility when combined with Football (see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Principle-of-Background" plural "false" caps "false" noprefix "false" \end_inset ). \end_layout \begin_layout Standard Notice that geo-redundancy implies by definition (see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:What-is-Geo-Redundancy" plural "false" caps "false" noprefix "false" \end_inset ) that an unforeseeable \series bold full datacenter loss \series default (e.g. caused by \series bold disasters \series default like a terrorist attack or an earthquake) must be compensated for \series bold several days or weeks \series default . Therefore it is \emph on not \emph default sufficient to take a big cluster and just spread it to two different locations. \end_layout \begin_layout Standard In any case, a MARS-based geo-redundant sharding pool with a reasonable size is cheaper than using commercial storage appliances, which are much more expensive by their nature. \end_layout \begin_layout Section Reliability Arguments from Architecture \begin_inset CommandInset label LatexCommand label name "sec:Reliability-Arguments-from" \end_inset \end_layout \begin_layout Standard A contemporary common belief is that big clusters and their \series bold random replication \series default methods would provide better reliability than anything else. There are some practical observations at 1&1 and its daughter companies which cannot confirm this. \end_layout \begin_layout Standard Similar experiences are part of a USENIX paper about copysets, see \begin_inset Flex URL status open \begin_layout Plain Layout https://www.usenix.org/system/files/conference/atc13/atc13-cidon.pdf \end_layout \end_inset . Their proposed solution is different from the solution proposed here, but interestingly their \emph on problem analysis \emph default part contains not only similar observations, but also comes to similar conclusions about random replication. Citation from the abstract: \end_layout \begin_layout Quote However, random replication is \series bold almost guaranteed \series default to lose data in the common scenario of simultaneous node failures due to cluster-wide power outages. \size footnotesize [emphasis added by me] \end_layout \begin_layout Standard Stimulated by practical experiences from truly less disastrous scenarios than mass power outage, theoretical explanations were sought. Surprisingly, they clearly show by mathematical arguments that \family typewriter LocalSharding \family default is superior to \family typewriter BigCluster \family default under practically important preconditions. \end_layout \begin_layout Standard We start with an intutitive explanation. A detailed mathematical description of the model can be found in appendix \begin_inset CommandInset ref LatexCommand vref reference "chap:Mathematical-Model-of" \end_inset . \end_layout \begin_layout Subsection Storage Server Node Failures \end_layout \begin_layout Subsubsection Simple Intuitive Explanation in a Nutshell \begin_inset CommandInset label LatexCommand label name "subsec:Simple-intuitive-explanation" \end_inset \end_layout \begin_layout Standard Block-level replication systems like DRBD are constructed for LV or disk failover in local redundancy scenarios. Or, when using MARS, even for geo-redundant failover scenarios. They are traditionally dealing with \series bold pairs \series default of servers, or with triples, etc. In order to get a storage incident with them, \emph on both \emph default sides of a DRBD or MARS small-cluster (also called \series bold shard \series default in section \begin_inset CommandInset ref LatexCommand nameref reference "par:Definition-of-Sharding" plural "false" caps "false" noprefix "false" \end_inset ) must have an incident \emph on at the same time \emph default . \end_layout \begin_layout Standard In contrast, the \series bold random replication \series default concept of big clusters is spreading huge masses of objects over a huge number of nodes \begin_inset Formula $O(n)$ \end_inset , with some redundancy degree \begin_inset Formula $k$ \end_inset denoting the number of object replicas. As a consequence, \emph on any \emph default \begin_inset Formula $k$ \end_inset node failures out of \begin_inset Formula $O(n)$ \end_inset will make \emph on some \emph default objects inaccessible, and thus produce an incident. For example, when \begin_inset Formula $k=2$ \end_inset and \begin_inset Formula $n$ \end_inset is equal for both models, then \emph on any \emph default combination to two node failures occurring at the same time will lead to an incident: \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/Incident_Probabilities.pdf width 100col% \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent Intuitively, it is easy to see that hitting both members of the \emph on same \emph default sharding pair at the same time is less likely than hitting \emph on any \emph default two nodes of a big cluster. Therefore, \series bold sharding provides better reliability \series default , when built on top of comparable technology. \end_layout \end_inset \end_layout \begin_layout Standard \noindent In addition: even when \begin_inset Formula $1$ \end_inset shard out of \begin_inset Formula $n$ \end_inset shards has an incident, the other \begin_inset Formula $n-1$ \end_inset shards will continue to run. In contrast, when a \family typewriter BigCluster \family default has an incident, \emph on all \emph default application instances are affected, due to \emph on uniform \emph default object distribution. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent Another advantage of sharded pairs is \series bold smaller incident size \series default . \end_layout \end_inset \end_layout \begin_layout Standard If you are curious about some more details and more concrete behaviour, read on. \end_layout \begin_layout Subsubsection Detailed Explanation of \family typewriter BigCluster \family default Reliability \begin_inset CommandInset label LatexCommand label name "sub:Detailed-explanation" \end_inset \end_layout \begin_layout Standard \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset The following analysis shows up some parallels to the well-known reliability loss caused by RAID striping. The main difference is granularity: variable-sized objects are used in place of fixed-size blocks. Therefore, this section is in reality about a \series bold fundamental property of data distribution / striping \series default . \end_layout \begin_layout Standard It is only formulated in terms of \family typewriter BigCluster \family default and random replication for didactic reasons, because in the context of this architecture guide we need to compare with \family typewriter LocalSharding \family default . \end_layout \begin_layout Standard For the sake of simplicity, the following more detailed model is based on the following assumptions: \end_layout \begin_layout Itemize We are looking at \series bold storage node \series default failures only. As observed from practice, this is the most important failure granularity for causing incidents. \end_layout \begin_layout Itemize Disk failures are regarded as already solved (e.g. by local RAID-6 or by the well-known compensation mechanisms of big clusters). Only in case they don't work, they are mapped to node failures, and are already included in the probability of storage node failures. \end_layout \begin_layout Itemize We only look at \series bold data replication \series default with a redundancy degree of a relatively small \begin_inset Formula $k$ \end_inset . CRC methods are not modeled across storage nodes, but may be present \emph on internally \emph default at some storage nodes, e.g. RAID-5 or RAID-6 or similar methods, or may be present internally in some hardware devices, like SSDs or HDDs. Notice that \emph on distributed \emph default CRC methods generally involve very high overhead, and won't work in realtime across long distances (geo-redundancy). \end_layout \begin_layout Itemize We restrict ourselves to temporary / \series bold transient \series default failures, without regarding permanent data loss. Otherwise, the following differences between local-storage sharding architectur es and big clusters would become even worse. When loosing some physical storage nodes forever in a big cluster, it is typically all else but easy to determine which data of which application instances / customers have been affected, and which will need a restore from backup. \end_layout \begin_layout Itemize Storage network failures (parts, or as a whole) are ignored. Otherwise a fair comparison between the architectures would become difficult. If they were taken into account, the advantages of \family typewriter LocalSharding \family default would become even bigger. \end_layout \begin_layout Itemize We assume that the storage network (when present) forms no bottleneck. Network implementations like TCP/IP versus Infiniband or similar are thus ignored. \end_layout \begin_layout Itemize Software failures / bugs are also ignored \begin_inset Foot status open \begin_layout Plain Layout When assuming that the probability of bugs is increased by increased architectur al complexity, a \family typewriter LocalSharding \family default model would likely win here also. However, such an assumption is difficult to justify, and might be wrong, depending on many (unknown) factors. \end_layout \end_inset . We are only comparing \emph on architectures \emph default here, not their various implementations (see \begin_inset CommandInset ref LatexCommand nameref reference "sec:What-is-Architecture" plural "false" caps "false" noprefix "false" \end_inset ). \end_layout \begin_layout Itemize The x axis shows the number of basic storage units \begin_inset Formula $n=x$ \end_inset from an \emph on application \emph default perspective, meaning \begin_inset Quotes eld \end_inset usable storage \begin_inset Quotes erd \end_inset or \begin_inset Quotes eld \end_inset net amount of storage \begin_inset Quotes erd \end_inset . For simplicitiy of the model, one basic application storage unit equals to the total disk space provided by one physical storage node in the special case of \begin_inset Formula $k=1$ \end_inset replicas. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Stated simply, this means that there is exactly 1 LV = 1 PV per each applicatio n unit present at the x axis. So we have a total of exactly \begin_inset Formula $x$ \end_inset LVs. Of course, you might create a more elaborate model by introduction of some constant \begin_inset Formula $l\geq1$ \end_inset for a grand total of \begin_inset Formula $l\cdot x$ \end_inset LVs on top of \begin_inset Formula $x=n$ \end_inset PVs, but we don't want to complexify our model unnecessarily. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Attention! when increasing the number of replicas \begin_inset Formula $k$ \end_inset , the total number of storage nodes needs to be \series bold increased accordingly \series default . Typically, you will need to deploy \begin_inset Formula $k\cdot n$ \end_inset physical storage nodes in order to get \begin_inset Formula $n$ \end_inset net storage units from a user's perspective. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Attention! \begin_inset space ~ \end_inset \begin_inset Formula $k$ \end_inset has a strong influence at the \series bold price tag \series default of any of the competing architectures. You cannot assume an \begin_inset Quotes eld \end_inset infinite amount of money \begin_inset Quotes erd \end_inset . Therefore, only relatively small \begin_inset Formula $k$ \end_inset are bearable for business cases. \end_layout \begin_layout Itemize As already stated, we assume that the number of application instances is linearly scaling with \begin_inset Formula $n$ \end_inset . For simplicity, we assume that the number of applications running on the whole pool is \emph on exactly \emph default \begin_inset Formula $n$ \end_inset . Of course, you might also introduce some \emph on coupling constant \emph default here, but don't complexify the model unnecessarily. \end_layout \begin_layout Itemize We assume that the storage nodes are (almost completely) filled with data (sectors with RAID, and/or objects with \family typewriter BigCluster \family default ). Otherwise, the game would be pointless on empty clusters / shards. \end_layout \begin_layout Itemize We assume that the number of sectors / objects per storage node is \begin_inset Quotes eld \end_inset very large \begin_inset Quotes erd \end_inset . Some examples: a logical volume of 4 TB has 1,000,000,000 sectors or object, each 4 KB in size. A physical storage node providing 40 TB of storage will then provide 10 billions of sectors / objects. \end_layout \begin_layout Itemize For the \family typewriter BigCluster \family default architecture, we assume that all objects are always distributed to \begin_inset Formula $O(n)$ \end_inset nodes. We will later discuss some variants where it is distributed to \emph on less \emph default nodes. This assumption is only for explaining the \series bold principal behaviour of data distribution / striping \series default , and also for one of its variants called \series bold random replication \series default . For simplicity of the model, we assume a distribution via a \emph on uniform \emph default hash function. In general, the principal behaviour would also work for many other distribution functions, such as RAID striping, or even certain non-uniform hash functions over \begin_inset Formula $O(n)$ \end_inset nodes. As discussed later, totally different hash functions (e.g. distributing only to a constant number of nodes) would no longer model a \family typewriter BigCluster \family default architecture in our sense. \begin_inset Newline newline \end_inset In the below example, we assume a uniform object distribution to \emph on exactly \emph default \begin_inset Formula $n$ \end_inset nodes. Notice that any other \begin_inset Formula $n'=O(n)$ \end_inset with \begin_inset Formula $n' \begin_inset Text \begin_layout Plain Layout LocalSharding \size tiny (DRBDorMARS) \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout A up \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout A down \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout B up \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 0 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 1 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout B down \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 1 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 2 \end_layout \end_inset \end_inset \begin_inset ERT status open \begin_layout Plain Layout \backslash hfill \end_layout \end_inset \begin_inset Tabular \begin_inset Text \begin_layout Plain Layout BigCluster \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout A up \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout A down \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout B up \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 0 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 2 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout B down \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 2 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 2 \end_layout \end_inset \end_inset \begin_inset ERT status open \begin_layout Plain Layout \backslash hfill \end_layout \end_inset \begin_inset space ~ \end_inset \end_layout \begin_layout Standard \noindent What is the heart of the difference? While a single node failure at LocalShardin g (DRBDorMARS) will tear down only the local application, the teardown produced at BigCluster will spread to \emph on all \emph default of the \begin_inset Formula $n=2$ \end_inset application units, because of the uniform hashing and because we have only \begin_inset Formula $k=1$ \end_inset replica. \end_layout \begin_layout Standard Would it help to increase both \begin_inset Formula $n$ \end_inset and \begin_inset Formula $k$ \end_inset to larger values? \end_layout \begin_layout Standard Let us first stay at \begin_inset Formula $k=1$ \end_inset , looking at the behaviour when \begin_inset Formula $n\rightarrow\infty$ \end_inset . The generalization to bigger redundancy degrees \begin_inset Formula $k$ \end_inset will follow later. \end_layout \begin_layout Standard In the following graphics, the thick red line shows the behaviour for \begin_inset Formula $k=1$ \end_inset PlainServers (which is the same as \begin_inset Formula $k=1$ \end_inset DRBDorMARS) with increasing number of storage units \begin_inset Formula $n,$ \end_inset ranging from 1 to 10,000 storage units = number of servers for \begin_inset Formula $k=1$ \end_inset . Higher values of \begin_inset Formula $k\in[1,4]$ \end_inset are also displayed in different colors, but we will discuss them later. All lines corresponding to the same \begin_inset Formula $k$ \end_inset are drawn in the same color. Notice that both the x and y axis are logscale: \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/SERVICE_Comparison_of_Reversible_StorageNode_Failures.pdf lyxscale 200 width 100col% \end_inset \end_layout \begin_layout Standard \noindent First, we look at the red lines, corresponding to \begin_inset Formula $k=1$ \end_inset . The behaviour of the thick red line should be rather clear in double logscale: with increasing number of servers at the x axis, the total downtime y is also increasing. This forms a straight line in double logscale, where the slope is 1 (proportion al to \begin_inset Formula $n$ \end_inset ), and the distances between the start of the other colored lines are multiples of \begin_inset Formula $1/p$ \end_inset for the given incident probability \begin_inset Formula $p$ \end_inset . \end_layout \begin_layout Standard Next, we are looking at the thin solid red line for \family typewriter BigCluster \family default \begin_inset Formula $k=1$ \end_inset . Why is it converging against the dotted grey line around \begin_inset Formula $n=10000$ \end_inset ? \end_layout \begin_layout Standard \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset At \begin_inset Formula $n\geq10000$ \end_inset servers, there is a \begin_inset Quotes eld \end_inset permanent incident \begin_inset Quotes erd \end_inset . In statistical average, there is approximately \emph on always \emph default some server down. Due to \begin_inset Formula $k=1$ \end_inset replica, the whole cluster will then be down from a user's perspective. The thin dotted grey line denotes the total number of operation hours to be executed for each \begin_inset Formula $n$ \end_inset , so this is the limes line we are converging against for big enough \begin_inset Formula $n$ \end_inset . \end_layout \begin_layout Standard This does not look nice from a user's perspective. Can we heal the problem by deploying more replicas \begin_inset Formula $k$ \end_inset ? \end_layout \begin_layout Standard Let us look at the green solid lines, correponding to \begin_inset Formula $k=2$ \end_inset replicas. Why is the thin green BigCluster line also converging against the same dotted limes? And why is this happening around the same point, around \begin_inset Formula $n\approx10000$ \end_inset ? \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset When you want to operate \begin_inset Formula $n=10000$ \end_inset application instances with a replication degree of \begin_inset Formula $k=2$ \end_inset replicas, then you will need to deploy \begin_inset Formula $k\cdot n=20000$ \end_inset storage servers. When you have 20000 storage servers, in statistical average about \begin_inset Formula $2$ \end_inset of them will be down at the same time. When \begin_inset Formula $k=2$ \end_inset servers are down at the same time, again the whole cluster will be down from a user's perspective. Thus the green line is also converging against the grey dotted limes line, roughly also around \begin_inset Formula $n\approx10000$ \end_inset . \end_layout \begin_layout Standard Why is the green thicker DRBDorMARS line much better? \end_layout \begin_layout Standard In double logscale plot, it forms a \emph on parallel \emph default line to the corresponding red line. The distance is conforming to \begin_inset Formula $1/p$ \end_inset . This means that the incident probability for hitting \emph on both \emph default members of the \emph on same \emph default shard is \emph on improved \emph default by a factor of 10,000. \end_layout \begin_layout Standard Finally, we look at all the other solid lines in any color. All the thin solid \family typewriter BigCluster \family default lines are converging against the same limes line, regardless of replication degree \begin_inset Formula $k$ \end_inset , and around the same \begin_inset Formula $n\approx10000$ \end_inset . Why is this the case? \end_layout \begin_layout Standard Because our BigCluster model as defined above will distribute \emph on all \emph default objects to \emph on all \emph default servers uniformly, there will almost always \emph on exist \emph default some objects for which no replica is available at almost any given point in time. This means, you will almost always have a \series bold permanent incident \series default involving the same number of nodes as your replication degree \begin_inset Formula $k$ \end_inset , and in turn \emph on some \emph default of your objects will not be accessible at all. This means, at around \begin_inset Formula $x=10,000$ \end_inset application units you will loose almost any advantage from increasing the number of replicas. Adding more replicas will no longer help at \begin_inset Formula $x\geq10,000$ \end_inset application units. \end_layout \begin_layout Standard Notice that the \emph on solid \emph default lines are showing the probability of \emph on some \emph default incident, disregarding the \series bold size of the incident \series default . \end_layout \begin_layout Standard What's about the \emph on dashed \emph default lines showing much better behaviour for BigCluster? \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Under some further preconditions, it would be possible to argue with the \emph on size \emph default of incidents. However, now a big fat warning. \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Size-weighted incident probabilities \end_layout \end_inset When you are \series bold responsible \series default for operations of \series bold thousands of servers \series default , you should be very conscious about preconditions for size-weighted downtime probabilities (dashed lines). Otherwise you could risk both the health of your business, and your career. \end_layout \end_inset \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Some preconditions for size-weighted incident probabilities \end_layout \end_inset In short: \end_layout \begin_layout Itemize When your application, e.g. a smartphone app, consists of accessing only 1 object at all during a reasonabl y long timeframe (say once per day), you can safely \series bold assume that there is no interdependency \series default between all of your objects. In addition, you have to assume (and you should check) that your cluster operating software as a whole does not introduce any further \series bold hidden / internal \begin_inset Foot status open \begin_layout Plain Layout Several distributed filesystems are separating their metadata from application data. Advocates are selling this as an advantage. However, in terms of \series bold reliability \series default this is clearly a \series bold disadvantage \series default . It increases the \emph on breakdown surface \emph default . Some distributed filesystems are even \emph on centralizing \emph default their metadata, sometimes via an ordinary database system, creating a SPOF = Single Point Of Failure. In case of inconsistencies between data and metadata, e.g. resulting from an incident or from a software bug, you will need the equivalent of a \series bold distributed \family typewriter fsck \family default \series default . Suchalike can easily turn into \series bold data loss \series default and other nightmares, such as node failures during the consistency check, for example when your hardware is flaky and produces intermitting errors. \end_layout \end_inset interdependencies \series default . Only in this case, and only then, you can take the dashed lines arguing with the number of inaccessible objects instead of with the number of distorted application units. \end_layout \begin_layout Itemize Whenever your application uses \series bold bigger structured logical objects \series default , such as filesystems or block devices (cf section \begin_inset CommandInset ref LatexCommand nameref reference "par:Negative-Example:-object" plural "false" caps "false" noprefix "false" \end_inset ), and/or whole VMs / containers requiring \series bold strict consistency \series default , then you will get \series bold interdependent objects \series default at your big cluster storage layer. \begin_inset Newline newline \end_inset Practical example: experienced sysadmins will confirm that even a data loss rate of only 1/1,000,000 of blocks in a classical Linux filesystem like \family typewriter xfs \family default or \family typewriter ext4 \family default will likely imply the need of an offline filesystem check ( \family typewriter fsck \family default ), which is a major incident for the affected filesystem instance. \begin_inset Newline newline \end_inset Theoretical explanation: servers are running for a very long time, and filesyste ms are typically also mounted for a long time. Notice that the probability of hitting any vital filesystem data roughly equals the probability of hitting any other data. Sooner or later, any defective sector in the metadata structures or in freespace management etc will stop your whole filesystem, and in turn will stop your application instance(s) running on top of it. \begin_inset Newline newline \end_inset Similar arguments hold for transient failures: most classical filesystems are not constructed for compensation of hanging IO, typically leading to \series bold system hangs \series default . \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Blindly taking the dashed lines will expose you to a high \series bold risk \series default of error. Practical experience shows that there are often \series bold hidden dependencies \series default in many applications, often also at application level. You cannot necessarily see them when inspecting their data structures! You will only notice some of them by analyzing their \series bold runtime behaviour \series default , e.g. with tools like \family typewriter strace \family default . Notice that in general the runtime behaviour of an arbitrary program is \series bold undecidable \series default . Be cautious when drawing assumptions out of thin air! \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Conversely, the assumption that \emph on any \emph default unaccessible object will halt your application, might be too strong for \emph on some \emph default use cases. Therefore, some practical behaviour may be inbetween the solid thin lines and the dashed lines of some given color. Be extremely careful when constructing such an intermediate case. Remember that the plot is in logscale, where constant factors will not make a huge difference. The above example of a loss rate of 1/1,000,000 of sectors in a classical filesystem should not be extended to lower values like 1/1,000,000,000 without knowing exactly how the filesystem works, and how it will react \emph on in detail \emph default . The grey zone between the extreme cases thin solid vs dashed is a \series bold dangerous zone \series default ! \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset As a manager, if you want to stay at the \series bold safe side \series default , simply obey the fundamental law as explained in the next section: \end_layout \end_inset \end_layout \begin_layout Subsection Optimum Reliability from Architecture \begin_inset CommandInset label LatexCommand label name "subsec:Optimum-Reliability-from" \end_inset \end_layout \begin_layout Standard Another argument could be: don't distribute the BigCluster objects to exactly \begin_inset Formula $n$ \end_inset nodes, but to less nodes. Would the result be better than DRBDorMARS LocalSharding? \end_layout \begin_layout Standard Actually, several BigCluster implementation are doing similar measures, in order to workaround the problems analyzed here. There are various terms for suchalike measures, like copysets, spread factors, buckets, etc. \end_layout \begin_layout Standard When distributing to \begin_inset Formula $O(k')$ \end_inset nodes with some constant \begin_inset Formula $k'$ \end_inset , we have no longer a BigCluster architecture, but a mixed BigClusterSharding form in our terminology. \end_layout \begin_layout Standard As can be generalized from the above tables, the reliability of \series bold any \series default BigCluster on \begin_inset Formula $k'>k$ \end_inset nodes is \series bold always \series default worse than of LocalSharding on exactly \begin_inset Formula $k$ \end_inset nodes, where \begin_inset Formula $k$ \end_inset is also the redundancy degree. In general: \end_layout \begin_layout Quote \series bold \size large The LocalSharding model is the optimum model for reliability of operation, compared to any other model truly distributing its data and operations over truly more nodes, like RemoteSharding or BigClusterSharding or BigCluster does. \end_layout \begin_layout Standard There exists no better model because shards consisting of exactly \begin_inset Formula $k$ \end_inset nodes where \begin_inset Formula $k$ \end_inset is the redundancy degree are already the \emph on smallest possible shards \emph default under the assumptions of section \begin_inset CommandInset ref LatexCommand ref reference "sub:Detailed-explanation" \end_inset . Any other model truly involving \begin_inset Formula $k'>k$ \end_inset nodes for distribution of objects at any shard is \series bold always \series default worse in the dimension of reliability. Thus the above sentence follows by induction. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout The above sentence is formulating a \series bold fundamental law of storage systems \series default . An intuitive formulation for humans: \end_layout \begin_layout Quote \series bold \size large Spread your per-application data to as less nodes as possible. \end_layout \begin_layout Plain Layout This includes unnecessary spreading between dedicated client and server machines, in place of local storage. Thus \family typewriter LocalSharding \family default is the best architectural model. \end_layout \end_inset \end_layout \begin_layout Standard \noindent This is intuitive: the more nodes are involved for storing the \emph on same \emph default data belonging to the \emph on same \emph default application instance (i.e. belonging to the same LV), the higher the \series bold risk \series default that \emph on any \emph default of them can fail. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Consequence: the \series bold \emph on concept \emph default of random replication \series default \begin_inset Foot status open \begin_layout Plain Layout A very picky argument might be: random distribution could be viewed as \emph on orthogonal \emph default to random replication, by separating the concept \begin_inset Quotes eld \end_inset distribution \begin_inset Quotes erd \end_inset from the concept \begin_inset Quotes eld \end_inset replication \begin_inset Quotes erd \end_inset . Then the above sentence should be re-formulated, using \begin_inset Quotes eld \end_inset random distribution \begin_inset Quotes erd \end_inset instead. However notice than \emph on random \emph default replication + distribution on exactly \begin_inset Formula $n\cdot k$ \end_inset nodes would degenerate, since it no longer is really \begin_inset Quotes eld \end_inset random \begin_inset Quotes erd \end_inset , but only has the freedom degree of a \begin_inset Quotes eld \end_inset permutation \begin_inset Quotes erd \end_inset . \end_layout \end_inset tries to do the \emph on opposite \emph default of this, by its very nature. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent Thus the \emph on concept \emph default of \series bold random replication does not work as expected \series default . \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset This does not imply that random replication does not generally work at all. Section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Explanations-from-DSM" plural "false" caps "false" noprefix "false" \end_inset mentions a few use cases where it appears to work in practice. However, after \series bold investing a lot \series default of effort / energy / money into a very complicated architecture and several implementations, the outcome is \series bold worse = non-optimal \series default in the dimension of reliability. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset There exist some \emph on workarounds \emph default as discussed in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Similarities-and-differences" plural "false" caps "false" noprefix "false" \end_inset . These can only patch the most urgent architectural problems, such that operation remains \emph on bearable \emph default in practice. They cannot fix the \series bold Dijkstra regression overhead \series default explained in section \begin_inset CommandInset ref LatexCommand nameref reference "par:Negative-Example:-object" plural "false" caps "false" noprefix "false" \end_inset . The above plot explains why even the workarounds are \series bold far from optimal \series default for a given fixed \begin_inset Foot status open \begin_layout Plain Layout As explained in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Cost-Arguments-from-Architecture" plural "false" caps "false" noprefix "false" \end_inset , several \family typewriter BigCluster \family default best practices are typically requiring \begin_inset Formula $k=3$ \end_inset replicas. Some advocates have taken this as granted. For a \series bold fair comparison \series default with Sharding, they will need to compare with \begin_inset Formula $k=3$ \end_inset LV replicas. \end_layout \end_inset redundancy degree \begin_inset Formula $k$ \end_inset . \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Summary from a management viewpoint \end_layout \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Under comparable conditions for big installations, random replication is requiring \series bold more invest \series default than Sharding (e.g. more client/server hardware and an \begin_inset Formula $O(n^{2})$ \end_inset realtime storage network), in order to get a \series bold \emph on worse result \series default \emph default in the \series bold risk dimension \series default . \end_layout \end_inset \end_layout \begin_layout Subsection Error Propagation to Client Mountpoints \begin_inset CommandInset label LatexCommand label name "subsec:Error-Propagation-to" \end_inset \end_layout \begin_layout Standard This section deals with a \emph on pathological \emph default setup. Best practice is to avoid such pathologies. \end_layout \begin_layout Standard The following is only applicable when \series bold filesystems \series default or whole \series bold object pools \series default (buckets) are exported over a storage network, in order to be \series bold mounted \series default in parallel at \begin_inset Formula $O(n)$ \end_inset mountpoints \emph on each \emph default . \end_layout \begin_layout Standard In other words: somebody is trying to make \emph on all \emph default server data available at \emph on all \emph default clients. In spirit, this is also some BigCluster-like \series bold way of thinking \series default . It just relates to the filesystem layer, c.f. section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Performance-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Standard In such a scenario, any problem / incident inside of your storage pool for the filesystem instances will be spread to \begin_inset Formula $O(n)$ \end_inset clients, leading to an increase of the incident size by a factor of \begin_inset Formula $O(n)$ \end_inset when measured in \series bold number of affected mountpoints \series default . Notice that this is different from the number of clients. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Notice the \series bold slopes \series default in the following plot. Some are correponding to \begin_inset Formula $n^{2},$ \end_inset and thus are even worse than in the previous plot: \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/MOUNTPOINTS_Comparison_of_Reversible_StorageNode_Failures.pdf lyxscale 200 width 100col% \end_inset \end_layout \begin_layout Standard \noindent As a result, we now have a total of \begin_inset Formula $O(n^{2})$ \end_inset mountpoints = our new basic application units \begin_inset Foot status open \begin_layout Plain Layout If you like, please create another mathematical model in terms of number of clients, instead of the number of mountpoints. Though the plot curves will be different, and certainly will explain an interesting behaviour, the management conclusions will not change too much. \end_layout \end_inset . \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset The problem is much worse than explained in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Explanations-from-DSM" plural "false" caps "false" noprefix "false" \end_inset , or in \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Example-Failures-of" plural "false" caps "false" noprefix "false" \end_inset where a disaster already occurred at \begin_inset Formula $n=6$ \end_inset . Suchalike \begin_inset Formula $O(n^{2})$ \end_inset architectures are simply \series bold hazardous \series default . Thus a clear warning: don't try to build systems in such a way. \end_layout \begin_layout Standard Notice: DRBD or MARS are traditionally used for running the application on the same box as the storage. Thus they are not vulnerable to these kinds of failure propagation over network. Even with traditional iSCSI exports over DRBD or MARS, you won't have suchalike problems. Your only chance to increase the error propagation are \begin_inset Formula $O(n)$ \end_inset NFS or \family typewriter glusterfs \family default exports to \begin_inset Formula $O(n)$ \end_inset clients leading to a total number of \begin_inset Formula $O(n^{2})$ \end_inset mountpoints, or similar setups. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Clear advice \end_layout \end_inset Don't use \begin_inset Formula $O(n^{2})$ \end_inset mountpoints in total. It's a very bad idea. \end_layout \end_inset \end_layout \begin_layout Subsection Similarities and Differences to Copysets \begin_inset CommandInset label LatexCommand label name "subsec:Similarities-and-differences" \end_inset \end_layout \begin_layout Standard This section is mostly of academic interest. You can skip it when looking for practical advice. \end_layout \begin_layout Standard The USENIX paper about copysets (see \begin_inset Flex URL status open \begin_layout Plain Layout https://www.usenix.org/system/files/conference/atc13/atc13-cidon.pdf \end_layout \end_inset ) relates to our analysis of \family typewriter BigCluster \family default vs \family typewriter Sharding \family default in the following way: \end_layout \begin_layout Paragraph Similarities \end_layout \begin_layout Standard Both are concluding: the concept of Random Replication of the storage data to large number of machines will reduce reliability. When chosing too big sets of storage machines, then the storage system as a whole will become practically unusable. This is common sense between the USENIX paper and the analysis from section \begin_inset CommandInset ref LatexCommand nameref reference "sub:Detailed-explanation" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Paragraph Differences \end_layout \begin_layout Standard The USENIX paper and many other Cloud Storage approaches are \emph on presuming \emph default that there exists a storage network, allowing real-time distribution of replicas over this kind of network. \end_layout \begin_layout Standard In contrast, the Sharding Approach to Cloud Storage tries to \emph on avoid \emph default real-time storage networks \emph on as much as possible \emph default . Notice that RemoteSharding and further variants (including future improvements) do \emph on not \emph default preclude it, but are trying to \emph on avoid \emph default real-time storage network traffic. Instead, the load-balancing problem is addressed via \series bold background data migration \series default . \end_layout \begin_layout Standard This changes the \emph on timely granularity \emph default of data access: while BigCluster is transferring \emph on each \emph default IO request over the storage network in \emph on realtime \emph default , nothing is transferred over an external network at LocalSharding, provided that no migration is necessary. Typically, migrations are a \series bold rare exception \series default . Normally, the data is already \series bold close to the consumer \series default . Only in rare situations when migration is needed, local IO transfers are \emph on shifted over \emph default to external migration processes. The outcome of a successful migration is that local IO is then sufficient again. \end_layout \begin_layout Standard In essence, Football is an \series bold optimizer for data proximity \series default : always try to keep the data as close \begin_inset Foot status open \begin_layout Plain Layout When the many local SAS busses are also viewed as a network, and when these are logically united with the replication network to a bigger \emph on logical \emph default network which is \emph on heterogenous \emph default at physical level: Football does nothing else but trying to \series bold offload \series default all IO requests to the local SAS networks, instead of overloading the wide-area IP network. In essence, this is a specialized traffic scheduling strategy for a two-level network. \end_layout \end_inset to the consumers as possible. \end_layout \begin_layout Standard In detail, there are some more differences to the USENIX paper. Some examples: \end_layout \begin_layout Itemize Terminology: the scatter width \begin_inset Formula $S$ \end_inset is defined (see page 39 of the paper) as: each node's data is split \emph on uniformly \emph default across a group of \begin_inset Formula $S$ \end_inset \emph on other \emph default nodes. In difference, we neither assume uniformity, nor do we require the data to be distributed to \emph on other \emph default nodes. By using the term \begin_inset Quotes eld \end_inset other \begin_inset Quotes erd \end_inset , the USENIX paper (as well as many other BigCluster approaches) are probably presuming something like a distinction between \begin_inset Quotes eld \end_inset client \begin_inset Quotes erd \end_inset and \begin_inset Quotes eld \end_inset server \begin_inset Quotes erd \end_inset machines: while data processing is done on a \begin_inset Quotes eld \end_inset client machine \begin_inset Quotes erd \end_inset , data storage is on a \begin_inset Quotes eld \end_inset server machine \begin_inset Quotes erd \end_inset . \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset In contrast, MARS uses the client-server paradigm at a different granularity: each machine can act in client role and/or in server role \emph on at the same time \emph default , and \emph on individually \emph default for each LV. Thus it is possible to use local storage. \end_layout \begin_layout Itemize We don't disallow conventional network-centric client-server machines in variants like \family typewriter RemoteSharding \family default or \family typewriter FlexibleSharding \family default and so on, but we gave some arguments why we are trying to \emph on avoid \emph default this. \end_layout \begin_layout Itemize It seems that some definitions in the USENIX paper may implicitly relate to \begin_inset Quotes eld \end_inset each chunk \begin_inset Quotes erd \end_inset . In contrast, the Sharding Approach typically relates to LVs = Logical Volumes. Probably, LVs could be viewed as a special case of \begin_inset Quotes eld \end_inset chunk \begin_inset Quotes erd \end_inset , e.g. by minimizing the number of chunks in a system. However notice: there exists definitions of \begin_inset Quotes eld \end_inset chunk \begin_inset Quotes erd \end_inset where it is the basic transfer unit. An LV has the fundamental property that small-granularity \series bold updates in place \series default (at any offset inside the LV) can be executed. \end_layout \begin_layout Itemize Notice: we do not preclude further fine-grained distribution of LV data at lower levels, such at LVM level and/or below, but this is something which should be \emph on avoided \emph default if not absolutely necessary (see \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Optimum-Reliability-from" plural "false" caps "false" noprefix "false" \end_inset ). Preferred method in typical practical use cases: some storage servers may have some spare RAID slots to be populated later, by resizing the PVs = Physical Volumes before resizing LVs. Another alternative is dynamic runtime extension of SAS busses, by addition of external enclosures. \end_layout \begin_layout Itemize Notice that a typical local RAID system \emph on is also \emph default a Distributed System, according to some reasonable definition. Typical RAID implementations just involve SAS cables instead of Ethernet cables or Infiniband cables. Notice that this also applies to many \begin_inset Quotes eld \end_inset Commodity Hardware \begin_inset Quotes erd \end_inset approaches, like Ceph storage nodes driving dozens of local HDDs connected over SAS or SATA. The main difference is just that instead of a hardware RAID controller, a hardware HBA = Host Bus Adapter is used instead. Instead of Ethernet switches, SAS multiplexers in backplanes are used. Anyway, this forms a locally distributed sub-system. \end_layout \begin_layout Itemize The USENIX paper needs to treat the following parameters as more or less fixed (or only slowly changable) \series bold constants \series default , given by the system designer: the replication degree \begin_inset Formula $R$ \end_inset , and the scatter width \begin_inset Formula $S$ \end_inset . In contrast, the replication degree \begin_inset Formula $k$ \end_inset of our Sharding Approach is not necessarily firmly given by the system, but can be \series bold dynamically changed \series default at runtime at per-LV granularity. For example, during background migration via MARS the command \family typewriter marsadm join-resource \family default is used for dynamic creating additional per-LV replicas. However notice: this freedom is limited by the total number of deployed hardware nodes. If you want \begin_inset Formula $k=3$ \end_inset replicas at the \emph on whole \emph default pool, then you will need to (dynamically) deploy at least about \begin_inset Formula $k*x$ \end_inset nodes in general. \end_layout \begin_layout Itemize The USENIX paper defines its copysets on a per-chunk basis. Similarly to before, we might transfer this definition to a Sharding Approach by relating it to a per-LV basis. As a side effect, a copyset can then trivially become identical to \begin_inset Formula $S$ \end_inset when the definition is \begin_inset Formula $S$ \end_inset is also changed to a per-LV basis, analogously. In the Sharding Approach, a distinction is not absolutely necessary, while the USENIX paper has to invest some effort into clarifying the relationship between \begin_inset Formula $S$ \end_inset and copysets as defined on a BigCluster model. \end_layout \begin_layout Itemize Neglecting the mentioned differences, we see our typical use case (LocalSharding ) roughly equivalent to \begin_inset Formula $S=R$ \end_inset in the terminology of the USENIX paper, or to \begin_inset Formula $S=k$ \end_inset (our number of replicas) in our terminology. \end_layout \begin_layout Itemize This means: LocalSharding tries to \emph on minimize \emph default the \emph on size \emph default of \begin_inset Formula $S$ \end_inset for any given per-LV \begin_inset Formula $k$ \end_inset , which will lead to the best possible reliability (under the conditions described in section \begin_inset CommandInset ref LatexCommand nameref reference "sub:Detailed-explanation" plural "false" caps "false" noprefix "false" \end_inset ) as has been shown in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Optimum-Reliability-from" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Standard \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Another parallel comes to mind: classical RAID striping has introduced the concept of \series bold RAID sets \series default since decades. Similarly to random replication, RAID striping is motivated by \emph on load distribution \emph default . Similarly to our previous discussion, this induces some \series bold cost \series default . This is not only about RAID-0 vs RAID-10 by introduction of some more replicas \begin_inset Foot status open \begin_layout Plain Layout Random replication is be more like RAID-01: first \emph on all \emph default the physical disks are striped, then replicas are created \emph on on top \emph default of it. Reversing this order would be more similar to RAID-10, and could lead to an improvement of random replication. However, this would contradict to a basic idea of BigCluster, that you can add \emph on any \emph default number of storage nodes at any time. Instead of adding an \emph on odd \emph default number of OSDs, each potentially of different size, now an \emph on even \emph default number needs to be added for \begin_inset Formula $k=2$ \end_inset replicas, or equal-sized triples for \begin_inset Formula $k=3,$ \end_inset etc. \end_layout \end_inset . It is a general problem caused by too high stripe spreading. When a single striped RAID set would grow too big, reliability would suffer too much. Thus multiple smaller RAID sets are traditionally used in place of a single big one \begin_inset Foot status open \begin_layout Plain Layout Practical example from experience: for RAID-60, a typical RAID-6 sub-set should not exceed 12 to 15 spindles. \end_layout \end_inset . This is somewhat similar to copysets, when taking the spread factor \begin_inset Formula $S$ \end_inset as analog to the RAID set size, by using objects in place of sector stripes, and a few other differences like using some well-known \emph on stripe distribution function \emph default in place of random replication. Compare with section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Optimum-Reliability-from" plural "false" caps "false" noprefix "false" \end_inset : RAID sets are just another example workaround for consequences from the fundamental law of storage systems. \end_layout \begin_layout Subsection Explanations from DSM and WorkingSet Theory \begin_inset CommandInset label LatexCommand label name "subsec:Explanations-from-DSM" \end_inset \end_layout \begin_layout Standard When looking for practical advice, just read the below example use cases, and skip the rest, which is mostly of academic interest. \end_layout \begin_layout Standard This section tries to explain the BigCluster incidents observed at some 1&1 Ionos doughter from a different perspective. In the OS literature and community, DSM = Distributed Shared Memory and Denning's workingset theory from the 1960s are typically attributed to a different research area. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Example use cases for \family typewriter BigCluster \family default \series default \begin_inset CommandInset label LatexCommand label name "Example-use-cases-Bigcluster" \end_inset \end_layout \end_inset Personal discussions with some prominent promoters of Ceph found some informal agreements about some use cases where BigCluster appears to be well suited: \end_layout \begin_layout Itemize Large collections of audio / video files. These are never modified in place, but written once, and then \series bold \emph on streamed \series default \emph default . Thus it is possible to use relatively large object sizes, or even 1 video file = 1 object. Then streaming involves only a low number of objects at the same time, down to a per-application parallelism degree of typically only 1. \end_layout \begin_layout Itemize Measurement data like in CERN physics experiments, where often some \emph on streaming model \emph default is predominant. \end_layout \begin_layout Itemize Backups and long-term archives, when also accomplished via \emph on streaming \emph default . \end_layout \end_inset \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Example problems for \family typewriter BigCluster \family default \series default \begin_inset CommandInset label LatexCommand label name "Example-problems-Bigcluster" \end_inset \end_layout \end_inset In contrast to this, here are some other use cases where BigCluster did not meet expectations of some people at 1&1 Ionos: \end_layout \begin_layout Itemize Virtual block devices involving \series bold strict consistency \series default on top of a very high number of small \begin_inset Quotes eld \end_inset unreliable \begin_inset Quotes erd \end_inset / eventually consistent objects. \end_layout \begin_layout Itemize CephFS with \series bold highly parallel random updates \series default to a huge number of files / inodes, also involving strict consistency in some places (e.g. concurrent metadata updates belonging to the same directory). \end_layout \end_inset \end_layout \begin_layout Standard \noindent Here is a \emph on first attempt \emph default to explain these behavioural observations from a more generalized viewpoint. The author is open for discussion, and will modify this part upon better understanding. \end_layout \begin_layout Standard For the following, you will need profound \begin_inset Foot status open \begin_layout Plain Layout In addition to standard Operating System text books like Silberschatz or Tanenbaum, you may need to consult some of the original work of further authors mentioned above. \end_layout \end_inset knowledge in Operating System Principles (aka Theory of Operating Systems). \end_layout \begin_layout Standard Ceph & co are apparently shining at use cases where the \emph on object paradigm \emph default is naturally well-suited for the \emph on application behaviour \emph default . \end_layout \begin_layout Standard Application behaviour has been studied in the 1970s. Theorists know that in general it is \emph on unpredictable \emph default due to Turing Completeness, but practical obervations are revealing some frequent \emph on behavioural pattern \emph default s. Otherwise, caching would not be beneficial in practice. \end_layout \begin_layout Standard While Denning had studied and modelled application behaviour for typical drum storage devices of his era, later DSM people stumbled over similar problems: the \emph on frequency of access to needed data \emph default can grow much higher than the channel / transport capacities can \begin_inset Foot status open \begin_layout Plain Layout In general, this is unavoidable. In a \series bold storage pyramid \series default , the CPU is always able to access RAM pages with a much higher frequency than any (R)DMA transport can supply. \end_layout \end_inset provide. Denning and Saltzer coined a term for this: \series bold thrashing \series default . \end_layout \begin_layout Standard Thrashing means that more time is spent by \emph on fetching \emph default data than by \emph on working \emph default with it, because the transports are \emph on overloaded \emph default . As Denning observed, thrashing essentially means that the system becomes \emph on unusable by customers \emph default . Thrashing is a highly non-linear \series bold self-amplifying effect \series default , similar to traffic jams at highways: one it has started, it will worsen itself. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 4 status open \begin_layout Plain Layout Although some historic descriptions of thrashing are mentioning contemporary hardware devices like drum storage, the \emph on concept \emph default is very universal. Thrashing can be transferred and \series bold generalized \series default to modern instances of \series bold storage pyramids \series default , and/or also to remote access over \series bold network bottlenecks \series default . \end_layout \end_inset \end_layout \begin_layout Standard \noindent Saltzer found a workaround for his contemporary batch operating systems: limit the parallelism degree of concurrently running batch jobs. In his Multics project, this was also transferred to interactive systems, by limiting the swap-in parallelism degree of his contemporary segment swapping methods. Although this may sound counter-intuitive for modern readers: by introduction of a certain type of \series bold artificial limitation \series default at or around the non-linear regression point, the \series bold user experience was \emph on improved \series default \emph default . \end_layout \begin_layout Standard Now comes a conclusion: when thrashing occurs in a modern BigCluster model for whatever reason, the self-amplification will be likely worse than in a LocalSharding model, due to the following reasons: \end_layout \begin_layout Itemize \series bold Overload propagation \series default : when some parts of the \begin_inset Formula $O(n^{2})$ \end_inset storage network are overloaded, other parts may also become affected in turn, due to sharing of network resources, such as cross-traffic lines. Once queueing has started somewhere, it is likely to worsen, and likely to induce further queueing at other parts of the shared network. The more other parts are affected transitively, the more parts will get overloaded. So the overload, once it has started somewhere, has a higher probabilty for \emph on spreading out \emph default even to parts which were not overloaded before (self-amplification at BigCluste r level). \end_layout \begin_layout Itemize Random replication of objects adds \emph on artificial randomness \emph default to the \series bold \emph on locality of reference \series default \emph default , as described by Denning. \end_layout \begin_layout Itemize Original DSM was trying to provide a strict or near-strict consistency model for application programmers. Later research then tried some weaker consistency models, without getting a final breakthrough for general use cases. BigCluster is similarly organized to DSM, but on slow \emph on remote storage \emph default instead of logically shared remote RAM over fast RDMA. Thus we can expect similar problems as observed by the DSM community, like \series bold single points of contention \series default , etc. These might become even worse once they have appeared. \end_layout \begin_layout Standard In a nutshell: \series bold system stability \series default under overload conditions, once they have started somewhere, is highly non-linear, and tends to spread \begin_inset Foot status open \begin_layout Plain Layout In the past, advocates of BigCluster have placed the argument that BigCluster can \emph on equallay distribute \emph default the total application load onto \begin_inset Formula $O(n)$ \end_inset storage servers, so a single overloaded client will get better performance than in a sharding model. This argument contains the \emph on implicit assumption \emph default that load distribution is behaving \series bold linearly \series default , or close to that. However, Denning and Saltzer found that system reaction due to overload by workingset behaviour is \emph on extremely \emph default non-linear, and may \emph on completely \emph default tear down systems even when only \emph on slightly \emph default overloaded. Although there may exist some areas where the assumption of linearity is correct and may lead to improvements by better load distribution, \begin_inset Quotes eld \end_inset unpredictable \begin_inset Quotes erd \end_inset behaviour due to self-amplification of overload at BigCluster level may result in the \emph on opposite \emph default . Denning has provided a mathematical model for this, which could probably be transferred to modern application behaviour. \end_layout \end_inset , and to self-amplify. \end_layout \begin_layout Standard In contrast, sharding models are not spreading any overload to other shards by definition. So the total availability from the viewpoint of the \emph on total \emph default set of customers is less vulnerable to impacts. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Risk characterization in a nutshell \end_layout \end_inset While BigCluster increases the risk of spread-out of overload and other stability problems similarly to a \series bold domino effect \series default , Sharding is restricting those risks by \series bold fencing \series default . \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset In the above use cases where BigCluster is shining, overload is unlikely, since the \emph on parallelism of object access \emph default is limited. This is somewhat similar to Saltzer's historic workaround for trashing. \emph on Streaming \emph default at application behaviour level will translate into streaming at the network layer. Classical TCP networks dealing with a relatively low number of high-throuhput streaming connections are just \emph on constructed \emph default for dealing with packet loss, such as caused by overload, e.g. by their \series bold congestion control \series default \begin_inset Foot status open \begin_layout Plain Layout Recommended reading: the papers from Sally Floyd. \end_layout \end_inset algorithms. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset In contrast, an extremely high number of parallel short connections would be similar to a \begin_inset Quotes eld \end_inset SYN flood attack \begin_inset Quotes erd \end_inset , or similar to a classical UDP packet storm. It would allow for a much higher parallelism degree, but will be more vulnerabl e to packet loss / packet storm effects / etc, and more vulnerable to self-ampli fication. These application behaviour types are avoided in the above use case examples for BigCluster. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset In addition, storing video files as immutable BLOBs will limit the \series bold randomness \series default of \emph on locality of references \emph default , while splitting into millions of very small objects may easily lead to an explosion of randomness by some orders of magnitude. \end_layout \begin_layout Section Performance Arguments from Architecture \begin_inset CommandInset label LatexCommand label name "sec:Performance-Arguments-from" \end_inset \end_layout \begin_layout Subsection Performance Penalties by Choice of Replication Layer \begin_inset CommandInset label LatexCommand label name "subsec:Performance-Penalties-Layer" \end_inset \end_layout \begin_layout Standard Some people think that replication is easily done at filesystem layer. There exist lots of cluster filesystems and other filesystem-layer solutions which claim to be able to replicate your data, sometimes even over long distances. \end_layout \begin_layout Standard Trying to replicate several petabytes of data, or some billions of inodes, is however a much bigger challenge than many people can imagine. \end_layout \begin_layout Standard Choosing the wrong \series bold layer \series default (see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Layering-Rules" plural "false" caps "false" noprefix "false" \end_inset ) for \series bold mass data replication \series default may get you into trouble. Layer selection is much more important than any load distribution argument as frequently heard from certain advocates. Here is an architectural-level (cf section \begin_inset CommandInset ref LatexCommand nameref reference "sec:What-is-Architecture" plural "false" caps "false" noprefix "false" \end_inset ) explanation why replication at the block layer is more easy and less error prone: \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/Layers.pdf width 100col% \end_inset \end_layout \begin_layout Standard \noindent The picture shows the main components of a standalone Unix / Linux system. It conforms to Dijkstra's layering rules explained in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Layering-Rules" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Standard In the late 1970s / early 1980s, a so-called \emph on Buffer Cache \emph default had been introduced into the architecture of Unix. Today's Linux has refined the concept to various internal caches such as the \series bold Page Cache \series default (for data) and the \series bold Dentry Cache \series default (for metadata lookup). \end_layout \begin_layout Standard All these caches serve one main purpose \begin_inset Foot status open \begin_layout Plain Layout Another important purpose is \series bold providing shared memory \series default . \end_layout \end_inset : they are reducing the load onto the storage by exploitation of fast RAM. A well-tuned cache can yield high cache hit ratios, typically 99%. In some cases (as observed in practice) even more than 99.9%. \end_layout \begin_layout Standard Now start distributing the system over long distances. There are potential cut points A and B and C \begin_inset Foot status open \begin_layout Plain Layout In theory, there is another cut point D by implementing a generically distribute d cache. There exists some academic research on this, but practically usable enterprise- grade implementations are rare and not wide-spread. \end_layout \end_inset . \end_layout \begin_layout Standard Cut point A is application specific, and can have advantages because it has knowledge of the application. For example, replication of mail queues can be controlled much more fine-graine d than at filesystem or block layer. \end_layout \begin_layout Standard Cut points B and C are \emph on generic \emph default , supporting a wide variety of applications, without altering them. Cutting at B means replication at filesystem layer. C means replication at block layer. \end_layout \begin_layout Standard When replicating at B, you will notice that the caches are \emph on below \emph default your cut point. Thus you will have to re-implement \series bold distributed caches \series default , and you will have to \series bold maintain cache coherence \series default . \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Caching can yield several \emph on orders of magnitude \emph default of performance. \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset In contrast, frequently heard load distribution arguments can only re-distribut e the already existing performance of your spindles, but cannot magically \begin_inset Quotes eld \end_inset create \begin_inset Quotes erd \end_inset new sources of performance out of thin air. \end_layout \end_inset \end_layout \begin_layout Standard \noindent In contrary, load distribution over a storage network is \emph on costing \emph default some performance, by introduction of additional latencies and potential bottlenecks. \end_layout \begin_layout Standard When replicating at C, the Linux caches are \emph on above \emph default your cut point. Thus you will receive much less traffic at C, typically already reduced by a factor of 100, or even more. This is much more easy to cope with. \emph on Local \emph default caches and their SMP scaling properties can be implemented much more efficientl y than distributed ones. You will also profit from \series bold journalling filesystems \series default like \family typewriter ext4 \family default or \family typewriter xfs \family default . In contrast, \emph on truly distributed \begin_inset Foot status open \begin_layout Plain Layout In this context, \begin_inset Quotes eld \end_inset truly \begin_inset Quotes erd \end_inset means that the POSIX semantics would be always guaranteed cluster-wide, and even in case of partial failures. In practice, some distributed filesystems like NFS don't even obey the POSIX standard \emph on locally \emph default on 1 standalone client. We know of projects which have \emph on failed \emph default right because of this. \end_layout \end_inset \emph default journalling is typically not available with distributed cluster filesystems. \end_layout \begin_layout Standard A \emph on potential \emph default drawback of block layer replication is that you are typically limited to active-passive replication. An active-active operation is not impossible at block layer (see combinations of DRBD with \family typewriter ocfs2 \family default ), but less common, and less safe to operate. \end_layout \begin_layout Standard This limitation isn't necessarily caused by the choice of layer. It is simply caused by the \series bold laws of physics \series default : communication is always limited by the speed of light. A distributed filesystem is essentially nothing else but a persistent \series bold DSM = Distributed Shared Memory \series default . \end_layout \begin_layout Standard Some decades of research on DSM have shown that there exist applications / workloads where the DSM model is \emph on inferior \emph default to the direct communication paradigm. Even in short-distance / cluster scenarios. Long-distance DSM is extremely cumbersome. \end_layout \begin_layout Standard Therefore: you simply shouldn't try to solve \series bold long-distance communication needs \series default via communication over shared filesystems. Even simple producer-consumer scenarios (one-way communication) are less performant (e.g. when compared to plain TCP/IP) when it comes to distributed POSIX semantics. There is simply too much \series bold synchronisation overhead at metadata level \series default . \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout If you want mixed operations at different locations in parallel: split your data set into disjoint filesystem instances (or database / VM instances, etc). Then you should achieve the \series bold ability for butterfly \series default , see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Flexibility-of-Failover" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \end_inset \end_layout \begin_layout Standard \noindent All you need is careful thought about the \emph on appropriate \emph default \emph on granularity \emph default of your data sets (such as well-chosen \emph on sets \emph default of user homedirectory subtrees, or database sets logically belonging together, etc). An example hierarchy of granularities is described in section \begin_inset CommandInset ref LatexCommand nameref reference "par:Positive-Example:-ShaHoLin" plural "false" caps "false" noprefix "false" \end_inset . Further hints can be found in sections \begin_inset CommandInset ref LatexCommand nameref reference "sec:Granularity-at-Architecture" plural "false" caps "false" noprefix "false" \end_inset and \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Variants-of-Sharding" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Sharding (see section \begin_inset CommandInset ref LatexCommand nameref reference "par:Definition-of-Sharding" plural "false" caps "false" noprefix "false" \end_inset ) implementations like ShaHoLin (see section \begin_inset CommandInset ref LatexCommand nameref reference "par:Positive-Example:-ShaHoLin" plural "false" caps "false" noprefix "false" \end_inset ) are essentially exploiting the scalability of SMP = Symmetric MultiProcessing, nowadays typically going into saturation around \begin_inset Formula $\approx100$ \end_inset hardware CPU threads for typical workloads, which is executed by \emph on hardware \emph default inside of your server enclosure. In contrast, DSM-like solutions are trying to distribute your application workload over longer distances, involving relatively slow system software instead of \series bold hardware acceleration \series default . Therefore, SMP is preferable over DSM wherever possible. \end_layout \begin_layout Standard Replication at filesystem level is often by single-file granularity. If you have several millions or even billions of inodes, you may easily find yourself in a snakepit. See also \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Example-Failures-of" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Conclusion \end_layout \end_inset \series bold Active-passive operation \series default over long distances (such as between continents) at \series bold block layer \series default is an \series bold \emph on advantage \series default \emph default . It keeps your staff from trying bad / almost impossible things, like DSM = Distributed Shared Memory over long distances. \end_layout \end_inset \end_layout \begin_layout Subsection Performance Tradeoffs from Load Distribution \begin_inset CommandInset label LatexCommand label name "subsec:Performance-Tradeoffs-from-Load-Distribution" \end_inset \end_layout \begin_layout Standard A frequent argument from BigCluster advocates is that random repliction would provide better performance. This argument isn't wrong, but it does not hit the point. \end_layout \begin_layout Standard As analysed in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Similarities-and-differences" plural "false" caps "false" noprefix "false" \end_inset , load distribution isn't a unique concept bound to BigCluster / random replication. Load distribution has been used since decades at a variety of \series bold RAID striping \series default methods. \end_layout \begin_layout Standard RAID striping levels like RAID-0 or RAID-10 or RAID-60 are known since decades, forming a mature technology. Also known since the 1980s is that the size of a single striped RAID set must not grow too big, otherwise reliability will suffer too much. Larger RAID systems are therefore \series bold split \series default into multiple \series bold RAID sets \series default . \end_layout \begin_layout Standard This has some intresting parallels to the BigCluster reliability problems analyzed in section \begin_inset CommandInset ref LatexCommand nameref reference "sub:Detailed-explanation" plural "false" caps "false" noprefix "false" \end_inset , and some workarounds, e.g. as discussed in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Similarities-and-differences" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Standard Summary: both RAID striping and random replication methods are \series bold limited \series default by the fundamental law of storage systems, see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Optimum-Reliability-from" plural "false" caps "false" noprefix "false" \end_inset , in a similar way. \end_layout \begin_layout Standard A detailed performane comparison at architcture level between random replication of variable-sized objects and striping of block-level sectors is beyond the scope of this architecture guide. However, the following should be be intuitively clear from section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Layering-Rules" plural "false" caps "false" noprefix "false" \end_inset and from Einstein's laws of the speed of light: \end_layout \begin_layout Quote Fine-grained load distribution over \series bold short distances \series default and/or at \series bold lower layers \series default has a \series bold bigger performance potential \series default than over longer distances and/or at higher layers. \end_layout \begin_layout Standard In other words: local SAS busses are capable of realtime IO transfers over very short distances (enclosure-to-enclosure), while an expensive IP storage network isn't realtime (due to packet loss). SAS busses are \emph on constructed \emph default for dealing with requirements arising from RAID, and have been optimized for years / decades. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Advice for performance-critical workloads \end_layout \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Besides \emph on local \emph default SSDs, also consider some appropriate RAID striping at your (Local)Sharding storage boxes for performance-critical workloads. It is not only cheaper than BigCluster load distribution methods, but typically also more performant (on top of comparable technology and comparable dimensioni ng). Tradeoffs of various parameters and measurement methods for system architects are described at \begin_inset Flex URL status open \begin_layout Plain Layout http://blkreplay.org \end_layout \end_inset . \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset RAID-6 is much cheaper \begin_inset Foot status open \begin_layout Plain Layout Several OSDs are also using SAS or similar local IO busses, in order to drive a high number of spindles. Essentially, random replication is involving \emph on two \emph default different types of networks at the same time. This also explains why such a combination must necessarily induce some performance loss. \end_layout \end_inset than RAID-10, and can also provide some striping with respect to (random) reads. However, random writes are much slower. For read-intensive workloads, the striping behaviour of RAID-6 is often sufficient. A tool for comparsion of different RAID setup alternatives can be found at \begin_inset Flex URL status open \begin_layout Plain Layout http://www.blkreplay.org \end_layout \end_inset . \end_layout \begin_layout Section Scalability Arguments from Architecture \begin_inset CommandInset label LatexCommand label name "sec:Scalability-Arguments-from" \end_inset \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Importance of scalability \end_layout \end_inset Scalability is important for \series bold mass data \series default / mass production. It determines the technical limits of \series bold scaling effects \series default . Bad scalability can seriously \series bold limit the business \series default , and its resolvement can produce high \series bold cost \series default . \end_layout \end_inset \end_layout \begin_layout Standard \noindent Unfortunately, in some circles, seriously wrong habits have established. I know of examples causing unnecessary problems and cost in the range of millions of €. \end_layout \begin_layout Standard Some people are talking about scalability by (1) looking at a relatively small \emph on example \emph default cluster \emph on implementation \emph default of their respective (pre-)chosen \emph on architecture \emph default having \begin_inset Formula $n$ \end_inset machines or \begin_inset Formula $n$ \end_inset network components or running \begin_inset Formula $n$ \end_inset application instances, and then (2) extrapolating its behaviour to bigger \begin_inset Formula $n$ \end_inset . They think if it runs with small \begin_inset Formula $n$ \end_inset , it will also run for bigger \begin_inset Formula $n$ \end_inset . \end_layout \begin_layout Standard This way of thinking and acting is completely broken, and can endanger both companies and careers. \end_layout \begin_layout Standard This is not only because of confusion of \begin_inset Quotes eld \end_inset architecture \begin_inset Quotes erd \end_inset with \begin_inset Quotes eld \end_inset implementation \begin_inset Quotes erd \end_inset , cf section \begin_inset CommandInset ref LatexCommand nameref reference "sec:What-is-Architecture" plural "false" caps "false" noprefix "false" \end_inset . It is also \series bold fundamentally broken \series default because it assumes some \begin_inset Quotes eld \end_inset linearity \begin_inset Quotes erd \end_inset in a field which is \series bold non-linear \emph on by definition \series default \emph default . \end_layout \begin_layout Standard If scalability would be linear, the term would not be useful at all, because there would be \emph on no limit \emph default . However, limits exist in practice, and the term \begin_inset Quotes eld \end_inset scalability \begin_inset Quotes erd \end_inset is a \series bold \emph on means \emph default for describing the behaviour at or around the limit \series default . \end_layout \begin_layout Standard Another \emph on incorrect \emph default way of ill-defining / ill-using the term \begin_inset Quotes eld \end_inset scalability \begin_inset Quotes erd \end_inset is looking at some relatively big \series bold \emph on example \series default \emph default cluster, which is working in practice for some \series bold particular use case \series default , and then concluding that it will also work in \series bold another use case \series default . Arguing with an \emph on example \emph default of a working system is wrong by construction. In general, examples can only be used for \series bold \emph on disproving \series default \emph default something, but never as a positive proof of concept \begin_inset Foot status open \begin_layout Plain Layout Unfortunately, the term PoC = Proof Of Concept is used wrongly in large parts of the industry. It should be termed PoI = Proof of Implementation, or VoI = Validation of Implementation or CoI = Check of Implementation instead. A concept can have multiple implementations, but only \emph on one \emph default of them has been actually checked in most cases. \end_layout \end_inset . \end_layout \begin_layout Standard Examples for suchalike examples: section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Explanations-from-DSM" plural "false" caps "false" noprefix "false" \end_inset mentions some use cases where \family typewriter BigCluster \family default architecture implementations via Ceph are shining. These example use cases are showing some commonalities, like relatively low performance demands at the storage, and relatively low IO parallelism degree \begin_inset Foot status open \begin_layout Plain Layout Example: many people are not aware that Apache is acting like a \series bold fork bomb \series default . When thousands of Apache processes are running in parallel, a parallelism of several thousands of IO requests \emph on may \emph default occasionally occur during \emph on peaks \emph default , although caches will serve them \emph on most \emph default of the time. Certain storage systems are reacting with \series bold instability \series default , sometimes even when \begin_inset Quotes eld \end_inset hammered \begin_inset Quotes erd \end_inset only once with a very short but massive overload peak. \end_layout \end_inset , and streaming-like \series bold access patterns \series default . However, it also mentions some \emph on other \emph default use cases, where it did \emph on not \emph default work as expected. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Humans are \series bold selective \series default in their perception. Evolution has created this, for our protection against overload in the information flood. Unfortunately, looking only at some positive use case examples, while either not knowing or ignoring other counter-examples, can be dangerous. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset \emph on Every \emph default storage system on this globe has \series bold \emph on always \emph default some scalability limit \series default , somewhere. Even the internet has some limit. Scalability is \emph on always \emph default a \series bold non-linear \series default behaviour. In order to find the practical limit, you must \emph on reach \emph default it. \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/principle-scalability.fig width 80col% \end_inset \end_layout \begin_layout Standard \noindent Typically, the overall scalability behaviour can be divided into several \emph on zones \emph default . Only in the scaling zone, some near-linear behaviour can be expected. Next comes the saturation zone, where the effects of inherent system limits are already retarding progress. After exeeding the scalability limit, no further progress is happening. Upon overload, many systems are even reacting with a \emph on regression \emph default . \end_layout \begin_layout Standard Any serious study should consciously deal with \emph on all \emph default of these zones, possibly except the regression \begin_inset Foot status open \begin_layout Plain Layout Entering the regression zone might possibly lead to destruction of certain systems, or to other damages. Then it is acceptable to not enter it. It would \emph on honorable \emph default to mention any \series bold risks \series default associated with suchlike overload behaviour. \end_layout \end_inset zone. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresToxiques.png lyxscale 50 scale 17 \end_inset There exists no excuse for omission \begin_inset Foot status open \begin_layout Plain Layout Several years ago, I attended a presentation at an OpenSource conference. It was about scalability of a Java programming environment for some SMP machines. The presenter showed some nice graphics, however showing only the scaling zone. He publicly claimed \begin_inset Quotes eld \end_inset arbitrary scalability \begin_inset Quotes erd \end_inset . After the talk, I tried to catch him, in order to ask him under 4 eyes whether he encountered some limit. His very short answer was that there is no limit, completely unwilling to talk with me at all, and quickly leaving the room. \end_layout \begin_layout Plain Layout Unfortunately, this \begin_inset Quotes eld \end_inset methodology \begin_inset Quotes erd \end_inset seemed to have been copied by more and more presenters. The problem is not only wrong claims. The problem is that managers and other decision makers can lose a lot of money when believing suchalike \series bold fake results \series default . \end_layout \end_inset of the limit. When the limit is \series bold unknown \series default , then you simply \series bold must not claim a certain scalability \series default ! \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Example use cases are principally insufficient for proving general scalability behaviour, as well as for comparing the scalability of architectures and/or of certain implementations. Examples can only be used for \emph on disproving \emph default scalability. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Caution: when a particular \emph on implementation \emph default does not work as expected, this does not generally prove that the corresponding concept / architecture does not work at all. There may be \series bold bugs \series default and other \series bold sources of error \series default in the particular implementation, which just need to be \series bold \emph on fixed \series default \emph default . \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Required skill level for architects \end_layout \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset The \series bold suitability of architectures for certain use cases \series default needs to be checked separately. This is an expert task, requiring high levels of skills and experience. \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Cross-checking by several experts may lead into systematical ill-designs by \series bold information bubbles \series default . Well-foundation of arguments, well-founded measurements on basis of solid methodology, etc, are much more important than number of votes! \end_layout \end_inset \end_layout \begin_layout Subsection Example Failures of Scalability \begin_inset CommandInset label LatexCommand label name "subsec:Example-Failures-of" \end_inset \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Recommended reading \end_layout \end_inset The following example is a \series bold must read \series default not only for \series bold responsibles \series default , but also for system architects, and also for sysadmins. \end_layout \end_inset \end_layout \begin_layout Standard The numbers and some details are from my memory, thus it need not be 100% accurate in all places. \end_layout \begin_layout Standard It is about an operation environment for a \emph on new \emph default product, which was a proprietary web page editor running under a very complicat ed variant of a LAMP \begin_inset Foot status open \begin_layout Plain Layout LAMP = Linux Apache Mysql PHP \end_layout \end_inset stack. \end_layout \begin_layout Standard The setup started with a \family typewriter BigCluster \family default \emph on architecture \emph default , but actually sized as a \family typewriter \begin_inset Quotes eld \end_inset SmallCluster \begin_inset Quotes erd \end_inset \family default implementation. \end_layout \begin_layout Paragraph Setup 1 (NFS) \end_layout \begin_layout Standard The first setup consisted of \begin_inset Formula $n=6$ \end_inset storage servers, each replicated to another datacenter via DRBD. Each server was exporting its filesystems via NFS to about the same number of client servers, where Apache/PHP was supposed to serve the HTTP requests from the customers, which were entering the client cluster via a HTTP load balancer. The load balancer was supposed to spread the HTTP load to the client servers in a \series bold round-robin \series default fashion. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout At this point, eager readers may notice some similarity with the error propagati on problem treated in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Error-Propagation-to" plural "false" caps "false" noprefix "false" \end_inset . Notice that this is about \emph on scalability \emph default instead, but you should compare with that, to find some similarities. \end_layout \end_inset \end_layout \begin_layout Standard \noindent After the complicated system was built up and was working well enough, the new product was launched via a marketing campaign with free trial accounts, limited to some time. \end_layout \begin_layout Standard So the number of customers was ramping up from 0 to about 20,000 within a few weeks. When about 20,000 customers were running on the client machines, system hangs were noticed, from a customer's perspective. When too many customers were pressing the \begin_inset Quotes eld \end_inset save \begin_inset Quotes erd \end_inset button in parallel on reasonably large web page projects, a big number of small files, including a huge bunch of small image files, was generated over a short period of time. A few customers were pressing the \begin_inset Quotes eld \end_inset save \begin_inset Quotes erd \end_inset button several times a minute, each time re-creating all of these files again and again from the proprietary web page generator. Result: the whole system appeared to hang. \end_layout \begin_layout Standard However, all of the servers, including the storage servers, were almost \emph on idle \emph default with respect to CPU consumption. RAM sizes were also no problem. \end_layout \begin_layout Standard After investigating the problem for a while, it was noticed that the \series bold \emph on network \series default \emph default was the bottleneck, but not in terms of throughput. The internal sockets were forming some \series bold queues \series default which were \emph on delaying \emph default the NFS requests in some \series bold ping-pong \series default like fashion, almost resulting in a \begin_inset Quotes eld \end_inset deadlock \begin_inset Quotes erd \end_inset from a customer's perspective (a better term would be \series bold distributed livelock \series default or \series bold distributed thrashing \series default , c.f. section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Explanations-from-DSM" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Paragraph Setup 2 ( \family typewriter ocfs2 \family default ) \end_layout \begin_layout Standard Due to some external investigations and recommendations, the system was converted from NFS to \family typewriter ocfs2 \family default . Now DRBD was operated in active-active mode. Only one system software component was replaced with another one, without altering the \family typewriter BigCluster \family default architecture, and without changing the number of servers, which remained a stripped-down \family typewriter SmallCluster \family default implementation. \end_layout \begin_layout Standard Result: the problem with the \begin_inset Quotes eld \end_inset hangs \begin_inset Quotes erd \end_inset disappeared. \end_layout \begin_layout Standard However, after the number of customers had exceeded the \series bold next scalability limit \series default of about 30,000 customers, the \begin_inset Quotes eld \end_inset hang \begin_inset Quotes erd \end_inset problem appeared once again, in a similar way. The system showed systematical incidents again. \end_layout \begin_layout Paragraph Setup 3 ( \family typewriter glusterfs \family default as a substitute for NFS / \family typewriter ocfs2 \family default ) \end_layout \begin_layout Standard After investigating the network queueing behaviour and the lock contention problems of \family typewriter ocfs2 \family default , the next solution was \family typewriter glusterfs \family default . \end_layout \begin_layout Standard However, when the number of customers exceeded the \series bold \emph on next \emph default scalability limit \series default , which was about 50,000 customers, some of them hammering the cluster with their \begin_inset Quotes eld \end_inset save \begin_inset Quotes erd \end_inset button, the \begin_inset Quotes eld \end_inset hangs \begin_inset Quotes erd \end_inset appeared again. \end_layout \begin_layout Paragraph Setup 4 ( \family typewriter glusterfs \family default replication as a substitute for DRBD) \end_layout \begin_layout Standard After analyzing the problem once again, it was discovered by accident that \family typewriter drbdadm disconnect \family default \emph on appeared \emph default to \begin_inset Quotes eld \end_inset solve \begin_inset Quotes erd \end_inset the problem. \end_layout \begin_layout Standard Therefore DRBD was replaced with \family typewriter glusterfs \family default replication. There exists a \family typewriter glusterfs \family default feature allowing replication of files at filesystem level. \end_layout \begin_layout Standard This attempt was \emph on immediately \emph default resulting in an \series bold almost fatal disaster \series default , and thus was stopped immediately: the cluster completely broke down. Almost nothing was working anymore. \end_layout \begin_layout Standard The problem was even worse: switching off the \family typewriter glusterfs \family default replication and rollback to DRBD did not work. The system remained \series bold unusable \series default . \end_layout \begin_layout Standard As a temporary workaround, \family typewriter drbdadm disconnect \family default was improving the situation enough for some humbling operation. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Retrospective explanation: some of the reasons can be found in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Behaviour-of-DRBD" plural "false" caps "false" noprefix "false" \end_inset . \family typewriter glusterfs \family default replication does not scale at all because it stores its replication information at \series bold per-inode granularity \series default in EAs (extended attributes). This must \emph on necessarily \emph default be worse than DRBD, because there were some hundreds of millions of them in total as reported by \family typewriter df -i \family default (see the cut point discussion in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Performance-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset , and section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Granularity-at-Architecture" plural "false" caps "false" noprefix "false" \end_inset ). Overnight in some cron jobs, these EAs had to be deleted in reasonably sized batches in order to become more or less \begin_inset Quotes eld \end_inset operable \begin_inset Quotes erd \end_inset again. \end_layout \end_inset \end_layout \begin_layout Paragraph Setup5 (Sharding on top of DRBD) \end_layout \begin_layout Standard After the almost fatal incident had been resolved to a less critical one, the responsibility for setup was taken over by another person. After the \begin_inset Formula $O(n^{2})$ \end_inset behaviour from section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Distributed-vs-Local:" plural "false" caps "false" noprefix "false" \end_inset had been understood, and after it was clear that sharding is only \begin_inset Formula $O(k)$ \end_inset from a customer's perspective, it was the final solution. Now the problem was resolved at \series bold \emph on architectural level \series default \emph default , no longer by just replacing some components with some others (c.f. section \begin_inset CommandInset ref LatexCommand nameref reference "sec:What-is-Architecture" plural "false" caps "false" noprefix "false" \end_inset ). \end_layout \begin_layout Standard The system was converted to a variant of a \family typewriter RemoteSharding \family default model (see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Variants-of-Sharding" plural "false" caps "false" noprefix "false" \end_inset ), and some \family typewriter migrate \family default scripts were introduced for load balancing of customer homedirectories and databases between shards. \end_layout \begin_layout Standard As a side effect, the load balancer became a new role: instead of spreading \emph on all \emph default of the HTTP requests to \emph on all \emph default of the client servers in a round-robin fashion, it now acted as a redirection mechanism at \emph on shard granularity \emph default , e.g. when one of the client servers was handed over to another one for maintenance. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Retrospective explanation: DRBD was definitely \emph on not \emph default the real reason for the critical incident. The replication traffic per shard is so low in average that until today, no replacement by MARS was absolutely necessary \begin_inset Foot status open \begin_layout Plain Layout Many sysadmins are running a conservative strategy: never touch a running system... \end_layout \end_inset , although the distance is over 50 km. If you wonder why such low write traffic demands can cause such a big incident: look at the \series bold cache reduction \series default graphics in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Performance-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset . Today, the \begin_inset Quotes eld \end_inset save \begin_inset Quotes erd \end_inset buttons of the customers are just triggering some \emph on extra \emph default \series bold writebacks \series default from the Page Cache of the kernel into the block layer, after some \emph on delay \emph default . These writebacks are not performance critical in reality, because the Page Cache is running them \series bold \emph on asynchronously in background \series default \emph default . \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset In contrast, distributed filesystems like \family typewriter NFS \family default or \family typewriter ocfs2 \family default or \family typewriter glusterfs \family default are not working asynchronously in many places, but will often schedule their requests \emph on synchronously \emph default into ordinary network queues, which form a \series bold sequential bottleneck \series default , competing with other high-frequent filesystem operations. In addition, the \begin_inset Quotes eld \end_inset save \begin_inset Quotes erd \end_inset button triggers masses of metadata / inode updates in a short time, often residing in the same directory. Such a directory may thus form a \begin_inset Quotes eld \end_inset global \begin_inset Quotes erd \end_inset bottleneck. When suchalike competing \series bold metadata updates \series default are distributed via a round-robin load balancer, the problem can easily become critical by the \series bold cache coherence problem \series default . While local filesystems can smoothen such application behaviour via the Dentry Cache plus Inode Cache, which also show some asynchronous writeback behaviour, network filesystems are often unable to deal with this performantly. \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Although DRBD has a similar sequential bottleneck at the low-frequency block layer by its write-through strategy into its replica, this does not really matter: all other writebacks from the Page Cache are \emph on also \emph default started asynchronously, and triggered low-frequently, and are occurring after some \emph on delay \emph default (which in turn will smoothen the \series bold spikes \series default caused by \series bold mass dirtification \series default of many small files and inodes in a short time as caused by the \begin_inset Quotes eld \end_inset save \begin_inset Quotes erd \end_inset button), and thus are not really performance critical for this particular use case. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset This is a striking example why careful \series bold selection of granularity level \series default (filesystem vs block layer, see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Performance-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset ) is essential. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset This is also a striking example why asynchronous operations can form a huge advantage in certain use cases. \end_layout \begin_layout Standard The sharding setup is working until today, scaling up to the current number of customers, which is more than an order of magnitude, in the range of about a million of customers. Of course, the number of shards had to be increased, but this is just what sharding is about. \end_layout \begin_layout Subsection Properties of Storage Scalability \begin_inset CommandInset label LatexCommand label name "subsec:Properties-Scalability" \end_inset \end_layout \begin_layout Subsubsection Influence Factors at Scalability \begin_inset CommandInset label LatexCommand label name "subsec:Influence-Factors-Scalability" \end_inset \end_layout \begin_layout Standard In general, scalability of storage systems may depend on the following factors (list may be incomplete): \end_layout \begin_layout Enumerate The \series bold application class \series default , in particular its principal \series bold workingset behaviour \series default (in both dimensions: timely and locality). More explanations about workingsets can be found in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Explanations-from-DSM" plural "false" caps "false" noprefix "false" \end_inset and at \begin_inset Flex URL status open \begin_layout Plain Layout http://blkreplay.org \end_layout \end_inset . \end_layout \begin_layout Enumerate The \series bold size \series default \begin_inset Formula $x$ \end_inset of the application data and/or the \series bold number of application instances \series default (possibly also denoted by \begin_inset Formula $x$ \end_inset ), and the amount of storage needed for it (could be also termed \begin_inset Formula $x$ \end_inset ). Besides the data itself, the corresponding \series bold metadata \series default (inodes, indexes, etc) can form an important factor, or can even \emph on dominate \emph default the whole story. Typically, critical datacenter application data is tremendously differently sized from workstation data. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresToxiques.png lyxscale 50 scale 17 \end_inset Caution! Some people think erronously that scalability would be \emph on linearly \emph default depending on \begin_inset Formula $x$ \end_inset . However, as is known at least since the 1960s (read some ancient papers from Saltzer and/or from Denning), scalability is \series bold never linear \series default , but sometimes even \series bold \emph on disruptive \series default \emph default , in particular when RAM size is the bottleneck. IO queues and/or networking queues are often also reacting to overload in a disruptive fashion. This means: after exceeding the \series bold scalability limit \series default of a particular system for its particular class of applications, the system will very likely \series bold break down \series default from a customer's perspective, sometimes almost completely, and sometimes even \series bold \emph on fatally \series default \emph default . \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset On the other hand, some other systems are reacting with \series bold graceful degradation \series default . Whether a particular systems reacts to a particular type of (over)load, either with graceful degradation, or with fatal disruption, or with some intermediate behaviour, is some sort of \begin_inset Quotes eld \end_inset quality property \begin_inset Quotes erd \end_inset of the system and/or of the application. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset EVERY SYSTEM, even sharded systems, and even the internet as a whole, has \emph on always \emph default some scalability limit \emph on somewhere \emph default . There exists \series bold no \begin_inset Quotes eld \end_inset inifinitely scaling \begin_inset Quotes erd \end_inset system \series default on earth! \end_layout \begin_layout Enumerate The \series bold \emph on distribution \series default \emph default of the application behaviour in both \series bold timely \series default and \series bold locality \series default dimensions. Depending on the application class, this is often an \emph on exponential \emph default distribution according to Zipf's law. By erronously \emph on assuming \emph default an equal distribution (or a Gaussian distribution) instead of actually measuring the distribution in both dimensions, you can easily induce zillions of costly problems for big \begin_inset Formula $x$ \end_inset , or even fatal failure of the whole system / project. \end_layout \begin_layout Enumerate The \series bold transformation \series default of the application workingset behaviour at architectural level, sometimes caused by certain components resp their specific implementation or parameteriza tion. Examples are intermediate virtualization layers, e.g. vmware \family typewriter *.vmdk \family default or KVM \family typewriter *.qcow2 \family default container formats which can completely change the game, not only in extreme cases. Another example is \series bold random distribution \series default to (or \series bold random replication \series default inside of) object stores, which can turn some uncomplicated sequential workloads into highly problematic \emph on random IO \emph default workloads. See also section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Similarities-and-differences" plural "false" caps "false" noprefix "false" \end_inset . Don't overlook such potential pitfalls! \end_layout \begin_layout Enumerate The storage \series bold architecture \series default to be chosen, such as \family typewriter CentralStorage \family default vs \family typewriter BigCluster \family default vs \family typewriter *Sharding \family default . Choice of the wrong architecture can be fatal for big \begin_inset Formula $n$ \end_inset and/or for certain timely / spatial application behaviour. Changing an architecture during operations on some petabytes of data and/or some billions of inodes can be almost impossible, and/or can consume a lot of time and money. \end_layout \begin_layout Enumerate The \series bold number \series default of storage \series bold nodes \series default \begin_inset Formula $n$ \end_inset . In some architectures, addition of more nodes can make the system \emph on worse \emph default instead of better, c.f. section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Reliability-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Enumerate In case of architectures relying on a storage network: choice of \series bold layer \series default for cut point, e.g. filesystem layer vs block layer, see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Performance-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset , and/or introduction of an additional intermediate object storage layer (which can result in major degradation from an architectural view). Due to fundamental differences in distributed vs local \series bold cache coherence \series default , suchalike can have a \emph on tremendous \emph default effect on scalability. \end_layout \begin_layout Enumerate The chosen \series bold implementation \series default of the architecture. Be sure to understand the difference between an \emph on architecture \emph default and an \emph on implementation \emph default of that architecture (section \begin_inset CommandInset ref LatexCommand nameref reference "sec:What-is-Architecture" plural "false" caps "false" noprefix "false" \end_inset ). \end_layout \begin_layout Enumerate The size and types / properties of various \series bold caches \series default at various layers. You need to know the general properties of \series bold inclusive \series default vs \series bold exclusive \series default cache architecture. You absolutely need to know what \series bold thrashing \series default is, and under which conditions it can occur. \begin_inset Newline newline \end_inset It is advantagous for system architects to know \begin_inset Foot status open \begin_layout Plain Layout Reading a few Wikipedia articles does not count as \begin_inset Quotes eld \end_inset knowledge \begin_inset Quotes erd \end_inset . You need to be able to \emph on apply \emph default your knowdedge to enterprise level systems (as opposed to workstation-sized systems), \emph on sustainable \emph default and \emph on reproducible \emph default . Therefore you need to have \emph on actually worked \emph default in the matter and gained some extraordinary experiences, on top of deep understanding of the matter. \end_layout \end_inset pre-loading strategies, as well as replacement strategies. It is advantageous to know what \family typewriter LRU \family default or \family typewriter MFU \family default means, what their induced \emph on overhead \emph default is, and how they \emph on really \emph default work on \emph on actual \emph default data, not just on some artificial lab data. You also should know what an \series bold anomaly \series default is, and how it can be produced not only by \family typewriter FIFO \family default strategies, but also by certain types of ill-designed multi-layer caching. Beware: there are places where \family typewriter FIFO \family default -like behaviour is almost impossible to avoid, such as networks. All of these is outside the scope of this MARS manual. You should \emph on measure \emph default , when possible, the \series bold overhead \series default of cache implementations. I know of \emph on examples \emph default where caching is c \emph on ounter-productive \emph default . For example, certain types and implementations of SSD caches are over-hyped. Removing a certain cache will then \emph on improve \emph default the situation. Notice: caches are conceptually based on some type of \series bold associative memory \series default , which is either very fast but costly when directly implemented in hardware, or it can suffer from tremendous performance penalties when implemented inappropriately in software. \end_layout \begin_layout Enumerate \series bold Hardware dimensioning \series default of the implementation: choice of storage hardware, for each storage node. This includes SSDs vs HDDs, their attachment (e.g. SAS multiplexing bottlenecks), RAID level, and controller limitations, etc. \end_layout \begin_layout Enumerate Only for architectures relying on a storage network: network \series bold throughput \series default and network \series bold latencies \series default , and network \series bold bottlenecks \series default , including the \series bold queueing \series default behaviour / congestion control / \series bold packet loss \series default behaviour upon overload. The latter is often neglected, leading to unexpected behaviour at load peaks, and/or leading to costly over-engineering (examples see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Example-Failures-of" plural "false" caps "false" noprefix "false" \end_inset ). \end_layout \begin_layout Enumerate \series bold \emph on Hidden \emph default bottlenecks \series default of various types. A complete enumeration is almost impossible, because there are too many \begin_inset Quotes eld \end_inset opportunities \begin_inset Quotes erd \end_inset . To reduce the latter, my general advice is to try to build bigger systems as \emph on simple \emph default as possible. This is why you should involve some \emph on real \emph default experts in storage systems, at least on critical enterprise data. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset \emph on Any \emph default of these factors can be dangerous when not carefully thought about and treated, depending on your use case. \end_layout \begin_layout Subsection Case Study: Example Scalability Scenario \begin_inset CommandInset label LatexCommand label name "subsec:Example-Scalability-Scenario" \end_inset \end_layout \begin_layout Standard To get an impression what \begin_inset Quotes eld \end_inset enterprise critical data \begin_inset Quotes erd \end_inset can mean in a concrete example, here are some characteristic numbers from 1&1 Ionos ShaHoLin (Shared Hosting Linux) around spring 2018. \end_layout \begin_layout Standard When the whole system would have to be re-constructed from scratch at a green field, the following number from the current implemenation would be \emph on requirered input parameters \emph default for \emph on any \emph default potential solution architecture, such as \family typewriter CentralStorage \family default vs \family typewriter BigCluster \family default vs \family typewriter Sharding \family default : \end_layout \begin_layout Itemize Webhosting very close to 24/7/365. \end_layout \begin_layout Itemize Overall customer-visible HA target of 99.98%, including WAN outages. Technically, a much better system-only HA target would be possible, but there are also some \emph on external \emph default incident sources like frequent updates of userspace software and a varity of application software libraries, frequent security updates, etc. Although managed by ITIL processes, these sources are outside of the scope of this \emph on system architecture \emph default guide. \end_layout \begin_layout Itemize About 9 millions of customer home directories. \end_layout \begin_layout Itemize About 10 billions of inodes, with daily incremental backup. \end_layout \begin_layout Itemize More than 4 petabytes of \emph on net \emph default data (total \family typewriter df \family default filling level) in spring 2018, with a growth rate of 21% per year. \end_layout \begin_layout Itemize All of this permanently replicated into a second datacenter. \end_layout \begin_layout Itemize In catastrophic failure scenarios, \emph on all \emph default resources must be switchable within a short time. \end_layout \begin_layout Standard In order to not bail out too many competing solutions via preconditions, the following is treated as a nice-to-have feature (only for the sake of the following sandbox game, while in reality the sysadmins would vote for a \emph on hard requirement \emph default instead): \end_layout \begin_layout Itemize Ability for butterfly, cf section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Flexibility-of-Failover" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Standard For simplicity of our architectural sandbox game, we assume that all of this is in one campus. In reality, about 30% is residing at another continent. Introducing this as an additional input parameter would not fundamentally change the game. Many other factors, like dependencies from existing infrastructure, are also neglected. \end_layout \begin_layout Subsubsection Theoretical Solution: \family typewriter CentralStorage \end_layout \begin_layout Standard Let us assume somebody would try to operate this on classical \family typewriter CentralStorage \family default , and let us assume that migration of this amount of data including billions of inodes would be no technical problem. What would be the outcome? \end_layout \begin_layout Standard With current technology, finding a single \family typewriter CentralStorage \family default appliance would be all else but easy. Dimensioning would be needed for the \emph on lifetime \emph default of such a solution, which is at least 5 years. In five years, the data would grow by a factor of about \begin_inset Formula $1.21^{5}=2.6$ \end_inset , which is then about \begin_inset Formula $10.5$ \end_inset petabytes. This is only the \emph on net \emph default capacity; at hardware layer much more is needed for spare space and for local redundancy. The single \family typewriter CentralStorage \family default instance will need to scale up to at least this number, in each datacenter (under the simplified game assumptions). \end_layout \begin_layout Standard The current number of client LXC containers is about \begin_inset Formula $2600$ \end_inset , independently from location. You will have to support growth in number of them. For maintenance, these need to be switchable to a different geo-datacenter at any time (e.g. risk mitigation of power supply maintenance in a datacenter), at least at hypervisor granularity. As explained in sections \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Flexibility-of-Failover" plural "false" caps "false" noprefix "false" \end_inset and \begin_inset CommandInset ref LatexCommand nameref reference "sec:Location-transparency" plural "false" caps "false" noprefix "false" \end_inset , handover \emph on should be \emph default at per-VM granularity, otherwise you would cause a regression in operability. The number of bare metal servers running the total workload can vary with hardware architecture / hardware lifecycle, and with growth, such as already demonstrated during the course of internal \begin_inset Quotes eld \end_inset Efficiency projects \begin_inset Quotes erd \end_inset . You will need to dimension a dedicated storage network for all of this. \end_layout \begin_layout Standard If you find a solution which can do this with current \family typewriter CentralStorage \family default technology for the next 5 years, then you will have to ensure that restore from backup \begin_inset Foot status open \begin_layout Plain Layout Local snapshots, whether LVM or via some COW filesystem, do not count as backups (see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Replication-vs-Backup" plural "false" caps "false" noprefix "false" \end_inset ). You need a \emph on logical \emph default copy, not a \emph on physical \emph default one, in case your production filesystem instance gets fatally damaged, such that \family typewriter fsck \family default won't help anymore. \end_layout \end_inset can be done in less than 1 day in case of a fatal disaster, see also treatment of \family typewriter CentralStorage \family default reliability in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Reliability-Differences-CentralStorage" plural "false" caps "false" noprefix "false" \end_inset . Notice that the current self-built backup solution for a total of 15 billions of inodes is based on a sharding model; converting this to some more or less centralized solution would turn out as another challenge. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Attention! Buying 10 or 50 or 100 CentralStorage instances does not count as a \family typewriter CentralStorage \family default architecture. By definition, suchalike would be \family typewriter RemoteSharding \family default instead. Notice that the current 1&1 solution is already a mixture of \family typewriter LocalSharding \family default and \family typewriter RemoteSharding \family default , so you would win \emph on nothing \emph default at architectural level. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset In case you actually would want to build a RemoteSharding model on top of commercial storage, you need to consider \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Cost-Arguments-from-Technology" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Standard In your business case, you would need to justify the price difference between the current component-based hardware solution (horizontally extensible by \emph on scale-out \emph default ) and \family typewriter CentralStorage \family default / \family typewriter RemoteSharding \family default , which is about a factor of 10 per terabyte according to the table in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Cost-Arguments-from-Technology" plural "false" caps "false" noprefix "false" \end_inset . Even if you manage to find a vendor who is willing to subsidize to a factor of only 3, this is not all you need. You need to add the cost for the dedicated storage network. On top of this, you need to account for the \emph on migration cost \emph default after the lifetime of 5 years has passed, where the full data set needs to be migrated to a successor storage system. \end_layout \begin_layout Standard Notice that classical argumentations with \series bold \emph on manpower \series default \emph default will not work. The current operating team is about 10 persons, with no dedicated storage admin. This relatively small team is not only operating a total of more than 6,000 shared boxes in all datacenters, but also some tenthousands of managed dedicated servers, running essentially the same software stack, with practicall y fully automated mass deployment. Most of their tasks are related to central software installation, which is then automatically distributed, and to operation / monitoring / troubleshoot ing of masses of client servers. Storage administration tasks in isolation are costing only a \emph on fraction \emph default of this. Typical claims that \family typewriter CentralStorage \family default would require less manpower will not work here. Almost everything which is needed for \emph on mass automation \emph default is already automated. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Neglecting the tenthousands of managed dedicated servers would be a catastrophi c ill-design. Their hardware is already given, by existing customer contracts, some of them decades old. Although it may be possible to modify \emph on some \emph default of these contracts, you simply cannot fundamentally change \emph on all \emph default the hardware of these customers including their \emph on dedicated \emph default local disks, which was / is their \emph on main selling point \emph default . You cannot simply convert them to a shared \family typewriter CentralStorage \family default , even if it would be technically possible, and if it would deliver similar IOPS rates than tenthousands of local spindles (and if you could reach the bundled performance of local SSDs from newer contracts), and even if you would introduce some interesting \series bold storage classes \series default for all of this. A dedicated server on top of a shared storage is no longer a dedicated one. You would have to migrate these customers to another product, with all of its consequences. Alone for these machines, \emph on most \begin_inset Foot status open \begin_layout Plain Layout Only a few out of >1000 self-built or customized Debian packages are dealing with MARS and/or with the clustermanager \family typewriter cm3 \family default . \end_layout \end_inset \emph default of the current automation of \family typewriter LocalStorage \family default is needed \emph on anyway \emph default , although they are not geo-redundant at current stage. \end_layout \begin_layout Standard Conclusion: \family typewriter CentralStorage \family default is simply \emph on unrealistic \emph default . \end_layout \begin_layout Subsubsection Theoretical Solution: \family typewriter BigCluster \end_layout \begin_layout Standard The main problem of \family typewriter BigCluster \family default is \series bold reliability \series default , as explained intuitively in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Reliability-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset , and graphically in section \begin_inset CommandInset ref LatexCommand nameref reference "sub:Detailed-explanation" plural "false" caps "false" noprefix "false" \end_inset , and mathematically in appendix \begin_inset CommandInset ref LatexCommand vref reference "chap:Mathematical-Model-of" \end_inset , and as observed in several installations not working as expected. It would be a bad idea to ignore the explanations from section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Explanations-from-DSM" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Standard Let us assume that all of these massive technical problems were solved, somehow. Then the business case would have to deal with the following: \end_layout \begin_layout Standard The total number of servers would need to be roughly \emph on doubled \emph default \begin_inset Foot status open \begin_layout Plain Layout One of the problems of the current Ceph \emph on implementation \emph default is its massive consumption of CPU power and RAM. Even if this would be improved in future, the \emph on architectural \emph default drawbacks will remain. \end_layout \end_inset . Not only their CAPEX, but also the corresponding OPEX (electrical power, rackspace, manpower) would increase. Alone their current electrical power cost, including cooling, is more than the current sysadmin manpower cost. Datacenter operations would also increase. On top, a dedicated storage network and its administration cost would also be needed. \end_layout \begin_layout Standard With respect to the tenthousands of managed dedicated servers and their customer contracts, a similar argument as above holds. You simply cannot convert them to \family typewriter BigCluster \family default . \end_layout \begin_layout Standard Conclusion: \family typewriter BigCluster \family default is also \emph on unrealistic \emph default . There is nothing to win, but a lot to loose. \end_layout \begin_layout Subsubsection Current Solution: \family typewriter LocalSharding \family default , sometimes \family typewriter RemoteSharding \end_layout \begin_layout Standard Short story: the architecture as well its current implementation works since decades, and is both cheap and robust since geo-redundancy had been added around 2010. \end_layout \begin_layout Standard With the advent of Football (see \family typewriter football-user-manual.pdf \family default ), the \family typewriter LocalSharding \family default architecture is raising up on par with the most important management abilities of \family typewriter CentralStorage \family default and \family typewriter BigCluster \family default / Software Defined Storage. \end_layout \begin_layout Standard Pre-configured \family typewriter RemoteSharding \family default on top of dedicated Linux-based storage boxes is currently being reduced in favour of the cheaper and more reliable \family typewriter LocalSharding \family default combined with Football. The dedicated storage boxes are almost EOL due to their age, and should vanish some day. \end_layout \begin_layout Standard There is another story about tenthousands of managed dedicated servers: without the traditional ShaHoLin sharding architecture and all of its automatio n, including the newest addition called Football, the product \begin_inset Quotes eld \end_inset managed dedicated servers \begin_inset Quotes erd \end_inset would not be possible in this scale. By definition, the dedicated server product \emph on is \emph default a sharding implementation. Thanks to football, further business opportunities like migration onto virtualized shared hardware (with optional \series bold resource partitioning \series default ) are possible. \end_layout \begin_layout Standard Summay: the sharded \begin_inset Quotes eld \end_inset shared \begin_inset Quotes erd \end_inset product enables another \begin_inset Quotes eld \end_inset dedicated \begin_inset Quotes erd \end_inset product, which is sharded by definition, and it actually is known to scale up by at least another order of magnitude (in terms of number of servers). \end_layout \begin_layout Subsection Scalability of Filesystem Layer vs Block Layer \begin_inset CommandInset label LatexCommand label name "subsec:Filesystem-Layer-vs" \end_inset \end_layout \begin_layout Standard Following factors are responsible for better architectural (cf section \begin_inset CommandInset ref LatexCommand ref reference "sec:What-is-Architecture" \end_inset ) scalability of the block layer vs the filesystem layer, with a few exceptions (list may be incomplete): \end_layout \begin_layout Enumerate \series bold Granularity \series default of access: \series bold metadata \series default is often smaller than the content data it refers to, but access to data is typically not possible without accessing corresponding metadata \emph on first \emph default . When \emph on masses \emph default of metadata are present (e.g. some billions of inodes like in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Example-Scalability-Scenario" plural "false" caps "false" noprefix "false" \end_inset ), and/or when metadata is accessed \series bold more frequently \series default than the corresponding data (e.g. in stateless designs like Apache), it is likely to become the bottleneck. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresToxiques.png lyxscale 50 scale 17 \end_inset Neglecting metadata and its access patterns is a major source of ill-designs. I know of projects which have failed (in their original setup) because of this. Repair may involve some non-trivial architectural changes. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset By default, the block layer itself has almost \begin_inset Foot status open \begin_layout Plain Layout There may be tiny metadata, such as describing the size of the whole block device. \end_layout \end_inset no metadata at all. Therefore it has an \emph on inherent advantage \emph default over the filesystem layer in such use cases. \end_layout \begin_layout Enumerate \series bold Caching \series default : shared memory caches in kernelspace (page cache + dentry cache) vs distributed caches over network. See the picture in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Performance-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset . \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresToxiques.png lyxscale 50 scale 17 \end_inset There exist \emph on examples \emph default where shared distributed caches do not work at all. Frequently, this has to with strict consistency requirements, and with runtime access patterns. I know of \emph on several \emph default projects which have failed. Another project than mentioned in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Example-Failures-of" plural "false" caps "false" noprefix "false" \end_inset has failed because of violations of POSIX filesystem semantics. \end_layout \begin_layout Enumerate Only in distributed systems: the \series bold cache coherence problem \series default , both on metadata and on data. Depending on load patterns, this can lead to tremendous performance degradation , see example in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Example-Failures-of" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Enumerate Dimensioning of the \series bold network \series default : throughput, latencies, queueing behaviour. \end_layout \begin_layout Standard There exist a few known exceptions (list may be incomplete, please report further examples if you know some): \end_layout \begin_layout Itemize Databases: these are typically operating on specific container formats, where no frequent \emph on external \emph default metadata access is necessary, and where no sharing of the \emph on container as such \emph default is necessary. Typically, there is no big performance difference between storing them in block devices vs local filesystems (although it could be viewed as a minor Dijkstra regression). \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Exception from the exception: MyISAM is an old design from the 1980s, originall y based on DBASE data structures under MSDOS. Don't try to access them over NFS or similar. Or, better, try to avoid them at all if possible. \end_layout \begin_layout Itemize VM images: these are logical BLOBS, so there is typically no big difference whether they are in an intermediate \emph on true \emph default filesystem layer, or not. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Filesystems on top of object stores (see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Granularity-at-Architecture" plural "false" caps "false" noprefix "false" \end_inset ) are no true intermediate filesystems. They are violating Dijkstra's important layering rules (see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Layering-Rules" plural "false" caps "false" noprefix "false" \end_inset ) at \emph on several \emph default places. A similar argument holds for block devices on top of object stores. Another layering violation may result from VM container formats like \family typewriter *.vmdk \family default or \family typewriter *.qcow2 \family default , which cannot always be avoided. Be warned that such container formats \emph on themselves \emph default can act as game changers with respect to performance, parallelism degree, reliability, etc. This does not mean that you have to avoid them generally. Layering violations just create an additional \emph on risk \emph default , which need not always materialize, and need not always be fatal. However, be sure to \series bold check their influence \series default , and don't forget to measure their \emph on workingset \emph default and their \emph on caching behaviour \emph default (which can go both into positive and into negative direction), in order to really \emph on know what you are doing. \end_layout \begin_layout Standard There exist a few cases where a distributed filesystem, sometimes even actually with \begin_inset Formula $O(n^{2})$ \end_inset behaviour according to section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Error-Propagation-to" plural "false" caps "false" noprefix "false" \end_inset , \emph on must \emph default be used, because there exists a \emph on hard requirement \emph default for it. Some examples (list is certainly incomplete): \end_layout \begin_layout Itemize HPC = \series bold High Performance Computing \series default on modern supercomputers, consisting of a high number of \begin_inset Formula $n$ \end_inset compute nodes, are often requiring access to a shared persistent data pool, where each of the \begin_inset Formula $n$ \end_inset nodes must be sometimes able to access the same persistent data, sometimes both for reading and writing. Therefore, several supercomputers are using cluster filesystems like Lustre. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Care must be taken that high-frequency / fine granularity communication over the distributed filesystem and its dedicated storage network does not take place, but instead occurs over the ordinary low-latency communication fabrics each modern supercomputer is relying on. True \begin_inset Formula $O(n^{2})$ \end_inset storage access behaviour should be avoided as far as possible (given by the problem to be solved). When absolutely necessary, location transparency (as possible with cluster filesystems like Lustre) as well as its DSM = Distributed Shared Memory model must be given up, and an \series bold explicit communication model \series default must be used instead, which allows explicit control over replicas and their communication paths (e.g. propagation in a binary tree fashion), although it results in much more work for the programmers. Only low frequency / coarse granularity transfers of \emph on bulk data \emph default with \emph on high locality \emph default should run over distributed filesystems, preferably in \emph on streaming \emph default mode (c.f. section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Explanations-from-DSM" plural "false" caps "false" noprefix "false" \end_inset ). The total frequency of metadata access should be low, because metadata consistency may form a bottleneck when updated too frequently. The programmers of the distributed application software need to take care for this. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Notice that certain supercomputer workloads may be crying for a RemoteSharding or FlexibleSharding storage architecture in place of a BigCluster architecture. However, this is very application specific. \end_layout \begin_layout Itemize Student pools at universities, or location-independent workplaces at companies. This is just the usecase where NFS was originally constructed for. Typically, \series bold workstation workloads \series default are neither performance critical, nor prone to actual \begin_inset Formula $O(n^{2})$ \end_inset behaviour (although the network infrastructure would \emph on allow \emph default for it), because each user has her own home directory which is typically \emph on not shared \emph default with others, and she cannot split herself and sit in front of multiple workstations at the same time. Thus the \emph on local per-workstation \emph default NFS caching strategies have a good chance to hide much of the network latencies , and thus the actual total network workload is typically only \begin_inset Formula $O(n).$ \end_inset \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset This can lead to a dangerous misinterpretation: because it apparently works even for a few thousands of workstations, people conclude \emph on wrongly \emph default that the network filesystem \begin_inset Quotes eld \end_inset must be scalable \begin_inset Quotes erd \end_inset . Some people are then applying their experience to completely different usecases, where much higher metadata traffic by several orders of magnitudes is occurring (such as in webhosting), or even where true \begin_inset Formula $O(n^{2})$ \end_inset runtime behaviour is occuring (see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Example-Failures-of" plural "false" caps "false" noprefix "false" \end_inset ). \end_layout \begin_layout Standard \begin_inset Graphics filename images/MatieresToxiques.png lyxscale 50 scale 17 \end_inset In general: when something works for usecase A, this \series bold does \emph on not \emph default prove \series default that it will also work for another usecase B. See explanations from start of section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Scalability-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Chapter Use Cases for MARS \begin_inset CommandInset label LatexCommand label name "chap:Use-Cases-for" \end_inset \end_layout \begin_layout Standard DRBD has a long history of successfully providing HA features to many users of Linux. With the advent of MARS, many people are wondering what the difference is. They ask for recommendations. In which use cases should DRBD be recommended, and in which other cases is MARS the better choice? \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Use cases MARS vs DRBD \end_layout \end_inset The following table is a short guide to the most important cases where the decision is rather clear: \begin_inset Separator latexpar \end_inset \end_layout \begin_layout Plain Layout \noindent \align center \begin_inset Tabular \begin_inset Text \begin_layout Plain Layout Use Case \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Recommendation \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout server pairs, each directly connected via \series bold crossover cables \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout DRBD \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \series bold active-active \series default / dual-primary, e.g. \family typewriter \series bold gfs2 \family default \series default , \family typewriter \series bold ocfs2 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout DRBD \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout distance \series bold > 50km \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout MARS \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout \series bold > 100 server pairs \series default over a short-distance \series bold shared \series default line \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout MARS \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout all else / intermediate cases \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout read the following details \end_layout \end_inset \end_inset \end_layout \end_inset \end_layout \begin_layout Standard \noindent There exist a few use cases where DRBD is clearly better than the current version of MARS. 1&1 has a long history of experiences with DRBD where it works very fine, in particular coupling Linux devices rack-to-rack via crossover cables. DRBD is just \emph on constructed \emph default for that use case (RAID-1 over network). In such a scenario, DRBD is better than MARS because it uses up less disk space resources. In addition, newer DRBD versions can run over high-speed but short-distance interconnects like Infiniband (via the SDP protocol). Another use case for DRBD is active-active / dual-primary mode, e.g. \family typewriter ocfs2 \family default \begin_inset Foot status open \begin_layout Plain Layout Notice that \family typewriter ocfs2 \family default is appearantly not constructed for long distances. 1&1 has some experiences on a specific short distance cluster where the \family typewriter ocfs2 \family default / \family typewriter DRBD \family default combination scaled a little bit better than \family typewriter NFS \family default , but worse than \family typewriter glusterfs \family default (using 2 clients in both cases – notice that \family typewriter glusterfs \family default showed extremely bad performance when trying to enable active-active \family typewriter glusterfs \family default replication between 2 server instances, therefore we ended up using active-pass ive DRBD replication below a single \family typewriter glusterfs \family default server). Conclusion: \family typewriter NFS \family default < \family typewriter ocfs2 \family default < \family typewriter glusterfs \family default < sharding. We found that \family typewriter glusterfs \family default on top of active-passive DRBD scalability was about 2 times better than \family typewriter NFS \family default on top of active-passive DRBD, while \family typewriter ocfs2 \family default on top of \family typewriter DRBD \family default in active-active mode was somewhere inbetween. All cluster comparisons with an increasing workload over time (measured as number of customers which could be safely operated). Each system was replaced by the next one when the respective scalability was at its respective end, each time leading to operational problems. The ultimate solution was to replace all of these clustering concepts by the general concept of \series bold sharding \series default . \end_layout \end_inset over short \begin_inset Foot status open \begin_layout Plain Layout Active-active won't work over long distances at all because of high network latencies (cf chapter \begin_inset CommandInset ref LatexCommand ref reference "chap:Cloud-Storage" \end_inset ). Probably, for replication of whole clusters over long distances DRBD and MARS could be stacked: using DRBD on top for MARS for active-active clustering of \family typewriter gfs2 \family default or \family typewriter ocfs2 \family default , and a MARS instance \emph on below \emph default for failover of \emph on one \emph default of the DRBD replicas over long distances. \end_layout \end_inset distances. \end_layout \begin_layout Standard On the other hand, there exist other use cases where DRBD did not work as expected, leading to incidents and other operational problems. We analyzed them for our specific use cases. The later author of MARS came to the conclusion that they could only be resolved by fundamental changes in the internal architecture of DRBD. The development of MARS started at the personal initiative of the author, first in form of a personal project during holidays, but later picked up by 1&1 as an official project. \end_layout \begin_layout Standard MARS and DRBD simply have \series bold different application areas \series default . \end_layout \begin_layout Standard In the following, we will discuss the pros and cons of each system in particular situations and contexts, and we shed some light at their conceptual and operational differences. \end_layout \begin_layout Section Network Bottlenecks \begin_inset CommandInset label LatexCommand label name "sec:Network-Bottlenecks" \end_inset \end_layout \begin_layout Subsection Behaviour of DRBD \begin_inset CommandInset label LatexCommand label name "subsec:Behaviour-of-DRBD" \end_inset \end_layout \begin_layout Standard In order to describe the most important problem we found when DRBD was used to couple whole datacenters (each encompassing thousands of servers) over metro distances, we strip down that complicated real-life scenario to a simplified laboratory scenario in order to demonstrate the effect with minimal means. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Notice that the following DRBD effect does not appear at crossover cables. The following scenario covers a non-standard case of DRBD. DRBD works fine when no network bottleneck appears. \end_layout \begin_layout Standard The following picture illustrates an effect which has been observed in 1&1 datacenters when running masses of DBRD instances through a single network bottleneck. In addition, the effect is also reproducible by an elder version of the MARS test suite \begin_inset Foot status open \begin_layout Plain Layout The effect has been demonstrated some years ago with DRBD version 8.3.13. By construction, is is independent from any of the DRBD series 8.3.x, 8.4.x, or 9.0.x. \end_layout \end_inset : \begin_inset Separator latexpar \end_inset \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/network-bottleneck-drbd.fig width 80col% \end_inset \end_layout \begin_layout Standard \noindent The simplified scenario is the following: \end_layout \begin_layout Enumerate DRBD is loaded with a low to medium, but constant rate of write operations for the sake of simplicity of the scenario. \end_layout \begin_layout Enumerate The network has some throughput bottleneck, depicted as a red line. For the sake of simplicity, we just linearly decrease it over time, starting from full throughput, down to zero. The decrease is very slowly over time (some minutes, or even hours). \end_layout \begin_layout Standard What will happen in this scenario? \end_layout \begin_layout Standard As long as the actual DRBD write throughput is lower than the network bandwidth (left part of the horizontal blue line), DRBD works as expected. \end_layout \begin_layout Standard Once the maximum network throughput (red line) starts to fall short of the required application throughput (first blue dotted line), we get into trouble. By its very nature, DRBD works \series bold synchronously \series default . Therefore, it \emph on must \emph default transfer all your application writes through the bottleneck, but now it is impossible \begin_inset Foot status open \begin_layout Plain Layout This is independent from the DRBD protocols A through C, because it depends on an information-theoretic argument independently from any protocol. We have a fundamental conflict between network capabilities and application demands here, which cannot be circumvented due to the \series bold synchronous \series default nature of DRBD. \end_layout \end_inset due to the bottleneck. \end_layout \begin_layout Standard As a consequence, the application running on top of DRBD will see increasingly higher IO latencies and/or stalls / hangs. We found practical cases (at least with former versions of DRBD) where IO latencies exceeded practical monitoring limits such as \begin_inset Formula $5$ \end_inset s by far, up to the range of \emph on minutes \emph default . Experienced sysadmins will know what happens next: your application will run into an \series bold incident \series default , and your customers will be dissatisfied. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout In order to deal with such situations, DRBD has lots of tuning parameters. In particular, the \family typewriter timeout \family default parameter and/or the \family typewriter ping-timeout \family default parameter will determine when DRBD will give up in such a situation and simply drop the network connection as an emergency measure. Dropping the network connection is roughly equivalent to an automatic \family typewriter disconnect \family default , followed by an automatic re-connect attempt after \family typewriter connect-int \family default seconds. During the dropped connection, the incident will appear as being resolved, but at some hidden cost \begin_inset Foot status open \begin_layout Plain Layout By appropriately tuning various DRBD parameters, such as \family typewriter timeout \family default and/or \family typewriter ping-timeout \family default , you can keep the impact of the incident below some viable limit. However, the automatic disconnect will then happen earlier and more often in practice. Flaky or overloaded networks may easily lead to an enormous number of automatic disconnects. \end_layout \end_inset . \end_layout \end_inset \end_layout \begin_layout Standard \noindent What happens next in our scenario? During the \family typewriter disconnect \family default , DRBD will record all positions of writes in its bitmap and/or in its activity log. As soon as the automatic re-connect succeeds after \family typewriter connect-int \family default seconds, DRBD has to do a partial re-sync of those blocks which were marked dirty in the meantime. This leads to an \emph on additional \emph default bandwidth demand \begin_inset Foot status open \begin_layout Plain Layout DRBD parameters \family typewriter sync-rate \family default resp \family typewriter resync-rate \family default may be used to tune the height of the additional demand. In addition, the newer parameters \family typewriter c-plan-ahead \family default , \family typewriter c-fill-target \family default , \family typewriter c-delay-target \family default , \family typewriter c-min-rate \family default , \family typewriter c-max-rate \family default and friends may be used to dynamically adapt to \emph on some \emph default situations where the application throughput \emph on could \emph default fit through the bottleneck. These newer parameters were developed in a cooperation between 1&1 and Linbit, the maker of DRBD. \end_layout \begin_layout Plain Layout Please note that lowering / dynamically adapting the resync rates may help in lowering the \emph on probability \emph default of occurrences of the above problems in practical scenarios where the bottlenec k would recover to viable limits after some time. However, lowering the rates will also increase the \emph on duration \emph default of re-sync operations accordingly. The \emph on total amount of re-sync data \emph default simply does not decrease when lowering \family typewriter resync-rate \family default ; it even tends to increase over time when new requests arrive. Therefore, the \emph on expectancy value \emph default of problems caused by \emph on strong \emph default network bottlenecks (i.e. when not even the ordinary application rate is fitting through) is \emph on not \emph default improved by lowering or adapting \family typewriter resync-rate \family default , but rather the expectancy value mostly depends on the \emph on relation \emph default between the amount of holdback data versus the amount of application write data, both measured for the duration of some given strong bottleneck. \end_layout \end_inset as indicated by the upper dotted blue box. \end_layout \begin_layout Standard Of course, there is \emph on absolutely no chance \emph default to get the increased amount of data through our bottleneck, since not even the ordinary application load (lower dotted lines) could be transferred. \end_layout \begin_layout Standard Therefore, you run at a \series bold very high risk \series default that the re-sync cannot finish before the next \family typewriter timeout \family default / \family typewriter ping-timeout \family default cycle will drop the network connection again. \end_layout \begin_layout Standard What will be the final result when that risk becomes true? Simply, your secondary site will be \emph on permanently \emph default in state \family typewriter inconsistent \family default . This means, you have lost your redundancy. In our scenario, there is no chance at all to become consistent again, because the network bottleneck declines more and more, slowly. It is simply \emph on hopeless \emph default , by construction. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresToxiques.png lyxscale 50 scale 17 \end_inset In case you lose your primary site now, you are lost at all. \end_layout \begin_layout Standard \noindent Some people may argue that the probability for a similar scenario were low. We don't agree on such an argumentation. Not only because it really happens in pratice, and it may even last some days until problems are fixed. In case of \series bold rolling disasters \series default , the network is very likely to become flaky and/or overloaded shortly before the final damage. Even in other cases, you can easily end up with inconsistent secondaries. It occurs not only in the lab, but also in practice if you operate some hundreds or even thousands of DRBD instances. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Resilience of DRBD \end_layout \end_inset The point is that you can produce an ill behaviour \emph on systematically \emph default just by overloading the network a bit for some sufficient duration. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset When coupling whole datacenters via some thousands of DRBD connections, any (short) network loss will almost certainly increase the re-sync network load each time the outage appears to be over. As a consequence, overload may be \emph on provoked \emph default by the re-sync repair attempts. This may easily lead to self-amplifying \series bold throughput storms \series default in some resonance frequency (similar to self-destruction of a bridge when an army is marching over it in lockstep). \end_layout \begin_layout Standard The only way for reliable prevention of loss of secondaries is to start any re-connect \emph on only \emph default in such situations where you can \emph on predict in advance \emph default that the re-sync is \emph on guaranteed \emph default to finish before any network bottleneck / loss will cause an automatic disconnect again. We don't know of any method which can reliably predict the future behaviour of a complex network. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Risks from non-crossover DRBD \end_layout \end_inset \begin_inset Graphics filename images/MatieresToxiques.png lyxscale 50 scale 17 \end_inset Conclusion: in the presence of network bottlenecks, you run a considerable risk that your DRBD mirrors get destroyed just in that moment when you desperately need them. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Notice that \emph on classical \emph default crossover cables usually do not show a behaviour like depicted by the red line. Traditional crossover cables are \emph on passive components \emph default which normally \begin_inset Foot status open \begin_layout Plain Layout Exceptions might be mechanical jiggling of plugs, or electro-magnetical interferences. We never noticed any of them. \end_layout \end_inset either work, or not. The binary connect / disconnect behaviour of DRBD has no problems to cope with that. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset However, some newer Ethernet cable technologies like SFP+ and faster are no longer passive. They have some internal chips inside of their plugs. Thus they may \series bold fail independently \series default from your storage nodes. Then you run at least the risks from the CAP theorem, see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Explanation-via-CAP" plural "false" caps "false" noprefix "false" \end_inset . In addition to CAP effects, intermitting errors such as flaky electrical contacts may rise the above risk of permanent data loss. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset or \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Linbit recommends a \series bold workaround \series default for the inconsistencies during re-sync: LVM snapshots. We tried it, but found a \emph on performance penalty \emph default which made it prohibitive for our concrete application. A problem seems to be the cost of destroying snapshots. LVM uses by default a BOW strategy (Backup On Write, which is the counterpart of COW = Copy On Write). BOW increases IO latencies during ordinary operation. Retaining snapshots is cheap, but reverting them may be very costly, depending on workload. We didn't fully investigate that effect, and our experience is a few years old. You might come to a different conclusion for a different workload, for newer versions of system software, or for a different strategy if you carefully investigate the field. \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset DRBD problems usually arise \emph on only \emph default when the network throughput shows some \begin_inset Quotes eld \end_inset awkward \begin_inset Quotes erd \end_inset analog behaviour, such as overload, or as occasionally produced by various switches / routers / transmitters, or other potential sources of packet loss. \end_layout \end_inset \end_layout \begin_layout Subsection Behaviour of MARS \begin_inset CommandInset label LatexCommand label name "subsec:Behaviour-of-MARS" \end_inset \end_layout \begin_layout Standard The behaviour of MARS in the above scenario: \begin_inset Separator latexpar \end_inset \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/network-bottleneck-mars.fig width 80col% \end_inset \end_layout \begin_layout Standard \noindent When the network is restrained, an asynchronous system like MARS will continue to serve the user IO requests (dotted green line) without any impact / incident while the actual network throughput (solid green line) follows the red line. In the meantime, all changes to the block device are recorded at the transactio n logfiles. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Here is one point in favour of DRBD: MARS stores its transaction logs on the filesystem \family typewriter /mars/ \family default . When the network bottleneck is lasting very long (some days or even some weeks), the filesystem will eventually run out of space some day. \family typewriter mars-user-manual.pdf \family default discusses countermeasures against that in detail. In contrast to MARS, DRBD allocates its bitmap \emph on statically \emph default at resource creation time. It uses up less space, and you don't have to monitor it for (potential) overflows. The space for transaction logs is the price you have to pay if you want or need anytime consistency, or asynchronous replication in general. \end_layout \begin_layout Standard In order to really grasp the \emph on heart \emph default of the difference between synchronous and asynchronous replication, we look at the following modified scenario: \begin_inset Separator latexpar \end_inset \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/network-flaky-mars.fig width 80col% \end_inset \end_layout \begin_layout Standard \noindent This time, the network throughput (red line) is varying \begin_inset Foot status open \begin_layout Plain Layout In real life, many long-distance lines or even some heavily used metro lines usually show fluctuations of their network bandwidth by an order of magnitude, or even higher. We have measured them. The overall behaviour can be characterized as \begin_inset Quotes eld \end_inset \series bold chaotic \series default \begin_inset Quotes erd \end_inset . \end_layout \end_inset in some unpredictable way. As before, the application throughput served by MARS is assumed to be constant (dotted green line, often superseded by the solid green line). The actual replication network throughput is depicted by the solid green line. \end_layout \begin_layout Standard As you can see, a network dropdown undershooting the application demand has no impact onto the application throughput, but only onto the replication network throughput. Whenever the network throughput is held back due to the flaky network, it simply catches up as soon as possible by overshooting the application throughput. The amount of lag-behind is visualized as shaded area: downward shading (below the application throughput) means an increase of the lag-behind, while the upwards shaded areas (beyond the application throughput) indicate a decrease of the lag-behind (catch-up). Once the lag-behind has been fully caught up, the network throughput suddenly jumps back to the application throughput (here visible in two cases). \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Note that the existence of lag-behind areas is roughly corresponding to DRBD disconnect states, and in turn to DRBD inconsistent states of the secondary as long as the lag-behind has not been fully cought up. The very rough \begin_inset Foot status open \begin_layout Plain Layout Of course, this visualization is not exact. On one hand, the DRBD inconsistency phase may start later as depicted here, because it only starts \emph on after \emph default the first automatic disconnect, upon the first automatic re-connect. In addition, the amount of resync data may be smaller than the amount of corresponding MARS transaction logfile data, because the DRBD bitmap will coalesce multiple writes to the same block into one single transfer. On the other hand, DRBD will transfer no data at all during its disconnected state, while MARS continues its best. This leads to a prolongation of the DRBD inconsistent phase. Depending on properties of the workload and of the network, the real duration of the inconsistency phase may be both shorter or longer. \end_layout \end_inset duration of the corresponding DRBD inconsistency phase is visualized as magenta line at the time scale. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Optimum throughput via MARS \end_layout \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset MARS utilizes the existing network bandwidth as best as possible in order to pipe through as much data as possible, provided that there exists some data requiring expedition. Conceptually, there exists no better way due to information theoretic limits (besides data compression). \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Note that \emph on in average \emph default during a longer period of time, the network must have emough capacity for transporting \emph on all \emph default of your data. MARS cannot magically break through information-theoretic limits. It cannot magically transport terabytes of data in a few seconds over very slow modem \begin_inset Foot status open \begin_layout Plain Layout A certain colleague at 1&1 is using MARS for a private application: CDP = Continuous Data Protection of a critical Windows VM over his home DSL line. \end_layout \end_inset lines. Only \emph on relatively short \emph default network problems / packet loss can be compensated, depending on the capacity of the \family typewriter /mars \family default filesystem. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset In case of lag-behind, the version of the data replicated to the secondary site corresponds to some time in the past. Since the data is always transferred in the same order as originally submitted at the primary site, the secondary never gets inconsistent. Your mirror always remains usable. Your only potential problem could be the outdated state, corresponding to some state in the past. However, the \begin_inset Quotes eld \end_inset as-best-as-possible \begin_inset Quotes erd \end_inset approach to the network transfer ensures that your version is always \emph on as up-to-date as possible \emph default even under ill-behaving network bottlenecks. \series bold There is simply no better way to do it. \series default In presence of temporary network bottlenecks such as network congestion, there exists no better method than prescribed by the information theoretic limit (red line, neglecting data compression). \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset In order to get all of your data through the line, somewhen the network must be healthy again. Otherwise, data will be recorded until the capacity of the \family typewriter /mars/ \family default filesystem is exhausted, leading to an emergency mode (see \family typewriter mars-user-manual.pdf \family default ). \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Risk reduction via MARS \end_layout \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset MARS' property of never sacrificing local data consistency (at the possible cost of actuality, as long as you have enough capacity in \family typewriter /mars/ \family default ) is called \series bold Anytime Consistency \series default . \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Even when the capacity of \family typewriter /mars/ \family default is exhausted and thus emergency mode is entered, the replicas will \emph on not \emph default become inconsistent by themselves. However, when the emergency mode is later \emph on cleaned up \emph default for a replica via \family typewriter marsadm invalidate \family default , it will become \emph on temporarily \emph default inconsistent during the fast full sync. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset When you have a total of \begin_inset Formula $k\geq3$ \end_inset replicas, you don't need to invalidate them \emph on all in parallel \emph default . By cascading the full syncs sequentially, you can retain some consistent, but outdated replica for the meantime, until all sync have finished. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Conclusion: you can even use \series bold traffic shaping \series default on MARS' TCP connections in order to globally balance your network throughput (of course at the cost of actuality, but without sacrificing local data consistency). If you would try to do the same with DRBD, you could easily provoke a disaster. MARS simply tolerates any network problems, provided that there is enough disk space for transaction logfiles. Even in case of completely filling up your disk with transaction logfiles after some days or weeks, you will not lose local consistency anywhere. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Simple traffic shaping be default \end_layout \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Newer versions of MARS are automatically setting the so-called TOS fields in standard TCP/IP packets for you, which is backwards compatible with the newer DSCP feature. You just need to properly configure your network equipment for this type of traffic shaping, unless it isn't already enabled by default from various network vendors. In the latter case, you don't need to do anything, in order to get some improvements by automatic traffic shaping for free. Details are in \family typewriter mars-user-manual.pdf \family default . \end_layout \end_inset \end_layout \begin_layout Standard \noindent Finally, here is yet another scenario where MARS can cope with the situation: \begin_inset Separator latexpar \end_inset \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/network-constant-mars.fig width 80col% \end_inset \end_layout \begin_layout Standard \noindent This time, the network throughput limit (solid red line) is assumed to be constant. However, the application workload (dotted green line) shows some heavy peaks. We know from our 1&1 datacenters that such an application behaviour is very common (e.g. in case of certain kinds of DDOS attacks etc). \end_layout \begin_layout Standard When the peaks are exceeding the network capacities for some short time, the replication network throughput (solid green line) will be limited for a short time, stay a little bit longer at the limit, and finally drop down again to the normal workload. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Resilience against load peaks \end_layout \end_inset In other words, you get a flexible buffering behaviour, coping with application load peaks. \end_layout \end_inset \end_layout \begin_layout Standard \noindent Similar scenarios (where both the application workload has peaks and the network is flaky to some degree) are rather common. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset If you would use DRBD in place of MARS, you were likely to run into regular application performance problems and/or frequent automatic disconnect cycles, depending on the height and on the duration of the peaks, and on network resources. As observed at 1&1, even permanent data loss is possible, with some residual probability. \end_layout \begin_layout Section Long Distances / High Latencies \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout In general and in some theories, latencies are conceptually independent from throughput, at least to some degree. There exist all 4 possible combinations: \end_layout \begin_layout Enumerate There exist communication lines with high latencies but also high throughput. Examples are raw fibre cables at the ground of the Atlantic. \end_layout \begin_layout Enumerate High latencies on low-throughput lines is very easy to achieve. If you never saw it, you never ran interactive \family typewriter vi \family default over \family typewriter ssh \family default in parallel to downloads on your old-fashioned modem line. \end_layout \begin_layout Enumerate Low latencies need not be incompatible with high throughput. See Myrinet, InfiniBand or high-speed point-to-point interconnects, such as modern RAM busses. \end_layout \begin_layout Enumerate Low latency combined with low throughput is also possible: in an ATM system (or another pre-reservation system for bandwidth), just increase the multiplex factor on low-capacity but short lines, which is only possible at the cost of assigned bandwidth. \end_layout \end_inset \end_layout \begin_layout Standard \noindent In the \emph on internet \emph default practice, it is very likely that \series bold high network latencies will also lead to worse throughput \series default , because of the \emph on congestion control algorithms \emph default running all over the world. \end_layout \begin_layout Standard We have experimented with extremely large TCP send/receive buffers plus various window sizes and congestion control algorithms over long-distance lines between the USA and Europe. Yes, it is possible to improve the behaviour to some degree. But magic does not happen. Natural laws like Einstein's laws will always hold. You simply cannot travel faster than the speed of light. \end_layout \begin_layout Standard Our experience leads to the following rule of thumb, not formally proven by anything, but just observed in practice: \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Safety rule for synchronous replication \end_layout \end_inset In general, \emph on synchronous \emph default data replication (not limited to applications of DRBD) works reliably only over distances \begin_inset Formula $<50$ \end_inset km, or sometimes even less. \end_layout \end_inset \end_layout \begin_layout Standard \noindent There may be some exceptions \begin_inset Foot status open \begin_layout Plain Layout We have heard of cases where even \emph on less \emph default than 50 km were not working with DRBD. It depends on application workload, on properties of the line, and on congestio n caused by other traffic. Some other people told us that according to \emph on their \emph default experience, much lesser distances should be considered operable, only in the range of a few single kilometers. However, they agree that DRBD is rock stable when used on crossover cables. \end_layout \end_inset , e.g. when dealing with low-end workstation loads. But when you are \series bold responsible \series default for a whole datacenter and/or for \series bold enterprise-critical data \series default , don't waste your time by trying (almost) impossible things. We recommend to use MARS in such use cases. \end_layout \begin_layout Section Explanation via CAP Theorem \begin_inset CommandInset label LatexCommand label name "sec:Explanation-via-CAP" \end_inset \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/cap-theorem.fig width 60col% \end_inset \end_layout \begin_layout Standard \noindent The famous CAP theorem, also called Brewer's theorem, is important for a deeper understanding of the differences between DRBD and MARS. A good explanation can be found at \begin_inset Flex URL status open \begin_layout Plain Layout https://en.wikipedia.org/wiki/CAP_theorem \end_layout \end_inset (retrieved July 2018). \end_layout \begin_layout Standard The CAP theorem states that only 2 out of 3 properties can be achieved at the same time, when a Distributed System is under pressure: C = Consistency means \series bold \emph on Strict \series default \emph default Consistency at the level of the \emph on distributed \emph default system (which is \emph on not \emph default the same as strict consistency \emph on inside \emph default of one of the \emph on local \emph default systems), A = Availability = intuitively clear from a user's perspective, and P = Partitioning Tolerance = the network may have its own outages at any time (which is a negative criterion). \end_layout \begin_layout Standard As explained in the Wikipedia article, the P = Partitioning Tolerance is a property which is imporant at least in \emph on wide-distance \emph default data replication scenarios, and possibly in some other scenarios. There the property P cannot be chosen at runtime, but is \emph on given \emph default by \emph on setup \emph default of the Distributed System. \end_layout \begin_layout Subsection CAP Differences between DRBD and MARS \begin_inset CommandInset label LatexCommand label name "subsec:CAP-Differences" \end_inset \end_layout \begin_layout Standard If you are considering only short distances like passive crossover cables between racks, \emph on then \emph default (and \emph on only then \emph default ) you may \emph on assume(!) \emph default that no effort for achieving property P is required, because it it is already given for free. Then, and only then, you can get both A and C at the same time, without sacrificing P, because P is already for free by \emph on assumption \emph default . In such a passive crossover cable scenario, getting all three properties C and A and P is possible, similarly to an explanation in the Wikipedia article. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Newer types of network cables for 10 GBit and more (e.g. SFP+) may have some active chips internally in their plugs. Suchalike technologies are no longer passive. Consequently, the assumption \begin_inset Quotes eld \end_inset passive component which cannot fail \begin_inset Quotes erd \end_inset is no longer true by construction. \end_layout \begin_layout Standard Relying on the assumption \begin_inset Quotes eld \end_inset P is for free = the network cannot fail \begin_inset Quotes erd \end_inset leads us to classical use cases for DRBD: when both DRBD replicas are always staying physically connected via a passive crossover cable (which is \emph on assumed \emph default to never break down), you \emph on could potentially \emph default get both strict global consistency and availability. \end_layout \begin_layout Standard Whether this is real in practice for DRBD, is a different story. It depends on the \emph on implementation \emph default of DRBD. Some sysadmins at 1&1 Ionos have made the experience that there is no 100% CAP guarantee, regardless of DRBD protocol configuration, while they were testing only some cases where only \emph on one \emph default of the DRBD nodes was failing \begin_inset Foot status open \begin_layout Plain Layout In addition, you will need some further components like Pacemaker, iSCSI failover, etc. These might also be involved in the practically observed behaviour. \end_layout \end_inset . Both C and A are provided by DRBD during \family typewriter connected \family default state, while P is \emph on assumed \emph default to be provided by a passive component. \end_layout \begin_layout Standard By addition of iSCSI failover (e.g. ALUA and similar technologies), it \emph on should \emph default be possible to achieve A, even in case of single storage node failures, while retaining C from the viewpoint \begin_inset Foot status open \begin_layout Plain Layout Notice: the CAP theorem does not deal with node failures, only with \emph on network \emph default failures. Node failures would always violate C by some \begin_inset Quotes eld \end_inset strong \begin_inset Quotes erd \end_inset definition. By some \begin_inset Quotes eld \end_inset weaker \begin_inset Quotes erd \end_inset definition, the downtime plus recovery time (e.g. DRBD re-sync) can be taken out of the game. Notice: while a node can always \begin_inset Quotes eld \end_inset know \begin_inset Quotes erd \end_inset whether it has failed (at least after reboot), network failures cannot be distinguished from failures of remote nodes in general. Therefore node failures and network failures are fundamentally different by their nature. \end_layout \end_inset of the application. \end_layout \begin_layout Standard This is explained by the thick line in the following variant of the graphics, which is only valid for passive crossover cables where P need not be guaranteed by the replication because it is already assumed for free: \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/cap-drbd-operational.fig width 60col% \end_inset \end_layout \begin_layout Standard \noindent Now look at the case of a truly Distributed System, where P cannot be assumed as for free. For example, try to use DRBD in a long-distance replication scenario. There we cannot assume P as already given. We \series bold must \emph on tolerate \series default \emph default replication network outages. DRBD is reacting to this differently in two different modes. \end_layout \begin_layout Standard First we look at the (short) time interval \emph on before \emph default DRBD recognizes the replication network incident, and before it leaves the \family typewriter connected \family default state. During this phase, the application IO will \series bold hang \series default for some time, indicating the (temporary) sacrifice (from a user's perspective) by a red X: \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/cap-drbd-connected.fig width 60col% \end_inset \end_layout \begin_layout Standard \noindent Because Availability is one of the highest goods of enterprise-critical IT operations, you will typically configure DRBD such that it automatically switches to some variant of a \family typewriter disconnected \family default state after some timeout, thereby giving up consistency between both replicas. The red X indicates not only loss of global strict consistency in the sense of the CAP theorem, but also that your replica will become \family typewriter Inconsistent \family default during the following re-sync: \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/cap-drbd-disconnected.fig width 60col% \end_inset \end_layout \begin_layout Standard \noindent You may wonder what the difference to MARS is. As explained in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Requirements-for-Cloud" plural "false" caps "false" noprefix "false" \end_inset , MARS is not only intended for wide distances, but also for \series bold Cloud Storage \series default where no strict consistency is required at global level by definition, but instead \series bold Eventually Consistent \series default is the preferred model for the Distributed System. Therefore, \emph on strict \emph default consistency (in the sense of the CAP theorem) is \emph on not required by definition \emph default . \end_layout \begin_layout Standard Consequently, the red X is not present in the following graphics, showing the state where MARS is remaining \emph on locally consistent \emph default all the time \begin_inset Foot status open \begin_layout Plain Layout Notice that the \emph on initial \emph default full sync is not considered here, neither for DRBD, nor for MARS. \emph on Setup \emph default of the Distributed System is its own scenario, not considered here. \emph on Repair \emph default of a \emph on damaged \emph default system is also a different scenario, also not considered here. Notice the MARS' emergency mode also belongs to the class of \begin_inset Quotes eld \end_inset damages \begin_inset Quotes erd \end_inset , as well as DRBD' disk failure modes, where is has some additional functionalit y compared to the current version of MARS. \end_layout \end_inset , even when a network outage occurs: \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/cap-mars.fig width 60col% \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Notice: MARS does not guarantee strict consistency \emph on between \emph default LV replicas at the level of the Distributed System, but only Eventually Consistent. However, \emph on at the same time \emph default it \emph on also \emph default guarantees strict consistency \emph on locally \emph default , and even at \emph on each \emph default of the passive replicas, each by each. Don't confuse these different levels. There are two different consistency guarantees at different levels, both at the same time. This might be confusing if you are not looking at the system at different levels: (1) overall Distributed System versus (2) each of the local system instances. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Why does MARS this? Because a better way is not possible at all. The CAP theorem tells us that there exists no better way when both A has to be guaranteed (as almost everywhere in enterprise-critical IT operations except database systems), and P has to be ensured in geo-redundant datacenter disaster scenarios or some other scenarios. Similarly to natural laws like Einstein's laws of the speed of light, there \emph on does not exist \emph default a better way! \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Solution classification of DRBD \end_layout \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Conclusion from the CAP theorem: when P is a \emph on hard \emph default \emph on requirement \emph default , don't use DRBD (or any other \emph on synchronous \emph default replication implementation) for long-distance and/or true Cloud Storage scenarios. It is only well-suited for short-distance crossover cable scenarios. \end_layout \end_inset \end_layout \begin_layout Standard \noindent The red X is in particular problematic during re-sync, after the network has become healthy again (cf section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Behaviour-of-DRBD" plural "false" caps "false" noprefix "false" \end_inset ). MARS has no red X at C because of its \series bold Anytime Consistency \series default , which refers to \emph on local \emph default consistency, and which is violated by DRBD during certain important phases of its regular operation. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Impossible requirements \end_layout \end_inset \begin_inset Graphics filename images/MatieresToxiques.png lyxscale 50 scale 17 \end_inset If you think that you require alle three properties C+A+P, but you don't have passive crossover cables over short distances, you are requiring something which is \series bold impossible \series default in general. You need give up one of them, at least with a certain probability. \end_layout \end_inset \end_layout \begin_layout Standard \noindent There exists no solution, with whatever component, or from whatever commercial storage vendor. Although some \begin_inset Quotes eld \end_inset marketing drones \begin_inset Quotes erd \end_inset are claiming the impossible, e.g. by citing \emph on examples \emph default , which are then incorrectly generalized. You might have luck, and there might be \emph on exceptional examples \emph default where all three C+A+P were ok, \series bold by chance \series default . But there remains a \series bold risk \series default . The CAP theorem is as hard as Einstein's natural laws are. \end_layout \begin_layout Standard You need a conscious decision about \series bold priorities \series default , which property to drop first. Rethink your complete concept, from end to end. Something is wrong, somewhere. Ignoring a fundamental law like CAP on enterprise-critical use cases can endanger a company and/or your career. \end_layout \begin_layout Subsection CAP Commonalities between DRBD and MARS \begin_inset CommandInset label LatexCommand label name "subsec:CAP-Commonalities" \end_inset \end_layout \begin_layout Standard In this subsection, we look at the case that P is not for free, but has to be ensured by the Distributed Storage system. \end_layout \begin_layout Standard You may have noticed that MARS' ordinary CAP behaviour is similar to DRBD's CAP picture in \family typewriter disconnected \family default state, or during similar states when the replication network is interrupted. \end_layout \begin_layout Standard Replication network interruption is also known as \begin_inset Quotes eld \end_inset Network Partitioning \begin_inset Quotes erd \end_inset . This is where property P = Partitioning Tolerance comes into play. \end_layout \begin_layout Standard When a network partition has \emph on actually occurred \emph default , both DRDB and MARS allow you to do the same: you may \series bold forcefully switch \series default the \family typewriter primary \family default role, which means activation of a former \family typewriter secondary \family default node. In such a situation, you can issue commands like \family typewriter drbdadm primary --force \family default or \family typewriter marsadm primary --force \family default . It is no accident that both commands are looking similar to each other. \end_layout \begin_layout Standard The outcome will be the same: you will most likely get a \family typewriter \series bold SplitBrain \family default \series default situation. \end_layout \begin_layout Standard The possibility of getting a split brain is no specific property of neither DRBD nor MARS. It will also happen with any other replication system, whether synchronous or asynchronous. \end_layout \begin_layout Standard It is one of the consequences from the CAP theorem when (1a) P has to be assured, and (1b) a network partition has \emph on actually occurred \emph default , and (2) when A = Availability is enforced at both sides of the network partition. The result is that C = \emph on global \emph default Consistency may be violated, by creation of two or more versions of the data. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Fortunately, \emph on \emph default there is a method for \emph on dynamic \emph default control of SplitBrain at \emph on runtime \emph default . The decision about forceful creation of SplitBrain can be made \emph on dynamically dependent \emph default on further external factors, like current customer demands, or forecasts, etc. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Careful: at least for some application classes, it is a bad idea to systematica lly create split brain via automatic cluster managers, e.g. Pacemaker or similar. As explained in section \begin_inset CommandInset ref LatexCommand vref reference "sec:Inappropriate-Clustermanger" \end_inset , some cluster managers were originally constructed for truly shared disk scenarios, where no split brain can occur by construction. Using them in masses on versioned data in truly distributed systems can result in existential surprises, once a bigger network partition and/or a flaky replication networks triggers them in masses, and possibly at unexpecte d moments. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent Split brain should not be provoked when not \emph on absolutely \emph default necessary. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Split brain resolution is all else but easy \emph on in general \emph default . When the data is in a generic block device, you typically will have no general means for \emph on merging \emph default both versions. This means, split brain resolution is typically only possible by \series bold throwing away \series default some of the versions. \end_layout \begin_layout Standard This kind of split brain resolution problem is not specific for DRBD or MARS. It is a fundamental property of Distributed Systems, and the difficulty of resolution is an inherent property of generic block devices. \end_layout \begin_layout Standard DRBD and MARS have some commands like \family typewriter drbdadm invalidate \family default or \family typewriter marsadm invalidate \family default for this. Again, the similarity is no accident. \end_layout \begin_layout Standard Notice that classical filesystems aren't typically better than raw block devices. There are even more possibilities for tricky types of \series bold conflicts \series default (e.g. on path names in addition to file content). Anyway, long-distance replication should not be done at filesystem layer, see section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Performance-Penalties-Layer" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Standard Similary, BigCluster object stores are often suffering from similar (or even worse) problems, because higher application layers may have some hidden internal dependencies between object versions, while the object store itself is agnostic of version dependencies in general \begin_inset Foot status open \begin_layout Plain Layout There exists lots of types of potential dependencies between objects. Timely ones are easy to capture, but this is not sufficient in general for everything. \end_layout \end_inset . \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresToxiques.png lyxscale 50 scale 17 \end_inset Cautious: when stacking block devices or filesystems, or any other complex \emph on structured aggregates \emph default on top of some BigCluster object store, you are creating another fundamental risk, in addition to Dijkstra regressions explained in section \begin_inset CommandInset ref LatexCommand nameref reference "par:Negative-Example:-object" plural "false" caps "false" noprefix "false" \end_inset . Several types \begin_inset Foot status open \begin_layout Plain Layout Notice: BigCluster architectures are typically discriminating between between client servers and storage servers. This will typically introduce some more possibilities into the game, such as forced client failover, independently from forced storage failover. \end_layout \end_inset of object stores will not magically resolve any split brain for you. Check whether your favorite object store implementation has some kind of equivalent of a \family typewriter primary --force \family default command. If it doesn't have one, or only a restricted one, you should be \series bold \emph on alerted \series default \emph default . In case of a \emph on long-lasting(!) \emph default storage network partition, you might need suchalike \emph on desperately \emph default for ensuring A, even at the cost of C \begin_inset Foot status open \begin_layout Plain Layout \noindent Notice that the C functionality is often not implemented by the object store itself (which typically provides only \emph on eventually consistent \emph default at object granularity), but implemented by the distributed block device or distributed filesystem, if it is implemented at all. There is a fundamental problem with at least 3 different granularities to be resolved: in order to guarantee strict consistency at (1) aggregate granularity, which is independent from the (2) network partition granularity, in general multiple versions of objects may be required at (3) object granulari ty. Does your object store have a means for this, similarly to multiversion databases, e.g. multiversion timestamp ordering? \end_layout \end_inset . \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Check: whether you need this is heavily depending on the \series bold \emph on application class \series default \emph default (see also the Cloud Storage definition in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Requirements-for-Cloud" plural "false" caps "false" noprefix "false" \end_inset ). If you \emph on would \emph default need it, but you are \series bold not prepared for suchalike scenarios at your enterprise-critical data \series default , it could cost you a lot of money and/or reputation and/or even your existence. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Notice: the \emph on concept \emph default of \family typewriter SplitBrain \family default is occurring almost everywhere in truly Distributed Systems when C can be violated in favour of A+P. It is a very general consequence \begin_inset Foot status open \begin_layout Plain Layout There exist only few opportunities for generic conflict resolution, even in classical databases where \emph on some \emph default knowledge about the structure of the data is available. Typically, there exist some more \emph on hidden \emph default dependencies than people are expecting. Lossless \family typewriter SplitBrain \family default resolution will thus need to be implemented at application layer, if it is possible at all. \end_layout \end_inset of the CAP theorem. \end_layout \begin_layout Standard The only reliable way for avoiding split brain in truly distributed systems would be: don't insist on A = Availability. Notice that there exist only a few application classes, like certain types of banking, where C is typically a higher good than A. \end_layout \begin_layout Standard Notice that both DRBD and MARS are supporting suchalike application classes also: just \emph on don't \emph default add the option \family typewriter --force \family default to the \family typewriter primary \family default switch command. \end_layout \begin_layout Standard However: even in banking, some \emph on extremely extraordinary \emph default scenarios might occur, where sacrifice of C in favour of A could be necessary (e.g. when \emph on manual cleanup \emph default of C is cheaper than long-lasting violations of A). \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Summary CAP decisions \end_layout \end_inset Both DRBD and MARS have some emergency measure for killing C in favour of A. It requires your \series bold conscious decision \series default whether / where / when to use it, \emph on or not \emph default . \end_layout \end_inset \end_layout \begin_layout Section Higher Consistency Guarantees vs Actuality \end_layout \begin_layout Standard We already saw in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Network-Bottlenecks" plural "false" caps "false" noprefix "false" \end_inset that certain types of network bottlenecks can easily (and reproducibly) destroy the consistency of your DRBD secondary, while MARS will preserve local consistency at the cost of actuality ( \series bold anytime consistency \series default ). \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout Some people, often located at database operations, are obtrusively arguing that actuality is such a high good that it must not be sacrificed under any circumstances. \end_layout \begin_layout Plain Layout Anyone arguing this way has at least the following choices (list may be incomplete): \end_layout \begin_layout Enumerate None of the above use cases for MARS apply. For instance, short distance replication over crossover cables is sufficient, and the network is reliable enough such that bottlenecks can never occur (e.g. because the total load is extremely low, or conversely the network is extremely overengineered / expensive), or the occurrence of bottlenecks can \emph on provably \emph default be taken into account. In such cases, DRBD is clearly the better solution than MARS, because it provides better actuality than the current version of MARS, and it uses up less disk resources. \end_layout \begin_layout Enumerate In the presence of network bottlenecks, people didn't notice and/or didn't understand and/or did under-estimate the risk of accidental invalidation of their DRBD secondaries. They should carefully check that risk. They should convince themselves that the risk is \emph on really \emph default bearable. Once they are hit by a \emph on systematic chain \emph default of events which \emph on reproducibly \emph default provoke the bad effect, it is too late \begin_inset Foot status open \begin_layout Plain Layout Some people seem to need a bad experience before they get the difference between risk caused by reproducible effects and inverted luck. \end_layout \end_inset . \end_layout \begin_layout Enumerate In the presence of network bottlenecks, people found a solution such that DRBD does not automatically re-connect after the connection has been dropped due to network problems (c.f. \family typewriter ko-count \family default parameter). So the risk of inconsistency \emph on appears \emph default to have vanished. In some cases, people did not notice that the risk has \emph on not completely \begin_inset Foot status open \begin_layout Plain Layout Hint: what's the \emph on conceptual \emph default difference beween an automatic and a manual re-connect? Yes, you can try to \emph on lower \emph default the risk in some cases by transferring risks to human analysis and human decisions, but did you take into account the possibility of human errors? \end_layout \end_inset \emph default vanished, and/or they did not notice that now the actuality produced by DRBD is even drastically worse than that of MARS (in the same situation). It is true that DRBD provides better actuality in \family typewriter connected \family default state, but for a \emph on full picture \emph default the actuality in \family typewriter disconnected \family default state must not be neglected \begin_inset Foot status open \begin_layout Plain Layout Hint: a potential hurdle may be the fact that the current format of \family typewriter /proc/drbd \family default does neither display the timestamp of the first \emph on relevant \emph default network drop nor the total amount of lag-behind user data (which is \emph on not \emph default the same as the number of dirty bits in the bitmap), while \family typewriter marsadm view \family default can display it. So it is difficult to judge the risks. Possibly a chance is inspection of DRBD messages in the syslog, but quantificat ion could remain hard. \end_layout \end_inset . So they didn't notice that their argumentation on the importance of actuality may be fundamentally wrong. A possible way to overcome that may be re-reading section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Behaviour-of-MARS" plural "false" caps "false" noprefix "false" \end_inset and comparing its outcome with the corresponding outcome of DRBD in the same situation. \end_layout \begin_layout Enumerate People do not know the CAP theorem (see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Explanation-via-CAP" plural "false" caps "false" noprefix "false" \end_inset ), and are trying to require something which simply is \series bold impossible \series default . \end_layout \begin_layout Enumerate People are stuck in contradictive requirements because the current version of MARS does not yet support synchronous or pseudo-synchronous operation modes. This should be resolved some day. \end_layout \end_inset \end_layout \begin_layout Standard \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset A common misunderstanding is about the actuality guarantees provided by filesystems. The buffer cache / page cache uses by default a \series bold writeback strategy \series default for performance reasons. Even modern journalling filesystems will (by default) provide only consistency guarantees, but no strong actuality guarantee. In case of power loss, some transactions may be even \emph on rolled back \emph default in order to restore consistency. According to POSIX \begin_inset Foot status open \begin_layout Plain Layout The above argumentation also applies to Windows filesystems in analogous way. \end_layout \end_inset and other standards, the only \emph on reliable \emph default way to achieve actuality is usage of system calls like \family typewriter sync() \family default , \family typewriter fsync() \family default , \family typewriter fdatasync() \family default , flags like \family typewriter O_DIRECT \family default , or similar. For performance reasons, the \emph on vast majority of applications \emph default don't use them at all, or use them only sparingly! \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset It makes no sense to require strong actuality guarantees from any block layer replication (whether DRBD or future versions of MARS) while higher layers such as filesystems or even applications are already sacrificing them! \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset In summary, the \series bold anytime consistency \series default provided by MARS is an argument you should consider, even if you need an extra hard disk for transaction logfiles. \end_layout \begin_layout Chapter Requirements of Long-Distance Replication \end_layout \begin_layout Section Avoiding Inappropriate Clustermanager Types for Medium and Long-Distance Replication \begin_inset CommandInset label LatexCommand label name "sec:Inappropriate-Clustermanger" \end_inset \end_layout \begin_layout Standard This section addresses some wide-spread misconceptions. Its main target audience is \emph on userspace \emph default developers, but others may profit from \series bold detailed explanations of problems and pitfalls \series default . When the problems described in this section are solved somewhen in future, this section will be shortened and some relevant parts moved to the appendix. \end_layout \begin_layout Standard Doing \series bold HA = High Availability \series default (see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:What-is-HA" plural "false" caps "false" noprefix "false" \end_inset ) wrong at \emph on concept level \emph default may easily get you into trouble, and may cost you several millions of € or $ in larger installations, or even knock you out of business when disasters are badly dealt with at higher levels such as clustermanagers. \end_layout \begin_layout Subsection General Cluster Models \end_layout \begin_layout Standard The most commonly known cluster model is called \series bold shared-disk \series default , and typically controlled by clustermanagers like \family typewriter PaceMaker \family default : \begin_inset Separator latexpar \end_inset \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/shared-disk-model.fig width 50col% \end_inset \end_layout \begin_layout Standard \noindent The most important property of shared-disk is that there exists only a single disk instance. Nowadays, this disk often has some \emph on internal \emph default redundancy such as RAID. At \emph on system \emph default architecure layer / network level, there exists no redundant disk at all. Only the application cluster is built redundantly. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset It should be immediately clear that shared-disk clusters are only suitable for short-distance operations in the same datacenter, or better in the same room / rack. Although running one of the data access lines over short distances between very near-by datacenters (e.g. 1 km) would be theoretically possible, there would be no sufficient protection against failure of a whole datacenter. \end_layout \begin_layout Standard Both DRBD and MARS belong to a different architectural model called \series bold shared-nothing \series default : \begin_inset Separator latexpar \end_inset \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/shared-nothing-model.fig width 50col% \end_inset \end_layout \begin_layout Standard \noindent The characteristic feature of a shared-nothing model is (additional) \series bold data redundancy at network level \series default . \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Shared-nothing \begin_inset Quotes eld \end_inset clusters \begin_inset Foot status open \begin_layout Plain Layout Notice that the term \begin_inset Quotes eld \end_inset cluster computing \begin_inset Quotes erd \end_inset usually refers to short-distance only. Long-distance coupling should be called \begin_inset Quotes eld \end_inset grid computing \begin_inset Quotes erd \end_inset in preference. As known from the scientific literature, grid computing requires different concepts and methods in general. Only for the sake of simplicity, we use \begin_inset Quotes eld \end_inset cluster \begin_inset Quotes erd \end_inset and \begin_inset Quotes eld \end_inset grid \begin_inset Quotes erd \end_inset interchangeably. \end_layout \end_inset \begin_inset Quotes erd \end_inset could theoretically be built for \emph on any \emph default distances, from short to medium to long distances. However, concrete technologies of disk coupling such as synchronous operation may pose practical limits on the distances (see chapter \begin_inset CommandInset ref LatexCommand nameref reference "chap:Use-Cases-for" plural "false" caps "false" noprefix "false" \end_inset ). \end_layout \begin_layout Standard In general, clustermanagers must fit to the model. Some clustermanager can be configured to fit to multiple models. If so, this must be done properly, or you may get into serious trouble. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout Some people don't know, or they don't believe even when told them, that different architectural models like shared-disk or shared-nothing will \emph on require \emph default an \emph on appropriate \emph default type of clustermanager and/or at least a different configuration. Failing to do so, by selection of an inappropriate clustermanager type and/or an inappropriate configuration may be \series bold hazardous \series default . \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Pitfall: suchalike problems are typically appearing \series bold only during incidents \series default . \end_layout \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset It is dangerous to conclude from \begin_inset Quotes eld \end_inset stable ordinary operation \begin_inset Quotes erd \end_inset that the system is reliable. The real \series bold risk \series default is that \series bold data inconsistencies \series default are showing up at the \series bold wrong moment \series default , when the clustermanager has to execute the right actions for compensation of a certain component failure. \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Selection of the right model alone is not sufficient. Some, if not many, clustermanagers have not been designed for long distances (see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:What-is-Geo-Redundancy" plural "false" caps "false" noprefix "false" \end_inset ). \end_layout \begin_layout Standard As explained in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Special-Requirements-for" plural "false" caps "false" noprefix "false" \end_inset , long distances have further \series bold hard requirements \series default . Disregarding them may be also hazardous! \end_layout \begin_layout Subsection Handover / Failover Reasons and Scenarios \end_layout \begin_layout Standard From a sysadmin perspective, there exist a number of different \series bold reasons \series default why the application workload must be switched from the currently active side A to the currently passive side B: \end_layout \begin_layout Enumerate Some \series bold defect \series default has occurred at cluster side A or at some corresponding part of the network. \end_layout \begin_layout Enumerate Some \series bold maintenance \series default has to be done at side A which would cause a longer downtime (e.g. security kernel update or replacement of core network equipment or maintainance of UPS or of the BBU cache etc - hardware isn't 24/7/365 in practice, although some vendors \emph on claim \emph default it - it is either not really true, or it becomes \emph on extremely \emph default expensive). \end_layout \begin_layout Standard Both reasons are valid and must be automatically \emph on handled \emph default (but not necessarily automatically \emph on triggered \emph default ) in larger installations. In order to deal with all of these reasons, the following basic mechanisms can be used in either model: \end_layout \begin_layout Enumerate \series bold Failover \series default (triggered either manually or automatically) \end_layout \begin_layout Enumerate \series bold Handover \series default (triggered manually \begin_inset Foot status open \begin_layout Plain Layout Automatic triggering could be feasible for prophylactic treatments. \end_layout \end_inset ) \end_layout \begin_layout Standard It is important to not confuse handover with failover at concept level. Not only the reasons / preconditions are very different, but also the \emph on requirements \emph default . \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout Precondition for handover is that \emph on both \emph default cluster sides are healthy, while precondition for failover is that \emph on some really relevant(!) \emph default failure has been \emph on detected \emph default somewhere (whether this is \emph on really \emph default true is another matter). Typically, failover must be able to run in masses, while planned handover often has lower scaling requirements. \end_layout \end_inset \end_layout \begin_layout Standard \noindent Not all existing clustermanagers are dealing with all of these cases (or their variants) equally well, and some are not even dealing with some of these cases / variants \emph on at all \emph default . \end_layout \begin_layout Standard Some clustermanagers cannot easily express the concept of \begin_inset Quotes eld \end_inset automatic triggering \begin_inset Quotes erd \end_inset versus \begin_inset Quotes eld \end_inset manual triggering \begin_inset Quotes erd \end_inset of an action. There exists simply no cluster-global switch which selects either \begin_inset Quotes eld \end_inset manual mode \begin_inset Quotes erd \end_inset or \begin_inset Quotes eld \end_inset automatic mode \begin_inset Quotes erd \end_inset (except when you start to hack the code and/or write new plugins; then you might notice that there is no sufficient architectural layering / sufficien t separation between mechanism and strategy). \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout Being forced to permanently use an automatic mode for \series bold triggering \series default several hundreds or even thousands of clusters is not only boring, but bears a \series bold considerable risk \series default when automatics do a wrong decision at hundreds of instances in parallel. \end_layout \end_inset \end_layout \begin_layout Subsection Granularity and Layering Hierarchy for Long Distances \begin_inset CommandInset label LatexCommand label name "subsec:Granularity-and-Layering" \end_inset \end_layout \begin_layout Standard Many existing clustermanager solutions are dealing with a single cluster instance, as the term \begin_inset Quotes eld \end_inset \emph on cluster \emph default manager \begin_inset Quotes erd \end_inset suggests. However, when running several hundreds or thousands of cluster instances, you likely will not want to manage each of them individually. In addition, failover should \emph on not only \emph default be \emph on triggered \emph default (not to be confused with \emph on executed \emph default ) individually at cluster level, but likely \emph on also \emph default at a higher granularity such as a room, or a whole datacenter. Otherwise, some chaos is likely to happen. \end_layout \begin_layout Standard Here is what you probably will \series bold need \series default , possibly in difference to what you may find on the market (whether OpenSource or not). For simplicity, the following diagram shows only two levels of granularity, but can be easily extended to multiple layers of granularity, or to some concept of various \emph on subsets of clusters \emph default : \begin_inset Separator latexpar \end_inset \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/clustermanager-hierarchy.fig width 70col% \end_inset \end_layout \begin_layout Standard \noindent Notice that many existing clustermanager solutions are not addressing the datacenter granularity at all. Typically, they use concepts like \series bold quorums \series default for determining failures \emph on at cluster level \emph default solely, and then immediately executing failover of the cluster, sometimes without clean architectural distinction between trigger and execution (similar to the \begin_inset Quotes eld \end_inset separation of concerns \begin_inset Quotes erd \end_inset between \series bold mechanism \series default and \series bold strategy \series default in Operating Systems). Sometimes there is even no internal software layering / modularization according to this separation of concerns at all. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset When there is no distinction between different levels of granularity, you are hopelessly bound to a non-extensible and thus non-adaptable system when you need to operate masses of clusters. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Minimum requirements for larger installations \end_layout \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset A lacking distinction between automatic mode and manual mode in a cluster management solution, and/or lack of corresponding \series bold architectural software layers \series default is not only a blatant ignoration of well-established best practices of \series bold software engineering \series default , but will bind you even more firmly to an \series bold inflexible system \series default , producing direct and indirect \series bold long-term follow-up cost \series default . \end_layout \end_inset \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Terminology: for practical reasons, we use the general term \begin_inset Quotes eld \end_inset clustermanager \begin_inset Quotes erd \end_inset also for speaking about layers dealing with higher granularity, such as datacenter layers, and also for long-distance replication scenarios, although some terminology from grid computing would be more appropriate in a scientific background. \end_layout \begin_layout Standard Please consider the following: when it comes to long-distance HA, the above layering architecture is also motivated by vastly different numbers of instances for each layer. Ideally, the topmost automatics layer should be able to overview several datacenters in parallel, in order to cope with (almost) global network problems such as network partitions. Additionally, it should also detect single cluster failures, or intermediate problems like \begin_inset Quotes eld \end_inset rack failure \begin_inset Quotes erd \end_inset or \begin_inset Quotes eld \end_inset room failure \begin_inset Quotes erd \end_inset , as well as various types of (partial / intermediate) (replication) network failures. Incompatible decisions at each of the different granularities would be a no-go in practice. Somewhere and somehow, you need one single \begin_inset Foot status open \begin_layout Plain Layout If you have \emph on logical pairs of datacenters \emph default which are firmly bound together, you could also have several topmost automatics instances, e.g. for each \emph on pair \emph default of datacenters. However, that would be very \series bold inflexible \series default , because then you cannot easily mix locations or migrate your servers between datacenters. Using \begin_inset Formula $k>2$ \end_inset replicas with MARS would also become a nightmare. In your own interest, please don't create any concepts where masses of hardware are firmly bound to fixed constants at some software layers. \end_layout \end_inset top-most \emph on logical \emph default problem detection / ranking instance, which should be \emph on internally distributed \emph default of course, typically using some \series bold distributed consensus protocol \series default ; but in difference to many published distributed consensus algorithms it should be able to work with \emph on multiple \emph default granularities at the same time. \end_layout \begin_layout Subsection Discussion of Handover / Failover Methods \end_layout \begin_layout Subsubsection Failover Methods \begin_inset CommandInset label LatexCommand label name "subsec:Failover-Methods" \end_inset \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout Failover methods are only needed in case of an incident. They should not be used for regular handover, because preconditions are different. Inappropriate merges of both method classes will cause unnecessary \series bold indirekt cost \series default . \end_layout \end_inset \end_layout \begin_layout Paragraph STONITH-like Methods \end_layout \begin_layout Standard STONITH = Shoot The Other Node In The Head \end_layout \begin_layout Standard These methods are widely known, although they have several serious drawbacks. Some people even believe that \emph on any \emph default clustermanager must \emph on always \emph default have some STONITH-like functionality. This is wrong. There \emph on exist \emph default alternatives, as shown in the next paragraph. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset A historical motivation for STONITH was prevention of illegal modifications of the \emph on shared disk \emph default by amok-running defective clients. In those ancient times, disks were \emph on passive \emph default mechanical components, while their disk controller was often belongig to the server. In modern shared-nothing scenarios, this motivation does no longer exist. Anyway, you can achieve \series bold disk fencing \series default by various software means nowadays. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset The most obvious drawback is that STONITH will always create a \series bold damage \series default , by definition. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout Typical contemporary STONITH implementations are using IPMI and relatives for automatically powering off your server, or at least pushing the (virtual) reset button. This will \emph on always \emph default create a certain type of damage: the affected systems will definitely not be available, at least for some time until it has (manually) rebooted. \end_layout \end_inset \end_layout \begin_layout Standard \noindent The STONITH damage leads to a \emph on conceptual \emph default contradiction: the reason for starting failover is that you want to restore availability as soon as possible, but in order to do so you will first \emph on destroy \emph default the availability of a particular \emph on component \emph default . This may be counter-productive. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout When your hot standby node B does not work as expected, or if it works even \emph on worse \emph default than A before, you will \emph on at least \emph default loose some time until you \emph on can \emph default become operational again at the old side A. In addition, pushing the reset button bears the \series bold risk of unnecessary data loss \series default from RAM buffers not yet written to disk, and in turn to \series bold risk of data inconsistencies \series default , like need for a filesystem check. When some of the hardware is defective, like for example the boot disk or the boot sector, the system may not come up at all after reset. \end_layout \end_inset \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold STONITH variant for shared-nothing \end_layout \end_inset Here is an example method for handling a failure scenario. The old active side A is assumed to be no longer healthy anymore. The method uses a sequential state transition chain with a STONITH-like step: \end_layout \begin_layout Description Phase1 Check whether the hot standby B is currently usable. If this is violated (which may happen during certain types of disasters), abort the failover for any affected resources. \end_layout \begin_layout Description Phase2 \emph on Try \emph default to shutdown the damaged side A (in the \emph on hope \emph default that there is no \emph on serious \emph default damage). \end_layout \begin_layout Description Phase3 In case phase2 did not work during a grace period / after a timeout, assume that A is badly damaged and therefore STONITH it. \end_layout \begin_layout Description Phase4 Start the application at the hot standby B. \end_layout \begin_layout Plain Layout Notice: any cleanup actions, such as \series bold repair \series default of defective hard- or software etc, are outside the scope of failover processes. Typically, they are executed much later when restoring redundancy. \end_layout \begin_layout Plain Layout Also notice: this method is a \emph on heavily \emph default distributed one, in the sense that sequential actions are alternated multiple times on different hosts. This is known to be cumbersome in distributed systems, in particular in presence of network problems. \end_layout \begin_layout Plain Layout \begin_inset CommandInset label LatexCommand label name "Phase4-in-more" \end_inset Phase4 in more detail for DRBD, augmented with some pseudo code for application control: \end_layout \begin_layout Enumerate at side B: \family typewriter drbdadm disconnect all \end_layout \begin_layout Enumerate at side B: \family typewriter drbdadm primary --force all \end_layout \begin_layout Enumerate at side B: \family typewriter applicationmanager start all \end_layout \begin_layout Plain Layout The same phase4 using MARS: \end_layout \begin_layout Enumerate at side B: \family typewriter marsadm pause-fetch all \end_layout \begin_layout Enumerate at side B: \family typewriter marsadm primary --force all \end_layout \begin_layout Enumerate at side B: \family typewriter applicationmanager start all \end_layout \end_inset \end_layout \begin_layout Standard \noindent This sequential 4-phase method is far from optimal, for the following reasons: \end_layout \begin_layout Itemize The method tries to handle both failover and handover scenarios with one single sequential receipe. In case of a true failover scenario where it is \emph on already known for sure \emph default that side A is badly damaged, this method will unnecessarily waste time for phase 2. This could be fixed by introduction of a conceptual distinction between handover and failover, but it would not fix the following problems. \end_layout \begin_layout Itemize Before phase4 is started (which will re-establish the service from a user's perspective), a lot of time is wasted by \emph on both \emph default phases 2 \emph on and \emph default 3. Even if phase 2 would be skipped, phase 3 would unnecessarily cost some time. In the next paragraph, an alternative method is explained which eliminates any unnecessary waiting time at all. \end_layout \begin_layout Itemize The above method is adapted from the shared-disk model. It does not take advantage of the shared-nothing model, where further possibili ties for better solutions exist. \end_layout \begin_layout Itemize In case of long-distance network partitions and/or sysadmin / system management subnetwork outages, you may not even be able to (remotely) execute STONITH at all. Thus the above method misses an important failure scenario. \end_layout \begin_layout Standard Some people seem to have a \emph on binary \emph default view at the healthiness of a system: in their view, a system is either operational, or it is damaged. This kind of view is ignoring the fact that some systems may be half-alive, showing only \emph on minor \emph default problems, or occurring only from time to time. \end_layout \begin_layout Standard It is obvious that damaging a healthy system is a bad idea by itself. Even \emph on generally \emph default damaging a half-alive system in order to \begin_inset Quotes eld \end_inset fix \begin_inset Quotes erd \end_inset problems is not generally a good idea, because it may increase the damage when you don't know the \emph on real \emph default reason \begin_inset Foot status open \begin_layout Plain Layout Example, occurring in masses: an incorrectly installed bootloader, or a wrong BIOS boot priority order which unexpectedly lead to hangs or infinite reboot cycles once the DHCP or BOOTP servers are not longer available / reachable. \end_layout \end_inset . \end_layout \begin_layout Standard Even worse: in a distributed system \begin_inset Foot status open \begin_layout Plain Layout Notice: the STONITH concept is more or less associated with short-distance scenarios where \series bold crossover cables \series default or similare equipment are used. The assumption is that crossover cables can't go defective, or at least it would be an extremely unlikely scenario. For long-distance replication, this assumption is simply not true. \end_layout \end_inset you sometimes \emph on cannot(!) \emph default know whether a system is healthy, or to what degree it is healthy. Typical STONITH methods as used in some contemporary clustermanagers are \series bold assuming a worst case \series default , even if that worst case is currently not for real. \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Advice \end_layout \end_inset Avoid the following \series bold fundamental flaws \series default in failover concepts and healthiness models, which apply to implementors / configurators of clustermanagers: \end_layout \begin_layout Itemize Don't mix up knowledge with conclusions about a (sub)system, and also don't mix this up with the real state of that (sub)system. In reality, you don't have any knowledge about a complex distributed system. You only may have \emph on some \emph default knowledge about \emph on some \emph default parts of the system, but you cannot \begin_inset Quotes eld \end_inset see \begin_inset Quotes erd \end_inset a complex distributed system as a whole. What you think is your knowledge, isn't knowledge in reality: in many cases, it is \emph on conclusion \emph default , not knowledge. Don't mix this up! \end_layout \begin_layout Itemize Some systems are more complex than your model of it. Don't neglect important parts (such as networks, routers, switches, cables, plugs) which may lead you to wrong conclusions! \end_layout \begin_layout Itemize Don't restrict your mind to boolean models of healthyness. Doing so can easily create unnecessary damage by construction, and even at concept level. You should know from software engineering that defects in concepts or models are much more serious than simple bugs in implementations. Choosing the wrong model cannot be fixed as easily as a typical bug or a typo. \end_layout \begin_layout Itemize Try to deduce the state of a system as \series bold reliably \series default as possible. If you don't know something for sure, don't generally assume that it has gone wrong. Don't confuse missing knowledge with the conclusion that something is bad. Boolean algebra restricts your mind to either \begin_inset Quotes eld \end_inset good \begin_inset Quotes erd \end_inset or \begin_inset Quotes eld \end_inset bad \begin_inset Quotes erd \end_inset . Use at least \series bold tri-state algebra \series default which has a means for expressing \series bold \begin_inset Quotes eld \end_inset unknown \begin_inset Quotes erd \end_inset \series default . Even better: attach a probability to anything you (believe to) know. Errare humanum est: nothing is absolutely for sure. \end_layout \begin_layout Itemize Oversimplification: don't report an \begin_inset Quotes eld \end_inset unknown \begin_inset Quotes erd \end_inset or even a \begin_inset Quotes eld \end_inset broken \begin_inset Quotes erd \end_inset state for a complex system whenever a smaller subsystem exists for which you have some knowledge (or you can conclude something about it with reasonable evidence). Otherwise, your users / sysadmins may draw wrong conclusions, and assume that the whole system is broken, while in reality only some minor part has some minor problem. Users could then likely make wrong decisions, which may then easily lead to bigger damages. \end_layout \begin_layout Itemize Murphy's law: \series bold never assume that something can't go wrong! \series default Doing so is a blatant misconception at topmost level: the \emph on purpose \emph default of a clustermanager is creating High Availablity (HA) out of more or less \begin_inset Quotes eld \end_inset unreliable \begin_inset Quotes erd \end_inset components. It is the damn duty of both a clustermanager and its configurator to try to compensate \emph on any \emph default failures, \emph on regardless of their probability \emph default \begin_inset Foot status open \begin_layout Plain Layout Never claim that something has only low probability (and therefore it were not relevant). In the HA area, you simply \series bold cannot know \series default that, because you typically have \emph on sporadic \emph default incidents. In extreme cases, the \emph on purpose \emph default of your HA solution is protection against 1 failure per 10 years. You simply don't have the time to wait for creating an incident statistics about that! \end_layout \end_inset , as best as possible. \end_layout \begin_layout Itemize Never confuse \series bold probability \series default with \series bold expectancy value! \series default If you don't know the mathematical term \begin_inset Quotes eld \end_inset expectancy value \begin_inset Quotes erd \end_inset , or if you don't know what this means \emph on in practice \emph default , don't take responsibility for millions of € or $. \end_layout \begin_layout Itemize When operating masses of hard- and software: never assume that a particular failure can occur only at a low number of instances. There are \series bold \emph on unknown(!) \emph default systematic errors \series default which may pop up at the wrong time and in huge masses when you don't expect them. \end_layout \begin_layout Itemize Multiple layers of fallback: \emph on any \emph default action can fail. Be prepared to have a plan B, and even a plan C, and even better a plan D, wherever possible. \end_layout \begin_layout Itemize Never increase any damage anywhere, unnecessarily! Always try to \emph on miminize \emph default any damage! It can be mathematically proven that in deterministic probabilistic systems having finite state, increases of a damage level \emph on at the wrong place \emph default will \emph on introduce \emph default an \emph on additional \emph default \emph on risk \emph default of getting into an \series bold endless loop \series default . This is also true for nondeterministic systems, as known from formal language theory \begin_inset Foot status open \begin_layout Plain Layout Finite automatons are known to be transformable to deterministic ones, usually by an exponential increase in the number of states. \end_layout \end_inset . \end_layout \begin_layout Itemize Apply the \series bold best effort principle \series default . You should be aware of the following fact: in general, it is impossible to create an \emph on absolutely reliable system \emph default out of unreliable components. You can \emph on lower \emph default the risk of failures to any \begin_inset Formula $\epsilon>0$ \end_inset by investing a lot of resources and of money, but whatever you do: \begin_inset Formula $\epsilon=0$ \end_inset is impossible. Therefore, be careful with boolean algebra. Prefer approximation methods / optimizing methods instead. Always do \emph on your \emph default best, instead of trying to reach a \emph on global \emph default optimum which likely does not exist at all (because the \begin_inset Formula $\epsilon$ \end_inset can only \emph on converge \emph default to an optimum, but will never actually reach it). \begin_inset Newline newline \end_inset The best effort principle means the following: if you discover a method for improving your operating state by reduction of a (potential) damage in a reasonable time and with reasonable effort, then \series bold simply do it \series default . Don't argue that a particular step is no 100% solution for all of your problems. \emph on Any \emph default \emph on improvement \emph default is valuable. \series bold Don't miss any valuable step \series default having reasonable cost with respect to your budget. Missing valuable measures which have low cost are certainly a violation of the best effort principle, because you are not doing \emph on your \emph default best. Keep that in mind. \begin_inset Newline newline \end_inset If you have \emph on understood \emph default this (e.g. deeply think at least one day about it), you will no longer advocate STONITH methods \emph on in general \emph default , when there are alternatives. STONITH methods are only valuable when you \emph on know in advance \emph default that the final outcome (after reboot) will most likely be better, and that waiting for reboot will most likely \emph on pay off \emph default . In general, this condition is \emph on not true \emph default if you have a healthy hot standby system. This should be easy to see. But there exist well-known clustermanager solutions / configurations blatantly ignoring \begin_inset Foot status open \begin_layout Plain Layout For some \emph on special(!) \emph default cases of the shared-disk model, there exist some justifications for doing STONITH \emph on before \emph default starting the application at the hot standby. Under certain circumstances, it can happen that system A running amok could destroy the data on your single shared disk (example: a filesystem doubly mounted \emph on in parallel \emph default , which will certainly destroy your data, except you are using \family typewriter ocfs2 \family default or suchalike). This argument is only valid for \emph on passive \emph default disks which are \emph on directly \emph default attached to \emph on both \emph default systems A and B, such that there is no \emph on external \emph default means for fencing the disk. In case of iSCSI running over ordinary network equipment such as routers or switches, the argument \begin_inset Quotes eld \end_inset fencing the disk is otherwise not possible \begin_inset Quotes erd \end_inset does not apply. You can interrupt iSCSI connections at the network gear, or you can often do it at cluster A or at the iSCSI target. Even commercial storage appliances speaking iSCSI can be remotely controlled for forcefully aborting iSCSI sessions. In modern times, the STONITH method has no longer such a justification. The justification stems from ancient times when a disk was a purely passive mechanical device, and its disk controller was part of the server system. \end_layout \end_inset this. Only when the former standby system does not work as expected (this means that \emph on all \emph default of your redundant systems are not healthy enough for your application), \emph on only then \begin_inset Foot status open \begin_layout Plain Layout Notice that STONITH may be needed for (manual or partially automatic) \emph on repair \emph default in some cases, e.g. when you know that a system has a kernel crash. Don't mix up the repair phase with failover or handover phases. Typically, they are executed at different times. The repair phase is outside the scope of this section. \end_layout \end_inset \emph default STONITH is unevitable as a \emph on last resort \emph default option. \begin_inset Newline newline \end_inset In short: blindly using STONITH without true need during failover is a violation of the best effort principle. You are simply not doing your best. \end_layout \begin_layout Itemize When your budget is limited, carefully select those improvements which make your system \series bold as reliable as possible \series default , given your fixed budget. \end_layout \begin_layout Itemize Create statistics on the duration of your actions. Based on this, try to get a \emph on balanced \emph default optimum between time and cost. \end_layout \begin_layout Itemize Whatever actions you can \series bold start in parallel \series default for saving time, do it. Otherwise you are disregarding the best effort principle, and your solution will be sub-optimal. You will require deep knowledge of parallel systems, as well as experience with dealing with problems like (distributed) races. Notice that \emph on any \emph default distributed system is \emph on inherently parallel \emph default . Don't believe that sequential methods can deliver an optimum solution in such a difficult area. \end_layout \begin_layout Itemize If you don't have the \series bold necessary skills \series default for (a) recognizing already existing parallelism, (b) dealing with parallelism at concept level, (c) programming and/or configuring parallelism race-free and deadlock-free (or if you even don't know what a race condition is and where it may occur in practice), then don't take responsibility for millions of € or $. \end_layout \begin_layout Itemize Avoid hard timeouts wherever possible. Use \series bold adaptive timeouts \series default instead. Reason: depending on hardware or workload, the same action A may take a very short time on cluster 1, but take a very long time on cluster 2. If you need to guard action A from hanging (which is almost always the case because of Murphy's law), don't configure any fixed timeout for it. When having several hundreds of clusters, you would need to use the \emph on worst case value \emph default , which is the longest time occurring somewhere at the very slow clusters / slow parts of the network. This wastes a lot of time in case one of the fast clusters is hanging. Adaptive timeouts work differently: they use a kind of \begin_inset Quotes eld \end_inset progress bar \begin_inset Quotes erd \end_inset to monitor the \emph on progress \emph default of an action. They will abort only if there is \emph on no progress \emph default for a certain amount of time. Hint: among others, \family typewriter marsadm view-*-rest \family default commands or macros are your friend. \end_layout \end_inset \end_layout \begin_layout Paragraph ITON = Ignore The Other Node \end_layout \begin_layout Standard This strategy means \series bold fencing from application traffic \series default , and can be used as an alternative to STONITH when done properly. \begin_inset Separator latexpar \end_inset \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/fencing-hierarchy.fig width 60col% \end_inset \end_layout \begin_layout Standard \noindent Fencing from application traffic is best suited for the shared-nothing model, but can also be adapted to the shared-disk model with some quirks. \end_layout \begin_layout Standard The idea is simple: always route your application network traffic to the current (logically) active side, whether it is currently A or B. Just don't route any application requests to the current (logically) passive side at all. \end_layout \begin_layout Standard For failover (and \emph on only \emph default for that), you \emph on should not care about \emph default any split brain occurring at the low-level generic block device: \begin_inset Separator latexpar \end_inset \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/split-brain-history.fig width 50col% \end_inset \end_layout \begin_layout Standard \noindent Although having a split brain at the generic low-level block device, you now define the \begin_inset Quotes eld \end_inset logically active \begin_inset Quotes erd \end_inset and \begin_inset Quotes eld \end_inset logically passive \begin_inset Quotes erd \end_inset side by yourself by \emph on logically ignoring \emph default the \begin_inset Quotes eld \end_inset wrong \begin_inset Quotes erd \end_inset side as defined by yourself: \begin_inset Separator latexpar \end_inset \end_layout \begin_layout Standard \noindent \align center \begin_inset Graphics filename images/split-brain-resolved.fig width 50col% \end_inset \end_layout \begin_layout Standard \noindent This is possible because the generic block devices provided by DRBD or MARS are completely \series bold agnostic \series default of the \begin_inset Quotes eld \end_inset meaning \begin_inset Quotes erd \end_inset of either version A or B. Higher levels such as clustermanagers (or humans like sysadmins) can assign them a meaning like \begin_inset Quotes eld \end_inset relevant \begin_inset Quotes erd \end_inset or \begin_inset Quotes eld \end_inset not relevant \begin_inset Quotes erd \end_inset , or \begin_inset Quotes eld \end_inset logically active \begin_inset Quotes erd \end_inset or \begin_inset Quotes eld \end_inset logically passive \begin_inset Quotes erd \end_inset . \end_layout \begin_layout Standard As a result of fencing from application traffic, the \begin_inset Quotes eld \end_inset logically passive \begin_inset Quotes erd \end_inset side will \emph on logically \emph default cease any actions such as updating user data, even if it is \begin_inset Quotes eld \end_inset physically active \begin_inset Quotes erd \end_inset during split-brain (when two primaries exist in DRBD or MARS sense \begin_inset Foot status open \begin_layout Plain Layout Hint: some clustermanagers and/or some people seem to define the term \begin_inset Quotes eld \end_inset split-brain \begin_inset Quotes erd \end_inset differently from DRBD or MARS. In the context of generic block devices, split brain means that the \emph on history \emph default of both versions has been split to a Y-like \series bold fork \series default (for whatever reason), such that re-joining them \emph on incrementally \emph default by ordinary write operations is no longer guaranteed to be possible. As a slightly simplified definition, you might alternatively use the definition \begin_inset Quotes eld \end_inset two incompatible primaries are existing in parallel \begin_inset Quotes erd \end_inset , which means almost the same in practice. Details of formal semantics are not the scope of this treatment. \end_layout \end_inset ). \end_layout \begin_layout Standard If you already have some load balancing at the network, or BGP, or another \emph on mechanism \emph default for dynamic routing, you already have an important part for the ITON method. Additionally, ensure by an appropriate \emph on strategy \emph default that your balancer status / BGP announcement etc does always coincide with the \begin_inset Quotes eld \end_inset logically active \begin_inset Quotes erd \end_inset side (recall that even during split-brain \emph on you \emph default must define \begin_inset Quotes eld \end_inset logically active \begin_inset Quotes erd \end_inset \series bold uniquely \series default \begin_inset Foot status open \begin_layout Plain Layout A possible strategy is to use a Lamport clock for route changes: the change with the most recent Lamport timestamp will always win over previous changes. \end_layout \end_inset by yourself). \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Application fencing \end_layout \end_inset \end_layout \begin_layout Description Phase1 Check whether the hot standby B is currently usable. If this is violated (which may happen during certain types of disasters), do not start failover for any affected resources. \end_layout \begin_layout Description Phase2 Do the following \emph on in parallel \begin_inset Foot status open \begin_layout Plain Layout For database applications where no transactions should get lost, you should slightly modify the order of operations: first fence the old side A, then start the application at standby side B. However, be warned that even this cannot guarantee that no transaction is lost. When the network between A and B is interrupted \emph on before \emph default the incident happens, DRBD will automatically disconnect, and MARS will show a lagbehind. In order to fully eliminate this possibility, you can either use DRBD and configure it to hang forever during network outages (such that users will be unable to commit any transactions at all), or you can use the shared-disk model instead. But in the latter case, you are introducing a SPOF at the single shared disk. The former case is logically almost equivalent to shared-disk, but avoiding some parts of the physical SPOF. In a truly distributed system, the famous CAP theorem is limiting your possibilities. Therefore, no general solution exists fulfilling all requirements at the same time. \end_layout \end_inset : \begin_inset Separator latexpar \end_inset \end_layout \begin_deeper \begin_layout Itemize Start all affected applications at the hot standby B. This can be done with the same DRBD or MARS procedure as described in \begin_inset CommandInset ref LatexCommand nameref reference "Phase4-in-more" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Itemize Fence A by fixedly routing all affected application traffic to B. \end_layout \end_deeper \begin_layout Plain Layout That's all which has to be done for a shared-nothing model. Of course, this will likely produce a split-brain (even when using DRBD in place of MARS), but that will not matter from a user's perspective, because the users will no longer \begin_inset Quotes eld \end_inset see \begin_inset Quotes erd \end_inset the \begin_inset Quotes eld \end_inset logically passive \begin_inset Quotes erd \end_inset side A through their network. Only during the relatively small time period where application traffic was going to the old side A while not replicated to B due to the incident, a very small number of updates \emph on could \emph default have gone lost. In fields like webhosting, this can be taken into account. Users will usually not complain when some (smaller amount of) data is lost due to split-brain. They will complain when the service is unavailable. \end_layout \end_inset \end_layout \begin_layout Standard \noindent This method is the \series bold fastest \series default for restoring HA, because it doesn't try to execute any (remote) action at side A. Only from a sysadmin's perspective, there remain some cleanup tasks to be done during the following repair phase, such as split-brain resolution, which are outside the scope of this treatment. \end_layout \begin_layout Standard By running the application fencing step \emph on sequentially \emph default (including wait for its partial successfulness such that the old side A can no longer be reached by any users) in front of the failover step, you may minimize the amount of lost data, but at the cost of total duration. Your service will take longer to be available again, while the amount of lost data could be \emph on theoretically \emph default somewhat smaller. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset A few people might clamour when some data is lost. In long-distance replication scenarios with high update traffic, there is \emph on simply no way at all \emph default for guaranteeing that no data can be lost ever. According to the laws of Einstein and the laws of Distributed Systems like the famous CAP theorem (see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Explanation-via-CAP" plural "false" caps "false" noprefix "false" \end_inset ), this isn't the fault of DRBD+proxy or MARS, but simply the \emph on consequence \emph default of having long distances. If you want to protect against data loss as best as possible, and when you can afford it financially, then don't use \begin_inset Formula $k=2$ \end_inset replicas. Use \begin_inset Formula $k\geq3$ \end_inset , and spread them over different distances, such as mixed small + medium + long distances. Future versions of MARS are planned to support adaptive pseudo-synchronous modes, which will allow individual adaptation to network latencies / distances. \end_layout \end_inset \end_layout \begin_layout Standard \noindent The ITON method can be adapted to shared-disk by additionally fencing the common disk from the (presumably) failed cluster node A. \end_layout \begin_layout Subsubsection Handover Methods \end_layout \begin_layout Standard Planned handover is conceptually simpler, because both sides must be (almost) healthy as a \emph on precondition \emph default . There are simply no pre-existing failures to deal with. \end_layout \begin_layout Standard Here is an example using DRBD, some application commands denoted as pseudo code: \end_layout \begin_layout Enumerate at side A: \family typewriter applicationmanager stop all \end_layout \begin_layout Enumerate at side A: \family typewriter drbdadm secondary all \end_layout \begin_layout Enumerate at side B: \family typewriter drbdadm primary all \end_layout \begin_layout Enumerate at side B: \family typewriter applicationmanager start all \end_layout \begin_layout Standard MARS already has a conceptual distinction between handover and failover. With MARS, it becomes even simpler, because a generic handover procedure is already built in: \end_layout \begin_layout Enumerate at side A: \family typewriter applicationmanager stop all \end_layout \begin_layout Enumerate at side B: \family typewriter marsadm primary all \end_layout \begin_layout Enumerate at side B: \family typewriter applicationmanager start all \end_layout \begin_layout Standard When using the \family typewriter systemd \family default interface of \family typewriter marsadm \family default (see \family typewriter mars-user-mnaual.pdf \family default ), this can be shortened into only one command: \end_layout \begin_layout Enumerate at side B: \family typewriter marsadm primary all \end_layout \begin_layout Subsubsection Hybrid Methods \end_layout \begin_layout Standard In general, a planned handover may fail at any stage. Notice that such a failure is also a failure, but (partially) caused by the planned handover. You have the following alternatives for automatically dealing with such cases: \end_layout \begin_layout Enumerate In case of a failure, switch back to the old side A. \end_layout \begin_layout Enumerate Instead, forcefully switch to the new side A, similar to the methods described in section \begin_inset CommandInset ref LatexCommand ref reference "subsec:Failover-Methods" \end_inset . \end_layout \begin_layout Standard Similar options exist for a failed failover (at least in theory), but chances are lower for actually recovering if you have only \begin_inset Formula $k=2$ \end_inset replicas in total. \end_layout \begin_layout Standard Whatever you decide to do in what case in whatever priority order, whether you decide it in advance or during the course of a failing action: it simply means that according to the best effort principle, you should \series bold never leave your system in a broken state \series default when there exists a chance to recover availability with any method. \end_layout \begin_layout Standard Therefore, you should \emph on implement \emph default neither handover nor failover in their pure forms. Always implement hybrid forms following the best effort principle. \end_layout \begin_layout Subsection Special Requirements for Long Distances \begin_inset CommandInset label LatexCommand label name "subsec:Special-Requirements-for" \end_inset \end_layout \begin_layout Standard Most contemporary clustermanagers have been constructed for short distance shared-nothing clusters, or even for \emph on local \emph default shared-nothing clusters (c.f. DRBD over crossover cables), or even for shared-disk clusters ( \emph on originally \emph default , when their \emph on concepts \emph default were developed). Blindly using them for long-distance replication without modification / adaptation bears some additional risks. \end_layout \begin_layout Itemize Notice that long-distance replication always \emph on requires \emph default a \series bold shared-nothing \series default model. \end_layout \begin_layout Itemize As a consequence, \series bold split brain \series default can appear \emph on regularly \emph default during failover. There is no way for preventing it! This is an \emph on inherent property \emph default of distributed systems, not limited to MARS (e.g. also ocurring with DRBD if you try to use it over long distances). Therefore, you \emph on must \emph default deal with occurences of split-brain as a \emph on requirement \emph default . \end_layout \begin_layout Itemize The probability of \series bold network partitions \series default is much higher: although you should have been required by Murphy's law to deal with network partitions already in short-distance scenarios, it now becomes \emph on mandatory \emph default . \end_layout \begin_layout Itemize Be prepared that in case of certain types of (more or less global) internet partitions, you may not be able to trigger STONITH actions \emph on at all \emph default . Therefore, \series bold fencing of application traffic \series default is \emph on mandatory \emph default . \end_layout \begin_layout Chapter Advice for Managers and Architects \begin_inset CommandInset label LatexCommand label name "chap:Advice-for-Managers" \end_inset \end_layout \begin_layout Section Maturity Considerations for Managers \begin_inset CommandInset label LatexCommand label name "sec:Maturity-Considerations" \end_inset \end_layout \begin_layout Subsection Maturity of Architectures \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout Instances of storage system \emph on architectures \emph default (see section \begin_inset CommandInset ref LatexCommand nameref reference "sec:What-is-Architecture" plural "false" caps "false" noprefix "false" \end_inset ) typically have a \series bold lifetime \series default of \series bold decades \series default . \end_layout \begin_layout Plain Layout While implementations / components / storage vendors etc can often be exchanged or updated more frequently (typically lifecycles of 3 to 5 years for CAPEX reasons), \series bold fundamental architectures \series default are much less flexible to change, and thus are \emph on forcing \emph default you into a \series bold long-term strategy \series default . \end_layout \end_inset \end_layout \begin_layout Standard \noindent In contrast, certain hardware technologies have a much lower lifetime, typically between 1 and 2 years. New server hardware / new disks / SSDs etc are hitting their market all the time, like waves in the ocean. \end_layout \begin_layout Standard \emph on System software \emph default technologies (OS layer) typically have a lifetime inbetween hardware and architecture lifetimes. Although their update cycles / minor release cycles are typically even faster than hardware releases, their \emph on fundamental product appearance points \emph default are rather stable \begin_inset Foot status open \begin_layout Plain Layout Appearance of certain technologies may occur in \series bold hype cycles \series default , caused by \emph on social \emph default effects. While there are founding wa ves for (sometimes similar) product classes, other solution appearancesare more evenly spread over the decades. For example, appearance of many Unix clones / descendants appears to rather smoothly distributed over half a century. \end_layout \end_inset . For example, the Linux kernel is now more than 20 years old, while its \emph on fundamental architecture \emph default has been copied from Unix and is now almost 50 years old. \end_layout \begin_layout Standard Certain advocates are arguing with the \emph on current \emph default status of maturity of \emph on components \emph default . In a long-term business operated by professionals, there is an observable long-term trend: \end_layout \begin_layout Quote \series bold \size large Maturity of components is (almost) always improving over the years. \end_layout \begin_layout Standard Of course, maturity is important. In sensible areas, so-called \begin_inset Quotes eld \end_inset banana software \begin_inset Quotes erd \end_inset may even kill you. In such a situation, the \emph on current \emph default maturity status is important. However, once an implementation is \emph on mature enough \emph default , and/or once only some nice-to-have features are deservable, the long-term maturity trend / forecast of implementations / components is more important than the current status. You can influence this with your \series bold long-term investment decisions \series default . \end_layout \begin_layout Standard There exists something which is even more important: \end_layout \begin_layout Quote \series bold \size large Maturity of fundamental architectures is most important, because they \emph on cannot \emph default improve. Architectures need to be \uuline on right from scratch \uuline default . \end_layout \begin_layout Standard This is similar to mathematics: Pythagoras' theorem or Einstein's laws cannot be improved. They will last forever. At most, they can get old-fashioned or otherwise \series bold outdated \series default / obsoleted. However, there are other chances and \series bold opportunities \series default : \end_layout \begin_layout Itemize New / better architetures may appear (rarely). \end_layout \begin_layout Itemize Implementations of architectures should evolve slowly over time. \end_layout \begin_layout Itemize Implementations may slowly migrate to other architectures, or even support multiple architectures at the same time (convergence properties). \end_layout \begin_layout Standard \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold General advice \end_layout \end_inset \end_layout \begin_layout Quote \series bold \size large Pay more attention to fundamental architectures. Develop a \uuline on long-term strategy \uuline default for maturity of components and implementations. \end_layout \end_inset \end_layout \begin_layout Subsection Maturity of MARS \begin_inset CommandInset label LatexCommand label name "subsec:Maturity-of-MARS" \end_inset \end_layout \begin_layout Standard Notice that MARS itself is just a component. For a fully functional system, you will need some more infrastructure at several layers. \end_layout \begin_layout Itemize \series bold MARS \series default itself is in production since 2013, and on mass data (several petabytes) since 2014. MARS itself is \emph on generic \emph default , and can be used for a multitude of Linux application stacks. \end_layout \begin_layout Itemize A \series bold cluster manager \series default \begin_inset Foot status open \begin_layout Plain Layout 1&1 Ionos ShaHoLin uses a self-built proprietray cluster manager called \family typewriter cm3 \family default . It works only with the internal 1&1 database infrastructure, and is not generic. \end_layout \end_inset is typically also needed for mass installations. You can use the \family typewriter systemd \family default template engine of \family typewriter marsadm \family default , see \family typewriter mars-user-manual.pdf \family default , which is easily configurable by Linux sysadmins. \end_layout \begin_layout Itemize Tpically, \series bold monitoring \series default is anyway specific for each application stack. Adding some simple Icinga scripts or similar should be no problem for professio nal Linux admins. \end_layout \begin_layout Itemize Automatic \series bold mass deployment \series default : this is anyway specific for the deployment system used for your system plus application stack. At the moment, plugins for generic solutions like OpenStack etc are missing. This is an opportunity for other OpenSource projects! \end_layout \begin_layout Itemize The \series bold Football framework \series default is in mass production at 1&1 Ionos ShaHoLin since 2018. It has some plugin for driving the \family typewriter systemd \family default cluster manager. Its plugin architecture should allow easy adaptation to other system and application stacks. \end_layout \begin_layout Itemize Another opportunity for OpenSource projects: some web-based point-and-click \series bold dashboard \series default similar to the Ceph Dashboard, but displaying and controlling sharded LVM pools which are replicated via MARS, and also controlling Football, would be a highly appreciated addendum. \end_layout \begin_layout Section Recommendations for Design and Operation of Storage Systems \begin_inset CommandInset label LatexCommand label name "sec:Recommendations-for-Designing" \end_inset \end_layout \begin_layout Subsection Recommendations for Managers \begin_inset CommandInset label LatexCommand label name "subsec:Recommendations-for-Managers" \end_inset \end_layout \begin_layout Standard When you are responsible for \series bold masses of enterprise-critical data \series default , the most important point is to get people with \series bold the right skills \series default , in \emph on addition(!) to \emph default the \emph on right mindset \emph default , and to assign the right roles to them. \end_layout \begin_layout Standard Practical observation from many groups in many companies: which storage systems / architectures are in use, and how much they are \emph on really \emph default \series bold failure resistent \series default and \series bold reliable \series default , and how much they are \emph on really \emph default \series bold scalable \series default for their workload, and what is their \series bold TCO = Total Cost of Ownership \series default , does often \emph on not \emph default depend on real knowledge and on facts. It often depends \series bold randomly \series default on \series bold personal habits \series default and \series bold pre-judgement \series default of staff \begin_inset Foot status open \begin_layout Plain Layout \noindent This can be seen in a bigger company (e.g. after mergers etc) when very different architectures have been built by different teams for very similar usecases, although they are sometimes even roughly comparable in size and workload. \end_layout \end_inset . \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset In essence, this results in a \series bold gambling game \series default how safe / cost-effective etc your critical data \emph on really \emph default is. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset In particular after company mergers, suchalike varieties need not remain a permanent disadvantage. You may turn it into an advantage. Once you have enough reliable and validated KPIs about each of the systems, and after you have checked that they are \emph on really \emph default comparable, you can derive a detailed comparison of competing architectures and/or of their actual implementations. Then you may start \series bold merging \series default some of the technical platforms, provided there is a business case for it. Or, you may \series bold bleed out \series default some old / obsolete technology. \end_layout \begin_layout Standard When the game is about building up \series bold new functionality \series default from scratch, it is much different. There are two main possibilities: \end_layout \begin_layout Enumerate check whether your \emph on best \emph default platform can be extended with the new functionality. Good architectures are also \series bold easily extensible \series default . \end_layout \begin_layout Enumerate build a new platform. \end_layout \begin_layout Standard The rest of this section focusses on architecture of \emph on new \emph default platforms. Always check whether existing \emph on experience \emph default can be re-used. \end_layout \begin_layout Standard \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset As explained throughout section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Scalability-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset , there are many pitfalls, and there are only few people who know them, because more people are working in small-scale systems than in large-scale enterprise ones. There are so many lots of people at the market who \emph on claim \emph default to have some experience, but in reality they don't know what they don't know ( \series bold second-order ignorance \series default ). \end_layout \begin_layout Standard Second-order ignorance is very dangerous, even for affected people themselves, because they are in good faith about their own skills, and that they would be able to control everything (sometimes they really want to control literally \emph on everything \emph default , even other people who have more real experience and knowledge). See for example wrong assumptions and \begin_inset Quotes eld \end_inset false proofs \begin_inset Quotes erd \end_inset about scalability, derived from different use cases (or even from workstation workloads). See the failed scalability scenario in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Example-Failures-of" plural "false" caps "false" noprefix "false" \end_inset where some freelancers were consulted as \begin_inset Quotes eld \end_inset external experts \begin_inset Quotes erd \end_inset . \end_layout \begin_layout Quotation \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Pitfall \begin_inset Quotes eld \end_inset false experts \begin_inset Quotes erd \end_inset \end_layout \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Check your information sources! There is a \emph on systematic reason \emph default for ill-informed \begin_inset Quotes eld \end_inset experts \begin_inset Quotes erd \end_inset : the internet. \end_layout \end_inset \end_layout \begin_layout Quotation \noindent \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset On the internet, you can find a lot of so-called \begin_inset Quotes eld \end_inset best practices \begin_inset Quotes erd \end_inset . Many of them propagating badly scaling storage architectures for enterprise workloads, sometimes even \emph on generally \emph default claiming they would \begin_inset Quotes eld \end_inset scale very well \begin_inset Quotes erd \end_inset , which is however often based on \emph on assumptions \emph default instead of knowledge (and rarely based on \emph on measurements \emph default at the right measurement points for deriving substantial knowledge about your \emph on real \emph default application behaviour). Literally \emph on anyone \emph default can post incorrectly generalized \begin_inset Quotes eld \end_inset best practices \begin_inset Quotes erd \end_inset to the internet. Together with second-order ignorance about the non-transferability of \begin_inset Quotes eld \end_inset success stories \begin_inset Quotes erd \end_inset from usecase A to usecase B (resulting in \emph on false \begin_inset Quotes eld \end_inset proofs \emph default \begin_inset Quotes erd \end_inset ), the internet is creating \series bold information bubbles \series default . \end_layout \begin_layout Quotation \begin_inset Flex Custom Color Box 1 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Superfluous load balancers \end_layout \end_inset Good examples are HTTP or other IP-based load balancers placed in front of VMs. Almost always, this is an \series bold expensive ill-design \series default . \end_layout \begin_layout Plain Layout \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Notice: as long as \emph on multiple \emph default VM instances are hosted on \emph on one \emph default hypervisor iron, load balancers are most likely completely useless \begin_inset Foot status open \begin_layout Plain Layout Reason: on SMP servers, there \emph on already exists \emph default a \begin_inset Quotes eld \end_inset load balancer \begin_inset Quotes erd \end_inset . The kernel and its \series bold process scheduler \series default can do even better than any external load balancer, by better distribution of physical CPUs to processes, and by exploitation of \series bold shared memory \series default , for example shared filesystem kernel caches, such as the Dentry Cache, and the fscache / Page Cache. Exceptions would only occur when there were per-VM global bottlenecks, such as interdependent processes. For instance, it is easy to \emph on misconfigure \emph default Apache logfiles to become such a bottleneck. Just fix such misconfigurations, before claiming that SMP scalability would be limited. \end_layout \end_inset . Instead, just assign more physical resources to a single VM. Only when the application load is \emph on really \emph default so high that 1 VM would fill up a hypervisor \emph on completely \emph default , only then a load balancer \emph on might \emph default be potentially useful. However, \emph on first \emph default check that there are enough RAM and SMP hardware threads. Only when state-of-the-art multi-socket CPUs with \begin_inset Formula $\approx128$ \end_inset or more CPU threads would be insufficient for a very high connection rate, and after tuning measures like PHP OpCache were not sufficient, a load balancer or another means for load distribution \emph on could \emph default become necessary. \end_layout \begin_layout Plain Layout \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Even then, there are often more intelligent alternative solutions, like wide-area \emph on distributed \emph default \series bold input traffic partitioning \series default to geo-distributed servers, in place of a central load balancer acting as a SPOF in a single datacenter. For example, source-IP based routing can partition global traffic into per-continent datacenters, drastically reducing application traffic latencies. In essence, this is coarse granularity sharding at global level. \end_layout \end_inset \end_layout \begin_layout Quotation \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset In a nutshell: compared to the scalability of sharding, load balancers would be \series bold only suitable for small-scale scalability \series default . However, small-scale scalability is much easier to achieve via hardware-based SMP = Symmetric MultiProcessing, at least in \emph on most \emph default \begin_inset Foot status open \begin_layout Plain Layout Personally, I have never seen a situation where a load balancer was really necessary. In all example cases, they were superfluous. In a few cases, they were even counter-productive. \end_layout \end_inset cases. \end_layout \begin_layout Quotation \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Never start a design with a load balancer \emph on by default \emph default . Only use load balancers when there is \emph on well-founded strong evidence \emph default that other scalability measures won't suffice. In particular, it needs to be very clear that sharding is really impossible, which in turn implies that there exists only 1 big customer, and that its data cannot be partitioned at all. \end_layout \begin_layout Quotation \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Cost explosion by superfluous load balancers \end_layout \end_inset Unnecessary load balancers are causing \series bold follow-up cost by increased complexity \series default . In addition to the load balancer and its administration, \emph on multiple \emph default servers and/or VMs need to be set up and administered. \end_layout \begin_layout Plain Layout \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset If you just need a redirection mechanism, read sections \begin_inset CommandInset ref LatexCommand nameref reference "sec:Location-transparency" plural "false" caps "false" noprefix "false" \end_inset and \begin_inset CommandInset ref LatexCommand nameref reference "sec:Where-implement-Location-Transparency" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Plain Layout \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset For example, the traffic from BGP = Border Gateway Protocol is executed by your \series bold ordinary network routers \series default , without additional hardware, and they can distribute sharded traffic to wide-area geo-locations. In comparison, load balancers are just restricted \series bold overkill \series default . \end_layout \begin_layout Plain Layout \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Never accept a system design with a \emph on mandatory \emph default load balancer. It will likely imply a BigCluster-like \emph on architecture \emph default , though typically only \emph on implemented \emph default as a SmallCluster. \end_layout \end_inset \begin_inset Flex Custom Color Box 2 status open \begin_layout Plain Layout \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Mandatory load balancers are often \begin_inset Foot status open \begin_layout Plain Layout There are some rare potential exceptions, like \series bold game servers \series default rendering scenes in \series bold realtime \series default , consuming \emph on massive \emph default CPU and/or GPU power in relation to network bandwidth. Even there, sharding is often a better alternative. In contrast, ordinary video streaming typically consumes very low CPU power, because file streaming is executed by kernel \family typewriter sendpage() \family default and partly offloaded to DMA hardware acceleration. \end_layout \end_inset creating some \begin_inset Formula $O(n^{2})$ \end_inset behaviour, showing up somewhere, often unexpectedly. Even when reduced to \begin_inset Formula $O(n)$ \end_inset , load balancers are close to the \series bold opposite of sharding \series default at \emph on concept level \emph default , because they try to \emph on distribute \emph default an \emph on unpartitioned load \emph default to servers needing \series bold shared data \series default similar to DSM (see section \begin_inset CommandInset ref LatexCommand ref reference "subsec:Explanations-from-DSM" plural "false" caps "false" noprefix "false" \end_inset ), instead of first \emph on partitioning the data \emph default and thus also partitioning the corresponding traffic. Read section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Error-Propagation-to" plural "false" caps "false" noprefix "false" \end_inset about typical \emph on real \emph default scalability and reliability. When this doesn't help, read section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Example-Failures-of" plural "false" caps "false" noprefix "false" \end_inset where the load balancer was a major \emph on source(!) \emph default of massive scalability problems. \end_layout \begin_layout Plain Layout \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset \series bold Sharding \series default architectures typically don't need any load balancers, although they are \series bold massively scalable \emph on horizontally \series default \emph default . Typically, they rely on the scalability of DNS, and of IP routing. Notice: when DNS would reach its scalability limit, then the internet as such would not scale anymore. \end_layout \begin_layout Plain Layout \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset In comparison, a load balancer is a SPOB = Single Point Of \series bold Bottleneck \series default , where the traffic must physically \series bold flow through \series default (thereby increasing hops and latencies), instead of dynamic wide-area routing. \end_layout \end_inset \end_layout \begin_layout Quotation \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Load balancers vs sharding \end_layout \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset As a manger, if you \begin_inset Quotes eld \end_inset buy \begin_inset Quotes erd \end_inset a \emph on mandatory \emph default load balancer, there is a high risk for \series bold architecturally hindering long-term scalability \series default by sharding. \end_layout \begin_layout Plain Layout \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Check whether people are \emph on really \emph default experts, when they want to solve suspected(!) scalability problems via mandatory load balancers. It is just poor system design, often inducing DSM problems, and producing unnecessary follow-up cost. Unfortunately, load balancers are systematically promoted by \series bold internet information bubbles \series default . \end_layout \end_inset \end_layout \begin_layout Quotation \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Real knowledge originates from evaluated sources, such as \series bold scientific publications \series default which have undergone at least some minimum \emph on quality check \emph default , and which are trying to describe their preconditions and operating environment s as precisely \begin_inset Foot status open \begin_layout Plain Layout \noindent Therefore, chances are better to get a real expert when he has some (higher) academic degrees, and was working in the area for a longer time. \end_layout \end_inset as possible. \end_layout \begin_layout Quotation \noindent \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset Real experts will tell you when they don't know something. In addition, they will tell you \emph on multiple \emph default ways for obtaining such information, such as measurements, simulation, etc. In addition, real experts are able to do well-founded measurements and deriving forecasts from them. Later, when it works, their forecasts were roughly correct. Check the quality of forecasts afterwards! \end_layout \begin_layout Standard If you don't have anyone in your teams who knows how \series bold caching \series default \emph on really \emph default works, or if it is a single guy who cannot withstand the pressure from a whole group of \begin_inset Quotes eld \end_inset alpha animals \begin_inset Quotes erd \end_inset , you are running an \series bold increased risk \series default of unnecessary expenses \begin_inset Foot status open \begin_layout Plain Layout I know of cases which have produced unnecessary \emph on direct \emph default cost of at least € 20 millions, not counting further indirect cost such as power and rackspace consumption. \end_layout \end_inset , worse services (indirect cost), failed projects, and sometimes even resulting in loss of market share and/or of stock exchange value. \end_layout \begin_layout Standard The problem is that it \emph on looks so easy \emph default , as if everyone could build a \emph on large(!) \emph default storage and/or application system, with ease. It looks easy once a small prototype is running at a workstation. Some people believe that \begin_inset Quotes eld \end_inset just spend some more money \begin_inset Quotes erd \end_inset would all which is needed. Unfortunately, both \begin_inset Quotes eld \end_inset marketing drones \begin_inset Quotes erd \end_inset from commercial storage vendors, and even a few OpenSource advocates, are propagating this \series bold dangerous mindset \series default . \end_layout \begin_layout Standard As a responsible manager, \series bold how can you detect \series default dangerous partly knowledge? \end_layout \begin_layout Standard Good indicators are wrong usage of the term \begin_inset Quotes eld \end_inset architecture \begin_inset Quotes erd \end_inset (see definition in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:What-is-Architecture" plural "false" caps "false" noprefix "false" \end_inset ), and/or \series bold confusion of architecture with implementation \series default . When somebody confuses \begin_inset Foot status open \begin_layout Plain Layout Notice that there exist people who use the term \begin_inset Quotes eld \end_inset architecture \begin_inset Quotes erd \end_inset inadvertly. They even don't even know that they are confusing architecture with implementat ion. Pure usage of a certain term is no clear indicator that somebody is really an expert. \end_layout \end_inset this, he does not really have an overview of different architectural solution classes. Instead, such people are tending to propagate their random \begin_inset Quotes eld \end_inset favourite solution \begin_inset Quotes erd \end_inset or their random \begin_inset Quotes eld \end_inset favourite product \begin_inset Quotes erd \end_inset . For you as a responsible, this increases the \series bold risk \series default of getting a non-optimum, or possibly even a bad / dangerous solution. \end_layout \begin_layout Standard Another good indicator is advocacy of load balancers. See above boxes about the size of their real application area and their real value. Do not confuse people's belief with deep knowledge. The latter also requires theoretical background, in addition to practical experience. \end_layout \begin_layout Standard Not everything which works in a garage, or in a student pool, or in the testlab (whether it's yours or from a commercial storage vendor), or in a PoC with so-called \begin_inset Quotes eld \end_inset friendly customers \begin_inset Quotes erd \end_inset , is well-suited for large enterprises and their critical data (measured in petabytes / billions of files / etc), or is the optimum solution for TCO. Some rules of thumb, out of experience and observation: \end_layout \begin_layout Itemize For each 1 or 2 orders of magnitude of the \series bold size \series default of your data, you will need \series bold better methods \series default for safe construction and operation, as would be sufficient for lower demands. \end_layout \begin_layout Itemize For each 3 to 4 orders of magnitude (sometimes even for less), you will need \series bold better architectures \series default , and people who can deal with them. \end_layout \begin_layout Itemize For each 1 or 2 orders of magntitude of \series bold criticality \series default of your data (measured by \emph on losses \emph default in case of certain incidents), you will also need better architecture, not just better components. \end_layout \begin_layout Standard \noindent \begin_inset Flex Custom Color Box 3 status open \begin_layout Plain Layout \noindent \begin_inset Argument 1 status open \begin_layout Plain Layout \series bold Important advice \end_layout \end_inset \begin_inset Graphics filename images/lightbulb_brightlit_benj_.png lyxscale 12 scale 7 \end_inset If you start a new platform from scratch, always \series bold start with a \emph on good \emph default architecture \series default . \end_layout \end_inset \end_layout \begin_layout Standard \noindent Once a platform is in production, even with a small number of customers, it becomes increasingly difficult to change its fundamental architecture. While bugs can be relatively easily fixed, and while single components can be exchanged with some effort, changing an architecture may turn out \emph on close to impossible \emph default , or at least very expensive. \end_layout \begin_layout Subsection Recommendations for Architects \begin_inset CommandInset label LatexCommand label name "subsec:Recommendations-for-Architects" \end_inset \end_layout \begin_layout Standard In order of precedence, do the following: \end_layout \begin_layout Enumerate \series bold Fix and/or limit and/or tune the \emph on application \series default \emph default . \begin_inset Newline newline \end_inset Some extreme examples: \end_layout \begin_deeper \begin_layout Itemize When you encounter a classical Unix \series bold fork bomb \series default , you have no chance against it. Even the \begin_inset Quotes eld \end_inset best and the most expensive hardware \begin_inset Foot status open \begin_layout Plain Layout There is an old joke from the 1980s: a Cray is a computer capable of running an endless loop in 10 seconds. \end_layout \end_inset \begin_inset Quotes erd \end_inset is unable to successfully run a fork bomb. The only countermeasure is \emph on limitation of resources \emph default . Reason: unlimited resources do not exist on earth. \end_layout \begin_layout Itemize If you think that this were only of academic interest: several types of internet \series bold DDOS attacks \series default are acting like a fork bomb, and \series bold Apache \series default is also acting similar to a fork bomb when not configured properly. This is not about academics, it is about \emph on your survival \emph default (in the sense of Darwin). \end_layout \begin_layout Itemize If you think it cannot hurt you because you are running \family typewriter fast-cgi \family default or another application scheme where forks are not part of the game (e.g. databases and many others): please notice that \series bold network queues \series default are often acting as a replacement for processes. Overflow of queues can have a similar effect than fork bombs from the viewpoint of customers: they simply don't get the service they are expecting. \end_layout \begin_layout Itemize If you think this cannot hurt you, because you are working in a completely different area from Apache: \emph on any \emph default type of IP-based network traffic can show queueing behaviour. Complex queuing systems can show \begin_inset Quotes eld \end_inset unexpected \begin_inset Quotes erd \end_inset behaviour, and sometimes even a dangerous one. \end_layout \begin_layout Itemize Real-life example for application-level problems: some percentage of \family typewriter WordPress \family default customers are typically and \emph on systematically \emph default \series bold misconfiguring \series default their \family typewriter wp-cron \family default cron jobs. They create backups of their website, which \emph on include \emph default their old backups. Result: in each generation of the backups, the needed disk space will roughly \emph on double \emph default . Even if you had \begin_inset Quotes eld \end_inset unlimited storage \begin_inset Quotes erd \end_inset on top of the \begin_inset Quotes eld \end_inset best and the most expensive storage system \begin_inset Quotes erd \end_inset , and even if you would like to give \begin_inset Quotes eld \end_inset unlimited storage \begin_inset Quotes erd \end_inset to your customers, it simply cannot work at all. Exponential growth is exponential growth. After a few months of this kind of daily backup, you would need more storage than atoms exist in the whole universe. You \emph on must \emph default introduce some quota limits somewhere. And you \emph on must \emph default ensure that the \family typewriter wp-cron \family default misconfiguration is fixed, whoever is responsible for fixing it. \end_layout \begin_layout Itemize Another \family typewriter WordPress \family default example: the \family typewriter wp-cron \family default configuration syntax is not easily understandable by laymen. It is easy to \series bold misconfigure \series default such that a backup is created \emph on once per minute \emph default . As long as the website is very small, this will not even be noticed by sysadmins. However, for bigger websites (and they are typically growing over time), the IO load may increase to a point until even asynchronous replication over 10Gig interfaces cannot catch up. Even worse: the next run of \family typewriter wp-cron \family default may start before the old one has finished within a minute. Again, there is no chance except fixing the \emph on root cause \emph default at application level. \end_layout \end_deeper \begin_layout Enumerate \series bold Choose the right \emph on overall \emph default architecture \series default (not limited to storage). \begin_inset Newline newline \end_inset An impressive example for architectural (cf section \begin_inset CommandInset ref LatexCommand nameref reference "sec:What-is-Architecture" plural "false" caps "false" noprefix "false" \end_inset ) ill-design can be found in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Example-Failures-of" plural "false" caps "false" noprefix "false" \end_inset . Important explanations are in section \begin_inset CommandInset ref LatexCommand ref reference "subsec:Properties-Scalability" \end_inset , in particular subsection \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Influence-Factors-Scalability" plural "false" caps "false" noprefix "false" \end_inset , and section \begin_inset CommandInset ref LatexCommand vref reference "subsec:Filesystem-Layer-vs" \end_inset . A strategic example is in subsection \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Example-Scalability-Scenario" plural "false" caps "false" noprefix "false" \end_inset . It is absolutely necessary to know the standard cache hierarchy of Unix (similarly also found in Windows) from section \begin_inset CommandInset ref LatexCommand nameref reference "sec:Performance-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset . More explanations are in this manual at many places. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset In general, major ill-designs of overall architectures (end-to-end) cannot be fixed at component level. Even the \begin_inset Quotes eld \end_inset best tuning of the world \begin_inset Quotes erd \end_inset executed by the \begin_inset Quotes eld \end_inset best tuning expert \begin_inset Quotes erd \end_inset on top of the \begin_inset Quotes eld \end_inset best and most expensive storage \emph on components \emph default over the best storage \emph on network \emph default of the world \begin_inset Quotes erd \end_inset cannot compensate major ill-designs, such as \begin_inset Formula $O(n^{2})$ \end_inset behaviour. \begin_inset Newline newline \end_inset \begin_inset Graphics filename images/MatieresCorrosives.png lyxscale 50 scale 17 \end_inset Similarly for reliability: if you have problems with too many and/or too large incidents affecting too many customers, read sections \begin_inset CommandInset ref LatexCommand nameref reference "sec:Reliability-Arguments-from" plural "false" caps "false" noprefix "false" \end_inset and \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Reliability-Differences-CentralStorage" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Enumerate \series bold Choice and tuning of components \series default . \begin_inset Newline newline \end_inset No further explanations necessary, because most people already know this. In case you think this is the \emph on only \emph default way: no, it is typically the \emph on worst \emph default and typically only the \emph on last resort \emph default when compared to the previous enumeration items. See example in section \begin_inset CommandInset ref LatexCommand nameref reference "subsec:Example-Failures-of" plural "false" caps "false" noprefix "false" \end_inset . \begin_inset Newline newline \end_inset Exception: choice of wrong components with insufficient properties for your particular application / use case, or even hard restrictions as mentioned in section \begin_inset CommandInset ref LatexCommand nameref reference "sec:What-is-Architecture" plural "false" caps "false" noprefix "false" \end_inset . But this is an \emph on architectural \emph default problem in reality, and belongs to the previous item, not to this one. \end_layout \begin_layout Chapter \start_of_appendix Mathematical Model of Architectural Reliability \begin_inset CommandInset label LatexCommand label name "chap:Mathematical-Model-of" \end_inset \end_layout \begin_layout Standard The assumptions used in the model are explained in detail in section \begin_inset CommandInset ref LatexCommand vref reference "sub:Detailed-explanation" \end_inset . Here is a quick recap of the main parameters: \end_layout \begin_layout Itemize \begin_inset Formula $n$ \end_inset is the number of basic storage units. It is also used for the number of application units, assumed to be the same. \end_layout \begin_layout Itemize \begin_inset Formula $k$ \end_inset is the replication degree, or number of replicas. In general, you will have to deploy \begin_inset Formula $N=k*n$ \end_inset storage servers for getting \begin_inset Formula $n$ \end_inset basic storage units. This applies to any of the competing architectures. \end_layout \begin_layout Itemize \begin_inset Formula $s$ \end_inset is the architecture-dependent spread exponent: it tells whether a storage incident will spread to the application units. Examples: \begin_inset Formula $s=0$ \end_inset means that there is no spread between storage unit failures and application unit failures, other than a local 1:1 one. \begin_inset Formula $s=1$ \end_inset means that an uncompensated storage node incident will cause \begin_inset Formula $n$ \end_inset application incidents. \end_layout \begin_layout Itemize \begin_inset Formula $p$ \end_inset is the probability of a storage server incident. In the examples at section \begin_inset CommandInset ref LatexCommand vref reference "sec:Reliability-Arguments-from" \end_inset , a fixed \begin_inset Formula $p=0.0001$ \end_inset was used for easy understanding, but the following formulae should also hold for any other \begin_inset Formula $p\in(0,1)$ \end_inset . \end_layout \begin_layout Itemize \begin_inset Formula $T$ \end_inset is the observational period, introduced for convenience of understanding. The following can also be computed independently from any \begin_inset Formula $T$ \end_inset , as long as the probability \begin_inset Formula $p$ \end_inset does not change over time, which is assumed. Because \begin_inset Formula $T$ \end_inset is only here for convenience of understanding, we set it to \begin_inset Formula $T=1/p$ \end_inset . In the examples from section \begin_inset CommandInset ref LatexCommand vref reference "sub:Detailed-explanation" \end_inset , a fixed \begin_inset Formula $T=10,000$ \end_inset hours was used. \end_layout \begin_layout Section Formula for DRBD / MARS \end_layout \begin_layout Standard We need not discrimiate between a storage failure probability S and an applicati on failure probability A because applications are run locally at the storage servers 1:1. The probability for failure of a single shard consisting of \begin_inset Formula $k$ \end_inset nodes is \end_layout \begin_layout Standard \begin_inset Formula \[ A_{p}(k)=p^{k} \] \end_inset because all \begin_inset Formula $k$ \end_inset shard members have to be down all at the same time. In section \begin_inset CommandInset ref LatexCommand vref reference "sub:Detailed-explanation" \end_inset we assumed that there is no cross-communication between shards. Therefore they are completely independent from each other, and the total downtime of \begin_inset Formula $n$ \end_inset shards during the observational period \begin_inset Formula $T$ \end_inset is \end_layout \begin_layout Standard \begin_inset Formula \[ A_{p,T}(k,n)=T*n*p^{k} \] \end_inset \end_layout \begin_layout Standard \noindent When introducing the spread exponent \begin_inset Formula $s$ \end_inset , the formula turns into \end_layout \begin_layout Standard \begin_inset Formula \[ A_{s,p,T}(k,n)=T*n^{s+1}*p^{k} \] \end_inset \end_layout \begin_layout Section Formula for Unweighted BigCluster \end_layout \begin_layout Standard This is based on the Bernoulli formula. The probability that exactly \begin_inset Formula $\bar{k}$ \end_inset storage nodes out of \begin_inset Formula $N=k*n$ \end_inset total storage nodes are down is \end_layout \begin_layout Standard \begin_inset Formula \[ \bar{S}_{p}(\bar{k},N)=\binom{N}{\bar{k}}*p^{\bar{k}}*(1-p)^{N-\bar{k}} \] \end_inset \end_layout \begin_layout Standard \noindent Similarly, the probability for getting \begin_inset Formula $k$ \end_inset or more storage node failures (up to \begin_inset Formula $N$ \end_inset ) at the same time is \end_layout \begin_layout Standard \begin_inset Formula \[ S_{p}(k,N)=\sum_{\bar{k}=k}^{N}\bar{S}_{p}(\bar{k},N)=\sum_{\bar{k}=k}^{N}\binom{N}{\bar{k}}*p^{\bar{k}}*(1-p)^{N-\bar{k}} \] \end_inset \end_layout \begin_layout Standard \noindent By replacing \begin_inset Formula $N$ \end_inset with \begin_inset Formula $k*n$ \end_inset (for conversion of the x axis into basic storage units) and by introducing \begin_inset Formula $T$ \end_inset we get \end_layout \begin_layout Standard \begin_inset Formula \[ S_{p,T}(k,n)=T*\sum_{\bar{k}=k}^{k*n}\binom{k*n}{\bar{k}}*p^{\bar{k}}*(1-p)^{k*n-\bar{k}} \] \end_inset \end_layout \begin_layout Standard \noindent For comparability with DRBDorMARS, we have to compute the application downtime A instead of the storage downtime S, which depends on the spread exponent \begin_inset Formula $s$ \end_inset as follows: \end_layout \begin_layout Standard \begin_inset Formula \[ A_{s,p,T}(k,n)=n^{s+1}*S_{p,T}(k,n)=n^{s+1}*T*\sum_{\bar{k}=k}^{k*n}\binom{k*n}{\bar{k}}*p^{\bar{k}}*(1-p)^{k*n-\bar{k}} \] \end_inset \end_layout \begin_layout Standard \noindent Notice that at \begin_inset Formula $s=0$ \end_inset we have introduced a factor of \begin_inset Formula $n$ \end_inset , which corresponds to the hashing effect (teardown of \begin_inset Formula $n$ \end_inset application instances by a single uncompensated storage incident) as described in section \begin_inset CommandInset ref LatexCommand vref reference "sub:Detailed-explanation" \end_inset . \end_layout \begin_layout Section Formula for SizeWeighted BigCluster \end_layout \begin_layout Standard In difference to above, we need to introduce a correction factor by the fraction of affected objects, relative to basic storage units. Otherwise the y axis would not stay comparable due to different units. \end_layout \begin_layout Standard For the special case of \begin_inset Formula $k=1$ \end_inset , there is no difference to above. \end_layout \begin_layout Standard For the special case of \begin_inset Formula $k=2$ \end_inset replica, the correction factor is \begin_inset Formula $1/(N-1)$ \end_inset , because we assume that all the replica of the affected first node are uniformly spread to all other nodes, which is \begin_inset Formula $N-1$ \end_inset . The probability for hitting the intersection of the first node with the second node is thus \begin_inset Formula $1/(N-1)$ \end_inset . \end_layout \begin_layout Standard For higher values of \begin_inset Formula $k$ \end_inset , and with a similar argument (never put another replica of the same object onto the same storage node) we get the correction factor as \end_layout \begin_layout Standard \begin_inset Formula \[ C(k,N)=\prod_{l=1}^{k-1}\frac{1}{N-l} \] \end_inset \end_layout \begin_layout Standard \noindent Hint: there are maximum \begin_inset Formula $k$ \end_inset physical replicas on the disks. For higher values of \begin_inset Formula $\bar{k}\geq k$ \end_inset , there are \begin_inset Formula $\binom{\bar{k}}{k}$ \end_inset combinations of object intersections (when assuming that the number of objects on a node is very large such and no further object repetition can occur execpt for the \begin_inset Formula $k$ \end_inset -fold replica placement). Thus the generalization to \begin_inset Formula $\bar{k}\geq k$ \end_inset is \end_layout \begin_layout Standard \begin_inset Formula \[ C(k,\bar{k},N)=\binom{\bar{k}}{k}\prod_{l=1}^{k-1}\frac{1}{N-l} \] \end_inset \end_layout \begin_layout Standard \noindent By inserting this into the above fomula, we get \end_layout \begin_layout Standard \begin_inset Formula \[ A_{s,p,T}(k,n)=n^{s+1}*T*\sum_{\bar{k}=k}^{k*n}C(k,\bar{k},k*n)*\binom{k*n}{\bar{k}}*p^{\bar{k}}*(1-p)^{k*n-\bar{k}} \] \end_inset \end_layout \begin_layout Standard \begin_inset CommandInset include LatexCommand input preview true filename "common-back-matter.lyx" \end_inset \end_layout \end_body \end_document