mirror of
https://github.com/schoebel/mars
synced 2024-12-22 22:53:41 +00:00
2535 lines
50 KiB
Plaintext
2535 lines
50 KiB
Plaintext
#LyX 2.3 created this file. For more info see http://www.lyx.org/
|
|
\lyxformat 544
|
|
\begin_document
|
|
\begin_header
|
|
\save_transient_properties true
|
|
\origin unavailable
|
|
\textclass scrreprt
|
|
\begin_preamble
|
|
\usepackage{listings}
|
|
\end_preamble
|
|
\options abstracton,dvipsnames
|
|
\use_default_options true
|
|
\begin_modules
|
|
customHeadersFooters
|
|
enumitem
|
|
fixltx2e
|
|
\end_modules
|
|
\maintain_unincluded_children true
|
|
\language english
|
|
\language_package default
|
|
\inputencoding auto
|
|
\fontencoding global
|
|
\font_roman "default" "default"
|
|
\font_sans "default" "default"
|
|
\font_typewriter "default" "default"
|
|
\font_math "auto" "auto"
|
|
\font_default_family rmdefault
|
|
\use_non_tex_fonts false
|
|
\font_sc false
|
|
\font_osf false
|
|
\font_sf_scale 100 100
|
|
\font_tt_scale 100 100
|
|
\use_microtype false
|
|
\use_dash_ligatures false
|
|
\graphics default
|
|
\default_output_format default
|
|
\output_sync 0
|
|
\bibtex_command default
|
|
\index_command default
|
|
\paperfontsize 10
|
|
\spacing single
|
|
\use_hyperref true
|
|
\pdf_title "MARS For Kernel Developers"
|
|
\pdf_author "Thomas Schöbel-Theuer"
|
|
\pdf_bookmarks true
|
|
\pdf_bookmarksnumbered false
|
|
\pdf_bookmarksopen true
|
|
\pdf_bookmarksopenlevel 2
|
|
\pdf_breaklinks true
|
|
\pdf_pdfborder true
|
|
\pdf_colorlinks true
|
|
\pdf_backref section
|
|
\pdf_pdfusetitle true
|
|
\papersize a4paper
|
|
\use_geometry true
|
|
\use_package amsmath 1
|
|
\use_package amssymb 1
|
|
\use_package cancel 1
|
|
\use_package esint 1
|
|
\use_package mathdots 1
|
|
\use_package mathtools 1
|
|
\use_package mhchem 1
|
|
\use_package stackrel 1
|
|
\use_package stmaryrd 1
|
|
\use_package undertilde 1
|
|
\cite_engine basic
|
|
\cite_engine_type default
|
|
\biblio_style plain
|
|
\use_bibtopic false
|
|
\use_indices false
|
|
\paperorientation portrait
|
|
\suppress_date false
|
|
\justification true
|
|
\use_refstyle 1
|
|
\use_minted 0
|
|
\index Index
|
|
\shortcut idx
|
|
\color #008000
|
|
\end_index
|
|
\leftmargin 3.7cm
|
|
\topmargin 2.7cm
|
|
\rightmargin 2.8cm
|
|
\bottommargin 2.3cm
|
|
\secnumdepth 3
|
|
\tocdepth 3
|
|
\paragraph_separation indent
|
|
\paragraph_indentation default
|
|
\is_math_indent 0
|
|
\math_numbering_side default
|
|
\quotes_style english
|
|
\dynamic_quotes 0
|
|
\papercolumns 1
|
|
\papersides 2
|
|
\paperpagestyle headings
|
|
\tracking_changes false
|
|
\output_changes false
|
|
\html_math_output 0
|
|
\html_css_as_file 0
|
|
\html_be_strict false
|
|
\end_header
|
|
|
|
\begin_body
|
|
|
|
\begin_layout Standard
|
|
\begin_inset ERT
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
|
|
\backslash
|
|
title{MARS for Kernel Developers}
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset CommandInset include
|
|
LatexCommand input
|
|
preview true
|
|
filename "common-front-matter.lyx"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset CommandInset toc
|
|
LatexCommand tableofcontents
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Chapter
|
|
Basic Working Principle
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
The Lamport Clock
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "sec:The-Lamport-Clock"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
MARS is always
|
|
\emph on
|
|
asynchonously
|
|
\emph default
|
|
communicating in the distributed system on
|
|
\emph on
|
|
any
|
|
\emph default
|
|
topics, even strategic decisions.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
If there were a
|
|
\emph on
|
|
strict
|
|
\emph default
|
|
global consistency model, which would be roughly equivalent to a standalone
|
|
model, we would need
|
|
\emph on
|
|
locking
|
|
\emph default
|
|
in order to serialize conflicting requests.
|
|
It is known for many decades that
|
|
\emph on
|
|
distributed locks
|
|
\emph default
|
|
do not only suffer from performance problems, but they are also cumbersome
|
|
to get them working reliably in scenarios where nodes or network links
|
|
may fail at any time.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Therefore, MARS uses a very different consistency model:
|
|
\series bold
|
|
Eventually Consistent
|
|
\series default
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
\begin_inset Graphics
|
|
filename images/lightbulb_brightlit_benj_.png
|
|
lyxscale 12
|
|
scale 7
|
|
|
|
\end_inset
|
|
|
|
Notice that the network bottleneck problems described in section
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "sec:Network-Bottlenecks"
|
|
|
|
\end_inset
|
|
|
|
are
|
|
\emph on
|
|
demanding
|
|
\emph default
|
|
an
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
eventually consistent
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
model.
|
|
You have
|
|
\series bold
|
|
no chance
|
|
\series default
|
|
against natural laws, like Einstein's laws.
|
|
In order to cope with the problem area, you have to
|
|
\emph on
|
|
invest some additional effort
|
|
\emph default
|
|
.
|
|
Unfortunately, asynchronous communication models are more tricky to program
|
|
and to debug than simple strictly consistent models.
|
|
In particular, you
|
|
\emph on
|
|
have to cope with
|
|
\emph default
|
|
additional
|
|
\series bold
|
|
race conditions
|
|
\series default
|
|
|
|
\emph on
|
|
inherent
|
|
\emph default
|
|
|
|
\emph on
|
|
to
|
|
\emph default
|
|
the
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
eventually consistent
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
model.
|
|
In the face of the laws of the universe, motivate yourself by looking at
|
|
the graphics at the cover page: the planets are a
|
|
\emph on
|
|
symbol
|
|
\emph default
|
|
for what you have to do!
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
\begin_inset Graphics
|
|
filename images/MatieresCorrosives.png
|
|
lyxscale 50
|
|
scale 17
|
|
|
|
\end_inset
|
|
|
|
Example: the asynchronous communication protocol of MARS leads to a different
|
|
behaviour from DRBD in case of
|
|
\series bold
|
|
network partitions
|
|
\series default
|
|
(temporary interruption of communication between some cluster nodes), because
|
|
MARS
|
|
\emph on
|
|
remembers
|
|
\emph default
|
|
the old state of remote nodes over long periods of time, while DRBD knows
|
|
absolutely nothing about its peers in disconnected state.
|
|
Sysadmins familiar with DRBD might find the following behaviour unusual:
|
|
\begin_inset Separator latexpar
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
\align center
|
|
|
|
\size tiny
|
|
\begin_inset Tabular
|
|
<lyxtabular version="3" rows="6" columns="3">
|
|
<features tabularvalignment="middle">
|
|
<column alignment="left" valignment="top" width="0pt">
|
|
<column alignment="left" valignment="top" width="0pt">
|
|
<column alignment="left" valignment="top" width="0pt">
|
|
<row>
|
|
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
Event
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
DRBD Behaviour
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
MARS Behaviour
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row endhead="true">
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
1.
|
|
the network partitions
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
automatic disconnect
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
nothing happens, but replication lags behind
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
2.
|
|
on A:
|
|
\family typewriter
|
|
umount $device
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
works
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
works
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
3.
|
|
on A:
|
|
\family typewriter
|
|
{drbd,mars}adm secondary
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
works
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
works
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
4.
|
|
on B:
|
|
\family typewriter
|
|
{drbd,mars}adm primary
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
works, split brain happens
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\series bold
|
|
\size tiny
|
|
refused
|
|
\series default
|
|
because B believes that A is primary
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
5.
|
|
the network resumes
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
automatic connect attempt fails
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
communication automatically resumes
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
</lyxtabular>
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
If you intentionally want to switch over (and to produce a split brain as
|
|
a side effect), the following variant must be used with MARS:
|
|
\begin_inset Separator latexpar
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
\align center
|
|
|
|
\size tiny
|
|
\begin_inset Tabular
|
|
<lyxtabular version="3" rows="9" columns="3">
|
|
<features tabularvalignment="middle">
|
|
<column alignment="left" valignment="top" width="0pt">
|
|
<column alignment="left" valignment="top" width="0pt">
|
|
<column alignment="left" valignment="top" width="0pt">
|
|
<row>
|
|
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
Event
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
DRBD Behaviour
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
MARS Behaviour
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row endhead="true">
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
1.
|
|
the network partitions
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
automatic disconnect
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
nothing happens, but replication lags behind
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
2.
|
|
on A:
|
|
\family typewriter
|
|
umount $device
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
works
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
works
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
3.
|
|
on A:
|
|
\family typewriter
|
|
{drbd,mars}adm secondary
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
works
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
works (but
|
|
\emph on
|
|
not remmonended!
|
|
\emph default
|
|
)
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
4.
|
|
on B:
|
|
\family typewriter
|
|
{drbd,mars}adm primary
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
split brain, but nobody knows
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\series bold
|
|
\size tiny
|
|
refused
|
|
\series default
|
|
because B believes that A is primary
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
5.
|
|
on B:
|
|
\family typewriter
|
|
marsadm disconnect
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
-
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
works, nothing happens
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
6.
|
|
on B:
|
|
\family typewriter
|
|
marsadm primary --force
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
-
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
works, split brain happens on B, but A doesn't know
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
7.
|
|
on B:
|
|
\family typewriter
|
|
marsadm connect
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
-
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
works, nothing happens
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
8.
|
|
the network resumes
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
automatic connect attempt fails
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="left" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\size tiny
|
|
communication resumes, A now detects the split brain
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
</lyxtabular>
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
In order to implement the consistency model
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
eventually consistent
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
, MARS uses a so-called Lamport
|
|
\begin_inset Foot
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
Published in the late 1970s by Leslie Lamport, also known as inventor of
|
|
|
|
\begin_inset ERT
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
|
|
\backslash
|
|
LaTeX
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
.
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
clock.
|
|
MARS uses a special variant called
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
physical Lamport clock
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The physical Lamport clock is another almost-realtime clock which
|
|
\emph on
|
|
can
|
|
\emph default
|
|
run independently from the Linux kernel system clock.
|
|
However, the Lamport clock tries to remain as near as possible to the system
|
|
clock.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Both clocks can be queried at any time via
|
|
\family typewriter
|
|
cat /proc/sys/mars/lamport_clock
|
|
\family default
|
|
.
|
|
The result will show both clocks in parallel, in units of seconds since
|
|
the Unix epoch, with nanosecond resolution.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
When there are no network messages at all, both the system clock and the
|
|
Lamport clock will show almost the same time (except some minor differences
|
|
of a few nanoseconds resulting from the finite processor clock speed).
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The physical Lamport clock works rather simple:
|
|
\emph on
|
|
any
|
|
\emph default
|
|
message on the network is augmented with a Lamport time stamp telling when
|
|
the message was
|
|
\emph on
|
|
sent
|
|
\emph default
|
|
according to the local Lamport clock of the sender.
|
|
Whenever that message is received by some receiver, it checks whether the
|
|
time ordering relation would be violated: whenever the Lamport timestamp
|
|
in the message would claim that the sender had sent it
|
|
\emph on
|
|
after
|
|
\emph default
|
|
it arrived at the receiver (according to drifts in their respective local
|
|
clocks), something must be wrong.
|
|
In this case, the local Lamport clock of the
|
|
\emph on
|
|
receiver
|
|
\emph default
|
|
is advanced shortly after the sender Lamport timestamp, such that the time
|
|
ordering relation is no longer violated.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
As a consequence, any local Lamport clock may precede the corresponding
|
|
local system clock.
|
|
In order to avoid accumulation of deltas between the Lamport and the system
|
|
clock, the Lamport clock will run slower after that, possibly until it
|
|
reaches the system clock again (if no other message arrives which sets
|
|
it forward again).
|
|
After having reached the system clock, the Lamport clock will continue
|
|
with
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
normal
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
speed.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
MARS uses the local Lamport clock for anything where other systems would
|
|
use the local system clock: for example, timestamp generation in the
|
|
\family typewriter
|
|
/mars/
|
|
\family default
|
|
filesystem.
|
|
Even symlinks created there are timestamped according to the Lamport clock.
|
|
Both the kernel module and the userspace tool
|
|
\family typewriter
|
|
marsadm
|
|
\family default
|
|
are always operating in the timescale of the Lamport clock.
|
|
Most importantly, all timestamp comparisons are always carried out with
|
|
respect to Lamport time.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
\begin_inset Graphics
|
|
filename images/MatieresCorrosives.png
|
|
lyxscale 50
|
|
scale 17
|
|
|
|
\end_inset
|
|
|
|
Bigger differences between the Lamport and the system clock can be annoying
|
|
from a human point of view: when typing
|
|
\family typewriter
|
|
ls -l /mars/resource-mydata/
|
|
\family default
|
|
many timestamps may appear as if they were created in the
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
future
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
, because the
|
|
\family typewriter
|
|
ls
|
|
\family default
|
|
command compares the output formatting against the system clock (it does
|
|
not even know of the existence of the MARS Lamport clock).
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
\begin_inset Graphics
|
|
filename images/MatieresToxiques.png
|
|
lyxscale 50
|
|
scale 17
|
|
|
|
\end_inset
|
|
|
|
Always use
|
|
\family typewriter
|
|
ntp
|
|
\family default
|
|
(or another clock synchronization service) in order to pre-synchronize
|
|
your system clocks as close as possible.
|
|
Bigger differences are not only annoying, but may lead some people to wrong
|
|
conclusions and therefore even lead to bad human decisions!
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
In a professional datacenter, you should use
|
|
\family typewriter
|
|
ntp
|
|
\family default
|
|
anyway, and you should monitor its effectiveness anyway.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
\begin_inset Graphics
|
|
filename images/lightbulb_brightlit_benj_.png
|
|
lyxscale 12
|
|
scale 7
|
|
|
|
\end_inset
|
|
|
|
Hint: many internal logfiles produced by the MARS kernel module contain
|
|
Lamport timestamps written as numerical values.
|
|
In order to convert them into human-readable form, use the command
|
|
\family typewriter
|
|
marsadm cat /mars/5.total.status
|
|
\family default
|
|
or similar.
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
The Symlink Tree
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "sec:The-Symlink-Tree"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Graphics
|
|
filename images/MatieresCorrosives.png
|
|
lyxscale 50
|
|
scale 17
|
|
|
|
\end_inset
|
|
|
|
The symlink tree as described here will be replaced by another representation
|
|
in future versions of MARS.
|
|
Therefore, don't do any scripting by directly accessing symlinks! Use the
|
|
primitive macros described in section
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "subsec:Predefined-Trivial-Macros"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The current
|
|
\family typewriter
|
|
/mars/
|
|
\family default
|
|
filesystem container format contains not only transaction logfiles, but
|
|
also acts as a generic storage for (persistent) state information.
|
|
Both configuration information and runtime state information are currently
|
|
stored in symlinks.
|
|
Symlinks are
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
misused
|
|
\begin_inset Foot
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
This means, the symlink targets need not be other files or directories,
|
|
but just any values like integers or strings.
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
in order to represent some
|
|
\family typewriter
|
|
key -> value
|
|
\family default
|
|
pairs.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
\begin_inset Graphics
|
|
filename images/lightbulb_brightlit_benj_.png
|
|
lyxscale 12
|
|
scale 7
|
|
|
|
\end_inset
|
|
|
|
It is not yet clear / decided, but there is a
|
|
\emph on
|
|
chance
|
|
\emph default
|
|
that the
|
|
\emph on
|
|
concept
|
|
\emph default
|
|
of
|
|
\family typewriter
|
|
key -> value
|
|
\family default
|
|
pairs will be retained in future versions of MARS.
|
|
Instead of being represented by symlinks, another representation will be
|
|
used, such that hopefully the
|
|
\family typewriter
|
|
key
|
|
\family default
|
|
part will remain in the form of a pathname, even if there were no longer
|
|
a physical representation in an actual filesystem.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
\begin_inset Graphics
|
|
filename images/lightbulb_brightlit_benj_.png
|
|
lyxscale 12
|
|
scale 7
|
|
|
|
\end_inset
|
|
|
|
A fundamentally different behaviour than DRBD: when your DRBD primary crashed
|
|
some time ago, and now comes up again, you have to setup DRBD again by
|
|
a sequence of commands like
|
|
\family typewriter
|
|
modprobe drbd; drbdadm up all; drbdadm primary all
|
|
\family default
|
|
or similar.
|
|
In contrast, MARS needs only
|
|
\family typewriter
|
|
modprobe mars
|
|
\family default
|
|
(after
|
|
\family typewriter
|
|
/mars/
|
|
\family default
|
|
has been mounted by
|
|
\family typewriter
|
|
/etc/fstab
|
|
\family default
|
|
).
|
|
The
|
|
\emph on
|
|
persistence
|
|
\emph default
|
|
of the symlinks residing in
|
|
\family typewriter
|
|
/mars/
|
|
\family default
|
|
will automatically remember your previous state, even if some your resources
|
|
were primary while others were secondary (mixed operations).
|
|
You don't need to do any actions in order to
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
restore
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
a previous state, no matter how
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
complex
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
it was.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
(Almost) all symlinks appearing in the
|
|
\family typewriter
|
|
/mars/
|
|
\family default
|
|
directory tree are automatically replicated thoughout the whole cluster,
|
|
provided that the cluster
|
|
\family typewriter
|
|
uuid
|
|
\family default
|
|
s are equal
|
|
\begin_inset Foot
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
This is protection against accidental
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
merging
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
of two unrelated clusters which had been created at different times with
|
|
different
|
|
\family typewriter
|
|
uuids
|
|
\family default
|
|
.
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
at all sites.
|
|
Thus the
|
|
\family typewriter
|
|
/mars/
|
|
\family default
|
|
directory forms some kind of
|
|
\emph on
|
|
global namespace
|
|
\emph default
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
In order to avoid name clashes, each pathname created at node A follows
|
|
a convention: the node name A should be a suffix of the pathname.
|
|
Typically, internal MARS names follow the scheme
|
|
\family typewriter
|
|
/mars/
|
|
\emph on
|
|
something
|
|
\emph default
|
|
/myname-A
|
|
\family default
|
|
.
|
|
When using the expert command
|
|
\family typewriter
|
|
marsadm {get,set}-link
|
|
\family default
|
|
(which will likely be replaced by something else in future MARS releases),
|
|
you should follow the best practice of systematically using pathnames like
|
|
|
|
\family typewriter
|
|
/mars/userspace/myname-A
|
|
\family default
|
|
or similar.
|
|
As a result, each node will automatically get informed about the state
|
|
at any other node, like B when the corresponding information is recorded
|
|
on node B under the name
|
|
\family typewriter
|
|
/mars/userspace/myname-B
|
|
\family default
|
|
(context-dependent names).
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
\begin_inset Graphics
|
|
filename images/lightbulb_brightlit_benj_.png
|
|
lyxscale 12
|
|
scale 7
|
|
|
|
\end_inset
|
|
|
|
Experts only: the symlink replication works generically.
|
|
You might use the
|
|
\family typewriter
|
|
/mars/userspace/
|
|
\family default
|
|
directory in order to place your own symlink there (for whatever purpose,
|
|
which need not have to do with MARS).
|
|
However, the symlinks are likely to disappear.
|
|
Use
|
|
\family typewriter
|
|
marsadm {get,set}-link
|
|
\family default
|
|
instead.
|
|
There is a chance that these abstract commands (or variants thereof) will
|
|
be retained, by acting on the new data representation in future, even if
|
|
the old symlink format will vanish some day.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
\begin_inset Graphics
|
|
filename images/lightbulb_brightlit_benj_.png
|
|
lyxscale 12
|
|
scale 7
|
|
|
|
\end_inset
|
|
|
|
Important: the convention of placing the
|
|
\series bold
|
|
creator host name
|
|
\series default
|
|
inside your pathnames should be used wherever possible.
|
|
The name part is a kind of
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
ownership indicator
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
.
|
|
It is crucial that no other host writes any symlink not
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
belonging
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
to him.
|
|
Other hosts may read foreign information as often as they want, but never
|
|
modify them.
|
|
This way, your cluster nodes are able to
|
|
\emph on
|
|
communicate
|
|
\emph default
|
|
with each other via symlink / information updates.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Although experts might create (and change) the current symlinks with userspace
|
|
tools like
|
|
\family typewriter
|
|
ln -s
|
|
\family default
|
|
, you should use the following marsadm commands instead:
|
|
\end_layout
|
|
|
|
\begin_layout Itemize
|
|
|
|
\family typewriter
|
|
marsadm set-link myvalue /mars/userspace/mykey-A
|
|
\end_layout
|
|
|
|
\begin_layout Itemize
|
|
|
|
\family typewriter
|
|
marsadm delete-file /mars/userspace/mykey-A
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
There are many reasons for this: first, the
|
|
\family typewriter
|
|
marsadm set-link
|
|
\family default
|
|
command will automatically use the Lamport clock for symlink creation,
|
|
and therefore will avoid any errors resulting from a
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
wrong
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
system clock (as in
|
|
\family typewriter
|
|
ln -s
|
|
\family default
|
|
).
|
|
Second, the
|
|
\family typewriter
|
|
marsadm delete-file
|
|
\family default
|
|
(which also deletes symlinks) works on the
|
|
\emph on
|
|
whole cluster
|
|
\emph default
|
|
.
|
|
And finally, there is a chance that this will work in future versions of
|
|
MARS even after the symlinks have vanished.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
What's the difference? If you would try to remove your symlink locally by
|
|
hand via
|
|
\family typewriter
|
|
rm -f
|
|
\family default
|
|
, you will be surprised: since the symlink has been replicated to the other
|
|
cluster nodes, it will be re-transferred from there and will be resurrected
|
|
locally after some short time.
|
|
This way, you cannot delete any object reliably, because your whole cluster
|
|
(which may consist of many nodes) remembers all your state information
|
|
and will
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
correct
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
it whenever
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
necessary
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
In order to solve the deletion problem, MARS uses some internal deletion
|
|
protocol using auxiliary symlinks residing in
|
|
\family typewriter
|
|
/mars/todo-global/.
|
|
|
|
\family default
|
|
The deletion protocol ensures that all replicas get deleted in the whole
|
|
cluster, and only thereafter the auxiliary symlinks in
|
|
\family typewriter
|
|
/mars/todo-global/
|
|
\family default
|
|
are also deleted eventually.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
You may update your already existing symlink via
|
|
\family typewriter
|
|
marsadm set-link some-other-value /mars/userspace/mykey-A
|
|
\family default
|
|
.
|
|
The new value will be propagated throughout the cluster according to a
|
|
|
|
\series bold
|
|
timestamp comparison protocol
|
|
\series default
|
|
: whenever node B notices that A has a
|
|
\emph on
|
|
newer
|
|
\emph default
|
|
version of some symlink (according to the Lamport timestamp), it will replace
|
|
its elder version by the newer one.
|
|
The opposite does
|
|
\emph on
|
|
not
|
|
\emph default
|
|
work: if B notices that A has an elder version, just nothing happens.
|
|
This way, the timestamps of symlinks can only progress in forward direction,
|
|
but never backwards in time.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
As a consequence, symlink updates made
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
by hand
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
via
|
|
\family typewriter
|
|
ln -sf
|
|
\family default
|
|
may get lost when the local system clock is much more earlier than the
|
|
Lamport clock.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
When your cluster is fully connected by the network, the last timestamp
|
|
will finally win everywhere.
|
|
Only in case of network outages leading to
|
|
\emph on
|
|
network partitions
|
|
\emph default
|
|
, some information may be
|
|
\emph on
|
|
temporarily inconsistent
|
|
\emph default
|
|
, but only for the duration of the network outage.
|
|
The timestamp comparison protocol in combination with the Lamport clock
|
|
and with the persistence of the
|
|
\family typewriter
|
|
/mars/
|
|
\family default
|
|
filesystem will automatically heal any temporary inconsistencies as soon
|
|
as possible, even in case of temporary node shutdown.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The meaning of some internal MARS symlinks residing in
|
|
\family typewriter
|
|
/mars/
|
|
\family default
|
|
will be hopefully documented in section
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "sec:Documentation-of-the"
|
|
|
|
\end_inset
|
|
|
|
some day.
|
|
\end_layout
|
|
|
|
\begin_layout Chapter
|
|
MARS for Developers
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
This chapter is organized strictly top-down.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
If you are a sysadmin and want to inform yourself about internals (useful
|
|
for debugging), the relevant information is at the beginning, and you don't
|
|
need to dive into all technical details at the end.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
If you are a kernel developer and want to contribute code to the emerging
|
|
MARS community, please read it (almost) all.
|
|
Due to the top-down organization, sometimes you will need to follow some
|
|
forward references in order to understand details.
|
|
Therefore I recommend reading this chapter twice in two different reading
|
|
modes: in the first reading pass, you just get a raw network of principles
|
|
and structures in your brain (you don't want to grasp details, therefore
|
|
don't strive for a full understanding).
|
|
In the second pass, you will exploit your knowlegde from the first pass
|
|
for a deeper understanding of the details.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Alternatively, you may first read the sections about general architecture,
|
|
and then start a bottom-up scan by first reading the last section about
|
|
generic objects and aspects, and working in reverse
|
|
\emph on
|
|
section
|
|
\emph default
|
|
order (but read
|
|
\emph on
|
|
sub
|
|
\emph default
|
|
sections in-order) until you finally reach the kernel interfaces / symlink
|
|
trees.
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
Motivation / Politics
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
MARS is not yet upstream in the Linux kernel.
|
|
This section tries to clear up some potential doubts.
|
|
Some people have asked why MARS uses its own internal framework instead
|
|
of
|
|
\emph on
|
|
directly
|
|
\emph default
|
|
|
|
\begin_inset Foot
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
Notice that
|
|
\emph on
|
|
indirect
|
|
\emph default
|
|
use of pre-existing Linux infrastructure is not only possible, but actually
|
|
implemented, by usinig it
|
|
\emph on
|
|
internally
|
|
\emph default
|
|
in brick
|
|
\emph on
|
|
implementations
|
|
\emph default
|
|
(black-box principle).
|
|
However, such bricks are not portable to other environments like userspace.
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
being based on some already existing Linux kernel infrastructures like
|
|
the device mapper.
|
|
Here is a list of technical reasons:
|
|
\end_layout
|
|
|
|
\begin_layout Enumerate
|
|
The existing device mapper infrastructure is based on
|
|
\family typewriter
|
|
struct bio
|
|
\family default
|
|
.
|
|
In contrast, the new XIO personality of the generic brick infrastructure
|
|
is based on the concept of AIO (Asynchronous IO), which is a
|
|
\series bold
|
|
true superset
|
|
\series default
|
|
of block IO.
|
|
\end_layout
|
|
|
|
\begin_layout Enumerate
|
|
In particular,
|
|
\family typewriter
|
|
struct bio
|
|
\family default
|
|
is firmly referencing to
|
|
\family typewriter
|
|
struct page
|
|
\family default
|
|
(via intermediate
|
|
\family typewriter
|
|
struct bio_vec
|
|
\family default
|
|
), using types like
|
|
\family typewriter
|
|
sector_t
|
|
\family default
|
|
in the field
|
|
\family typewriter
|
|
bi_sector
|
|
\family default
|
|
.
|
|
Basic transfer units are blocks, or sectors, or pages, or the like.
|
|
In contrast,
|
|
\family typewriter
|
|
struct aio_object
|
|
\family default
|
|
used by the XIO personality can address
|
|
\series bold
|
|
arbitrary granularity
|
|
\series default
|
|
memory with byte resolution even at odd
|
|
\begin_inset Foot
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
Some brick
|
|
\emph on
|
|
implementations
|
|
\emph default
|
|
(as opposed to the capabilities of the
|
|
\emph on
|
|
interface
|
|
\emph default
|
|
) may be (and, in fact,
|
|
\emph on
|
|
are
|
|
\emph default
|
|
) restricted to
|
|
\family typewriter
|
|
PAGE_SIZE
|
|
\family default
|
|
operations or the like.
|
|
This is no general problem, because IOP can automatically insert some translato
|
|
r bricks extending the capabilities to universal granularity (of course
|
|
at some performance costs).
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
positions in (virtual) files / devices, similar to classical Unix file
|
|
IO, but
|
|
\emph on
|
|
asynchronously
|
|
\emph default
|
|
.
|
|
Practical experience shows that even non-functional properties like performance
|
|
of many datacenter workloads are profiting from that
|
|
\begin_inset Foot
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
The current transaction logger uses variable-sized headers at
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
odd
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
addresses.
|
|
Although this increases
|
|
\family typewriter
|
|
memcpy()
|
|
\family default
|
|
load due to
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
misalignment
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
, the
|
|
\emph on
|
|
overall performance
|
|
\emph default
|
|
was provably better than in variants where sector / page alignment was
|
|
strictly obeyed, but space was wasted for alignments.
|
|
Such functionality is only possible if the XIO infrastructure
|
|
\emph on
|
|
allows
|
|
\emph default
|
|
|
|
\emph on
|
|
for
|
|
\emph default
|
|
(but doesn't force)
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
mis-aligned
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
IO operations.
|
|
In future, many different transaction logfile formats showing different
|
|
runtime behaviour (e.g.
|
|
optimized for high-throughput SSD loads) may co-exist in parallel.
|
|
Note that properly aligned XIO operations bear no noticeable overhead compared
|
|
to classical block IO, at least in typical datacenter RAID scenarios.
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
.
|
|
The AIO/XIO abstraction contains no fixed link to kernel abstractions and
|
|
should be
|
|
\series bold
|
|
easily portable
|
|
\series default
|
|
to other environments.
|
|
In summary, the new personality provides a uniform abstraction which abstracts
|
|
away from multiple different kernel interfaces; it is designed to be useful
|
|
even in userspace.
|
|
\end_layout
|
|
|
|
\begin_layout Enumerate
|
|
Kernel infrastructures for the concept of
|
|
\emph on
|
|
direct IO
|
|
\emph default
|
|
are different from those for
|
|
\emph on
|
|
buffered IO
|
|
\emph default
|
|
.
|
|
The XIO personality used by MARS subsumes both concepts as use case
|
|
\emph on
|
|
variants
|
|
\emph default
|
|
.
|
|
|
|
\series bold
|
|
Buffering
|
|
\series default
|
|
is an optional internal property of XIO bricks (almost non-functional property
|
|
with support for consistency guarantees).
|
|
\end_layout
|
|
|
|
\begin_layout Enumerate
|
|
The AIO/XIO personality is generically designed for remote operations over
|
|
networks, at arbitrary places in the IO stack, with (almost
|
|
\begin_inset Foot
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
By default, automatic network connection re-establishment and infinite network
|
|
retries are already implemented in the
|
|
\family typewriter
|
|
xio_client
|
|
\family default
|
|
and
|
|
\family typewriter
|
|
xio_server
|
|
\family default
|
|
bricks to provide fully transparent semantics.
|
|
However, this may be undesirable in case of fatal crashes.
|
|
Therefore, abort operations are also configurable, as well as network timeouts
|
|
which are then mapped to classical IO errors.
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
) no semantic differences to local operations (built-in
|
|
\series bold
|
|
network transparency
|
|
\series default
|
|
).
|
|
There are universal provisions for mixed operation of different versions
|
|
(
|
|
\series bold
|
|
rolling software updates
|
|
\series default
|
|
in clusters / grids).
|
|
\end_layout
|
|
|
|
\begin_layout Enumerate
|
|
The generic brick infrastructure (as well as its personalities like XIO
|
|
or any other future personality) supports
|
|
\series bold
|
|
dynamic re-wiring / re-configuration
|
|
\series default
|
|
|
|
\emph on
|
|
during
|
|
\emph default
|
|
operation (even while parallel IO requests are flying, some of them taking
|
|
different paths in the IO stack in parallel).
|
|
This is absolutely needed for MARS logfile rotation.
|
|
In the long term, this would be useful for many advanced new features and
|
|
products, not limited to multipathing.
|
|
\end_layout
|
|
|
|
\begin_layout Enumerate
|
|
The generic brick infrastructure (and in turn all personalities) provide
|
|
|
|
\series bold
|
|
additional comfort
|
|
\series default
|
|
to the programmer while enabling
|
|
\series bold
|
|
increased functionality
|
|
\series default
|
|
: by use of a generalization of
|
|
\series bold
|
|
aspect orientation
|
|
\series default
|
|
|
|
\begin_inset Foot
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
Similar to AOP, insertion of IOP bricks for checking / debugging etc is
|
|
one of the key advantages of the generic brick infrastructure.
|
|
In contrast to AOP where debugging is usually {en,dis}abled statically
|
|
at compile time, IOP allows for
|
|
\emph on
|
|
dynamic
|
|
\emph default
|
|
(re-)configuration of debugging bricks, automatic repair, and many more
|
|
features promoted by
|
|
\emph on
|
|
organic computing
|
|
\emph default
|
|
.
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
, the programmer need no longer worry about dynamic memory allocations for
|
|
|
|
\emph on
|
|
local state
|
|
\emph default
|
|
in a brick instance.
|
|
MARS is
|
|
\series bold
|
|
automating local state
|
|
\series default
|
|
even when dynamically instantiating new bricks (possibly having the same
|
|
brick type) at runtime.
|
|
Specifially, XIO is automating
|
|
\series bold
|
|
request stacking
|
|
\series default
|
|
at the completion path this way, even while dynamically reconfiguring the
|
|
IO stack
|
|
\begin_inset Foot
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
The generic aspect orientation approach leads to better
|
|
\series bold
|
|
separation of concerns
|
|
\series default
|
|
: local state needed by brick implementations is not visible from outside
|
|
by default.
|
|
In other words, local state is also
|
|
\series bold
|
|
private state
|
|
\series default
|
|
.
|
|
Accidental hampering of internal operations is impeded.
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
Example from the kernel: in
|
|
\family typewriter
|
|
include/linux/blkdev.h
|
|
\family default
|
|
the definition of
|
|
\family typewriter
|
|
struct request
|
|
\family default
|
|
contains the following comment:
|
|
\family typewriter
|
|
/* the following two fields are internal, NEVER access directly */
|
|
\family default
|
|
.
|
|
It appears that
|
|
\family typewriter
|
|
struct request
|
|
\family default
|
|
contains not only fields relevant for the caller, but also
|
|
\series bold
|
|
internal fields
|
|
\series default
|
|
needed only in
|
|
\emph on
|
|
some
|
|
\emph default
|
|
|
|
\emph on
|
|
specific
|
|
\emph default
|
|
callees.
|
|
For example,
|
|
\family typewriter
|
|
rb_node
|
|
\family default
|
|
is documented to be used only in IO schedulers.
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
XIO goes one step further: there need not exist exactly one IO scheduler
|
|
instance in the IO stack for a single device.
|
|
Future
|
|
\family typewriter
|
|
xio_scheduler_{deadline,cfq,...}
|
|
\family default
|
|
brick types could be each instantiated many times, and in arbitrary places,
|
|
even for the same (logical) device.
|
|
The equivalent of
|
|
\family typewriter
|
|
rb_node
|
|
\family default
|
|
would then be automatically instantiated multiple times for the same IO
|
|
request, by automatically instantiating the right local aspect instances.
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
.
|
|
A similar automation
|
|
\begin_inset Foot
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
DM can achieve stacking and dynamic routing by a workaround called
|
|
\emph on
|
|
request cloning
|
|
\emph default
|
|
, potentially leading to mass creation of temporary / intermediate object
|
|
instances.
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
does not exist in the rest of the Linux kernel.
|
|
\end_layout
|
|
|
|
\begin_layout Enumerate
|
|
The generic brick infrastructure, together with personalities like XIO,
|
|
enables
|
|
\series bold
|
|
new long-term functional and non-functional opportunities
|
|
\series default
|
|
by use of concepts from instance-oriented programming (IOP
|
|
\begin_inset Foot
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
See
|
|
\begin_inset Flex URL
|
|
status collapsed
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
http://athomux.net/papers/paper_inst2.pdf
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
).
|
|
The application area is
|
|
\series bold
|
|
not limited to device drivers
|
|
\series default
|
|
.
|
|
For example, a new personality for
|
|
\emph on
|
|
stackable filesystems
|
|
\emph default
|
|
could be developed in future.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
In summary, anyone who would insist that MARS should be
|
|
\emph on
|
|
directly
|
|
\begin_inset Foot
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
Notice that kernel-specific structures like
|
|
\family typewriter
|
|
struct bio
|
|
\family default
|
|
are of course used by MARS, but only
|
|
\emph on
|
|
inside
|
|
\emph default
|
|
the blackbox implementation of bricks like
|
|
\family typewriter
|
|
mars_bio
|
|
\family default
|
|
or
|
|
\family typewriter
|
|
mars_if
|
|
\family default
|
|
which act as
|
|
\series bold
|
|
adaptors
|
|
\series default
|
|
to/from that structure.
|
|
It is possible to write further adaptors, e.g.
|
|
for direct interfacing to the device mapper infrastructure.
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\emph default
|
|
based on pre-existing kernel structures / frameworks instead of contributing
|
|
a new framework would cause a
|
|
\emph on
|
|
massive regression of functionality
|
|
\emph default
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Itemize
|
|
On one hand, all code contributed by the MARS project is
|
|
\series bold
|
|
non-intrusive
|
|
\series default
|
|
into the rest of the Linux kernel.
|
|
From the viewpoint of other parts of the kernel, the whole addition
|
|
\emph on
|
|
behaves
|
|
\emph default
|
|
|
|
\emph on
|
|
like
|
|
\emph default
|
|
a driver (although its infrastructure is much more than a driver).
|
|
\end_layout
|
|
|
|
\begin_layout Itemize
|
|
On the other hand, if people are interested, the contributed infrastructure
|
|
|
|
\emph on
|
|
may
|
|
\emph default
|
|
be used to
|
|
\emph on
|
|
add
|
|
\emph default
|
|
to the power of the Linux kernel.
|
|
It is designed to be
|
|
\series bold
|
|
open for contributions
|
|
\series default
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Itemize
|
|
A
|
|
\emph on
|
|
possible
|
|
\emph default
|
|
(but not the only possible) way to do this is giving the generic brick
|
|
framework / the XIO personality as well as future personalities / the MARS
|
|
application the status of a
|
|
\emph on
|
|
subsystem
|
|
\emph default
|
|
inside the kernel (in the long term), similar to the SCSI subsystem or
|
|
the network subsystem.
|
|
Noone is forced to use it, but anybody may use it if he/she likes.
|
|
\end_layout
|
|
|
|
\begin_layout Itemize
|
|
Politically, the author is a FOSS advocate willing to collaborate and to
|
|
support anyone interested in contributions.
|
|
The author's personal interest is long-term and is open for both in-tree
|
|
and out-of-tree extensions of both the framework and MARS by any other
|
|
party obeying the GPL and not hazarding FOSS by patents (instead supporting
|
|
organizations like the Open Invention Network).
|
|
The author is open to closer relationships with the Linux Foundation and
|
|
other parts of the Linux ecosystem.
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
Architecture Overview
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Graphics
|
|
filename images/MARS_Framework_Architecture.pdf
|
|
width 100col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
Some Architectural Details
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The following pictures show some
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
zones of responsibility
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
, not necessarily a strict hierarchy (although Dijkstra's famous layering
|
|
rules from THE are tried to be respected as much as possible).
|
|
The construction principle follows the concept of
|
|
\series bold
|
|
Instance Oriented Programming
|
|
\series default
|
|
(IOP) described in
|
|
\begin_inset Flex URL
|
|
status collapsed
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
http://athomux.net/papers/paper_inst2.pdf
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
.
|
|
Please note that MARS is only instance-
|
|
\emph on
|
|
based
|
|
\emph default
|
|
|
|
\begin_inset Foot
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
Similar to OOP, where
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
object-based
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
means a weaker form of
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
object-oriented
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
, the term
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
instance-based
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
means that the
|
|
\emph on
|
|
strategy
|
|
\emph default
|
|
brick layer need not be fully modularized according to the IOP principles,
|
|
but the
|
|
\emph on
|
|
worker
|
|
\emph default
|
|
brick layer already is.
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
, while MARS Full is planned to be fully instance-
|
|
\emph on
|
|
oriented
|
|
\emph default
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Subsection
|
|
MARS Architecture
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename images/mars-light-architecture.fig
|
|
width 40col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Subsection
|
|
MARS Full Architecture (planned)
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename images/mars-full-architecture.fig
|
|
width 80col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
Documentation of the Symlink Trees
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "sec:Documentation-of-the"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The
|
|
\family typewriter
|
|
/mars/
|
|
\family default
|
|
symlink tree is serving the following purposes, all at the same time:
|
|
\end_layout
|
|
|
|
\begin_layout Enumerate
|
|
For
|
|
\series bold
|
|
communication
|
|
\series default
|
|
between cluster nodes, see sections
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "sec:The-Lamport-Clock"
|
|
|
|
\end_inset
|
|
|
|
and
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "sec:The-Symlink-Tree"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
This communication is even the
|
|
\emph on
|
|
only
|
|
\emph default
|
|
communication between cluster nodes (apart from the
|
|
\emph on
|
|
contents
|
|
\emph default
|
|
of transaction logfiles and sync data).
|
|
\end_layout
|
|
|
|
\begin_layout Enumerate
|
|
|
|
\series bold
|
|
\emph on
|
|
Internal
|
|
\emph default
|
|
interface
|
|
\series default
|
|
between the kernel module and the userspace tool
|
|
\family typewriter
|
|
marsadm
|
|
\family default
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Enumerate
|
|
|
|
\series bold
|
|
\emph on
|
|
Internal
|
|
\emph default
|
|
persistent repository
|
|
\series default
|
|
which keeps state information between reboots (also in case of node crashes).
|
|
It is even the
|
|
\emph on
|
|
only
|
|
\emph default
|
|
place where state information is kept.
|
|
There is no other place like
|
|
\family typewriter
|
|
/etc/drbd.conf
|
|
\family default
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Graphics
|
|
filename images/MatieresCorrosives.png
|
|
lyxscale 50
|
|
scale 17
|
|
|
|
\end_inset
|
|
|
|
Because of its internal character, its representation and semantics may
|
|
change at any time without notice (e.g.
|
|
via an
|
|
\emph on
|
|
internal
|
|
\emph default
|
|
upgrade procedure between major releases).
|
|
It is
|
|
\emph on
|
|
not
|
|
\emph default
|
|
an external interface to the outer world.
|
|
Don't build anything on it.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
However, knowledge of the symlink tree is useful for advanced sysadmins,
|
|
for
|
|
\series bold
|
|
human inspection
|
|
\series default
|
|
and for
|
|
\series bold
|
|
debugging
|
|
\series default
|
|
.
|
|
And, of course, for developers.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
As an
|
|
\begin_inset Quotes eld
|
|
\end_inset
|
|
|
|
official
|
|
\begin_inset Quotes erd
|
|
\end_inset
|
|
|
|
interface from outside, only the
|
|
\family typewriter
|
|
marsadm
|
|
\family default
|
|
command should be used.
|
|
\end_layout
|
|
|
|
\begin_layout Subsection
|
|
Documentation of the MARS Symlink Tree
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
XIO Worker Bricks
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
StrategY Worker Bricks
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
NYI
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
The XIO Brick Personality
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
The Generic Brick Infrastructure Layer
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
The Generic Object and Aspect Infrastructure
|
|
\end_layout
|
|
|
|
\begin_layout Chapter
|
|
\start_of_appendix
|
|
GNU Free Documentation License
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "chap:GNU-FDL"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
|
|
\family typewriter
|
|
\size footnotesize
|
|
\begin_inset ERT
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
|
|
\backslash
|
|
lstinputlisting{fdl.txt}
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_body
|
|
\end_document
|