mirror of
https://github.com/schoebel/mars
synced 2024-12-26 08:32:24 +00:00
Merge branch 'mars0.1.y' into mars0.1a.y
This commit is contained in:
commit
ac0677fcb7
63
ChangeLog
63
ChangeLog
@ -32,16 +32,21 @@ Release Conventions / Branches / Tagnames
|
||||
-----------------------------------------
|
||||
FLOW OF BUGFIXES: 0.1 -> 0.1a -> 0.1b -> 0.2 -> ...
|
||||
|
||||
mars0.1 series (stable):
|
||||
- Asynchronous replication.
|
||||
Currently operational at more than 3000 servers at
|
||||
1&1, more than 25,000,000 operating hours (Feb 2017)
|
||||
mars0.1 series (stable, will go EOL soon):
|
||||
- Will run in parallel to branch 0.1a for a few
|
||||
months, and then go EOL.
|
||||
- Unstable tagnames: light0.1beta%d.%d (obsolete)
|
||||
- Stable branch: mars0.1.y
|
||||
- Stable tagnames: mars0.1stable%02d
|
||||
|
||||
mars0.1a series:
|
||||
New designated master branch. Will become stable ASAP.
|
||||
mars0.1a series (stable):
|
||||
New master branch. Now stable.
|
||||
This branch is operational for several years on
|
||||
several thousands of servers, and several petabytes
|
||||
of data.
|
||||
- Unstable tagnames: light0.1abeta%d (obsolete)
|
||||
- Stable branch: mars0.1a.y
|
||||
- Stable tagnames: mars0.1astable%02d
|
||||
|
||||
mars0.1b series (currently alpha):
|
||||
This is an _imtermediate_ series between 0.1 and 0.2.
|
||||
@ -283,28 +288,40 @@ mars0.1abeta0
|
||||
-----------------------------------
|
||||
Changelog for series 0.1:
|
||||
|
||||
Attention! This branch will go EOL in the next few months.
|
||||
Reason: branch 0.1a is productive for several months at 1&1.
|
||||
Experiences: seems to run better than 0.1.y with
|
||||
better performance, smoother, etc.
|
||||
Attention! This branch will go EOL around February 2019.
|
||||
Branch mars0.1a.y is now the new master branch, and all
|
||||
commits from the 0.1 series will be fully merged.
|
||||
Upgrade is easy: just rollout the new marsadm version,
|
||||
install the new kernel modules, and load them where possible.
|
||||
Mixed operation of different versions is no problem,
|
||||
but is of course not the desired state, so keep it short.
|
||||
but is of course not the desired state, so keep this period
|
||||
as short as possible.
|
||||
Rollback is also easy.
|
||||
If nobody complains, I'd like to get rid of this ancient
|
||||
branch ASAP.
|
||||
Before being closed, I am adding the new systemd templates
|
||||
for better upgrade / downgrade in case you need it. If you don't
|
||||
activate it, it should have zero impact.
|
||||
|
||||
Hint: branch 0.1a will get a merge from here, and then get the
|
||||
Remote Device (which was beta until now).
|
||||
The new code is not used when not activated, thus inclusion
|
||||
of new features into 0.1a should be low risk.
|
||||
Afterwards, 0.1a will be marked "stable", and newer features
|
||||
(except Football related ones) will then go to 0.1b.
|
||||
Finally, when 0.1a is stable, I will close this branch.
|
||||
Motivation: branch 0.1a is productive for several years at 1&1.
|
||||
Experiences: now runs provably better than 0.1.y with
|
||||
better performance, smoother, etc.
|
||||
And even more stable, although the 0.1a releases were
|
||||
called "beta" up to now.
|
||||
|
||||
mars0.1stable66
|
||||
* Critical fix, only relevant for kernels 4.3 to 4.4:
|
||||
Due to a forgotten adaptation to newer kernels,
|
||||
some userspace tools like xfs_repair could read/write
|
||||
wrong data upon _large_ IO requests, and/or kernel memory
|
||||
corruption could occur. Kernel-level filesystems
|
||||
are typically _not_ affected because they typically use 4k
|
||||
pages at maximum.
|
||||
If you are operating such a kernel, please upgrade to
|
||||
minimize any risks. You probably want userspace tools like
|
||||
xfs_repair to not crash your kernel ;)
|
||||
The problem was reproducibly detected at lab regression testing,
|
||||
_before_ updating a big installation from kernel 3.16 to 4.4.
|
||||
It did not show up with the old kernel.
|
||||
Notice: kernels >4.6 are not yet supported at the moment,
|
||||
but work on them is likely being continued during the next
|
||||
months. Stay tuned.
|
||||
* Minor doc updates.
|
||||
|
||||
mars0.1stable65
|
||||
* Major fix, only observed during KASAN debugging:
|
||||
|
59
docu/images/raid-lvm-architecture.fig
Normal file
59
docu/images/raid-lvm-architecture.fig
Normal file
@ -0,0 +1,59 @@
|
||||
#FIG 3.2 Produced by xfig version 3.2.5c
|
||||
Landscape
|
||||
Center
|
||||
Metric
|
||||
A4
|
||||
100.00
|
||||
Single
|
||||
-2
|
||||
1200 2
|
||||
6 270 2610 630 2880
|
||||
1 1 0 1 0 7 50 -1 -1 0.000 1 0.0000 450 2700 180 90 450 2700 630 2610
|
||||
1 1 0 1 0 7 50 -1 -1 0.000 1 0.0000 450 2790 180 90 450 2790 630 2700
|
||||
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
|
||||
270 2700 270 2790
|
||||
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
|
||||
630 2700 630 2790
|
||||
-6
|
||||
6 1170 2610 1530 2880
|
||||
1 1 0 1 0 7 50 -1 -1 0.000 1 0.0000 1350 2700 180 90 1350 2700 1530 2610
|
||||
1 1 0 1 0 7 50 -1 -1 0.000 1 0.0000 1350 2790 180 90 1350 2790 1530 2700
|
||||
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
|
||||
1170 2700 1170 2790
|
||||
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
|
||||
1530 2700 1530 2790
|
||||
-6
|
||||
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
|
||||
720 2250 540 2610
|
||||
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
|
||||
1080 2250 1260 2610
|
||||
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
|
||||
450 1980 1350 1980 1350 2250 450 2250 450 1980
|
||||
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
|
||||
450 270 1350 270 1350 720 450 720 450 270
|
||||
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
|
||||
450 1620 1350 1620 1350 1890 450 1890 450 1620
|
||||
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
|
||||
450 1260 1350 1260 1350 1530 450 1530 450 1260
|
||||
2 2 1 1 0 -1 50 -1 -1 4.000 0 0 -1 0 0 5
|
||||
450 810 1350 810 1350 1080 450 1080 450 810
|
||||
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
|
||||
900 720 900 810
|
||||
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
|
||||
900 1080 900 1260
|
||||
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
|
||||
900 1530 900 1620
|
||||
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
|
||||
900 1890 900 1980
|
||||
4 1 0 50 -1 2 15 0.0000 4 45 180 900 2790 ...\001
|
||||
4 1 0 50 -1 -1 10 0.0000 4 135 780 900 3060 48 spindles\001
|
||||
4 1 0 50 -1 -1 10 0.0000 4 105 195 900 450 zfs\001
|
||||
4 1 0 50 -1 -1 10 0.0000 4 135 660 900 630 snapshots\001
|
||||
4 1 0 50 -1 2 15 0.0000 4 45 180 900 2430 ...\001
|
||||
4 1 0 50 -1 -1 10 0.0000 4 105 405 900 2160 RAID\001
|
||||
4 1 0 50 -1 -1 10 0.0000 4 120 660 900 1800 pvs + vgs\001
|
||||
4 1 0 50 -1 -1 10 0.0000 4 135 735 900 990 replication\001
|
||||
4 1 0 50 -1 -1 10 0.0000 4 105 195 900 1440 lvs\001
|
||||
4 1 0 50 -1 -1 10 0.0000 4 105 270 270 1440 10x\001
|
||||
4 1 0 50 -1 -1 10 0.0000 4 105 270 270 990 10x\001
|
||||
4 1 0 50 -1 -1 10 0.0000 4 105 270 270 540 10x\001
|
47
docu/images/zpool-architecture.fig
Normal file
47
docu/images/zpool-architecture.fig
Normal file
@ -0,0 +1,47 @@
|
||||
#FIG 3.2 Produced by xfig version 3.2.5c
|
||||
Landscape
|
||||
Center
|
||||
Metric
|
||||
A4
|
||||
100.00
|
||||
Single
|
||||
-2
|
||||
1200 2
|
||||
6 270 2610 630 2880
|
||||
1 1 0 1 0 7 50 -1 -1 0.000 1 0.0000 450 2700 180 90 450 2700 630 2610
|
||||
1 1 0 1 0 7 50 -1 -1 0.000 1 0.0000 450 2790 180 90 450 2790 630 2700
|
||||
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
|
||||
270 2700 270 2790
|
||||
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
|
||||
630 2700 630 2790
|
||||
-6
|
||||
6 1170 2610 1530 2880
|
||||
1 1 0 1 0 7 50 -1 -1 0.000 1 0.0000 1350 2700 180 90 1350 2700 1530 2610
|
||||
1 1 0 1 0 7 50 -1 -1 0.000 1 0.0000 1350 2790 180 90 1350 2790 1530 2700
|
||||
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
|
||||
1170 2700 1170 2790
|
||||
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
|
||||
1530 2700 1530 2790
|
||||
-6
|
||||
6 90 540 450 720
|
||||
4 1 0 50 -1 -1 10 0.0000 4 105 270 270 720 10x\001
|
||||
-6
|
||||
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
|
||||
720 2250 540 2610
|
||||
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
|
||||
1080 2250 1260 2610
|
||||
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
|
||||
450 270 1350 270 1350 2250 450 2250 450 270
|
||||
2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2
|
||||
450 1260 1350 1260
|
||||
4 1 0 50 -1 2 15 0.0000 4 45 180 900 2790 ...\001
|
||||
4 1 0 50 -1 -1 10 0.0000 4 135 780 900 3060 48 spindles\001
|
||||
4 1 0 50 -1 -1 10 0.0000 4 135 390 900 1620 zpool\001
|
||||
4 1 0 50 -1 -1 10 0.0000 4 135 885 900 1980 functionality\001
|
||||
4 1 0 50 -1 -1 10 0.0000 4 120 660 900 1800 pvs + vgs\001
|
||||
4 1 0 50 -1 -1 10 0.0000 4 105 195 900 450 zfs\001
|
||||
4 1 0 50 -1 -1 10 0.0000 4 135 660 900 630 snapshots\001
|
||||
4 1 0 50 -1 -1 10 0.0000 4 105 495 900 810 +RAID\001
|
||||
4 1 0 50 -1 2 15 0.0000 4 45 180 900 2430 ...\001
|
||||
4 1 0 50 -1 20 8 0.0000 4 135 810 900 1350 interface\001
|
||||
4 1 0 50 -1 20 8 0.0000 4 135 720 900 1260 internal\001
|
@ -141,7 +141,7 @@ tst@1und1.de
|
||||
\end_layout
|
||||
|
||||
\begin_layout Date
|
||||
Version 0.1a-18
|
||||
Version 0.1a-66
|
||||
\end_layout
|
||||
|
||||
\begin_layout Lowertitleback
|
||||
@ -398,15 +398,15 @@ too simple
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
However, with the advent of a new method called
|
||||
\begin_inset Quotes eld
|
||||
\end_inset
|
||||
|
||||
However, this picture has changed with the advent of a new
|
||||
\series bold
|
||||
load balancing
|
||||
\series default
|
||||
method called
|
||||
\series bold
|
||||
LV Football
|
||||
\begin_inset Quotes erd
|
||||
\end_inset
|
||||
|
||||
this picture has changed, see chapter
|
||||
\series default
|
||||
, see chapter
|
||||
\begin_inset CommandInset ref
|
||||
LatexCommand ref
|
||||
reference "chap:LV-Football"
|
||||
@ -943,11 +943,15 @@ sub-component
|
||||
\emph default
|
||||
).
|
||||
Typical granularity is replication of whole internal storage pools, or
|
||||
of LVs, or of filesystem data.
|
||||
of LVs, or of filesystem instances.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Description
|
||||
LocalStorage, and some further models like RemoteSharding (see section
|
||||
LocalStorage, and some further models like
|
||||
\family typewriter
|
||||
RemoteSharding
|
||||
\family default
|
||||
(see section
|
||||
\begin_inset CommandInset ref
|
||||
LatexCommand ref
|
||||
reference "subsec:Variants-of-Sharding"
|
||||
@ -985,8 +989,11 @@ Big Virtual LVM Pool
|
||||
\end_layout
|
||||
|
||||
\begin_layout Description
|
||||
(4) at least Eventually Consistent or better can be alternatively achieved
|
||||
by
|
||||
(4) at least
|
||||
\family typewriter
|
||||
Eventually Consistent
|
||||
\family default
|
||||
or better can be alternatively achieved by
|
||||
\end_layout
|
||||
|
||||
\begin_deeper
|
||||
@ -995,12 +1002,19 @@ Big Virtual LVM Pool
|
||||
\series bold
|
||||
DRBD
|
||||
\series default
|
||||
, which provides Strict consistency during
|
||||
, which provides
|
||||
\family typewriter
|
||||
Strict Consistency
|
||||
\family default
|
||||
during
|
||||
\family typewriter
|
||||
connected
|
||||
\family default
|
||||
state, but works only reliably with passive crossover cables over short
|
||||
distances (see CAP theorem in section
|
||||
state, but works only reliably with passive crossover cables over
|
||||
\series bold
|
||||
short distances
|
||||
\series default
|
||||
(see CAP theorem in section
|
||||
\begin_inset CommandInset ref
|
||||
LatexCommand vref
|
||||
reference "sec:Explanation-via-CAP"
|
||||
@ -1008,6 +1022,23 @@ reference "sec:Explanation-via-CAP"
|
||||
\end_inset
|
||||
|
||||
).
|
||||
\begin_inset Newline newline
|
||||
\end_inset
|
||||
|
||||
Notice: DRBD violates any type of consistency within your
|
||||
\emph on
|
||||
replicas
|
||||
\emph default
|
||||
during (automatic) re-sync, and thus does not
|
||||
\emph on
|
||||
fully
|
||||
\emph default
|
||||
comply with the above definition of cloud storage in a
|
||||
\emph on
|
||||
strong
|
||||
\emph default
|
||||
sense.
|
||||
But you can argue at a course time granularity level in order to fix this.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Description
|
||||
@ -1015,8 +1046,12 @@ reference "sec:Explanation-via-CAP"
|
||||
\series bold
|
||||
MARS
|
||||
\series default
|
||||
, which works over long distances and provides two different consistency
|
||||
guarantees at different levels,
|
||||
, which works over
|
||||
\series bold
|
||||
long distances
|
||||
\series default
|
||||
and provides two different consistency guarantees at different levels,
|
||||
|
||||
\emph on
|
||||
both at the same time
|
||||
\emph default
|
||||
@ -1025,7 +1060,11 @@ both at the same time
|
||||
|
||||
\begin_deeper
|
||||
\begin_layout Description
|
||||
locally: Strict local consistency at LV granularity, also
|
||||
locally:
|
||||
\family typewriter
|
||||
Strict Consistency
|
||||
\family default
|
||||
at local LV granularity, also
|
||||
\emph on
|
||||
within
|
||||
\emph default
|
||||
@ -1033,11 +1072,39 @@ within
|
||||
\end_layout
|
||||
|
||||
\begin_layout Description
|
||||
globally: Eventually consistent
|
||||
globally:
|
||||
\family typewriter
|
||||
Eventually Consistent
|
||||
\family default
|
||||
|
||||
\emph on
|
||||
between
|
||||
\emph default
|
||||
different LV replicas.
|
||||
different LV replicas (global level).
|
||||
\begin_inset Newline newline
|
||||
\end_inset
|
||||
|
||||
The CAP theorem (see section
|
||||
\begin_inset CommandInset ref
|
||||
LatexCommand ref
|
||||
reference "sec:Explanation-via-CAP"
|
||||
|
||||
\end_inset
|
||||
|
||||
) says that
|
||||
\family typewriter
|
||||
Strict Consistency
|
||||
\family default
|
||||
is
|
||||
\series bold
|
||||
not possible
|
||||
\series default
|
||||
in general at
|
||||
\emph on
|
||||
unplanned failover
|
||||
\emph default
|
||||
during long-distance network outages (P = Partitioning Tolerance), when
|
||||
A = Availability is also a requirement.
|
||||
\begin_inset Newline newline
|
||||
\end_inset
|
||||
|
||||
@ -1045,8 +1112,89 @@ However, in case of a
|
||||
\emph on
|
||||
planned handover
|
||||
\emph default
|
||||
, it is also strictly consistent at a global level, but may need some extra
|
||||
time for catching up.
|
||||
, MARS is also
|
||||
\family typewriter
|
||||
Strictly Consistent
|
||||
\family default
|
||||
at a global level, but may need some extra time for catching up.
|
||||
\begin_inset Newline newline
|
||||
\end_inset
|
||||
|
||||
Notice: global
|
||||
\family typewriter
|
||||
Strict Consistency
|
||||
\family default
|
||||
is also possible at a
|
||||
\emph on
|
||||
coarse timescale
|
||||
\emph default
|
||||
, in accordance with the CAP theorem, if you decide to sacrifice A = Availabilit
|
||||
y during such a network incident by simply
|
||||
\emph on
|
||||
not
|
||||
\emph default
|
||||
doing a failover action.
|
||||
Just wait until the network outage is gone, and MARS will automatically
|
||||
resume
|
||||
\begin_inset Foot
|
||||
status open
|
||||
|
||||
\begin_layout Plain Layout
|
||||
This automatic MARS behaviour is similar to the behaviour of DRBD in such
|
||||
situations, when DBRD can automatically go to
|
||||
\family typewriter
|
||||
disconnected
|
||||
\family default
|
||||
-like state, and you are later manually or automatically resuming the DRBD
|
||||
connection for an incremental re-sync.
|
||||
MARS does everything automatically because it has no firmly built-in assumption
|
||||
s about the actual duration of any network communication.
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
||||
everything ASAP, and thus you are using MARS
|
||||
\emph on
|
||||
only
|
||||
\emph default
|
||||
as a protection against
|
||||
\series bold
|
||||
fatal
|
||||
\series default
|
||||
storage failures / unplanned
|
||||
\series bold
|
||||
disasters
|
||||
\series default
|
||||
.
|
||||
\begin_inset Newline newline
|
||||
\end_inset
|
||||
|
||||
Notice: A = Availability is
|
||||
\emph on
|
||||
not generally
|
||||
\emph default
|
||||
required by the above definition of cloud storage, because from a user's
|
||||
perspective it would not generally make sense in the global internet where
|
||||
connection loss may anyway occur at any time.
|
||||
Thus it is a valid operational strategy to
|
||||
\emph on
|
||||
not
|
||||
\emph default
|
||||
fail-over your LVs during certain major network outages.
|
||||
\begin_inset Newline newline
|
||||
\end_inset
|
||||
|
||||
Notice: long-term
|
||||
\series bold
|
||||
disaster tolerance
|
||||
\series default
|
||||
(e.g.
|
||||
perpetual loss of some storage nodes during an earthquake) is
|
||||
\emph on
|
||||
not
|
||||
\emph default
|
||||
modeled by the CAP theorem, but is more or less required by (2) and (3)
|
||||
from the above definition of cloud storage.
|
||||
\end_layout
|
||||
|
||||
\end_deeper
|
||||
@ -1703,7 +1851,7 @@ Protect for logical data corruption
|
||||
\begin_inset Text
|
||||
|
||||
\begin_layout Plain Layout
|
||||
yes
|
||||
yes (partly)
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
@ -1834,7 +1982,7 @@ tical storage.
|
||||
\begin_inset Text
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
OpenSource Component
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
@ -2377,6 +2525,189 @@ The last item means that ZFS by itself does not protect against amok-running
|
||||
enterprise-critical applications.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
\begin_inset Graphics
|
||||
filename images/lightbulb_brightlit_benj_.png
|
||||
lyxscale 12
|
||||
scale 7
|
||||
|
||||
\end_inset
|
||||
|
||||
Notice that zfs snapshots can be combined with DRBD or MARS, because zfs
|
||||
snapshots are residing at
|
||||
\emph on
|
||||
filesystem
|
||||
\emph default
|
||||
layer, while DRBD / MARS replicas are located at
|
||||
\emph on
|
||||
block
|
||||
\emph default
|
||||
layer.
|
||||
Just create your zpools at the
|
||||
\emph on
|
||||
top
|
||||
\emph default
|
||||
of DRBD or MARS virtual devices, and import / export them
|
||||
\emph on
|
||||
individually
|
||||
\emph default
|
||||
upon handover / failover of each LV.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
\begin_inset Graphics
|
||||
filename images/MatieresCorrosives.png
|
||||
lyxscale 50
|
||||
scale 17
|
||||
|
||||
\end_inset
|
||||
|
||||
There is a
|
||||
\series bold
|
||||
\emph on
|
||||
fundamental
|
||||
\series default
|
||||
\emph default
|
||||
difference between zpools and classical RAID / LVM stacked architectures.
|
||||
Some zfs advocates are propagating zpools as a replacement for both RAID
|
||||
and LVM.
|
||||
However, there is a
|
||||
\series bold
|
||||
massive difference
|
||||
\series default
|
||||
in architecture, as explained in the following example (10 logical resources
|
||||
over 48 physical spindles), achieving practically the
|
||||
\series bold
|
||||
\emph on
|
||||
same
|
||||
\series default
|
||||
zfs snapshot functionality
|
||||
\emph default
|
||||
from a user's perspective, but in a different way:
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
\align center
|
||||
\begin_inset Graphics
|
||||
filename images/raid-lvm-architecture.fig
|
||||
height 6cm
|
||||
|
||||
\end_inset
|
||||
|
||||
|
||||
\begin_inset Graphics
|
||||
filename images/zpool-architecture.fig
|
||||
height 6cm
|
||||
|
||||
\end_inset
|
||||
|
||||
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
When RAID functionality is executed by zfs, it will be located at the
|
||||
\emph on
|
||||
top
|
||||
\emph default
|
||||
of the hierarchy.
|
||||
On one hand, this easily allows for different RAID levels for each of the
|
||||
10 different logical resources.
|
||||
On the other hand, this
|
||||
\emph on
|
||||
exposes
|
||||
\emph default
|
||||
the
|
||||
\series bold
|
||||
physical spindle configuration
|
||||
\series default
|
||||
to the topmost filesystem layer (48 spindles in this example).
|
||||
There is no easy way for replication of these
|
||||
\emph on
|
||||
physical properties
|
||||
\emph default
|
||||
in a larger / heterogenous distributed system, e.g.
|
||||
when some hardware components are replaced over a longer period of time
|
||||
(hardware lifecycle, or LV Football as explained in chapter
|
||||
\begin_inset CommandInset ref
|
||||
LatexCommand ref
|
||||
reference "chap:LV-Football"
|
||||
|
||||
\end_inset
|
||||
|
||||
).
|
||||
Essentially, only replication of
|
||||
\emph on
|
||||
logical
|
||||
\emph default
|
||||
structures like snapshots remains as the only reasonable option, with its
|
||||
drawbacks as explained above.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
\begin_inset Graphics
|
||||
filename images/MatieresCorrosives.png
|
||||
lyxscale 50
|
||||
scale 17
|
||||
|
||||
\end_inset
|
||||
|
||||
There is another argument: zfs tries to
|
||||
\emph on
|
||||
hide
|
||||
\emph default
|
||||
its internal structures and interfaces from the sysadmins, forming a more
|
||||
or less
|
||||
\series bold
|
||||
monolithic
|
||||
\begin_inset Foot
|
||||
status open
|
||||
|
||||
\begin_layout Plain Layout
|
||||
Some sysadmins acting as zfs advocates are reclaiming this as an advantage,
|
||||
because they need to understand only a single tool for managing
|
||||
\begin_inset Quotes eld
|
||||
\end_inset
|
||||
|
||||
everything
|
||||
\begin_inset Quotes erd
|
||||
\end_inset
|
||||
|
||||
.
|
||||
However, this is a short-sighted argument when it comes to
|
||||
\emph on
|
||||
true
|
||||
\emph default
|
||||
flexibility as offered by a component-based system, where multiple types
|
||||
of hardware / software RAID, multiple types of LVM functionality, and much
|
||||
more can be almost orthogonally combined in a very flexible way.
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
||||
architecture
|
||||
\series default
|
||||
as seen from outside.
|
||||
This violates the classical
|
||||
\emph on
|
||||
layering rules
|
||||
\emph default
|
||||
from Dijkstra.
|
||||
In contrast, classical LVM-based configurations are
|
||||
\series bold
|
||||
component oriented
|
||||
\series default
|
||||
, according to the
|
||||
\series bold
|
||||
Unix philosophy
|
||||
\series default
|
||||
.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Section
|
||||
Local vs Centralized Storage
|
||||
\begin_inset CommandInset label
|
||||
@ -41872,6 +42203,23 @@ netstat
|
||||
\family default
|
||||
& friends.
|
||||
Updates may need some time to proceed (socket timeouts etc).
|
||||
\begin_inset Newline newline
|
||||
\end_inset
|
||||
|
||||
Hint: for safety, call this on
|
||||
\emph on
|
||||
all
|
||||
\emph default
|
||||
members of a cluster to ensure consistency.
|
||||
Otherwise it may happen that some cluster members do not know the
|
||||
\emph on
|
||||
new
|
||||
\emph default
|
||||
IP address where to fetch the
|
||||
\emph on
|
||||
new
|
||||
\emph default
|
||||
information from.
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
Binary file not shown.
@ -43,6 +43,7 @@
|
||||
#define USE_MAX_PHYS_SEGMENTS (MARS_MAX_SEGMENT_SIZE >> 9)
|
||||
#define USE_MAX_SEGMENT_SIZE MARS_MAX_SEGMENT_SIZE
|
||||
#define USE_LOGICAL_BLOCK_SIZE 512
|
||||
#define USE_MAX_HW_SECTORS 1
|
||||
#define USE_SEGMENT_BOUNDARY (PAGE_SIZE-1)
|
||||
|
||||
#define USE_CONGESTED_FN
|
||||
@ -74,6 +75,15 @@
|
||||
#define HAS_MERGE_BVEC
|
||||
#endif
|
||||
|
||||
/* 54efd50bfd873e2dbf784e0b21a8027ba4299a3e and 8ae126660fddbeebb9251a174e6fa45b6ad8f932,
|
||||
* detected via 4246a0b63bd8f56a1469b12eafeb875b1041a451
|
||||
*/
|
||||
#ifndef BIO_UPTODATE
|
||||
#define NEED_BIO_SPLIT
|
||||
#undef USE_MAX_PHYS_SEGMENTS
|
||||
#define USE_MAX_PHYS_SEGMENTS 1
|
||||
#endif
|
||||
|
||||
// end_remove_this
|
||||
///////////////////////// global tuning ////////////////////////
|
||||
|
||||
@ -341,17 +351,24 @@ void if_timer(unsigned long data)
|
||||
/* accept a linux bio, convert to mref and call buf_io() on it.
|
||||
*/
|
||||
static
|
||||
// remove_this
|
||||
/* see dece16353ef47d8d33f5302bc158072a9d65e26f */
|
||||
#ifdef BLK_QC_T_NONE
|
||||
#ifdef NEED_BIO_SPLIT
|
||||
// end_remove_this
|
||||
blk_qc_t if_make_request(struct request_queue *q, struct bio *bio)
|
||||
blk_qc_t _if_make_request(struct request_queue *q, struct bio *bio)
|
||||
// remove_this
|
||||
#else
|
||||
blk_qc_t if_make_request(struct request_queue *q, struct bio *bio)
|
||||
#endif
|
||||
#elif defined(BIO_CPU_AFFINE)
|
||||
int if_make_request(struct request_queue *q, struct bio *bio)
|
||||
#else
|
||||
#ifdef NEED_BIO_SPLIT
|
||||
void _if_make_request(struct request_queue *q, struct bio *bio)
|
||||
#else
|
||||
void if_make_request(struct request_queue *q, struct bio *bio)
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
struct if_input *input = q->queuedata;
|
||||
struct if_brick *brick = input->brick;
|
||||
@ -804,6 +821,24 @@ done:
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef NEED_BIO_SPLIT
|
||||
static
|
||||
#ifdef BLK_QC_T_NONE
|
||||
blk_qc_t if_make_request(struct request_queue *q, struct bio *bio)
|
||||
#else
|
||||
void if_make_request(struct request_queue *q, struct bio *bio)
|
||||
#endif
|
||||
{
|
||||
blk_queue_split(q, &bio, q->bio_split);
|
||||
#ifdef BLK_QC_T_NONE
|
||||
return _if_make_request(q, bio);
|
||||
#else
|
||||
_if_make_request(q, bio);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef BLK_MAX_REQUEST_COUNT
|
||||
//static
|
||||
void if_unplug(struct request_queue *q)
|
||||
@ -988,6 +1023,10 @@ static int if_switch(struct if_brick *brick)
|
||||
MARS_DBG("blk_queue_max_hw_segments()\n");
|
||||
blk_queue_max_hw_segments(q, USE_MAX_HW_SEGMENTS);
|
||||
#endif
|
||||
#ifdef USE_MAX_HW_SECTORS
|
||||
MARS_DBG("blk_queue_max_hw_sectors()\n");
|
||||
blk_queue_max_hw_sectors(q, USE_MAX_HW_SECTORS);
|
||||
#endif
|
||||
#ifdef USE_MAX_SEGMENT_SIZE
|
||||
MARS_DBG("blk_queue_max_segment_size()\n");
|
||||
blk_queue_max_segment_size(q, USE_MAX_SEGMENT_SIZE);
|
||||
|
@ -26,8 +26,7 @@
|
||||
|
||||
#include <linux/semaphore.h>
|
||||
|
||||
#define HT_SHIFT 6 //????
|
||||
#define MARS_MAX_SEGMENT_SIZE (1U << (9+HT_SHIFT))
|
||||
#define MARS_MAX_SEGMENT_SIZE (PAGE_SIZE)
|
||||
|
||||
#define MAX_BIO 32
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user