mirror of
https://github.com/schoebel/mars
synced 2024-12-27 00:52:21 +00:00
Merge branch 'mars0.1.y' into mars0.1a.y
This commit is contained in:
commit
abf5b84b6d
76
ChangeLog
76
ChangeLog
@ -30,7 +30,7 @@ Example: mars0.3alpha*:
|
||||
|
||||
Release Conventions / Branches / Tagnames
|
||||
-----------------------------------------
|
||||
FLOW OF BUGFIXES: 0.1 -> 0.1a -> 0.1b -> 0.2 -> ...
|
||||
FLOW OF BUGFIXES: 0.1 -> 0.1a -> ...
|
||||
|
||||
mars0.1 series (stable, will go EOL soon):
|
||||
- Will run in parallel to branch 0.1a for a few
|
||||
@ -48,64 +48,6 @@ Release Conventions / Branches / Tagnames
|
||||
- Stable branch: mars0.1a.y
|
||||
- Stable tagnames: mars0.1astable%02d
|
||||
|
||||
mars0.1b series (currently alpha):
|
||||
This is an _imtermediate_ series between 0.1 and 0.2.
|
||||
The goal is to improve _scalability_ to thousands of
|
||||
hosts in one cluster, as well as thousands of resources.
|
||||
Likely, this intermdiate branch will be merged into 0.2
|
||||
and then continue development there. When this point
|
||||
will arrive is uncertain at the moment.
|
||||
Likely, the stabilization of the new scalability features
|
||||
will occur together with the 0.2 series.
|
||||
Reason for this: the rollout strategy at 1&1 to
|
||||
thousands of machines wants to do small incremental
|
||||
steps. The risk of directly going to 0.2 in _masses_
|
||||
is minimized by first rolling out the really necessary
|
||||
changes, and to postpone those developments which are
|
||||
currently not yet really needed in mass deployment.
|
||||
|
||||
mars0.2 series (currently in beta stage):
|
||||
Mostly for internal needs of 1&1 (but not limited to that).
|
||||
- Getting rid of the kernel prepatch! MARS may be built
|
||||
as an external kernel module for any supported
|
||||
kernel version. First prototype is only tested for
|
||||
unaltered 3.2.x vanilla kernel, but compatibility to
|
||||
further vanilla kernel versions (maybe even
|
||||
Redhat-specific ones) will follow during the course of
|
||||
the MARS mars0.2 stable series. The problem is not
|
||||
compatibility as such, but _testing_ that it really
|
||||
works. These tests need a lot of time.
|
||||
=> further arguments for getting to kernel upstream ASAP.
|
||||
- Improved network throughput by parallel TCP connections
|
||||
(in particular under packet loss).
|
||||
Also called "socket bundling".
|
||||
First benchmarks show an impressive speedup over
|
||||
highly congested long-distance lines.
|
||||
- Future-proof updates in the network protocol:
|
||||
Mixed operation of 32/64bit and/or {big,low}endian
|
||||
- Support for multi-homed network interfaces.
|
||||
- Transparent data compression over low bandwidth lines.
|
||||
Consumes a lot of CPU, therefore only recommended for
|
||||
low write loads or for desperate network situations.
|
||||
- Remote device: bypassing iSCSI. In essence,
|
||||
/dev/mars/mydata can appear at any other cluster member
|
||||
which doesn't necessarily need any local disks.
|
||||
- Various smaller features and improvements.
|
||||
- Unstable tagnames: mars0.2beta%d.%d (current)
|
||||
- Stable branch: mars0.2.y (already in use for beta)
|
||||
- Stable tagnames: mars0.2stable%02d (planned)
|
||||
|
||||
mars0.3 series (planned):
|
||||
(some might possibly go to 1.0 series instead)
|
||||
- Improve replication latency.
|
||||
- New pseudo-synchronous replication modes.
|
||||
For the internal needs of database folks at 1&1.
|
||||
- (Maybe) old test suite could be retired, a new
|
||||
one is at github.com/schoebel/test-suite
|
||||
- Unstable tagnames: mars0.3beta%d.%d (planned)
|
||||
- Stable branch: mars0.3.y (planned)
|
||||
- Stable tagnames: mars0.3stable%02d (planned)
|
||||
|
||||
mars1.0 series (planned):
|
||||
- Replace symlink tree by transactional status files
|
||||
(future-proof)
|
||||
@ -130,16 +72,6 @@ Release Conventions / Branches / Tagnames
|
||||
necessary for a bugfix, or for an important usability improvement
|
||||
(such as clearer display of errors, hints for resolving them, etc).
|
||||
|
||||
-----------------------------------
|
||||
Changelog for series 0.2:
|
||||
|
||||
(you need to checkout branch mars0.2.y to see any details)
|
||||
|
||||
-----------------------------------
|
||||
Changelog for series 0.1b:
|
||||
|
||||
(you need to checkout branch mars0.1b.y to see any details)
|
||||
|
||||
-----------------------------------
|
||||
Changelog for series 0.1a:
|
||||
|
||||
@ -401,6 +333,12 @@ Attention! This branch will go EOL around March 2019.
|
||||
And even more stable, although the 0.1a releases were
|
||||
called "beta" up to now.
|
||||
|
||||
mars0.1stable72
|
||||
* Minor fix: writeback improved in a corner case.
|
||||
* Minor improvement: display WriteBack data amount in
|
||||
marsadm view.
|
||||
* Major doc improvement: describe IO performance tuning.
|
||||
|
||||
mars0.1stable71
|
||||
* Major fix: writeback at the primary was unnecessarily
|
||||
slow at certain situations.
|
||||
|
Binary file not shown.
After Width: | Height: | Size: 53 KiB |
Binary file not shown.
After Width: | Height: | Size: 108 KiB |
Binary file not shown.
After Width: | Height: | Size: 91 KiB |
Binary file not shown.
After Width: | Height: | Size: 51 KiB |
Binary file not shown.
After Width: | Height: | Size: 82 KiB |
@ -147,7 +147,7 @@ tst@1und1.de
|
||||
\end_layout
|
||||
|
||||
\begin_layout Date
|
||||
Version 0.1a-70
|
||||
Version 0.1a-72
|
||||
\end_layout
|
||||
|
||||
\begin_layout Lowertitleback
|
||||
@ -15194,6 +15194,17 @@ For better performance, use newer MARS versions from branch
|
||||
mars0.1a.y
|
||||
\family default
|
||||
or later.
|
||||
Check the trips and tricks from sections
|
||||
\begin_inset CommandInset ref
|
||||
LatexCommand vref
|
||||
reference "sec:IO-Performance-Tuning"
|
||||
plural "false"
|
||||
caps "false"
|
||||
noprefix "false"
|
||||
|
||||
\end_inset
|
||||
|
||||
.
|
||||
You may also play around with
|
||||
\family typewriter
|
||||
/proc/sys/mars/aio_sync_mode
|
||||
@ -23492,11 +23503,19 @@ natural races
|
||||
\labelwidthstring 00.00.0000
|
||||
|
||||
\family typewriter
|
||||
WriteBack
|
||||
WriteBack[
|
||||
\emph on
|
||||
amount
|
||||
\emph default
|
||||
]
|
||||
\family default
|
||||
(cf
|
||||
\family typewriter
|
||||
%is-primary{}
|
||||
\family default
|
||||
and amount via
|
||||
\family typewriter
|
||||
%writeback-rest{}
|
||||
\family default
|
||||
) Appears only at actual primaries (whether designated or not), when the
|
||||
writeback from the RAM buffer is active (see section
|
||||
@ -23506,7 +23525,13 @@ reference "sec:The-Transaction-Logger"
|
||||
|
||||
\end_inset
|
||||
|
||||
)
|
||||
).
|
||||
The
|
||||
\emph on
|
||||
amount
|
||||
\emph default
|
||||
is displayed in human readable form, and may be used for a very rough estimatio
|
||||
n of recovery time after a primary crash.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Labeling
|
||||
@ -25185,7 +25210,35 @@ resize
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
The following macros are only meaningful for secondary nodes.
|
||||
The following macros are only meaningful for resources in primary mode:
|
||||
\end_layout
|
||||
|
||||
\begin_layout Labeling
|
||||
\labelwidthstring 00.00.0000
|
||||
|
||||
\family typewriter
|
||||
writeback-rest
|
||||
\family default
|
||||
Show the amount of data which is already in the transaction logfile, but
|
||||
has not yet been written back to the underlying disk.
|
||||
This may be used for estimation of recovery time after a potential primary
|
||||
crash.
|
||||
The writeback buffer is explained by the graphics at
|
||||
\begin_inset CommandInset ref
|
||||
LatexCommand vref
|
||||
reference "sec:The-Transaction-Logger"
|
||||
plural "false"
|
||||
caps "false"
|
||||
noprefix "false"
|
||||
|
||||
\end_inset
|
||||
|
||||
.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
The following macros are only meaningful for resources in secondary mode.
|
||||
By information theoretic limits, they can only tell what is
|
||||
\emph on
|
||||
locally known
|
||||
@ -44963,6 +45016,685 @@ replication networks
|
||||
Tips and Tricks
|
||||
\end_layout
|
||||
|
||||
\begin_layout Section
|
||||
IO Performance Tuning
|
||||
\begin_inset CommandInset label
|
||||
LatexCommand label
|
||||
name "sec:IO-Performance-Tuning"
|
||||
|
||||
\end_inset
|
||||
|
||||
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
There
|
||||
\emph on
|
||||
exist
|
||||
\emph default
|
||||
some use cases where MARS
|
||||
\emph on
|
||||
can
|
||||
\emph default
|
||||
deliver better IO performance than a raw block device.
|
||||
However, this cannot be expected
|
||||
\emph on
|
||||
in general
|
||||
\emph default
|
||||
.
|
||||
In some
|
||||
\emph on
|
||||
other
|
||||
\emph default
|
||||
cases the performance may be
|
||||
\emph on
|
||||
lower
|
||||
\emph default
|
||||
than with a
|
||||
\emph on
|
||||
single
|
||||
\emph default
|
||||
local raw device.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
For demonstration, we use the
|
||||
\family typewriter
|
||||
blkreplay
|
||||
\family default
|
||||
tool from
|
||||
\begin_inset Flex URL
|
||||
status open
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
http://blkreplay.org
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
||||
and a load which has been captured from a
|
||||
\series bold
|
||||
real datacenter
|
||||
\series default
|
||||
(1&1 Ionos ShaHoLin = Shared Hosting Linux).
|
||||
The load already contains a parallelism degree of 20 LXC containers running
|
||||
in parallel at the same iron.
|
||||
This corresponds to about 60,000 web spaces running on 20 Apache instances,
|
||||
already in parallel.
|
||||
In difference to artificial benchmarks (like pure random IO or pure sequential
|
||||
IO), this benchmark is much more close to real server operations, while
|
||||
artificial benchmarks are not meaningful for practice in general, because
|
||||
they can deviate from real server operations by
|
||||
\emph on
|
||||
factors
|
||||
\emph default
|
||||
or even by
|
||||
\series bold
|
||||
orders of magnitude
|
||||
\series default
|
||||
.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
In order to determine the limits of the test candidates, the timing of the
|
||||
original workload was converted to a linear ramp-up, simulating an
|
||||
\series bold
|
||||
overloaded
|
||||
\series default
|
||||
system.
|
||||
Otherwise benchmarking would not be possible.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
The following
|
||||
\family typewriter
|
||||
blkreplay
|
||||
\family default
|
||||
benchmarks were executed on an otherwise unloaded Dell R630 with 40 CPU
|
||||
threads on 2 sockets, 192 GB RAM, a Dell R730 hardware RAID controller
|
||||
with 2 GB BBU cache, and 10 spindles Dell 1.8 TB 2.5 inch SAS disks configured
|
||||
as RAID-6.
|
||||
All data, including the
|
||||
\family typewriter
|
||||
/mars
|
||||
\family default
|
||||
directory, was located on the hardware RAID via LVM2.
|
||||
|
||||
\family typewriter
|
||||
/dev/vginfong/lv-0
|
||||
\family default
|
||||
was assigned a size of 8 TiB.
|
||||
For testing, vanilla kernel 4.9.x with the MARS pre-patch and
|
||||
\family typewriter
|
||||
mars0.1astable72
|
||||
\family default
|
||||
was used.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
The
|
||||
\family typewriter
|
||||
blkreplay
|
||||
\family default
|
||||
parameters were as follows:
|
||||
\begin_inset listings
|
||||
inline false
|
||||
status open
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
output_label="MARS"
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
# input description
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
input_file_list="http://blkreplay.org/loads/natural/1and1/shared-hosting/2016/Sha
|
||||
HoLin_from_bare_metal/x20/shaholin-x20-ramped/shaholin-x20.adjacent.ramped-100.load.
|
||||
gz"
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
replay_duration=110
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
speedup=10
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
threads=512
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
cmode=with-conflicts
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
scheduler="noop"
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
# hardware setup
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
replay_host_list="icpu5133"
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
replay_device_list="/dev/vginfong/lv-0"
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
# output description
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
enable_graph=1
|
||||
\end_layout
|
||||
|
||||
\begin_layout Plain Layout
|
||||
|
||||
graph_options="--no-static --dynamic"
|
||||
\end_layout
|
||||
|
||||
\end_inset
|
||||
|
||||
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
We start with the
|
||||
\series bold
|
||||
raw
|
||||
\series default
|
||||
device
|
||||
\family typewriter
|
||||
/dev/vginfong/lv-0
|
||||
\family default
|
||||
which had a size of 8 TiB.
|
||||
The throughput is about 1418 IOPS, and the latency diagram shows that the
|
||||
system is overloaded, but can cope with that overload:
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
\align center
|
||||
\begin_inset Graphics
|
||||
filename images/blkreplay/MARS.MARS.raw.iosched-noop.nr_request-128.icpu5133.vginfong.lv-0.g01.latency.realtime.png
|
||||
width 100col%
|
||||
|
||||
\end_inset
|
||||
|
||||
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
As you can see in the filename, the NOOP kernel IO scheduler was used, and
|
||||
the kernel parameter
|
||||
\family typewriter
|
||||
nr_requests
|
||||
\family default
|
||||
was left at its default value of 128.
|
||||
When you read the specs of the Dell R730 hardware RAID controller, you
|
||||
will notice that it can handle a much higher IO request parallelism of
|
||||
almost 1024 requests in parallel.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
So the first natural tuning attempt is
|
||||
\family typewriter
|
||||
nr_requests=1020
|
||||
\family default
|
||||
, in order to release the
|
||||
\begin_inset Quotes eld
|
||||
\end_inset
|
||||
|
||||
kernel IO handbrake
|
||||
\begin_inset Quotes erd
|
||||
\end_inset
|
||||
|
||||
.
|
||||
This results in an improved throughput of 1562 IOPS, and even the
|
||||
\emph on
|
||||
maximum
|
||||
\emph default
|
||||
latencies are improved, but the
|
||||
\emph on
|
||||
average
|
||||
\emph default
|
||||
latencies are becoming a little bit worse:
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
\align center
|
||||
\begin_inset Graphics
|
||||
filename images/blkreplay/MARS.MARS.raw.iosched-noop.nr_request-1020.icpu5133.vginfong.lv-0.g01.latency.realtime.png
|
||||
width 100col%
|
||||
|
||||
\end_inset
|
||||
|
||||
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
It is well known since decades that there is a principal tradeoff between
|
||||
throughput and latencies in IO systems.
|
||||
Thus it is not a surprising result.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
On servers, overload situations should be rare, and during overload throughput
|
||||
is typically much more important than latencies, as long as latencies are
|
||||
not exceedingly high.
|
||||
Thus we can recommend
|
||||
\family typewriter
|
||||
nr_requests=1000
|
||||
\family default
|
||||
for production.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
However, some sysadmins might be tempted to question why the NOOP scheduler
|
||||
has been used.
|
||||
On the internet, there are a ton of claims that CFQ is much better.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
Well, testing with CFQ instead of NOOP is no problem for
|
||||
\family typewriter
|
||||
blkreplay
|
||||
\family default
|
||||
.
|
||||
However, the result is very surprising.
|
||||
While the IOPS are 1539, which is only a slight decrease which could result
|
||||
from measurement tolerances, the latencies are now turning almost into
|
||||
a disaster:
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
\align center
|
||||
\begin_inset Graphics
|
||||
filename images/blkreplay/MARS.MARS.raw.iosched-cfq.nr_request-1020.icpu5133.vginfong.lv-0.g01.latency.realtime.png
|
||||
width 100col%
|
||||
|
||||
\end_inset
|
||||
|
||||
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
In production, you should never encounter IO latencies of almost 15 seconds.
|
||||
So what is going wrong here?
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
Here is an explanation.
|
||||
A hardware RAID controller
|
||||
\emph on
|
||||
already
|
||||
\emph default
|
||||
has an
|
||||
\emph on
|
||||
internal
|
||||
\emph default
|
||||
IO scheduler.
|
||||
This IO scheduler is hidden in a black box, such that many sysadmins don't
|
||||
know of its existence.
|
||||
If you add another IO scheduler at kernel level, you will have
|
||||
\series bold
|
||||
two different
|
||||
\series default
|
||||
IO schedulers running in parallel, and sometimes taking
|
||||
\series bold
|
||||
contradictory decisions
|
||||
\series default
|
||||
.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
These contradictory scheduling decisions may lead to problems in certain
|
||||
cases and scenarios.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
While kernel-level IO schedulers like CFQ certainly have their merits at
|
||||
improving your workstation's IO behaviour, they are counter-productive
|
||||
at servers with hardware RAID controllers.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
So the advice is clear:
|
||||
\series bold
|
||||
switch them off
|
||||
\series default
|
||||
|
||||
\emph on
|
||||
in such a case
|
||||
\emph default
|
||||
.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
Even if you have a software RAID, check with
|
||||
\family typewriter
|
||||
blkreplay
|
||||
\family default
|
||||
that any IO schedulers are
|
||||
\emph on
|
||||
really
|
||||
\emph default
|
||||
improving things.
|
||||
When possible, use your real workload, captured with
|
||||
\family typewriter
|
||||
blktrace
|
||||
\family default
|
||||
.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
\begin_inset Graphics
|
||||
filename images/MatieresCorrosives.png
|
||||
lyxscale 50
|
||||
scale 17
|
||||
|
||||
\end_inset
|
||||
|
||||
Never use a benchmark which only delivers IOPS! As demonstrated, inappropriate
|
||||
IOPS tuning (or choice of inappropriate components) can worsen latencies
|
||||
so much that production can be endangered!
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
\begin_inset Graphics
|
||||
filename images/lightbulb_brightlit_benj_.png
|
||||
lyxscale 12
|
||||
scale 7
|
||||
|
||||
\end_inset
|
||||
|
||||
Always look at
|
||||
\emph on
|
||||
both
|
||||
\emph default
|
||||
IOPS
|
||||
\emph on
|
||||
and
|
||||
\emph default
|
||||
latencies!
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
\begin_inset Graphics
|
||||
filename images/MatieresCorrosives.png
|
||||
lyxscale 50
|
||||
scale 17
|
||||
|
||||
\end_inset
|
||||
|
||||
|
||||
\emph on
|
||||
Average
|
||||
\emph default
|
||||
latencies, even when enriched with
|
||||
\emph on
|
||||
standard deviation
|
||||
\emph default
|
||||
, are not enough.
|
||||
Classical statistics does not clearly describe operational problems like
|
||||
|
||||
\series bold
|
||||
hangs
|
||||
\series default
|
||||
and
|
||||
\series bold
|
||||
exceptionally high latency requests
|
||||
\series default
|
||||
, which may occur only rarely, but can then lead to
|
||||
\series bold
|
||||
serious incidents
|
||||
\series default
|
||||
.
|
||||
Use a tool which can clearly display
|
||||
\emph on
|
||||
any
|
||||
\emph default
|
||||
faulty behaviour, such as
|
||||
\family typewriter
|
||||
blkreplay
|
||||
\family default
|
||||
's
|
||||
\series bold
|
||||
latency diagrams
|
||||
\series default
|
||||
!
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
Now we come to benchmarking
|
||||
\family typewriter
|
||||
/dev/mars/lv-0
|
||||
\family default
|
||||
placed on top of
|
||||
\family typewriter
|
||||
/dev/vginfong/lv-0
|
||||
\family default
|
||||
.
|
||||
Notice that MARS needs to write all write requests twice: once into the
|
||||
transaction logfile, and a second time by writeback into
|
||||
\family typewriter
|
||||
/dev/vginfong/lv-0
|
||||
\family default
|
||||
.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
So you might expect that performace of
|
||||
\family typewriter
|
||||
/dev/mars/lv-0
|
||||
\family default
|
||||
could be worse than at the underlying raw device.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
Nevertheless, the
|
||||
\series bold
|
||||
throughput
|
||||
\series default
|
||||
is now measured 4338 IOPS, which means that performance has
|
||||
\series bold
|
||||
more than doubled
|
||||
\series default
|
||||
.
|
||||
You can also see it by the duration of the benchmark at the x axis.
|
||||
Even the latencies have improved in many cases:
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
\align center
|
||||
\begin_inset Graphics
|
||||
filename images/blkreplay/MARS.MARS.mars.iosched-noop.nr_request-1020.icpu5133.mars.lv-0.g01.latency.realtime.png
|
||||
width 100col%
|
||||
|
||||
\end_inset
|
||||
|
||||
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
How is it possible to be faster than a RAW device? How can this be explained?
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
Look at the graphics and at the explanations from section
|
||||
\begin_inset CommandInset ref
|
||||
LatexCommand vref
|
||||
reference "sec:The-Transaction-Logger"
|
||||
plural "false"
|
||||
caps "false"
|
||||
noprefix "false"
|
||||
|
||||
\end_inset
|
||||
|
||||
.
|
||||
The key to local IO performance is the
|
||||
\series bold
|
||||
re-ordering of writeback
|
||||
\series default
|
||||
according to ascending sector numbers.
|
||||
This can reduce mechanical seek times of hard disks considerably, and even
|
||||
by factors, such that it can over-compensate the doubled writes to the
|
||||
transaction logfile, and even when both are residing at the same RAID set.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
Notice: this effect is not only dependent from total RAM size and from the
|
||||
maximum size of the MARS temporary memory buffer (tuning parameter
|
||||
\family typewriter
|
||||
/proc/sys/mars/mars_mem_percent
|
||||
\family default
|
||||
which defaults to a limit of 20%).
|
||||
It is also highly dependent from the actual seek behaviour of the
|
||||
\series bold
|
||||
workload
|
||||
\series default
|
||||
.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
For example, if you use
|
||||
\family typewriter
|
||||
dd
|
||||
\family default
|
||||
for sequentially overwriting /dev/mars/lv-0 with a parallelism degree of
|
||||
1, the writeback optimization of MARS cannot be exploited.
|
||||
However,
|
||||
\family typewriter
|
||||
dd
|
||||
\family default
|
||||
is no appropriate benchmarking tool, and has almost nothing to do with
|
||||
real workloads occuring in datacenters, which typically are neither sequential,
|
||||
nor do they have a parallelism degree of only 1.
|
||||
Please don't try to lead any discussions about this: simply use
|
||||
\family typewriter
|
||||
blktrace
|
||||
\family default
|
||||
to capture your real server workload, and compare it to a run of dd.
|
||||
Only if you encounter the same behaviour as
|
||||
\family typewriter
|
||||
dd
|
||||
\family default
|
||||
, only then you can really claim that your workload is like
|
||||
\family typewriter
|
||||
dd
|
||||
\family default
|
||||
.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
\begin_inset Graphics
|
||||
filename images/MatieresCorrosives.png
|
||||
lyxscale 50
|
||||
scale 17
|
||||
|
||||
\end_inset
|
||||
|
||||
Any assumptions about workloads are very dangerous: they can deviate from
|
||||
practice not only by factors, but sometimes even by
|
||||
\emph on
|
||||
orders of magnitude
|
||||
\emph default
|
||||
.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
Notice: the writeback optimization of MARS can typically only improve performanc
|
||||
e of HDDs, but not of SSDs.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
\begin_inset Graphics
|
||||
filename images/lightbulb_brightlit_benj_.png
|
||||
lyxscale 12
|
||||
scale 7
|
||||
|
||||
\end_inset
|
||||
|
||||
By placing
|
||||
\family typewriter
|
||||
/mars
|
||||
\family default
|
||||
onto its own physical device with appropriate speed, you can compensate
|
||||
the doubled writes to some degree.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
\noindent
|
||||
\begin_inset Graphics
|
||||
filename images/lightbulb_brightlit_benj_.png
|
||||
lyxscale 12
|
||||
scale 7
|
||||
|
||||
\end_inset
|
||||
|
||||
Depending on the workload and on RAID parameters,
|
||||
\family typewriter
|
||||
/mars
|
||||
\family default
|
||||
may be better placed onto SSDs, or better be placed on HDDs.
|
||||
There is no general rule.
|
||||
Just use
|
||||
\family typewriter
|
||||
blktrace
|
||||
\family default
|
||||
on your real workload, and check several configuration alternatives (also
|
||||
different RAID levels etc) with
|
||||
\family typewriter
|
||||
blkreplay
|
||||
\family default
|
||||
.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Section
|
||||
Avoiding Inappropriate Clustermanager Types for Medium and Long-Distance
|
||||
Replication
|
||||
|
@ -86,7 +86,6 @@ struct trans_logger_hash_anchor {
|
||||
///////////////////////// global tuning ////////////////////////
|
||||
|
||||
int trans_logger_completion_semantics = 1;
|
||||
EXPORT_SYMBOL_GPL(trans_logger_completion_semantics);
|
||||
|
||||
int trans_logger_do_crc =
|
||||
#ifdef CONFIG_MARS_DEBUG
|
||||
@ -94,26 +93,22 @@ int trans_logger_do_crc =
|
||||
#else
|
||||
false;
|
||||
#endif
|
||||
EXPORT_SYMBOL_GPL(trans_logger_do_crc);
|
||||
|
||||
int trans_logger_mem_usage; // in KB
|
||||
EXPORT_SYMBOL_GPL(trans_logger_mem_usage);
|
||||
|
||||
int trans_logger_pressure_limit = 0;
|
||||
|
||||
int trans_logger_max_interleave = -1;
|
||||
EXPORT_SYMBOL_GPL(trans_logger_max_interleave);
|
||||
|
||||
int trans_logger_resume = 1;
|
||||
EXPORT_SYMBOL_GPL(trans_logger_resume);
|
||||
|
||||
int trans_logger_replay_timeout = 1; // in s
|
||||
EXPORT_SYMBOL_GPL(trans_logger_replay_timeout);
|
||||
|
||||
struct writeback_group global_writeback = {
|
||||
.mutex = __RWSEM_INITIALIZER(global_writeback.mutex),
|
||||
.group_anchor = LIST_HEAD_INIT(global_writeback.group_anchor),
|
||||
.until_percent = 30,
|
||||
};
|
||||
EXPORT_SYMBOL_GPL(global_writeback);
|
||||
|
||||
static
|
||||
void add_to_group(struct writeback_group *gr, struct trans_logger_brick *brick)
|
||||
@ -2311,13 +2306,28 @@ struct rank_info global_rank_mref_flying[] = {
|
||||
{ RKI_DUMMY }
|
||||
};
|
||||
|
||||
/* Not checking pressure means to always have writeback pressure
|
||||
* by default. No pressure means that writeback may be postponed
|
||||
* when other IO is more important.
|
||||
*/
|
||||
static inline
|
||||
bool _check_pressure(struct trans_logger_brick *brick)
|
||||
{
|
||||
int active =
|
||||
atomic_read(&brick->any_fly_count) +
|
||||
brick->q_phase[0].q_queued + brick->q_phase[0].q_active;
|
||||
|
||||
return (active > trans_logger_pressure_limit) &&
|
||||
brick->power.button;
|
||||
}
|
||||
|
||||
static noinline
|
||||
int _do_ranking(struct trans_logger_brick *brick)
|
||||
{
|
||||
struct rank_data *rkd = brick->rkd;
|
||||
int res;
|
||||
int i;
|
||||
int floating_mode;
|
||||
int pressure_mode;
|
||||
int mref_flying;
|
||||
bool delay_callers;
|
||||
|
||||
@ -2325,15 +2335,13 @@ int _do_ranking(struct trans_logger_brick *brick)
|
||||
|
||||
// check the memory situation...
|
||||
delay_callers = false;
|
||||
floating_mode = 1;
|
||||
if (atomic_read(&brick->any_fly_count) +
|
||||
brick->q_phase[0].q_queued + brick->q_phase[0].q_active <= 0) {
|
||||
/* do not change floating_mode */
|
||||
} else if (brick_global_memlimit >= 1024) {
|
||||
pressure_mode = 1;
|
||||
if (brick_global_memlimit >= 1024) {
|
||||
int global_mem_used = atomic64_read(&global_mshadow_used) / 1024;
|
||||
trans_logger_mem_usage = global_mem_used;
|
||||
|
||||
floating_mode = (global_mem_used < brick_global_memlimit / 2) ? 0 : 1;
|
||||
if (_check_pressure(brick))
|
||||
pressure_mode = (global_mem_used < brick_global_memlimit / 2) ? 0 : 1;
|
||||
|
||||
if (global_mem_used >= brick_global_memlimit)
|
||||
delay_callers = true;
|
||||
@ -2342,7 +2350,8 @@ int _do_ranking(struct trans_logger_brick *brick)
|
||||
} else if (brick->shadow_mem_limit >= 8) {
|
||||
int local_mem_used = atomic64_read(&brick->shadow_mem_used) / 1024;
|
||||
|
||||
floating_mode = (local_mem_used < brick->shadow_mem_limit / 2) ? 0 : 1;
|
||||
if (_check_pressure(brick))
|
||||
pressure_mode = (local_mem_used < brick->shadow_mem_limit / 2) ? 0 : 1;
|
||||
|
||||
if (local_mem_used >= brick->shadow_mem_limit)
|
||||
delay_callers = true;
|
||||
@ -2403,7 +2412,7 @@ int _do_ranking(struct trans_logger_brick *brick)
|
||||
if (i == 0) {
|
||||
// limit mref IO parallelism on transaction log
|
||||
ranking_compute(&rkd[0], extra_rank_mref_flying, mref_flying);
|
||||
} else if (i == 1 && !floating_mode) {
|
||||
} else if (i == 1 && !pressure_mode) {
|
||||
struct trans_logger_brick *leader;
|
||||
int lim;
|
||||
|
||||
@ -2439,13 +2448,13 @@ int _do_ranking(struct trans_logger_brick *brick)
|
||||
}
|
||||
}
|
||||
|
||||
ranking_compute(&rkd[i], queue_ranks[floating_mode][i], queued);
|
||||
ranking_compute(&rkd[i], queue_ranks[pressure_mode][i], queued);
|
||||
|
||||
flying = brick->q_phase[i].q_active - brick->q_phase[i].q_active;
|
||||
|
||||
MARS_IO("i = %d queued = %d flying = %d\n", i, queued, flying);
|
||||
|
||||
ranking_compute(&rkd[i], fly_ranks[floating_mode][i], flying);
|
||||
ranking_compute(&rkd[i], fly_ranks[pressure_mode][i], flying);
|
||||
}
|
||||
|
||||
// finalize it
|
||||
|
@ -50,6 +50,7 @@
|
||||
extern int trans_logger_completion_semantics;
|
||||
extern int trans_logger_do_crc;
|
||||
extern int trans_logger_mem_usage; // in KB
|
||||
extern int trans_logger_pressure_limit;
|
||||
extern int trans_logger_max_interleave;
|
||||
extern int trans_logger_resume;
|
||||
extern int trans_logger_replay_timeout; // in s
|
||||
|
@ -354,6 +354,7 @@ struct ctl_table mars_table[] = {
|
||||
INT_ENTRY("delay_say_on_overflow",delay_say_on_overflow, 0600),
|
||||
INT_ENTRY("mapfree_period_sec", mapfree_period_sec, 0600),
|
||||
INT_ENTRY("mapfree_grace_keep_mb", mapfree_grace_keep_mb, 0600),
|
||||
INT_ENTRY("logger_pressure_limit", trans_logger_pressure_limit, 0600),
|
||||
INT_ENTRY("logger_max_interleave", trans_logger_max_interleave, 0600),
|
||||
INT_ENTRY("logger_resume", trans_logger_resume, 0600),
|
||||
INT_ENTRY("logger_replay_timeout_sec", trans_logger_replay_timeout, 0600),
|
||||
|
@ -4387,6 +4387,7 @@ sub eval_fn {
|
||||
if (/^is[-_]?orphan$/) {
|
||||
my $peer = parse_macro($arg1, $env);
|
||||
$peer = $$env{"host"} unless $peer;
|
||||
return 0 if eval_fn($env, "is-primary", $peer);
|
||||
my $replay = get_link($$env{"resdir"} . "/replay-$peer", 1);
|
||||
$replay =~ m/^(log-[^,]+),([0-9]*)/;
|
||||
my $logfile = $$env{"resdir"} . "/" . $1;
|
||||
@ -4475,7 +4476,9 @@ sub eval_fn {
|
||||
my $what = $1;
|
||||
my $is = "is";
|
||||
$is = "has" if $what eq "emergency";
|
||||
my $lnk = $$env{"resdir"} . "/actual-" . $$env{"host"} . "/$is-$what";
|
||||
my $peer = parse_macro($arg1, $env);
|
||||
$peer = $$env{"host"} unless $peer;
|
||||
my $lnk = $$env{"resdir"} . "/actual-$peer/$is-$what";
|
||||
$lnk = correct_path($lnk);
|
||||
return get_link($lnk, 1);
|
||||
}
|
||||
@ -4568,6 +4571,13 @@ sub eval_fn {
|
||||
my $what = $1;
|
||||
return eval_fn($env, "$what-lognr", "") - eval_fn($env, "replay-lognr", "");
|
||||
}
|
||||
if (/^writeback[-_]?rest$/) {
|
||||
my $lnk = $$env{"resdir"} . "/replay-" . $$env{"host"};
|
||||
my $link = get_link($lnk, 1);
|
||||
$link =~ m/,([0-9]+)$/;
|
||||
return $1 if defined($1);
|
||||
return 0;
|
||||
}
|
||||
if (/^(replay|work)[-_]?(pos)$/) {
|
||||
my $what = $1;
|
||||
my $op = $2;
|
||||
@ -5049,7 +5059,7 @@ my %complex_macros =
|
||||
. "%elsif{%not{%todo-primary{}}}{"
|
||||
. "InConsistent"
|
||||
. "}{%is-primary{}}{"
|
||||
. "WriteBack"
|
||||
. "WriteBack[%human-numbers{}{ }{ }{%writeback-rest{}}]"
|
||||
. "}{"
|
||||
. "Recovery"
|
||||
. "}"
|
||||
@ -5057,7 +5067,7 @@ my %complex_macros =
|
||||
. "%elsif{%not{%todo-primary{}}}{"
|
||||
. "OutDated[%call{outdated-flags}]"
|
||||
. "}{%is-primary{}}{"
|
||||
. "WriteBack"
|
||||
. "WriteBack[%human-numbers{}{ }{ }{%writeback-rest{}}]"
|
||||
. "}{"
|
||||
. "Recovery"
|
||||
. "}"
|
||||
@ -5479,6 +5489,8 @@ my %trivial_globs =
|
||||
=> "",
|
||||
"{sync,fetch,replay,work}-{rest,{almost-,threshold-,}reached,percent,permille,vector}"
|
||||
=> "",
|
||||
"writeback-rest"
|
||||
=> "",
|
||||
"{sync,fetch,replay}-{rate,remain}"
|
||||
=> "",
|
||||
"replay-basenr"
|
||||
|
Loading…
Reference in New Issue
Block a user