infra: new Lamport clock implementation

This commit is contained in:
Thomas Schoebel-Theuer 2017-04-15 08:56:20 +02:00
parent 4f071e362f
commit 104b3a522a
2 changed files with 128 additions and 33 deletions

View File

@ -3,8 +3,8 @@
* *
* This file is part of MARS project: http://schoebel.github.io/mars/ * This file is part of MARS project: http://schoebel.github.io/mars/
* *
* Copyright (C) 2010-2014 Thomas Schoebel-Theuer * Copyright (C) 2010-2017 Thomas Schoebel-Theuer
* Copyright (C) 2011-2014 1&1 Internet AG * Copyright (C) 2011-2017 1&1 Internet AG
* *
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
@ -24,50 +24,141 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/semaphore.h> #include <linux/rwsem.h>
#include "lamport.h" #include "lamport.h"
/* This implementation is a variant of the following:
struct semaphore lamport_sem = __SEMAPHORE_INITIALIZER(lamport_sem, 1); // TODO: replace with spinlock if possible (first check) *
struct timespec lamport_now = {}; @article{Lamport78,
author = {Leslie Lamport},
void get_lamport(struct timespec *real_now, struct timespec *lamp_now) title = {Time, Clocks, and the Ordering of Events in a Distributed System},
{ journal = {CACM},
int diff; volume = {21},
number = {7},
down(&lamport_sem); year = {1978},
pages = {558--565}
*lamp_now = CURRENT_TIME;
if (real_now)
*real_now = *lamp_now;
diff = timespec_compare(lamp_now, &lamport_now);
if (diff >= 0) {
timespec_add_ns(lamp_now, 1);
memcpy(&lamport_now, lamp_now, sizeof(lamport_now));
timespec_add_ns(&lamport_now, 1);
} else {
timespec_add_ns(&lamport_now, 1);
memcpy(lamp_now, &lamport_now, sizeof(*lamp_now));
} }
* We always get both the local real time and the Lamport time in parallel.
* The Lamport timestamp cannot fall behind the real timestamp, but
* it may go ahead (into the "future") when clocks in the distributed
* system are not synchronized precisely enough (e.g. via ntp).
*
* Thus we have a physical Lamport clock with the additional property
* that it cannot fall behind local realtime.
*/
up(&lamport_sem); /* TODO CHECK: would a different locking method be better?
* rwlocks? RCU?
*
* I did not really check it, due to lack of time.
*
* The reason why I chose rw_semaphore (against some contemporary
* "common belief") is the following:
*
* A Lamport clock is a _global_ object by definition (with respect
* to an SMP system => attention we have two levels of parallelism:
* one at the Distributed System level, and SMP at the node level).
*
* Thus it _can_ happen that the Lamport clock forms a bottleneck,
* e.g. when O(n) MARS ressources are syncing in parallel over a fast
* network.
*
* Looking only at the "best case" where spinlocks or RCU might be faster
* is therefore fundamentally broken. Instead, not only the
* average case has to be observed, but also the worst case.
*
* We have some 40-core SMP/NUMA machines now (2017) at 1&1, and the number
* will likely increase to 72 cores this year.
* I know of cases where spinlock contention is really happening on
* such machines in practice. If it happens, it almost kills the machine.
*
* When O(n) processors are spinning for the same bottleneck only _once_
* each, already O(n^2) CPU cycles are burnt. When the bottleneck is
* a _continuous_ one (e.g. multiple long-lasting MARS syncs in parallel),
* then the whole machine may loose its efficiency and burn more than 90%
* of its total CPU power in spinlocks.
*
* Thus I think some kind of scheduling lock is needed because the worst
* case is an important one when the number of processors is high.
*
* Don't test this on workstations or notebooks, please test it on
* the _most_ _powerful_ _servers_ you can get.
*
* THINK: is performance really the right measure in the long-term future?
*
* I think we should consider the _power_ _consumption_ (nJ / LamportOperation)
* as a candidate for a more important measure in future.
*
* Please improve this code, but please use the right optimisation goal.
*/
struct rw_semaphore lamport_sem = __RWSEM_INITIALIZER(lamport_sem);
struct timespec lamport_stamp = {};
void get_lamport(struct timespec *real_now, struct timespec *lamport_now)
{
struct timespec _real_now;
struct timespec _lamport_now;
/* Get a consistent copy of _both_ clocks */
down_read(&lamport_sem);
_lamport_now = lamport_stamp;
/* Theoretically, the next statement could be moved behind the unlock.
* However, then we will loose strictness of real timestamps,
* or even may produce contradictory orderings between real and
* Lamport timestamps, respectively, in relation to pseudo-parallel
* calls to get_lamport().
*/
_real_now = CURRENT_TIME;
up_read(&lamport_sem);
if (real_now)
*real_now = _real_now;
/* use the maximum of both clocks as Lamport timestamp */
if (timespec_compare(&_real_now, &_lamport_now) >= 0)
*lamport_now = _real_now;
else
*lamport_now = _lamport_now;
} }
EXPORT_SYMBOL_GPL(get_lamport); EXPORT_SYMBOL_GPL(get_lamport);
void set_lamport(struct timespec *old) void set_lamport(struct timespec *lamport_old)
{ {
int diff; /* Always advance the internal Lamport timestamp a little bit
* in order to ensure strict monotonicity between set_lamport() calls.
down(&lamport_sem); */
down_write(&lamport_sem);
diff = timespec_compare(old, &lamport_now); if (timespec_compare(lamport_old, &lamport_stamp) > 0)
if (diff >= 0) { lamport_stamp = *lamport_old;
memcpy(&lamport_now, old, sizeof(lamport_now)); else
timespec_add_ns(&lamport_now, 1); timespec_add_ns(&lamport_stamp, 1);
} up_write(&lamport_sem);
up(&lamport_sem);
} }
EXPORT_SYMBOL_GPL(set_lamport); EXPORT_SYMBOL_GPL(set_lamport);
/* After advancing the Lamport time, re-get the new values.
* This is almost equivalent to a sequence of set_lamport() ; get_lamport()
* but more efficient because the lock is taken only once.
*/
void set_get_lamport(struct timespec *lamport_old, struct timespec *real_now, struct timespec *lamport_now)
{
struct timespec _real_now;
down_write(&lamport_sem);
if (timespec_compare(lamport_old, &lamport_stamp) > 0)
*lamport_now = *lamport_old;
else
*lamport_now = timespec_add(lamport_stamp, (struct timespec){0, 1});
lamport_stamp = *lamport_now;
_real_now = CURRENT_TIME;
up_write(&lamport_sem);
if (real_now)
*real_now = _real_now;
/* use the maximum of both clocks as Lamport timestamp */
if (timespec_compare(&_real_now, lamport_now) > 0)
*lamport_now = _real_now;
}
EXPORT_SYMBOL_GPL(set_get_lamport);

View File

@ -30,9 +30,13 @@
* We always get both the local real time and the Lamport time in parallel, * We always get both the local real time and the Lamport time in parallel,
* consistently. * consistently.
* *
* The implementation ensures that the distributed Lamport timestamp can
* never fall behind the local real time.
*
* When not interested in real time, you can simply leave real_now at NULL. * When not interested in real time, you can simply leave real_now at NULL.
*/ */
extern void get_lamport(struct timespec *real_now, struct timespec *lamport_now); extern void get_lamport(struct timespec *real_now, struct timespec *lamport_now);
extern void set_lamport(struct timespec *lamport_old); extern void set_lamport(struct timespec *lamport_old);
extern void set_get_lamport(struct timespec *lamport_old, struct timespec *real_now, struct timespec *lamport_now);
#endif #endif