mirror of https://github.com/schoebel/mars
infra: new Lamport clock implementation
This commit is contained in:
parent
4f071e362f
commit
104b3a522a
157
kernel/lamport.c
157
kernel/lamport.c
|
@ -3,8 +3,8 @@
|
|||
*
|
||||
* This file is part of MARS project: http://schoebel.github.io/mars/
|
||||
*
|
||||
* Copyright (C) 2010-2014 Thomas Schoebel-Theuer
|
||||
* Copyright (C) 2011-2014 1&1 Internet AG
|
||||
* Copyright (C) 2010-2017 Thomas Schoebel-Theuer
|
||||
* Copyright (C) 2011-2017 1&1 Internet AG
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
|
@ -24,50 +24,141 @@
|
|||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/semaphore.h>
|
||||
#include <linux/rwsem.h>
|
||||
|
||||
#include "lamport.h"
|
||||
|
||||
/* This implementation is a variant of the following:
|
||||
*
|
||||
@article{Lamport78,
|
||||
author = {Leslie Lamport},
|
||||
title = {Time, Clocks, and the Ordering of Events in a Distributed System},
|
||||
journal = {CACM},
|
||||
volume = {21},
|
||||
number = {7},
|
||||
year = {1978},
|
||||
pages = {558--565}
|
||||
}
|
||||
* We always get both the local real time and the Lamport time in parallel.
|
||||
* The Lamport timestamp cannot fall behind the real timestamp, but
|
||||
* it may go ahead (into the "future") when clocks in the distributed
|
||||
* system are not synchronized precisely enough (e.g. via ntp).
|
||||
*
|
||||
* Thus we have a physical Lamport clock with the additional property
|
||||
* that it cannot fall behind local realtime.
|
||||
*/
|
||||
|
||||
struct semaphore lamport_sem = __SEMAPHORE_INITIALIZER(lamport_sem, 1); // TODO: replace with spinlock if possible (first check)
|
||||
struct timespec lamport_now = {};
|
||||
/* TODO CHECK: would a different locking method be better?
|
||||
* rwlocks? RCU?
|
||||
*
|
||||
* I did not really check it, due to lack of time.
|
||||
*
|
||||
* The reason why I chose rw_semaphore (against some contemporary
|
||||
* "common belief") is the following:
|
||||
*
|
||||
* A Lamport clock is a _global_ object by definition (with respect
|
||||
* to an SMP system => attention we have two levels of parallelism:
|
||||
* one at the Distributed System level, and SMP at the node level).
|
||||
*
|
||||
* Thus it _can_ happen that the Lamport clock forms a bottleneck,
|
||||
* e.g. when O(n) MARS ressources are syncing in parallel over a fast
|
||||
* network.
|
||||
*
|
||||
* Looking only at the "best case" where spinlocks or RCU might be faster
|
||||
* is therefore fundamentally broken. Instead, not only the
|
||||
* average case has to be observed, but also the worst case.
|
||||
*
|
||||
* We have some 40-core SMP/NUMA machines now (2017) at 1&1, and the number
|
||||
* will likely increase to 72 cores this year.
|
||||
* I know of cases where spinlock contention is really happening on
|
||||
* such machines in practice. If it happens, it almost kills the machine.
|
||||
*
|
||||
* When O(n) processors are spinning for the same bottleneck only _once_
|
||||
* each, already O(n^2) CPU cycles are burnt. When the bottleneck is
|
||||
* a _continuous_ one (e.g. multiple long-lasting MARS syncs in parallel),
|
||||
* then the whole machine may loose its efficiency and burn more than 90%
|
||||
* of its total CPU power in spinlocks.
|
||||
*
|
||||
* Thus I think some kind of scheduling lock is needed because the worst
|
||||
* case is an important one when the number of processors is high.
|
||||
*
|
||||
* Don't test this on workstations or notebooks, please test it on
|
||||
* the _most_ _powerful_ _servers_ you can get.
|
||||
*
|
||||
* THINK: is performance really the right measure in the long-term future?
|
||||
*
|
||||
* I think we should consider the _power_ _consumption_ (nJ / LamportOperation)
|
||||
* as a candidate for a more important measure in future.
|
||||
*
|
||||
* Please improve this code, but please use the right optimisation goal.
|
||||
*/
|
||||
struct rw_semaphore lamport_sem = __RWSEM_INITIALIZER(lamport_sem);
|
||||
|
||||
void get_lamport(struct timespec *real_now, struct timespec *lamp_now)
|
||||
struct timespec lamport_stamp = {};
|
||||
|
||||
void get_lamport(struct timespec *real_now, struct timespec *lamport_now)
|
||||
{
|
||||
int diff;
|
||||
struct timespec _real_now;
|
||||
struct timespec _lamport_now;
|
||||
|
||||
down(&lamport_sem);
|
||||
/* Get a consistent copy of _both_ clocks */
|
||||
down_read(&lamport_sem);
|
||||
_lamport_now = lamport_stamp;
|
||||
/* Theoretically, the next statement could be moved behind the unlock.
|
||||
* However, then we will loose strictness of real timestamps,
|
||||
* or even may produce contradictory orderings between real and
|
||||
* Lamport timestamps, respectively, in relation to pseudo-parallel
|
||||
* calls to get_lamport().
|
||||
*/
|
||||
_real_now = CURRENT_TIME;
|
||||
up_read(&lamport_sem);
|
||||
|
||||
*lamp_now = CURRENT_TIME;
|
||||
if (real_now)
|
||||
*real_now = *lamp_now;
|
||||
diff = timespec_compare(lamp_now, &lamport_now);
|
||||
if (diff >= 0) {
|
||||
timespec_add_ns(lamp_now, 1);
|
||||
memcpy(&lamport_now, lamp_now, sizeof(lamport_now));
|
||||
timespec_add_ns(&lamport_now, 1);
|
||||
} else {
|
||||
timespec_add_ns(&lamport_now, 1);
|
||||
memcpy(lamp_now, &lamport_now, sizeof(*lamp_now));
|
||||
}
|
||||
|
||||
up(&lamport_sem);
|
||||
*real_now = _real_now;
|
||||
/* use the maximum of both clocks as Lamport timestamp */
|
||||
if (timespec_compare(&_real_now, &_lamport_now) >= 0)
|
||||
*lamport_now = _real_now;
|
||||
else
|
||||
*lamport_now = _lamport_now;
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL_GPL(get_lamport);
|
||||
|
||||
void set_lamport(struct timespec *old)
|
||||
void set_lamport(struct timespec *lamport_old)
|
||||
{
|
||||
int diff;
|
||||
|
||||
down(&lamport_sem);
|
||||
|
||||
diff = timespec_compare(old, &lamport_now);
|
||||
if (diff >= 0) {
|
||||
memcpy(&lamport_now, old, sizeof(lamport_now));
|
||||
timespec_add_ns(&lamport_now, 1);
|
||||
}
|
||||
|
||||
up(&lamport_sem);
|
||||
/* Always advance the internal Lamport timestamp a little bit
|
||||
* in order to ensure strict monotonicity between set_lamport() calls.
|
||||
*/
|
||||
down_write(&lamport_sem);
|
||||
if (timespec_compare(lamport_old, &lamport_stamp) > 0)
|
||||
lamport_stamp = *lamport_old;
|
||||
else
|
||||
timespec_add_ns(&lamport_stamp, 1);
|
||||
up_write(&lamport_sem);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(set_lamport);
|
||||
|
||||
/* After advancing the Lamport time, re-get the new values.
|
||||
* This is almost equivalent to a sequence of set_lamport() ; get_lamport()
|
||||
* but more efficient because the lock is taken only once.
|
||||
*/
|
||||
void set_get_lamport(struct timespec *lamport_old, struct timespec *real_now, struct timespec *lamport_now)
|
||||
{
|
||||
struct timespec _real_now;
|
||||
|
||||
down_write(&lamport_sem);
|
||||
if (timespec_compare(lamport_old, &lamport_stamp) > 0)
|
||||
*lamport_now = *lamport_old;
|
||||
else
|
||||
*lamport_now = timespec_add(lamport_stamp, (struct timespec){0, 1});
|
||||
lamport_stamp = *lamport_now;
|
||||
_real_now = CURRENT_TIME;
|
||||
up_write(&lamport_sem);
|
||||
|
||||
if (real_now)
|
||||
*real_now = _real_now;
|
||||
/* use the maximum of both clocks as Lamport timestamp */
|
||||
if (timespec_compare(&_real_now, lamport_now) > 0)
|
||||
*lamport_now = _real_now;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(set_get_lamport);
|
||||
|
|
|
@ -30,9 +30,13 @@
|
|||
* We always get both the local real time and the Lamport time in parallel,
|
||||
* consistently.
|
||||
*
|
||||
* The implementation ensures that the distributed Lamport timestamp can
|
||||
* never fall behind the local real time.
|
||||
*
|
||||
* When not interested in real time, you can simply leave real_now at NULL.
|
||||
*/
|
||||
extern void get_lamport(struct timespec *real_now, struct timespec *lamport_now);
|
||||
extern void set_lamport(struct timespec *lamport_old);
|
||||
extern void set_get_lamport(struct timespec *lamport_old, struct timespec *real_now, struct timespec *lamport_now);
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue