From 104b3a522a131adf3a494c66c3f582634e22389d Mon Sep 17 00:00:00 2001 From: Thomas Schoebel-Theuer Date: Sat, 15 Apr 2017 08:56:20 +0200 Subject: [PATCH] infra: new Lamport clock implementation --- kernel/lamport.c | 157 +++++++++++++++++++++++++++++++++++++---------- kernel/lamport.h | 4 ++ 2 files changed, 128 insertions(+), 33 deletions(-) diff --git a/kernel/lamport.c b/kernel/lamport.c index 010ba375..853530c2 100644 --- a/kernel/lamport.c +++ b/kernel/lamport.c @@ -3,8 +3,8 @@ * * This file is part of MARS project: http://schoebel.github.io/mars/ * - * Copyright (C) 2010-2014 Thomas Schoebel-Theuer - * Copyright (C) 2011-2014 1&1 Internet AG + * Copyright (C) 2010-2017 Thomas Schoebel-Theuer + * Copyright (C) 2011-2017 1&1 Internet AG * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -24,50 +24,141 @@ #include #include -#include +#include #include "lamport.h" +/* This implementation is a variant of the following: + * +@article{Lamport78, + author = {Leslie Lamport}, + title = {Time, Clocks, and the Ordering of Events in a Distributed System}, + journal = {CACM}, + volume = {21}, + number = {7}, + year = {1978}, + pages = {558--565} +} + * We always get both the local real time and the Lamport time in parallel. + * The Lamport timestamp cannot fall behind the real timestamp, but + * it may go ahead (into the "future") when clocks in the distributed + * system are not synchronized precisely enough (e.g. via ntp). + * + * Thus we have a physical Lamport clock with the additional property + * that it cannot fall behind local realtime. + */ -struct semaphore lamport_sem = __SEMAPHORE_INITIALIZER(lamport_sem, 1); // TODO: replace with spinlock if possible (first check) -struct timespec lamport_now = {}; +/* TODO CHECK: would a different locking method be better? + * rwlocks? RCU? + * + * I did not really check it, due to lack of time. + * + * The reason why I chose rw_semaphore (against some contemporary + * "common belief") is the following: + * + * A Lamport clock is a _global_ object by definition (with respect + * to an SMP system => attention we have two levels of parallelism: + * one at the Distributed System level, and SMP at the node level). + * + * Thus it _can_ happen that the Lamport clock forms a bottleneck, + * e.g. when O(n) MARS ressources are syncing in parallel over a fast + * network. + * + * Looking only at the "best case" where spinlocks or RCU might be faster + * is therefore fundamentally broken. Instead, not only the + * average case has to be observed, but also the worst case. + * + * We have some 40-core SMP/NUMA machines now (2017) at 1&1, and the number + * will likely increase to 72 cores this year. + * I know of cases where spinlock contention is really happening on + * such machines in practice. If it happens, it almost kills the machine. + * + * When O(n) processors are spinning for the same bottleneck only _once_ + * each, already O(n^2) CPU cycles are burnt. When the bottleneck is + * a _continuous_ one (e.g. multiple long-lasting MARS syncs in parallel), + * then the whole machine may loose its efficiency and burn more than 90% + * of its total CPU power in spinlocks. + * + * Thus I think some kind of scheduling lock is needed because the worst + * case is an important one when the number of processors is high. + * + * Don't test this on workstations or notebooks, please test it on + * the _most_ _powerful_ _servers_ you can get. + * + * THINK: is performance really the right measure in the long-term future? + * + * I think we should consider the _power_ _consumption_ (nJ / LamportOperation) + * as a candidate for a more important measure in future. + * + * Please improve this code, but please use the right optimisation goal. + */ +struct rw_semaphore lamport_sem = __RWSEM_INITIALIZER(lamport_sem); -void get_lamport(struct timespec *real_now, struct timespec *lamp_now) +struct timespec lamport_stamp = {}; + +void get_lamport(struct timespec *real_now, struct timespec *lamport_now) { - int diff; + struct timespec _real_now; + struct timespec _lamport_now; - down(&lamport_sem); + /* Get a consistent copy of _both_ clocks */ + down_read(&lamport_sem); + _lamport_now = lamport_stamp; + /* Theoretically, the next statement could be moved behind the unlock. + * However, then we will loose strictness of real timestamps, + * or even may produce contradictory orderings between real and + * Lamport timestamps, respectively, in relation to pseudo-parallel + * calls to get_lamport(). + */ + _real_now = CURRENT_TIME; + up_read(&lamport_sem); - *lamp_now = CURRENT_TIME; if (real_now) - *real_now = *lamp_now; - diff = timespec_compare(lamp_now, &lamport_now); - if (diff >= 0) { - timespec_add_ns(lamp_now, 1); - memcpy(&lamport_now, lamp_now, sizeof(lamport_now)); - timespec_add_ns(&lamport_now, 1); - } else { - timespec_add_ns(&lamport_now, 1); - memcpy(lamp_now, &lamport_now, sizeof(*lamp_now)); - } - - up(&lamport_sem); + *real_now = _real_now; + /* use the maximum of both clocks as Lamport timestamp */ + if (timespec_compare(&_real_now, &_lamport_now) >= 0) + *lamport_now = _real_now; + else + *lamport_now = _lamport_now; } EXPORT_SYMBOL_GPL(get_lamport); -void set_lamport(struct timespec *old) +void set_lamport(struct timespec *lamport_old) { - int diff; - - down(&lamport_sem); - - diff = timespec_compare(old, &lamport_now); - if (diff >= 0) { - memcpy(&lamport_now, old, sizeof(lamport_now)); - timespec_add_ns(&lamport_now, 1); - } - - up(&lamport_sem); + /* Always advance the internal Lamport timestamp a little bit + * in order to ensure strict monotonicity between set_lamport() calls. + */ + down_write(&lamport_sem); + if (timespec_compare(lamport_old, &lamport_stamp) > 0) + lamport_stamp = *lamport_old; + else + timespec_add_ns(&lamport_stamp, 1); + up_write(&lamport_sem); } EXPORT_SYMBOL_GPL(set_lamport); + +/* After advancing the Lamport time, re-get the new values. + * This is almost equivalent to a sequence of set_lamport() ; get_lamport() + * but more efficient because the lock is taken only once. + */ +void set_get_lamport(struct timespec *lamport_old, struct timespec *real_now, struct timespec *lamport_now) +{ + struct timespec _real_now; + + down_write(&lamport_sem); + if (timespec_compare(lamport_old, &lamport_stamp) > 0) + *lamport_now = *lamport_old; + else + *lamport_now = timespec_add(lamport_stamp, (struct timespec){0, 1}); + lamport_stamp = *lamport_now; + _real_now = CURRENT_TIME; + up_write(&lamport_sem); + + if (real_now) + *real_now = _real_now; + /* use the maximum of both clocks as Lamport timestamp */ + if (timespec_compare(&_real_now, lamport_now) > 0) + *lamport_now = _real_now; +} +EXPORT_SYMBOL_GPL(set_get_lamport); diff --git a/kernel/lamport.h b/kernel/lamport.h index 5ac1915f..0e9d96be 100644 --- a/kernel/lamport.h +++ b/kernel/lamport.h @@ -30,9 +30,13 @@ * We always get both the local real time and the Lamport time in parallel, * consistently. * + * The implementation ensures that the distributed Lamport timestamp can + * never fall behind the local real time. + * * When not interested in real time, you can simply leave real_now at NULL. */ extern void get_lamport(struct timespec *real_now, struct timespec *lamport_now); extern void set_lamport(struct timespec *lamport_old); +extern void set_get_lamport(struct timespec *lamport_old, struct timespec *real_now, struct timespec *lamport_now); #endif