/* * MARS Long Distance Replication Software * * This file is part of MARS project: http://schoebel.github.io/mars/ * * Copyright (C) 2010-2017 Thomas Schoebel-Theuer * Copyright (C) 2011-2017 1&1 Internet AG * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/rwsem.h> #include "lamport.h" /* This implementation is a variant of the following: * @article{Lamport78, author = {Leslie Lamport}, title = {Time, Clocks, and the Ordering of Events in a Distributed System}, journal = {CACM}, volume = {21}, number = {7}, year = {1978}, pages = {558--565} } * We always get both the local real time and the Lamport time in parallel. * The Lamport timestamp cannot fall behind the real timestamp, but * it may go ahead (into the "future") when clocks in the distributed * system are not synchronized precisely enough (e.g. via ntp). * * Thus we have a physical Lamport clock with the additional property * that it cannot fall behind local realtime. */ /* TODO CHECK: would a different locking method be better? * rwlocks? RCU? * * I did not really check it, due to lack of time. * * The reason why I chose rw_semaphore (against some contemporary * "common belief") is the following: * * A Lamport clock is a _global_ object by definition (with respect * to an SMP system => attention we have two levels of parallelism: * one at the Distributed System level, and SMP at the node level). * * Thus it _can_ happen that the Lamport clock forms a bottleneck, * e.g. when O(n) MARS ressources are syncing in parallel over a fast * network. * * Looking only at the "best case" where spinlocks or RCU might be faster * is therefore fundamentally broken. Instead, not only the * average case has to be observed, but also the worst case. * * We have some 40-core SMP/NUMA machines now (2017) at 1&1, and the number * will likely increase to 72 cores this year. * I know of cases where spinlock contention is really happening on * such machines in practice. If it happens, it almost kills the machine. * * When O(n) processors are spinning for the same bottleneck only _once_ * each, already O(n^2) CPU cycles are burnt. When the bottleneck is * a _continuous_ one (e.g. multiple long-lasting MARS syncs in parallel), * then the whole machine may loose its efficiency and burn more than 90% * of its total CPU power in spinlocks. * * Thus I think some kind of scheduling lock is needed because the worst * case is an important one when the number of processors is high. * * Don't test this on workstations or notebooks, please test it on * the _most_ _powerful_ _servers_ you can get. * * THINK: is performance really the right measure in the long-term future? * * I think we should consider the _power_ _consumption_ (nJ / LamportOperation) * as a candidate for a more important measure in future. * * Please improve this code, but please use the right optimisation goal. */ struct rw_semaphore lamport_sem = __RWSEM_INITIALIZER(lamport_sem); struct timespec lamport_stamp = {}; void get_lamport(struct timespec *real_now, struct timespec *lamport_now) { struct timespec _real_now; struct timespec _lamport_now; /* Get a consistent copy of _both_ clocks */ down_read(&lamport_sem); _lamport_now = lamport_stamp; /* Theoretically, the next statement could be moved behind the unlock. * However, then we will loose strictness of real timestamps, * or even may produce contradictory orderings between real and * Lamport timestamps, respectively, in relation to pseudo-parallel * calls to get_lamport(). */ _real_now = CURRENT_TIME; up_read(&lamport_sem); if (real_now) *real_now = _real_now; /* use the maximum of both clocks as Lamport timestamp */ if (timespec_compare(&_real_now, &_lamport_now) >= 0) *lamport_now = _real_now; else *lamport_now = _lamport_now; } EXPORT_SYMBOL_GPL(get_lamport); void set_lamport(struct timespec *lamport_old) { protect_timespec(lamport_old); /* Always advance the internal Lamport timestamp a little bit * in order to ensure strict monotonicity between set_lamport() calls. */ down_write(&lamport_sem); if (timespec_compare(lamport_old, &lamport_stamp) > 0) lamport_stamp = *lamport_old; else timespec_add_ns(&lamport_stamp, 1); up_write(&lamport_sem); } EXPORT_SYMBOL_GPL(set_lamport); void set_lamport_nonstrict(struct timespec *lamport_old) { protect_timespec(lamport_old); /* Speculate that advaning is not necessary, to avoid the lock */ if (timespec_compare(lamport_old, &lamport_stamp) > 0) { down_write(&lamport_sem); if (timespec_compare(lamport_old, &lamport_stamp) > 0) lamport_stamp = *lamport_old; up_write(&lamport_sem); } } EXPORT_SYMBOL_GPL(set_lamport_nonstrict); /* After advancing the Lamport time, re-get the new values. * This is almost equivalent to a sequence of set_lamport() ; get_lamport() * but more efficient because the lock is taken only once. */ void set_get_lamport(struct timespec *lamport_old, struct timespec *real_now, struct timespec *lamport_now) { struct timespec _real_now; protect_timespec(lamport_old); down_write(&lamport_sem); if (timespec_compare(lamport_old, &lamport_stamp) > 0) *lamport_now = *lamport_old; else *lamport_now = timespec_add(lamport_stamp, (struct timespec){0, 1}); lamport_stamp = *lamport_now; _real_now = CURRENT_TIME; up_write(&lamport_sem); if (real_now) *real_now = _real_now; /* use the maximum of both clocks as Lamport timestamp */ if (timespec_compare(&_real_now, lamport_now) > 0) *lamport_now = _real_now; } EXPORT_SYMBOL_GPL(set_get_lamport); /* Protect against illegal values, e.g. from currupt filesystems etc. */ int max_lamport_future = 30 * 24 * 3600; bool protect_timespec(struct timespec *check) { struct timespec limit = CURRENT_TIME; bool res = false; limit.tv_sec += max_lamport_future; if (unlikely(check->tv_sec >= limit.tv_sec)) { down_write(&lamport_sem); timespec_add_ns(&lamport_stamp, 1); memcpy(check, &lamport_stamp, sizeof(*check)); if (unlikely(check->tv_sec > limit.tv_sec)) max_lamport_future += check->tv_sec - limit.tv_sec; up_write(&lamport_sem); res = true; } return res; }