mars/kernel/lamport.c

/*
 * MARS Long Distance Replication Software
 *
 * This file is part of MARS project: http://schoebel.github.io/mars/
 *
 * Copyright (C) 2010-2017 Thomas Schoebel-Theuer
 * Copyright (C) 2011-2017 1&1 Internet AG
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */


#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/rwsem.h>

#include "lamport.h"

/* This implementation is a variant of the following:
 *
@article{Lamport78,
  author = {Leslie Lamport},
  title = {Time, Clocks, and the Ordering of Events in a Distributed System},
  journal = {CACM},
  volume = {21},
  number = {7},
  year = {1978},
  pages = {558--565}
}
 * We always get both the local real time and the Lamport time in parallel.
 * The Lamport timestamp cannot fall behind the real timestamp, but
 * it may go ahead (into the "future") when clocks in the distributed
 * system are not synchronized precisely enough (e.g. via ntp).
 *
 * Thus we have a physical Lamport clock with the additional property
 * that it cannot fall behind local realtime.
 */

/* TODO CHECK: would a different locking method be better?
 * rwlocks? RCU?
 *
 * I did not really check it, due to lack of time.
 *
 * The reason why I chose rw_semaphore (against some contemporary
 * "common belief") is the following:
 *
 * A Lamport clock is a _global_ object by definition (with respect
 * to an SMP system => attention we have two levels of parallelism:
 * one at the Distributed System level, and SMP at the node level).
 *
 * Thus it _can_ happen that the Lamport clock forms a bottleneck,
 * e.g. when O(n) MARS ressources are syncing in parallel over a fast
 * network.
 *
 * Looking only at the "best case" where spinlocks or RCU might be faster
 * is therefore fundamentally broken. Instead, not only the
 * average case has to be observed, but also the worst case.
 *
 * We have some 40-core SMP/NUMA machines now (2017) at 1&1, and the number
 * will likely increase to 72 cores this year.
 * I know of cases where spinlock contention is really happening on
 * such machines in practice. If it happens, it almost kills the machine.
 *
 * When O(n) processors are spinning for the same bottleneck only _once_
 * each, already O(n^2) CPU cycles are burnt. When the bottleneck is
 * a _continuous_ one (e.g. multiple long-lasting MARS syncs in parallel),
 * then the whole machine may loose its efficiency and burn more than 90%
 * of its total CPU power in spinlocks.
 *
 * Thus I think some kind of scheduling lock is needed because the worst
 * case is an important one when the number of processors is high.
 *
 * Don't test this on workstations or notebooks, please test it on
 * the _most_ _powerful_ _servers_ you can get.
 *
 * THINK: is performance really the right measure in the long-term future?
 *
 * I think we should consider the _power_ _consumption_ (nJ / LamportOperation)
 * as a candidate for a more important measure in future.
 *
 * Please improve this code, but please use the right optimisation goal.
 */
struct lamport_clock global_lamport = {
	.lamport_sem = __RWSEM_INITIALIZER(global_lamport.lamport_sem),
};
EXPORT_SYMBOL_GPL(global_lamport);

void _get_lamport(struct lamport_clock *clock,
		  struct lamport_time *real_now,
		  struct lamport_time *lamport_now)
{
	struct lamport_time _real_now;
	struct lamport_time _lamport_now;

	/* Get a consistent copy of _both_ clocks */
	down_read(&clock->lamport_sem);
	_lamport_now = clock->lamport_stamp;
	/* Theoretically, the next statement could be moved behind the unlock.
	 * However, then we will loose strictness of real timestamps,
	 * or even may produce contradictory orderings between real and
	 * Lamport timestamps, respectively, in relation to pseudo-parallel
	 * calls to get_lamport().
	 */
	get_real_lamport(&_real_now);

	up_read(&clock->lamport_sem);

	if (real_now)
		*real_now = _real_now;
	/* use the maximum of both clocks as Lamport timestamp */
	if (lamport_time_compare(&_real_now, &_lamport_now) >= 0)
		*lamport_now = _real_now;
	else
		*lamport_now = _lamport_now;
}
EXPORT_SYMBOL_GPL(_get_lamport);

void _set_lamport(struct lamport_clock *clock,
		  struct lamport_time *lamport_advance)
{
	protect_lamport_time(lamport_advance);

	/* Always advance the internal Lamport timestamp a little bit
	 * in order to ensure strict monotonicity between set_lamport() calls.
	 */
	down_write(&clock->lamport_sem);
	if (lamport_time_compare(lamport_advance, &clock->lamport_stamp) > 0)
		clock->lamport_stamp = *lamport_advance;
	else
		lamport_time_add_ns(&clock->lamport_stamp, 1);
	up_write(&clock->lamport_sem);
}
EXPORT_SYMBOL_GPL(_set_lamport);

void _set_lamport_nonstrict(struct lamport_clock *clock,
			    struct lamport_time *lamport_advance)
{
	protect_lamport_time(lamport_advance);

	/*  Speculate that advaning is not necessary, to avoid the lock
	 */
	if (lamport_time_compare(lamport_advance, &clock->lamport_stamp) > 0) {
		down_write(&clock->lamport_sem);
		if (lamport_time_compare(lamport_advance, &clock->lamport_stamp) > 0)
			clock->lamport_stamp = *lamport_advance;
		up_write(&clock->lamport_sem);
	}
}
EXPORT_SYMBOL_GPL(_set_lamport_nonstrict);

/* After advancing the Lamport time, re-get the new values.
 * This is almost equivalent to a sequence of set_lamport() ; get_lamport()
 * but more efficient because the lock is taken only once.
 */
void _set_get_lamport(struct lamport_clock *clock,
		      struct lamport_time *lamport_advance,
		      struct lamport_time *real_now,
		      struct lamport_time *lamport_now)
{
	struct lamport_time _real_now;

	protect_lamport_time(lamport_advance);

	down_write(&clock->lamport_sem);
	if (lamport_time_compare(lamport_advance, &clock->lamport_stamp) > 0)
		*lamport_now = *lamport_advance;
	else
		*lamport_now = lamport_time_add(clock->lamport_stamp,
						(struct lamport_time){0, 1});
	clock->lamport_stamp = *lamport_now;
	get_real_lamport(&_real_now);
	up_write(&clock->lamport_sem);

	if (real_now)
		*real_now = _real_now;
	/* use the maximum of both clocks as Lamport timestamp */
	if (lamport_time_compare(&_real_now, lamport_now) > 0)
		*lamport_now = _real_now;
}
EXPORT_SYMBOL_GPL(_set_get_lamport);

/* Protect against illegal values, e.g. from currupt filesystems etc.
 */

int max_lamport_future = 30 * 24 * 3600;

bool _protect_lamport_time(struct lamport_clock *clock,
			   struct lamport_time *check)
{
	struct lamport_time limit;
	bool res = false;

	get_real_lamport(&limit);
	limit.tv_sec += max_lamport_future;
	if (unlikely(check->tv_sec >= limit.tv_sec)) {
		down_write(&clock->lamport_sem);
		lamport_time_add_ns(&clock->lamport_stamp, 1);
		lamport_time_add_ns(&clock->lamport_stamp, 1);
		memcpy(check, &clock->lamport_stamp, sizeof(*check));
		if (unlikely(check->tv_sec > limit.tv_sec))
			max_lamport_future += check->tv_sec - limit.tv_sec;
		up_write(&clock->lamport_sem);
		res = true;
	}
	return res;
}
EXPORT_SYMBOL_GPL(_protect_lamport_time);