From 104b3a522a131adf3a494c66c3f582634e22389d Mon Sep 17 00:00:00 2001
From: Thomas Schoebel-Theuer <tst@schoebel-theuer.de>
Date: Sat, 15 Apr 2017 08:56:20 +0200
Subject: [PATCH] infra: new Lamport clock implementation

---
 kernel/lamport.c | 157 +++++++++++++++++++++++++++++++++++++----------
 kernel/lamport.h |   4 ++
 2 files changed, 128 insertions(+), 33 deletions(-)

diff --git a/kernel/lamport.c b/kernel/lamport.c
index 010ba375..853530c2 100644
--- a/kernel/lamport.c
+++ b/kernel/lamport.c
@@ -3,8 +3,8 @@
  *
  * This file is part of MARS project: http://schoebel.github.io/mars/
  *
- * Copyright (C) 2010-2014 Thomas Schoebel-Theuer
- * Copyright (C) 2011-2014 1&1 Internet AG
+ * Copyright (C) 2010-2017 Thomas Schoebel-Theuer
+ * Copyright (C) 2011-2017 1&1 Internet AG
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -24,50 +24,141 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/semaphore.h>
+#include <linux/rwsem.h>
 
 #include "lamport.h"
 
+/* This implementation is a variant of the following:
+ *
+@article{Lamport78,
+  author = {Leslie Lamport},
+  title = {Time, Clocks, and the Ordering of Events in a Distributed System},
+  journal = {CACM},
+  volume = {21},
+  number = {7},
+  year = {1978},
+  pages = {558--565}
+}
+ * We always get both the local real time and the Lamport time in parallel.
+ * The Lamport timestamp cannot fall behind the real timestamp, but
+ * it may go ahead (into the "future") when clocks in the distributed
+ * system are not synchronized precisely enough (e.g. via ntp).
+ *
+ * Thus we have a physical Lamport clock with the additional property
+ * that it cannot fall behind local realtime.
+ */
 
-struct semaphore lamport_sem = __SEMAPHORE_INITIALIZER(lamport_sem, 1); // TODO: replace with spinlock if possible (first check)
-struct timespec lamport_now = {};
+/* TODO CHECK: would a different locking method be better?
+ * rwlocks? RCU?
+ *
+ * I did not really check it, due to lack of time.
+ *
+ * The reason why I chose rw_semaphore (against some contemporary
+ * "common belief") is the following:
+ *
+ * A Lamport clock is a _global_ object by definition (with respect
+ * to an SMP system => attention we have two levels of parallelism:
+ * one at the Distributed System level, and SMP at the node level).
+ *
+ * Thus it _can_ happen that the Lamport clock forms a bottleneck,
+ * e.g. when O(n) MARS ressources are syncing in parallel over a fast
+ * network.
+ *
+ * Looking only at the "best case" where spinlocks or RCU might be faster
+ * is therefore fundamentally broken. Instead, not only the
+ * average case has to be observed, but also the worst case.
+ *
+ * We have some 40-core SMP/NUMA machines now (2017) at 1&1, and the number
+ * will likely increase to 72 cores this year.
+ * I know of cases where spinlock contention is really happening on
+ * such machines in practice. If it happens, it almost kills the machine.
+ *
+ * When O(n) processors are spinning for the same bottleneck only _once_
+ * each, already O(n^2) CPU cycles are burnt. When the bottleneck is
+ * a _continuous_ one (e.g. multiple long-lasting MARS syncs in parallel),
+ * then the whole machine may loose its efficiency and burn more than 90%
+ * of its total CPU power in spinlocks.
+ *
+ * Thus I think some kind of scheduling lock is needed because the worst
+ * case is an important one when the number of processors is high.
+ *
+ * Don't test this on workstations or notebooks, please test it on
+ * the _most_ _powerful_ _servers_ you can get.
+ *
+ * THINK: is performance really the right measure in the long-term future?
+ *
+ * I think we should consider the _power_ _consumption_ (nJ / LamportOperation)
+ * as a candidate for a more important measure in future.
+ *
+ * Please improve this code, but please use the right optimisation goal.
+ */
+struct rw_semaphore lamport_sem = __RWSEM_INITIALIZER(lamport_sem);
 
-void get_lamport(struct timespec *real_now, struct timespec *lamp_now)
+struct timespec lamport_stamp = {};
+
+void get_lamport(struct timespec *real_now, struct timespec *lamport_now)
 {
-	int diff;
+	struct timespec _real_now;
+	struct timespec _lamport_now;
 
-	down(&lamport_sem);
+	/* Get a consistent copy of _both_ clocks */
+	down_read(&lamport_sem);
+	_lamport_now = lamport_stamp;
+	/* Theoretically, the next statement could be moved behind the unlock.
+	 * However, then we will loose strictness of real timestamps,
+	 * or even may produce contradictory orderings between real and
+	 * Lamport timestamps, respectively, in relation to pseudo-parallel
+	 * calls to get_lamport().
+	 */
+	_real_now = CURRENT_TIME;
+	up_read(&lamport_sem);
 
-	*lamp_now = CURRENT_TIME;
 	if (real_now)
-		*real_now = *lamp_now;
-	diff = timespec_compare(lamp_now, &lamport_now);
-	if (diff >= 0) {
-		timespec_add_ns(lamp_now, 1);
-		memcpy(&lamport_now, lamp_now, sizeof(lamport_now));
-		timespec_add_ns(&lamport_now, 1);
-	} else {
-		timespec_add_ns(&lamport_now, 1);
-		memcpy(lamp_now, &lamport_now, sizeof(*lamp_now));
-	}
-
-	up(&lamport_sem);
+		*real_now = _real_now;
+	/* use the maximum of both clocks as Lamport timestamp */
+	if (timespec_compare(&_real_now, &_lamport_now) >= 0)
+		*lamport_now = _real_now;
+	else
+		*lamport_now = _lamport_now;
 }
 
 EXPORT_SYMBOL_GPL(get_lamport);
 
-void set_lamport(struct timespec *old)
+void set_lamport(struct timespec *lamport_old)
 {
-	int diff;
-
-	down(&lamport_sem);
-
-	diff = timespec_compare(old, &lamport_now);
-	if (diff >= 0) {
-		memcpy(&lamport_now, old, sizeof(lamport_now));
-		timespec_add_ns(&lamport_now, 1);
-	}
-
-	up(&lamport_sem);
+	/* Always advance the internal Lamport timestamp a little bit
+	 * in order to ensure strict monotonicity between set_lamport() calls.
+	 */
+	down_write(&lamport_sem);
+	if (timespec_compare(lamport_old, &lamport_stamp) > 0)
+		lamport_stamp = *lamport_old;
+	else
+		timespec_add_ns(&lamport_stamp, 1);
+	up_write(&lamport_sem);
 }
 EXPORT_SYMBOL_GPL(set_lamport);
+
+/* After advancing the Lamport time, re-get the new values.
+ * This is almost equivalent to a sequence of set_lamport() ; get_lamport()
+ * but more efficient because the lock is taken only once.
+ */
+void set_get_lamport(struct timespec *lamport_old, struct timespec *real_now, struct timespec *lamport_now)
+{
+	struct timespec _real_now;
+
+	down_write(&lamport_sem);
+	if (timespec_compare(lamport_old, &lamport_stamp) > 0)
+		*lamport_now = *lamport_old;
+	else
+		*lamport_now = timespec_add(lamport_stamp, (struct timespec){0, 1});
+	lamport_stamp = *lamport_now;
+	_real_now = CURRENT_TIME;
+	up_write(&lamport_sem);
+
+	if (real_now)
+		*real_now = _real_now;
+	/* use the maximum of both clocks as Lamport timestamp */
+	if (timespec_compare(&_real_now, lamport_now) > 0)
+		*lamport_now = _real_now;
+}
+EXPORT_SYMBOL_GPL(set_get_lamport);
diff --git a/kernel/lamport.h b/kernel/lamport.h
index 5ac1915f..0e9d96be 100644
--- a/kernel/lamport.h
+++ b/kernel/lamport.h
@@ -30,9 +30,13 @@
  * We always get both the local real time and the Lamport time in parallel,
  * consistently.
  *
+ * The implementation ensures that the distributed Lamport timestamp can
+ * never fall behind the local real time.
+ *
  * When not interested in real time, you can simply leave real_now at NULL.
  */
 extern void get_lamport(struct timespec *real_now, struct timespec *lamport_now);
 extern void set_lamport(struct timespec *lamport_old);
+extern void set_get_lamport(struct timespec *lamport_old, struct timespec *real_now, struct timespec *lamport_now);
 
 #endif