Merge pull request #500 from euspectre/examples

examples: Added an example of a problematic patch with an explanation
This commit is contained in:
Seth Jennings 2015-10-01 08:32:49 -05:00
commit f049dcf6e0
2 changed files with 163 additions and 0 deletions

View File

@ -0,0 +1,105 @@
The original patch changes the initialization of 'cubictcp' instance of
struct tcp_congestion_ops ('cubictcp.cwnd_event' field). Kpatch
intentionally rejects to process such changes.
This modification of the patch uses Kpatch load/unload hooks to set
'cubictcp.cwnd_event' when the binary patch is loaded and reset it to NULL
when the patch is unloaded.
It is still needed to check if changing that field could be problematic
due to concurrency issues, etc.
'cwnd_event' callback is used only via tcp_ca_event() function.
include/net/tcp.h:
static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_ops->cwnd_event)
icsk->icsk_ca_ops->cwnd_event(sk, event);
}
In turn, tcp_ca_event() is called in a number of places in
net/ipv4/tcp_output.c and net/ipv4/tcp_input.c.
One problem with this modification of the patch is that it may not be safe
to unload it. If it is possible for tcp_ca_event() to run concurrently with
the unloading of the patch, it may happen that 'icsk->icsk_ca_ops->cwnd_event'
is the address of bictcp_cwnd_event() when tcp_ca_event() checks it but is
set to NULL right after. So 'icsk->icsk_ca_ops->cwnd_event(sk, event)' would
result in a kernel oops.
Whether such scenario is possible or not, it should be analyzed. If it is,
then at least, the body of tcp_ca_event() should be made atomic w.r.t.
changing 'cwnd_event' in the patch somehow. Perhaps, RCU could be suitable
for that: a read-side critical section for the body of tcp_ca_event() with
a single read of icsk->icsk_ca_ops->cwnd_event pointer with rcu_dereference().
The pointer could be set by the patch with rcu_assign_pointer().
An alternative suggested by Josh Poimboeuf would be to patch the functions
that call 'cwnd_event' callback (tcp_ca_event() in this case) so that they
call bictcp_cwnd_event() directly when they detect the cubictcp struct [1].
Note that tcp_ca_event() is inlined in a number of places, so the binary
patch will provide replacements for all of the corresponding functions
rather than for just one. It is still needed to check if replacing these
functions in runtime is safe.
References:
[1] https://www.redhat.com/archives/kpatch/2015-September/msg00005.html
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 894b7ce..9bff8a0 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -153,6 +153,27 @@ static void bictcp_init(struct sock *sk)
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}
+static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+ if (event == CA_EVENT_TX_START) {
+ struct bictcp *ca = inet_csk_ca(sk);
+ u32 now = tcp_time_stamp;
+ s32 delta;
+
+ delta = now - tcp_sk(sk)->lsndtime;
+
+ /* We were application limited (idle) for a while.
+ * Shift epoch_start to keep cwnd growth to cubic curve.
+ */
+ if (ca->epoch_start && delta > 0) {
+ ca->epoch_start += delta;
+ if (after(ca->epoch_start, now))
+ ca->epoch_start = now;
+ }
+ return;
+ }
+}
+
/* calculate the cubic root of x using a table lookup followed by one
* Newton-Raphson iteration.
* Avg err ~= 0.195%
@@ -444,6 +465,20 @@ static struct tcp_congestion_ops cubictcp __read_mostly = {
.name = "cubic",
};
+void kpatch_load_cubictcp_cwnd_event(void)
+{
+ cubictcp.cwnd_event = bictcp_cwnd_event;
+}
+
+void kpatch_unload_cubictcp_cwnd_event(void)
+{
+ cubictcp.cwnd_event = NULL;
+}
+
+#include "kpatch-macros.h"
+KPATCH_LOAD_HOOK(kpatch_load_cubictcp_cwnd_event);
+KPATCH_UNLOAD_HOOK(kpatch_unload_cubictcp_cwnd_event);
+
static int __init cubictcp_register(void)
{
BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);

View File

@ -0,0 +1,58 @@
This patch is for 3.10.x.
It combines the following commits from the mainline:
commit 30927520dbae297182990bb21d08762bcc35ce1d
Author: Eric Dumazet <edumazet@google.com>
Date: Wed Sep 9 21:55:07 2015 -0700
tcp_cubic: better follow cubic curve after idle period
commit c2e7204d180f8efc80f27959ca9cf16fa17f67db
Author: Eric Dumazet <edumazet@google.com>
Date: Thu Sep 17 08:38:00 2015 -0700
tcp_cubic: do not set epoch_start in the future
References:
http://www.phoronix.com/scan.php?page=news_item&px=Google-Fixes-TCP-Linux
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 894b7ce..872b3a0 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -153,6 +153,27 @@ static void bictcp_init(struct sock *sk)
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}
+static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+ if (event == CA_EVENT_TX_START) {
+ struct bictcp *ca = inet_csk_ca(sk);
+ u32 now = tcp_time_stamp;
+ s32 delta;
+
+ delta = now - tcp_sk(sk)->lsndtime;
+
+ /* We were application limited (idle) for a while.
+ * Shift epoch_start to keep cwnd growth to cubic curve.
+ */
+ if (ca->epoch_start && delta > 0) {
+ ca->epoch_start += delta;
+ if (after(ca->epoch_start, now))
+ ca->epoch_start = now;
+ }
+ return;
+ }
+}
+
/* calculate the cubic root of x using a table lookup followed by one
* Newton-Raphson iteration.
* Avg err ~= 0.195%
@@ -439,6 +460,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = {
.cong_avoid = bictcp_cong_avoid,
.set_state = bictcp_state,
.undo_cwnd = bictcp_undo_cwnd,
+ .cwnd_event = bictcp_cwnd_event,
.pkts_acked = bictcp_acked,
.owner = THIS_MODULE,
.name = "cubic",