From d04adf44dc4f524a02cd74188704cad05d65a98a Mon Sep 17 00:00:00 2001 From: Frederic Lecaille Date: Thu, 5 Sep 2024 10:33:38 +0200 Subject: [PATCH] MINOR: quic: implement BBR congestion control algorithm for QUIC Implement the version 3 of BBR for QUIC specified by the IETF in this draft: https://datatracker.ietf.org/doc/draft-ietf-ccwg-bbr/ Here is an extract from the Abstract part to sum up the the capabilities of BBR: BBR ("Bottleneck Bandwidth and Round-trip propagation time") uses recent measurements of a transport connection's delivery rate, round-trip time, and packet loss rate to build an explicit model of the network path. BBR then uses this model to control both how fast it sends data and the maximum volume of data it allows in flight in the network at any time. Relative to loss-based congestion control algorithms such as Reno [RFC5681] or CUBIC [RFC9438], BBR offers substantially higher throughput for bottlenecks with shallow buffers or random losses, and substantially lower queueing delays for bottlenecks with deep buffers (avoiding "bufferbloat"). BBR can be implemented in any transport protocol that supports packet-delivery acknowledgment. Thus far, open source implementations are available for TCP [RFC9293] and QUIC [RFC9000]. In haproxy, this implementation is considered as still experimental. It depends on the newly implemented pacing feature. BBR was asked in GH #2516 by @KazuyaKanemura, @osevan and @kennyZ96. --- Makefile | 3 +- include/haproxy/quic_cc-t.h | 6 +- include/haproxy/quic_cc.h | 2 + src/quic_cc_bbr.c | 1534 +++++++++++++++++++++++++++++++++++ 4 files changed, 1543 insertions(+), 2 deletions(-) create mode 100644 src/quic_cc_bbr.c diff --git a/Makefile b/Makefile index 4d22f2eaa8..6a9f84cf3f 100644 --- a/Makefile +++ b/Makefile @@ -654,7 +654,8 @@ OPTIONS_OBJS += src/quic_rx.o src/mux_quic.o src/h3.o src/quic_tx.o \ src/cfgparse-quic.o src/qmux_trace.o src/qpack-enc.o \ src/qpack-tbl.o src/h3_stats.o src/quic_stats.o \ src/quic_fctl.o src/cbuf.o src/quic_rules.o \ - src/quic_token.o src/quic_pacing.o src/quic_cc_drs.o + src/quic_token.o src/quic_pacing.o src/quic_cc_drs.o \ + src/quic_cc_bbr.o endif ifneq ($(USE_QUIC_OPENSSL_COMPAT:0=),) diff --git a/include/haproxy/quic_cc-t.h b/include/haproxy/quic_cc-t.h index 430a6167ce..a4a43cadc3 100644 --- a/include/haproxy/quic_cc-t.h +++ b/include/haproxy/quic_cc-t.h @@ -36,6 +36,7 @@ extern struct quic_cc_algo quic_cc_algo_nr; extern struct quic_cc_algo quic_cc_algo_cubic; +extern struct quic_cc_algo quic_cc_algo_bbr; extern struct quic_cc_algo *default_quic_cc_algo; /* Fake algorithm with its fixed window */ @@ -81,6 +82,7 @@ struct quic_cc_event { enum quic_cc_algo_type { QUIC_CC_ALGO_TP_NEWRENO, QUIC_CC_ALGO_TP_CUBIC, + QUIC_CC_ALGO_TP_BBR, QUIC_CC_ALGO_TP_NOCC, }; @@ -88,7 +90,7 @@ struct quic_cc { /* is there only for debugging purpose. */ struct quic_conn *qc; struct quic_cc_algo *algo; - uint32_t priv[20]; + uint32_t priv[158]; }; struct quic_cc_path { @@ -118,6 +120,8 @@ struct quic_cc_path { /* Burst size if pacing is used. Not used if congestion algo handle pacing itself. */ uint32_t pacing_burst; uint64_t delivery_rate; /* bytes per second */ + size_t send_quantum; + uint32_t recovery_start_ts; }; struct quic_cc_algo { diff --git a/include/haproxy/quic_cc.h b/include/haproxy/quic_cc.h index db4cc58fa2..6537556c77 100644 --- a/include/haproxy/quic_cc.h +++ b/include/haproxy/quic_cc.h @@ -101,6 +101,8 @@ static inline void quic_cc_path_init(struct quic_cc_path *path, int ipv4, unsign path->pacing_burst = burst; quic_cc_init(&path->cc, algo, qc); path->delivery_rate = 0; + path->send_quantum = 64 * 1024; + path->recovery_start_ts = TICK_ETERNITY; } /* Return the remaining available on QUIC path for prepared data diff --git a/src/quic_cc_bbr.c b/src/quic_cc_bbr.c new file mode 100644 index 0000000000..c409d1703a --- /dev/null +++ b/src/quic_cc_bbr.c @@ -0,0 +1,1534 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + + +/* Bottleneck Bandwidth and Round-trip propagation time version 3 (BBRv3) + * congestion algorithm implementation for QUIC. + * https://datatracker.ietf.org/doc/draft-ietf-ccwg-bbr/ + * + * This algorithm builds a model of the network producing some delivery rate + * sample from acknowledgement information to sequentially estimate the maximum + * bandwidth and round-trip time. + */ + + +/* BBR constant definitions */ + +/* BBRStartupFullLossCnt(6): + * the maximum number of accepted discontiguous sequence ranges lost in a round + * trip during Startup. + */ +#define BBR_STARTUP_FULL_LOSS_COUNT 6 + +/* BBRStartupPacingGain(2.77): + * a constant specifying the minimum gain value for calculating the pacing rate + * that will allow the sending rate to double each round (4 * ln(2) ~= 2.77); + * used in Startup mode for BBR.pacing_gain. + */ +#define BBR_STARTUP_PACING_GAIN_MULT 277 /* percents */ + +/* BBRDrainPacingGain(0.35) + * a constant specifying the pacing gain value used in Drain mode, to attempt + * to drain the estimated queue at the bottleneck link in one round-trip or less. + */ +#define BBR_DRAIN_PACING_GAIN_MULT 35 /* percents */ + +/* BBRDefaultCwndGain: + * a constant specifying the minimum gain value that + * allows the sending rate to double each round (2). Used by default in most + * phases for BBR.cwnd_gain. + */ +#define BBR_DEFAULT_CWND_GAIN_MULT 200 /* percents */ + +/* BBRPacingMarginPercent(1%): + * the static discount factor of 1% used to scale BBR.bw to produce + * BBR.pacing_rate. + */ +#define BBR_PACING_MARGIN_PERCENT 1 /* percents */ + +/* BBRLossThresh(2%): + * the maximum tolerated per-round-trip packet loss rate when probing for + * bandwidth. + */ +#define BBR_LOSS_THRESH_MULT 2 +#define BBR_LOSS_THRESH_DIVI 100 + +/* BBRBeta(0.7): + * the default multiplicative decrease to make upon each round trip during + * which the connection detects packet loss. + */ +#define BBR_BETA_MULT 7 +#define BBR_BETA_DIVI 10 + +/* BBRHeadroom (0.15): + * the multiplicative factor to apply to BBR.inflight_hi when calculating a + * volume of free headroom to try to leave unused in the path (e.g. free space + * in the bottleneck buffer or free time slots in the bottleneck link) that can + * be used by cross traffic. + */ +#define BBR_HEADROOM_MULT 15 +#define BBR_HEADROOM_DIVI 100 + +/* MaxBwFilterLen(2): + * the length of the windowed filter length for BBR.MaxBwFilter */ +#define BBR_MAX_BW_FILTERLEN 2 + +/* BBRExtraAckedFilterLen(10): + * The window length of the BBR.ExtraACKedFilter max filter window in steady-state + * in units of packet-timed round trips + */ +#define BBR_EXTRA_ACKED_FILTERLEN 10 + +/* MinRTTFilterLen(10s): + * A constant specifying the length of the BBR.min_rtt min filter window. + */ +#define BBR_MIN_RTT_FILTERLEN 10000 /* ms */ + +/* BBRProbeRTTCwndGain(0.5): + * A constant specifying the gain value for calculating the cwnd during ProbeRTT: + * 0.5 (meaning that ProbeRTT attempts to reduce in-flight data to 50% of the + * estimated BDP) + */ +#define BBR_PROBE_RTT_CWND_GAIN_MULT 50 /* percents */ + +/* ProbeRTTDuration(200ms): + * A constant specifying the minimum duration for which ProbeRTT state holds + * inflight to BBRMinPipeCwnd or fewer packets + */ +#define BBR_PROBE_RTT_DURATION 200 /* ms */ + +/* ProbeRTTInterval(5s) + * A constant specifying the minimum time interval between ProbeRTT states + */ +#define BBR_PROBE_RTT_INTERVAL 5000 /* ms */ + +/* The divisor to apply to the gain multiplicandes above (BBR.*_GAIN_MULT) + * whose the unit is the percent. + */ +#define BBR_GAIN_DIVI 100 + +/* 4.1.1: State Transition Diagram + * + * + * | + * V + * +---> Startup ------------+ + * | | | + * | V | + * | Drain --------------+ + * | | | + * | V | + * +---> ProbeBW_DOWN -------+ + * | ^ | | + * | | V | + * | | ProbeBW_CRUISE ------+ + * | | | | + * | | V | + * | | ProbeBW_REFILL -----+ + * | | | | + * | | V | + * | | ProbeBW_UP ---------+ + * | | | | + * | +------+ | + * | | + * +---- ProbeRTT <-----------+ + * + */ + +/* + * 4.1.2. State Machine Operation Overview + * + * When starting up, BBR probes to try to quickly build a model of the network + * path; to adapt to later changes to the path or its traffic, BBR must + * continue to probe to update its model. If the available bottleneck bandwidth + * increases, BBR must send faster to discover this. Likewise, if the round-trip + * propagation delay changes, this changes the BDP, and thus BBR must send slower + * to get inflight below the new BDP in order to measure the new BBR.min_rtt. + * Thus, BBR's state machine runs periodic, sequential experiments, sending faster + * to check for BBR.bw increases or sending slower to yield bandwidth, drain the + * queue, and check for BBR.min_rtt decreases. The frequency, magnitude, duration, + * and structure of these experiments differ depending on what's already known + * (startup or steady-state) and application sending behavior (intermittent or + * continuous). + * This state machine has several goals: + * + * - Achieve high throughput by efficiently utilizing available bandwidth. + * - Achieve low latency and packet loss rates by keeping queues bounded and + * small. + * - Share bandwidth with other flows in an approximately fair manner. + * - Feed samples to the model estimators to refresh and update the model. + */ + +/* BBR states */ +enum bbr_state { + BBR_ST_STARTUP, + BBR_ST_DRAIN, + BBR_ST_PROBE_BW_DOWN, + BBR_ST_PROBE_BW_CRUISE, + BBR_ST_PROBE_BW_REFILL, + BBR_ST_PROBE_BW_UP, + BBR_ST_PROBE_RTT, +}; + +enum bbr_ack_phase { + BBR_ACK_PHASE_ACKS_PROBE_STARTING, + BBR_ACK_PHASE_ACKS_PROBE_STOPPING, + BBR_ACK_PHASE_ACKS_PROBE_FEEDBACK, + BBR_ACK_PHASE_ACKS_REFILLING, +}; + +struct bbr { + /* Delivery rate sampling information. */ + struct quic_cc_drs drs; + /* 2.4 Output Control Parameters */ + uint64_t pacing_rate; + /* 2.5 Pacing State and Parameters */ + /* BBR.pacing_gain: The dynamic gain factor used to scale BBR.bw to + * produce BBR.pacing_rate. + */ + uint64_t pacing_gain; // percents + /* 2.6. cwnd State and Parameters */ + /* BBR.cwnd_gain: The dynamic gain factor used to scale the estimated BDP + * to produce a congestion window (cwnd). + */ + uint64_t cwnd_gain; // percents + /* 2.7 General Algorithm State */ + enum bbr_state state; + uint64_t round_count; + int round_start; /* boolean */ + uint64_t next_round_delivered; + int idle_restart; /* boolean */ + /* 2.9.1 Data Rate Network Path Model Parameters */ + uint64_t max_bw; + uint64_t bw_lo; + uint64_t bw; + uint64_t prior_cwnd; + /* 2.9.2 Data Volume Network Path Model Parameters */ + uint32_t min_rtt; + uint64_t extra_acked; + uint64_t bytes_lost_in_round; + uint64_t loss_events_in_round; + uint64_t offload_budget; + uint64_t probe_up_cnt; + uint32_t cycle_stamp; + enum bbr_ack_phase ack_phase; + unsigned int bw_probe_wait; + int bw_probe_samples; + int bw_probe_up_rounds; + uint64_t bw_probe_up_acks; + uint64_t max_inflight; + uint64_t inflight_hi; + uint64_t inflight_lo; + /* 2.10 State for Responding to Congestion */ + int loss_round_start; /* boolean */ + uint64_t bw_latest; + int loss_in_round; /* boolean */ + uint64_t loss_round_delivered; + unsigned int rounds_since_bw_probe; + uint64_t inflight_latest; + /* 2.11 Estimating BBR.max_bw */ + struct wf max_bw_filter; + uint64_t cycle_count; + /* 2.12 Estimating BBR.extra_acked */ + uint32_t extra_acked_interval_start; + uint64_t extra_acked_delivered; + struct wf extra_acked_filter; + /* 2.13 Startup Parameters and State */ + int full_bw_reached; /* boolean */ + int full_bw_now; /* boolean */ + uint64_t full_bw; + int full_bw_count; + /* 2.14 ProbeRTT and min_rtt Parameters and State */ + /* 2.14.1 Parameters for Estimating BBR.min_rtt */ + uint32_t min_rtt_stamp; + /* 2.14.2 Parameters for Scheduling ProbeRTT */ + uint32_t probe_rtt_min_delay; /* ms */ + uint32_t probe_rtt_min_stamp; /* ms */ + uint32_t probe_rtt_done_stamp; + int probe_rtt_round_done; /* boolean */ + int probe_rtt_expired; /* boolean */ + int packet_conservation; /* boolean */ + uint64_t round_count_at_recovery; + int in_loss_recovery; /* boolean */ + uint32_t recovery_start_ts; +}; + +/* BBR functions definitions. + * The camelcase naming convention is used by the BBR RFC for the function names + * and constants. To helps in matching the code below with the RFC one, note + * that all the function names have been translated this way. The uppercase + * letters have been replaced by lowercase letters. The words have been seperated + * by underscores as follows: + * + * ex: BBRMinPipeCwnd() -> bbr_min_pipe_cwnd() + */ + +/* BBRMinPipeCwnd: + * Return the minimal cwnd value BBR targets, to allow pipelining with TCP + * endpoints that follow an "ACK every other packet" delayed-ACK policy: 4 * SMSS. + */ +static inline uint64_t bbr_min_pipe_cwnd(struct quic_cc_path *p) +{ + return 4 * p->mtu; +} + +static inline int is_inflight_too_high(struct quic_cc_rs *rs) +{ + return rs->lost * BBR_LOSS_THRESH_DIVI > + rs->tx_in_flight * BBR_LOSS_THRESH_MULT; +} + +static inline int bbr_is_probing_bw(struct bbr *bbr) +{ + switch (bbr->state) { + case BBR_ST_PROBE_BW_DOWN: + case BBR_ST_PROBE_BW_CRUISE: + case BBR_ST_PROBE_BW_REFILL: + case BBR_ST_PROBE_BW_UP: + return 1; + default: + return 0; + } +} + +static void bbr_reset_congestion_signals(struct bbr *bbr) +{ + bbr->loss_in_round = 0; + bbr->bw_latest = 0; + bbr->inflight_latest = 0; +} + +static void bbr_reset_lower_bounds(struct bbr *bbr) +{ + bbr->bw_lo = UINT64_MAX; + bbr->inflight_lo = UINT64_MAX; +} + +static void bbr_init_round_counting(struct bbr *bbr) +{ + bbr->next_round_delivered = 0; + bbr->round_start = 0; + bbr->round_count = 0; +} + +static void bbr_reset_full_bw(struct bbr *bbr) +{ + bbr->full_bw = 0; + bbr->full_bw_count = 0; + bbr->full_bw_now = 0; +} + +static void bbr_init_pacing_rate(struct quic_cc *cc, struct bbr *bbr) +{ + struct quic_cc_path *p = container_of(cc, struct quic_cc_path, cc); + unsigned int srtt = p->loss.srtt; + + bbr->pacing_rate = 1000 * p->initial_wnd * BBR_STARTUP_PACING_GAIN_MULT / + BBR_GAIN_DIVI / (srtt ? srtt : 1); +} + +/* 4.6.3. Send Quantum: BBR.send_quantum + * + * In order to amortize per-packet overheads involved in the sending process + * (host CPU, NIC processing, and interrupt processing delays), high-performance + * transport sender implementations (e.g., Linux TCP) often schedule an + * aggregate containing multiple packets (multiple SMSS) worth of data as a + * single quantum (using TSO, GSO, or other offload mechanisms). The BBR + * congestion control algorithm makes this control decision explicitly, + * dynamically calculating a quantum control parameter that specifies the + * maximum size of these transmission aggregates. This decision is based on a + * trade-off: + * + * A smaller quantum is preferred at lower data rates because it results in + * shorter packet bursts, shorter queues, lower queueing delays, and lower rates + * of packet loss. + * + * A bigger quantum can be required at higher data rates because it results in + * lower CPU overheads at the sending and receiving hosts, who can ship larger + * amounts of data with a single trip through the networking stack. + */ + +/* Set ->send_quantum. Must be called on each ack receipt. */ +static void bbr_set_send_quantum(struct bbr *bbr, struct quic_cc_path *p) +{ + p->send_quantum = bbr->pacing_rate / 1000; + p->send_quantum = MIN(p->send_quantum, 64 * 1024); + p->send_quantum = MAX(p->send_quantum, 2 * p->mtu); +} + +/* 4.3.1. Startup + * + * 4.3.1.1. Startup Dynamics + * + * When a BBR flow starts up, it performs its first (and most rapid) sequential + * probe/drain process in the Startup and Drain states. Network link bandwidths + * currently span a range of at least 11 orders of magnitude, from a few bps to + * hundreds of Gbps. To quickly learn BBR.max_bw, given this huge range to + * explore, BBR's Startup state does an exponential search of the rate space, + * doubling the sending rate each round. This finds BBR.max_bw in O(log_2(BDP)) + * round trips. + * To achieve this rapid probing smoothly, in Startup BBR uses the minimum gain + * values that will allow the sending rate to double each round: in Startup BBR + * sets BBR.pacing_gain to BBRStartupPacingGain (2.77) [BBRStartupPacingGain] + * and BBR.cwnd_gain to BBRDefaultCwndGain (2) [BBRStartupCwndGain]. + * As BBR grows its sending rate rapidly, it obtains higher delivery rate + * samples, BBR.max_bw increases, and the pacing rate and cwnd both adapt by + * smoothly growing in proportion. Once the pipe is full, a queue typically + * forms, but the cwnd_gain bounds any queue to (cwnd_gain - 1) * estimated_BDP, + * which is approximately (2 - 1) * estimated_BDP = estimated_BDP. The + * immediately following Drain state is designed to quickly drain that queue. + */ +static void bbr_enter_startup(struct bbr *bbr) +{ + bbr->state = BBR_ST_STARTUP; + bbr->pacing_gain = BBR_STARTUP_PACING_GAIN_MULT; + bbr->cwnd_gain = BBR_DEFAULT_CWND_GAIN_MULT; +} + +/* 4.3.2. Drain + * + * Upon exiting Startup, BBR enters its Drain state. In Drain, BBR aims to + * quickly drain any queue at the bottleneck link that was created in Startup + * by switching to a pacing_gain well below 1.0, until any estimated queue has + * been drained. It uses a pacing_gain of BBRDrainPacingGain = 0.35, chosen via + * analysis [BBRDrainPacingGain] and experimentation to try to drain the queue + * in less than one round-trip. + */ +static void bbr_enter_drain(struct bbr *bbr) +{ + bbr->state = BBR_ST_DRAIN; + bbr->pacing_gain = BBR_DRAIN_PACING_GAIN_MULT; /* pace slowly */ + bbr->cwnd_gain = BBR_DEFAULT_CWND_GAIN_MULT; +} + +static void bbr_enter_probe_rtt(struct bbr *bbr) +{ + bbr->state = BBR_ST_PROBE_RTT; + bbr->pacing_gain = 100; + bbr->cwnd_gain = BBR_PROBE_RTT_CWND_GAIN_MULT; +} + +static void bbr_save_cwnd(struct bbr *bbr, struct quic_cc_path *p) +{ + if (!bbr->in_loss_recovery && bbr->state != BBR_ST_PROBE_RTT) { + bbr->prior_cwnd = p->cwnd; + } + else { + bbr->prior_cwnd = MAX(bbr->prior_cwnd, p->cwnd); + } +} + +static void bbr_restore_cwnd(struct bbr *bbr, struct quic_cc_path *p) +{ + p->cwnd = MAX(p->cwnd, bbr->prior_cwnd); +} + +/* must be provided in percents. */ +static uint64_t bbr_bdp_multiple(struct bbr *bbr, struct quic_cc_path *p, + uint64_t bw, uint64_t gain) +{ + uint64_t bdp; + + if (bbr->min_rtt == UINT32_MAX) + return p->initial_wnd; /* no valid RTT samples yet */ + + bdp = bw * bbr->min_rtt / 1000; + + /* Note that unit is the percent. */ + return gain * bdp / BBR_GAIN_DIVI; +} + +static void bbr_update_offload_budget(struct bbr *bbr, struct quic_cc_path *p) +{ + bbr->offload_budget = 3 * p->send_quantum; +} + +static uint64_t bbr_quantization_budget(struct bbr *bbr, struct quic_cc_path *p, + uint64_t inflight) +{ + bbr_update_offload_budget(bbr, p); + inflight = MAX(inflight, bbr->offload_budget); + inflight = MAX(inflight, bbr_min_pipe_cwnd(p)); + if (bbr->state == BBR_ST_PROBE_BW_UP) + inflight += 2 * p->mtu; + + return inflight; +} + +static uint64_t bbr_inflight(struct bbr *bbr, struct quic_cc_path *p, + uint64_t bw, uint64_t gain) +{ + uint64_t inflight = bbr_bdp_multiple(bbr, p, bw, gain); + return bbr_quantization_budget(bbr, p, inflight); +} + +static void bbr_update_max_inflight(struct bbr *bbr, struct quic_cc_path *p) +{ + uint64_t inflight; + + /* Not defined by RFC */ + //BBRUpdateAggregationBudget(); + inflight = bbr_bdp_multiple(bbr, p, bbr->bw, bbr->cwnd_gain); + inflight += bbr->extra_acked; + bbr->max_inflight = bbr_quantization_budget(bbr, p, inflight); +} + +static void bbr_set_pacing_rate_with_gain(struct bbr *bbr, + struct quic_cc_path *p, + uint64_t pacing_gain) +{ + uint64_t rate; + + if (!bbr->bw) + return; + + /* pacing_gain unit is percent */ + rate = pacing_gain * bbr->bw * (100 - BBR_PACING_MARGIN_PERCENT) / + BBR_GAIN_DIVI / BBR_GAIN_DIVI; + if (bbr->full_bw_reached || rate > bbr->pacing_rate) + bbr->pacing_rate = rate; +} + +static void bbr_set_pacing_rate(struct bbr *bbr, struct quic_cc_path *p) +{ + bbr_set_pacing_rate_with_gain(bbr, p, bbr->pacing_gain); +} + +static uint64_t bbr_probe_rtt_cwnd(struct bbr *bbr, struct quic_cc_path *p) +{ + uint64_t probe_rtt_cwnd = + bbr_bdp_multiple(bbr, p, bbr->bw, BBR_PROBE_RTT_CWND_GAIN_MULT); + + return MAX(probe_rtt_cwnd, bbr_min_pipe_cwnd(p)); +} + +static void bbr_bound_cwnd_for_probe_rtt(struct bbr *bbr, struct quic_cc_path *p) +{ + if (bbr->state == BBR_ST_PROBE_RTT) + p->cwnd = MIN(p->cwnd, bbr_probe_rtt_cwnd(bbr, p)); +} + +/* Return a volume of data that tries to leave free headroom in the bottleneck + * buffer or link for other flows, for fairness convergence and lower RTTs and + * loss. + */ +static uint64_t bbr_inflight_with_headroom(struct bbr *bbr, struct quic_cc_path *p) +{ + uint64_t headroom; + + if (bbr->inflight_hi == UINT64_MAX) + return UINT64_MAX; + + headroom = + MAX(p->mtu, bbr->inflight_hi * BBR_HEADROOM_MULT / BBR_HEADROOM_DIVI); + return MAX(bbr->inflight_hi - headroom, bbr_min_pipe_cwnd(p)); +} + +static void bbr_bound_cwnd_for_model(struct bbr *bbr, struct quic_cc_path *p) +{ + uint64_t cap = UINT64_MAX; + + if (bbr_is_probing_bw(bbr) && bbr->state != BBR_ST_PROBE_BW_CRUISE) + cap = bbr->inflight_hi; + else if (bbr->state == BBR_ST_PROBE_RTT || bbr->state == BBR_ST_PROBE_BW_CRUISE) + cap = bbr_inflight_with_headroom(bbr, p); + + /* apply inflight_lo (possibly infinite): */ + cap = MIN(cap, bbr->inflight_lo); + cap = MAX(cap, bbr_min_pipe_cwnd(p)); + p->cwnd = MIN(p->cwnd, cap); +} + +static void bbr_set_cwnd(struct bbr *bbr, struct quic_cc_path *p, uint32_t acked) +{ + bbr_update_max_inflight(bbr, p); + if (bbr->full_bw_reached) { + p->cwnd += acked; + p->cwnd = MIN(p->cwnd, bbr->max_inflight); + } + else if (p->cwnd < bbr->max_inflight || bbr->drs.delivered < p->initial_wnd) { + p->cwnd += acked; + } + p->cwnd = MAX(p->cwnd, bbr_min_pipe_cwnd(p)); + bbr_bound_cwnd_for_probe_rtt(bbr, p); + bbr_bound_cwnd_for_model(bbr, p); +} + +static int bbr_init(struct quic_cc *cc) +{ + struct bbr *bbr = quic_cc_priv(cc); + + quic_cc_drs_init(&bbr->drs); + wf_init(&bbr->max_bw_filter, BBR_MAX_BW_FILTERLEN, 0, ~0U); + wf_init(&bbr->extra_acked_filter, BBR_EXTRA_ACKED_FILTERLEN, 0, ~0U); + bbr->min_rtt = UINT32_MAX; + bbr->min_rtt_stamp = now_ms; + bbr->probe_rtt_done_stamp = TICK_ETERNITY; + bbr->probe_rtt_round_done = 0; + bbr->prior_cwnd = 0; + bbr->idle_restart = 0; + bbr->extra_acked_interval_start = now_ms; + bbr->extra_acked_delivered = 0; + bbr->full_bw_reached = 0; + + bbr_reset_congestion_signals(bbr); + bbr_reset_lower_bounds(bbr); + bbr_init_round_counting(bbr); + bbr_reset_full_bw(bbr); + bbr_init_pacing_rate(cc, bbr); + bbr_enter_startup(bbr); + + /* Not in RFC */ + bbr->loss_round_start = 0; + bbr->loss_round_delivered = UINT64_MAX; + bbr->rounds_since_bw_probe = 0; + bbr->max_bw = 0; + bbr->bw = 0; + bbr->extra_acked = 0; + bbr->bytes_lost_in_round = 0; + bbr->loss_events_in_round = 0; + bbr->offload_budget = 0; + bbr->probe_up_cnt = UINT64_MAX; + bbr->cycle_stamp = TICK_ETERNITY; + bbr->ack_phase = 0; + bbr->bw_probe_wait = 0; + bbr->bw_probe_samples = 0; + bbr->bw_probe_up_rounds = 0; + bbr->bw_probe_up_acks = 0; + bbr->max_inflight = 0; + bbr->inflight_hi = UINT64_MAX; + bbr->cycle_count = 0; + bbr->probe_rtt_min_delay = UINT32_MAX; + bbr->probe_rtt_min_stamp = now_ms; + bbr->probe_rtt_expired = 0; + bbr->in_loss_recovery = 0; + bbr->packet_conservation = 0; + bbr->recovery_start_ts = TICK_ETERNITY; + bbr->round_count_at_recovery = UINT64_MAX; + + return 1; +} + +/* 4.3.1.2. Exiting Acceleration Based on Bandwidth Plateau + * + * In phases where BBR is accelerating to probe the available bandwidth + * - Startup and ProbeBW_UP - BBR runs a state machine to estimate whether an + * accelerating sending rate has saturated the available per-flow bandwidth + * ("filled the pipe") by looking for a plateau in the measured rs.delivery_rate. + * BBR tracks the status of the current full-pipe estimation process in the + * boolean BBR.full_bw_now, and uses BBR.full_bw_now to exit ProbeBW_UP. BBR + * records in the boolean BBR.full_bw_reached whether BBR estimates that it has + * ever fully utilized its available bandwidth (over the lifetime of the + * connection), and uses BBR.full_bw_reached to decide when to exit Startup and + * enter Drain.The full pipe estimator works as follows: if BBR counts several + * (three) non-application-limited rounds where attempts to significantly increase + * the delivery rate actually result in little increase (less than 25 percent), + * then it estimates that it has fully utilized the per-flow available bandwidth, + * and sets both BBR.full_bw_now and BBR.full_bw_reached to true. + */ +static void bbr_check_full_bw_reached(struct bbr *bbr, struct quic_cc_path *p) +{ + struct quic_cc_rs *rs = &bbr->drs.rs; + + if (bbr->full_bw_now || rs->is_app_limited) + return; /* no need to check for a full pipe now */ + + if (p->delivery_rate * 100 >= bbr->full_bw * 125) { + bbr_reset_full_bw(bbr); /* bw is still growing, so reset */ + bbr->full_bw = p->delivery_rate; /* record new baseline bw */ + return; + } + + if (!bbr->round_start) + return; + + bbr->full_bw_count++; /* another round w/o much growth */ + bbr->full_bw_now = bbr->full_bw_count >= 3; + if (bbr->full_bw_now) + bbr->full_bw_reached = 1; +} + +/* 4.3.1.3. Exiting Startup Based on Packet Loss + * + * A second method BBR uses for estimating the bottleneck is full in Startup is + * by looking at packet losses. Specifically, BBRCheckStartupHighLoss() checks + * whether all of the following criteria are all met: + * + * The connection has been in fast recovery for at least one full packet-timed + * round trip. + * + * The loss rate over the time scale of a single full round trip exceeds + * BBRLossThresh (2%). + * + * There are at least BBRStartupFullLossCnt=6 discontiguous sequence ranges lost + * in that round trip. + * + * If these criteria are all met, then BBRCheckStartupHighLoss() takes the + * following steps. First, it sets BBR.full_bw_reached = true. Then it sets + * BBR.inflight_hi to its estimate of a safe level of in-flight data suggested + * by these losses, which is max(BBR.bdp, BBR.inflight_latest), where + * BBR.inflight_latest is the max delivered volume of data (rs.delivered) over + * the last round trip. Finally, it exits Startup and enters Drain.The algorithm + * waits until all three criteria are met to filter out noise from burst losses, + * and to try to ensure the bottleneck is fully utilized on a sustained basis, + * and the full bottleneck bandwidth has been measured, before attempting to drain + * the level of in-flight data to the estimated BDP. + */ +static void bbr_check_startup_high_loss(struct bbr *bbr, struct quic_cc_path *p) +{ + if (bbr->full_bw_reached || + bbr->loss_events_in_round <= BBR_STARTUP_FULL_LOSS_COUNT || + (bbr->in_loss_recovery && + bbr->round_count <= bbr->round_count_at_recovery) || + !is_inflight_too_high(&bbr->drs.rs)) { + return; + } + + bbr->full_bw_reached = 1; + bbr->inflight_hi = + MAX(bbr_bdp_multiple(bbr, p, bbr->bw, bbr->cwnd_gain), bbr->inflight_latest); +} + +static void bbr_start_round(struct bbr *bbr) +{ + bbr->next_round_delivered = bbr->drs.delivered; +} + +static void bbr_update_round(struct bbr *bbr, uint32_t delivered) +{ + if (delivered >= bbr->next_round_delivered) { + bbr_start_round(bbr); + bbr->round_count++; + bbr->rounds_since_bw_probe++; + bbr->round_start = 1; + bbr->bytes_lost_in_round = 0; + bbr->loss_events_in_round = 0; + bbr->drs.is_cwnd_limited = 0; + } + else { + bbr->round_start = 0; + } +} + +static void bbr_pick_probe_wait(struct bbr *bbr) +{ + uint32_t rand = ha_random32(); + + bbr->rounds_since_bw_probe = rand & 0x1; /* 0 or 1 */ + /* Decide the random wall clock bound for wait: */ + bbr->bw_probe_wait = 2000 + (rand % 1000); +} + +static void bbr_raise_inflight_hi_slope(struct bbr *bbr, struct quic_cc_path *p) +{ + uint64_t growth_this_round = p->mtu << bbr->bw_probe_up_rounds; + + bbr->bw_probe_up_rounds = MIN(bbr->bw_probe_up_rounds + 1, 30); + bbr->probe_up_cnt = MAX(p->cwnd / growth_this_round, 1) * p->mtu; +} + +/* 4.3.3. ProbeBW + * + * Long-lived BBR flows tend to spend the vast majority of their time in the + * ProbeBW states. In the ProbeBW states, a BBR flow sequentially accelerates, + * decelerates, and cruises, to measure the network path, improve its operating + * point (increase throughput and reduce queue pressure), and converge toward a + * more fair allocation of bottleneck bandwidth. To do this, the flow + * sequentially cycles through all three tactics: trying to send faster than, + * slower than, and at the same rate as the network delivery process. To achieve + * this, a BBR flow in ProbeBW mode cycles through the four Probe bw states + * (DOWN, CRUISE, REFILL, and UP). + */ + +/* 4.3.3.1. ProbeBW_DOWN + * + * In the ProbeBW_DOWN phase of the cycle, a BBR flow pursues the deceleration + * tactic, to try to send slower than the network is delivering data, to reduce + * the amount of data in flight, with all of the standard motivations for the + * deceleration tactic (discussed in "State Machine Tactics" in Section 4.1.3). + * It does this by switching to a BBR.pacing_gain of 0.90, sending at 90% of + * BBR.bw. The pacing_gain value of 0.90 is derived based on the ProbeBW_UP + * pacing gain of 1.25, as the minimum pacing_gain value that allows + * bandwidth-based convergence to approximate fairness, and validated through + * experiments. + */ +static void bbr_start_probe_bw_down(struct bbr *bbr) +{ + bbr_reset_congestion_signals(bbr); + bbr->probe_up_cnt = UINT64_MAX; + bbr_pick_probe_wait(bbr); + bbr->cycle_stamp = now_ms; + bbr->ack_phase = BBR_ACK_PHASE_ACKS_PROBE_STOPPING; + bbr_start_round(bbr); + bbr->state = BBR_ST_PROBE_BW_DOWN; + bbr->pacing_gain = 90; + bbr->cwnd_gain = BBR_DEFAULT_CWND_GAIN_MULT; +} + +/* 4.3.3.2. ProbeBW_CRUISE + * + * In the ProbeBW_CRUISE phase of the cycle, a BBR flow pursues the "cruising" + * tactic (discussed in "State Machine Tactics" in Section 4.1.3), attempting + * to send at the same rate the network is delivering data. It tries to match + * the sending rate to the flow's current available bandwidth, to try to + * achieve high utilization of the available bandwidth without increasing + * queue pressure. It does this by switching to a pacing_gain of 1.0, sending + * at 100% of BBR.bw. Notably, while in this state it responds to concrete + * congestion signals (loss) by reducing BBR.bw_lo and BBR.inflight_lo, + * because these signals suggest that the available bandwidth and deliverable + * volume of in-flight data have likely reduced, and the flow needs to change + * to adapt, slowing down to match the latest delivery process. + */ +static void bbr_start_probe_bw_cruise(struct bbr *bbr) +{ + bbr->state = BBR_ST_PROBE_BW_CRUISE; + bbr->pacing_gain = 100; + bbr->cwnd_gain = BBR_DEFAULT_CWND_GAIN_MULT; +} + +static void bbr_start_probe_bw_refill(struct bbr *bbr) +{ + bbr_reset_lower_bounds(bbr); + bbr->bw_probe_up_rounds = 0; + bbr->bw_probe_up_acks = 0; + bbr->ack_phase = BBR_ACK_PHASE_ACKS_REFILLING; + bbr_start_round(bbr); + bbr->state = BBR_ST_PROBE_BW_REFILL; + bbr->pacing_gain = 100; + bbr->cwnd_gain = BBR_DEFAULT_CWND_GAIN_MULT; +} + +static void bbr_start_probe_bw_up(struct bbr *bbr, struct quic_cc_path *p) +{ + bbr->ack_phase = BBR_ACK_PHASE_ACKS_PROBE_STARTING; + bbr_start_round(bbr); + bbr_reset_full_bw(bbr); + bbr->full_bw = p->delivery_rate; + bbr->state = BBR_ST_PROBE_BW_UP; + bbr->pacing_gain = 125; + bbr->cwnd_gain = 225; + bbr_raise_inflight_hi_slope(bbr, p); +} + +/* 4.3.4.5. Exiting ProbeRTT + * + * When exiting ProbeRTT, BBR transitions to ProbeBW if it estimates the pipe + * was filled already, or Startup otherwise. + * When transitioning out of ProbeRTT, BBR calls BBRResetLowerBounds() to reset + * the lower bounds, since any congestion encountered in ProbeRTT may have + * pulled the short-term model far below the capacity of the path.But the + * algorithm is cautious in timing the next bandwidth probe: raising inflight + * after ProbeRTT may cause loss, so the algorithm resets the bandwidth-probing + * clock by starting the cycle at ProbeBW_DOWN(). But then as an optimization, + * since the connection is exiting ProbeRTT, we know that infligh is already + * below the estimated BDP, so the connection can proceed immediately to ProbeBW_CRUISE. + */ +static void bbr_exit_probe_rtt(struct bbr *bbr) +{ + bbr_reset_lower_bounds(bbr); + if (bbr->full_bw_reached) { + bbr_start_probe_bw_down(bbr); + bbr_start_probe_bw_cruise(bbr); + } else { + bbr_enter_startup(bbr); + } +} + +static void bbr_advance_max_bw_filter(struct bbr *bbr) +{ + bbr->cycle_count++; +} + +static uint64_t bbr_target_inflight(struct bbr *bbr, struct quic_cc_path *p) +{ + uint64_t bdp = bbr_inflight(bbr, p, bbr->bw, 100); + return MIN(bdp, p->cwnd); +} + +static void bbr_handle_inflight_too_high(struct bbr *bbr, + struct quic_cc_path *p, + struct quic_cc_rs *rs) +{ + bbr->bw_probe_samples = 0; + if (!rs->is_app_limited) + bbr->inflight_hi = + MAX(rs->tx_in_flight, + bbr_target_inflight(bbr, p) * BBR_BETA_MULT / BBR_BETA_DIVI); + + if (bbr->state == BBR_ST_PROBE_BW_UP) + bbr_start_probe_bw_down(bbr); +} + +/* 4.5.10.2. Probing for Bandwidth In ProbeBW + * IsInflightTooHigh() implementation at BBR state level. This function + * call is_inflight_too_high() at delivery rate sampling level. + */ +static int bbr_is_inflight_too_high(struct bbr *bbr, struct quic_cc_path *p) +{ + if (!is_inflight_too_high(&bbr->drs.rs)) + return 0; + + if (bbr->bw_probe_samples) + bbr_handle_inflight_too_high(bbr, p, &bbr->drs.rs); + + return 1; +} + +static void bbr_probe_inflight_hi_upward(struct bbr *bbr, struct quic_cc_path *p, uint32_t acked) +{ + if (!bbr->drs.is_cwnd_limited || p->cwnd < bbr->inflight_hi) + return; /* not fully using inflight_hi, so don't grow it */ + + bbr->bw_probe_up_acks += acked; + if (bbr->bw_probe_up_acks >= bbr->probe_up_cnt) { + uint64_t delta; + + delta = bbr->bw_probe_up_acks / bbr->probe_up_cnt; + bbr->bw_probe_up_acks -= delta * bbr->probe_up_cnt; + bbr->inflight_hi += delta * p->mtu; + } + + if (bbr->round_start) + bbr_raise_inflight_hi_slope(bbr, p); +} + +/* Track ACK state and update BBR.max_bw window and + * BBR.inflight_hi. + */ +static void bbr_adapt_upper_bounds(struct bbr *bbr, struct quic_cc_path *p, + uint32_t acked) +{ + if (bbr->ack_phase == BBR_ACK_PHASE_ACKS_PROBE_STARTING && bbr->round_start) + /* starting to get bw probing samples */ + bbr->ack_phase = BBR_ACK_PHASE_ACKS_PROBE_FEEDBACK; + + if (bbr->ack_phase == BBR_ACK_PHASE_ACKS_PROBE_STOPPING && bbr->round_start) { + /* end of samples from bw probing phase */ + if (bbr_is_probing_bw(bbr) && !bbr->drs.rs.is_app_limited) + bbr_advance_max_bw_filter(bbr); + } + + if (bbr_is_inflight_too_high(bbr, p)) + return; + + if (bbr->inflight_hi == UINT64_MAX) + return; + + if (bbr->drs.rs.tx_in_flight > bbr->inflight_hi) + bbr->inflight_hi = bbr->drs.rs.tx_in_flight; + + if (bbr->state == BBR_ST_PROBE_BW_UP) + bbr_probe_inflight_hi_upward(bbr, p, acked); +} + + +static inline int bbr_has_elapsed_in_phase(struct bbr *bbr, + uint32_t interval) +{ + return tick_is_lt(tick_add(bbr->cycle_stamp, interval), now_ms); +} + +static int bbr_is_reno_coexistence_probe_time(struct bbr *bbr, struct quic_cc_path *p) +{ + uint64_t reno_rounds; + + reno_rounds = bbr_target_inflight(bbr, p) / p->mtu; + return bbr->rounds_since_bw_probe >= MIN(reno_rounds, 63); +} + +/* Is it time to transition from DOWN or CRUISE to REFILL? */ +static int bbr_is_time_to_probe_bw(struct bbr *bbr, struct quic_cc_path *p) +{ + if (bbr_has_elapsed_in_phase(bbr, bbr->bw_probe_wait) || + bbr_is_reno_coexistence_probe_time(bbr, p)) { + bbr_start_probe_bw_refill(bbr); + return 1; + } + + return 0; +} + +/* Called to exit from ProbeBW_DONW. + * 4.3.3.1. ProbeBW_DOWN + * + * Exit conditions: The flow exits the ProbeBW_DOWN phase and enters CRUISE when + * the flow estimates that both of the following conditions have been met: + * + * There is free headroom: If inflight_hi is set, then BBR remains in + * ProbeBW_DOWN at least until the volume of in-flight data is less than or equal + * to a target calculated based on (1 - BBRHeadroom)*BBR.inflight_hi. The goal of + * this constraint is to ensure that in cases where loss signals suggest an upper + * limit on the volume of in-flight data, then the flow attempts to leave some + * free headroom in the path (e.g. free space in the bottleneck buffer or free + * time slots in the bottleneck link) that can be used by cross traffic (both for + * convergence of bandwidth shares and for burst tolerance). + * + * The volume of in-flight data is less than or equal to BBR.bdp, i.e. the flow + * estimates that it has drained any queue at the bottleneck. + */ + +/* Time to transition from DOWN to CRUISE? */ +static int bbr_is_time_to_cruise(struct bbr *bbr, struct quic_cc_path *p) +{ + if (p->in_flight > bbr_inflight_with_headroom(bbr, p)) + return 0; /* not enough headroom */ + + if (p->in_flight <= bbr_inflight(bbr, p, bbr->max_bw, 1)) + return 1; /* inflight <= estimated BDP */ + + return 0; +} + +/* Time to transition from UP to DOWN? */ +static int bbr_is_time_to_go_down(struct bbr *bbr, struct quic_cc_path *p) +{ + if (bbr->drs.is_cwnd_limited && p->cwnd >= bbr->inflight_hi) { + bbr_reset_full_bw(bbr); /* bw is limited by inflight_hi */ + bbr->full_bw = p->delivery_rate; + } + else if (bbr->full_bw_now) { + return 1; /* we estimate we've fully used path bw */ + } + + return 0; +} + +static void bbr_check_probe_rtt_done(struct bbr *bbr, struct quic_cc_path *p) +{ + if (tick_isset(bbr->probe_rtt_done_stamp) && + tick_is_lt(bbr->probe_rtt_done_stamp, now_ms)) { + /* schedule next ProbeRTT: */ + bbr->probe_rtt_min_stamp = now_ms; + bbr_restore_cwnd(bbr, p); + bbr_exit_probe_rtt(bbr); + } +} + +static void bbr_mark_connection_app_limited(struct bbr *bbr, struct quic_cc_path *p) +{ + uint64_t app_limited = bbr->drs.delivered + p->in_flight; + + bbr->drs.app_limited = app_limited ? app_limited : p->mtu; +} + +static void bbr_update_max_bw(struct bbr *bbr, struct quic_cc_path *p, + uint32_t delivered) +{ + struct quic_cc_rs *rs = &bbr->drs.rs; + + bbr_update_round(bbr, delivered); + if (p->delivery_rate >= bbr->max_bw || !rs->is_app_limited) + bbr->max_bw = wf_max_update(&bbr->max_bw_filter, + p->delivery_rate, bbr->cycle_count); +} + +static void bbr_init_lower_bounds(struct bbr *bbr, struct quic_cc_path *p) +{ + if (bbr->bw_lo == UINT64_MAX) + bbr->bw_lo = bbr->max_bw; + if (bbr->inflight_lo == UINT64_MAX) + bbr->inflight_lo = p->cwnd; +} + +static void bbr_loss_lower_bounds(struct bbr *bbr) +{ + bbr->bw_lo = MAX(bbr->bw_latest, bbr->bw_lo * BBR_BETA_MULT / BBR_BETA_DIVI); + bbr->inflight_lo = MAX(bbr->inflight_latest, + bbr->inflight_lo * BBR_BETA_MULT / BBR_BETA_DIVI); +} + +static void bbr_adapt_lower_bounds_from_congestion(struct bbr *bbr, struct quic_cc_path *p) +{ + if (bbr_is_probing_bw(bbr)) + return; + + if (bbr->loss_in_round) { + bbr_init_lower_bounds(bbr, p); + bbr_loss_lower_bounds(bbr); + } +} + +static void bbr_update_latest_delivery_signals(struct bbr *bbr, + struct quic_cc_path *p) +{ + struct quic_cc_drs *drs = &bbr->drs; + + bbr->loss_round_start = 0; + bbr->bw_latest = MAX(bbr->bw_latest, p->delivery_rate); + bbr->inflight_latest = MAX(bbr->inflight_latest, drs->rs.delivered); + if (drs->rs.prior_delivered >= bbr->loss_round_delivered) { + bbr->loss_round_delivered = drs->delivered; + bbr->loss_round_start = 1; + } +} + +static void bbr_update_congestion_signals(struct bbr *bbr, struct quic_cc_path *p, + uint64_t bytes_lost, uint64_t delivered) +{ + bbr_update_max_bw(bbr, p, delivered); + if (bytes_lost) { + bbr->bytes_lost_in_round += bytes_lost; + ++bbr->loss_events_in_round; + + if (!bbr->loss_in_round) { + bbr->loss_in_round = 1; + bbr->loss_round_delivered = bbr->drs.delivered; + } + } + + if (!bbr->loss_round_start) + return; /* wait until end of round trip */ + + bbr_adapt_lower_bounds_from_congestion(bbr, p); /* once per round, adapt */ + bbr->loss_in_round = 0; +} + +static void bbr_update_ack_aggregation(struct bbr *bbr, + struct quic_cc_path *p, + uint32_t acked) +{ + uint32_t interval = now_ms - bbr->extra_acked_interval_start; + uint64_t expected_delivered = bbr->bw * interval / 1000; + uint64_t extra; + + if (bbr->extra_acked_delivered <= expected_delivered) { + bbr->extra_acked_delivered = 0; + bbr->extra_acked_interval_start = now_ms; + expected_delivered = 0; + } + + bbr->extra_acked_delivered += acked; + extra = bbr->extra_acked_delivered - expected_delivered; + extra = MIN(extra, p->cwnd); + + bbr->extra_acked = wf_max_update(&bbr->extra_acked_filter, extra, bbr->round_count); +} + +/* 4.3.1. Startup + * + * During Startup, BBR estimates whether the pipe is full using two estimators. + * The first looks for a plateau in the BBR.max_bw estimate. The second looks + * for packet loss. + */ +static void bbr_check_startup_done(struct bbr *bbr, struct quic_cc_path *p) +{ + bbr_check_startup_high_loss(bbr, p); + if (bbr->state == BBR_ST_STARTUP && bbr->full_bw_reached) + bbr_enter_drain(bbr); +} + +/* 4.3.2. Drain + * In Drain, when the amount of data in flight is less than or equal to the + * estimated BDP, meaning BBR estimates that the queue at the bottleneck link + * has been fully drained, then BBR exits Drain and enters ProbeBW. + */ +static void bbr_check_drain_done(struct bbr *bbr, + struct quic_cc_path *p) +{ + if (bbr->state == BBR_ST_DRAIN && + p->in_flight <= bbr_inflight(bbr, p, bbr->bw, 100)) + bbr_start_probe_bw_down(bbr); +} + +/* The core state machine logic for ProbeBW: */ +static void bbr_update_probe_bw_cycle_phase(struct bbr *bbr, struct quic_cc_path *p, + uint32_t acked) +{ + if (!bbr->full_bw_reached) + return; /* only handling steady-state behavior here */ + + bbr_adapt_upper_bounds(bbr, p, acked); + if (!bbr_is_probing_bw(bbr)) + return; /* only handling ProbeBW states here: */ + + switch (bbr->state) { + case BBR_ST_PROBE_BW_DOWN: + if (bbr_is_time_to_probe_bw(bbr, p)) + return;/* already decided state transition */ + + if (bbr_is_time_to_cruise(bbr, p)) + bbr_start_probe_bw_cruise(bbr); + break; + + case BBR_ST_PROBE_BW_CRUISE: + if (bbr_is_time_to_probe_bw(bbr, p)) + return; /* already decided state transition */ + break; + + case BBR_ST_PROBE_BW_REFILL: + /* After one round of REFILL, start UP */ + if (bbr->round_start) { + bbr->bw_probe_samples = 1; + bbr_start_probe_bw_up(bbr, p); + } + break; + + case BBR_ST_PROBE_BW_UP: + if (bbr_is_time_to_go_down(bbr, p)) + bbr_start_probe_bw_down(bbr); + break; + + default: + break; + } +} + +/* 4.3.4.4. ProbeRTT Logic + * + * On every ACK BBR executes BBRUpdateMinRTT() to update its ProbeRTT scheduling + * state (BBR.probe_rtt_min_delay and BBR.probe_rtt_min_stamp) and its + * BBR.min_rtt estimate. + * Here BBR.probe_rtt_expired is a boolean recording whether the + * BBR.probe_rtt_min_delay has expired and is due for a refresh, via either an + * application idle period or a transition into ProbeRTT state. + */ +static void bbr_update_min_rtt(struct bbr *bbr, uint32_t ack_rtt) +{ + int min_rtt_expired; + + bbr->probe_rtt_expired = + tick_is_lt(tick_add(bbr->probe_rtt_min_stamp, BBR_PROBE_RTT_INTERVAL), now_ms); + if (ack_rtt != UINT32_MAX && (ack_rtt < bbr->probe_rtt_min_delay || + bbr->probe_rtt_expired)) { + bbr->probe_rtt_min_delay = ack_rtt; + bbr->probe_rtt_min_stamp = now_ms; + } + + min_rtt_expired = + tick_is_lt(tick_add(bbr->min_rtt_stamp, BBR_MIN_RTT_FILTERLEN), now_ms); + if (bbr->probe_rtt_min_delay < bbr->min_rtt || min_rtt_expired) { + bbr->min_rtt = bbr->probe_rtt_min_delay; + bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; + } +} + +static void bbr_handle_probe_rtt(struct bbr *bbr, struct quic_cc_path *p) +{ + /* Ignore low rate samples during ProbeRTT: */ + bbr_mark_connection_app_limited(bbr, p); + if (!tick_isset(bbr->probe_rtt_done_stamp) && + p->in_flight <= bbr_probe_rtt_cwnd(bbr, p)) { + /* Wait for at least ProbeRTTDuration to elapse: */ + bbr->probe_rtt_done_stamp = tick_add(now_ms, BBR_PROBE_RTT_DURATION); + /* Wait for at least one round to elapse: */ + bbr->probe_rtt_round_done = 0; + bbr_start_round(bbr); + } + else if (tick_isset(bbr->probe_rtt_done_stamp)) { + if (bbr->round_start) + bbr->probe_rtt_round_done = 1; + if (bbr->probe_rtt_round_done) + bbr_check_probe_rtt_done(bbr, p); + } +} + +/* On every ACK BBR executes BBRCheckProbeRTT() to handle the steps related to + * the ProbeRTT state. + */ +static inline void bbr_check_probe_rtt(struct bbr *bbr, struct quic_cc_path *p) +{ + if (bbr->state != BBR_ST_PROBE_RTT && + bbr->probe_rtt_expired && !bbr->idle_restart) { + bbr_enter_probe_rtt(bbr); + bbr_save_cwnd(bbr, p); + bbr->probe_rtt_done_stamp = TICK_ETERNITY; + bbr->ack_phase = BBR_ACK_PHASE_ACKS_PROBE_STOPPING; + bbr_start_round(bbr); + } + + if (bbr->state == BBR_ST_PROBE_RTT) + bbr_handle_probe_rtt(bbr, p); + if (bbr->drs.rs.delivered > 0) + bbr->idle_restart = 0; +} + +static inline void bbr_advance_latest_delivery_signals(struct bbr *bbr, + struct quic_cc_path *p) +{ + if (bbr->loss_round_start) { + bbr->bw_latest = p->delivery_rate; + bbr->inflight_latest = bbr->drs.rs.delivered; + } +} + +static inline void bbr_bound_bw_for_model(struct bbr *bbr) +{ + bbr->bw = MIN(bbr->max_bw, bbr->bw_lo); +} + +static void bbr_update_model_and_state(struct bbr *bbr, + struct quic_cc_path *p, + uint32_t acked, + uint32_t delivered, + uint32_t ack_rtt, + uint32_t bytes_lost) +{ + bbr_update_latest_delivery_signals(bbr, p); + bbr_update_congestion_signals(bbr, p, bytes_lost, delivered); + bbr_update_ack_aggregation(bbr, p, acked); + bbr_check_full_bw_reached(bbr, p); + bbr_check_startup_done(bbr, p); + bbr_check_drain_done(bbr, p); + bbr_update_probe_bw_cycle_phase(bbr, p, acked); + bbr_update_min_rtt(bbr, ack_rtt); + bbr_check_probe_rtt(bbr, p); + bbr_advance_latest_delivery_signals(bbr, p); + bbr_bound_bw_for_model(bbr); +} + +static void bbr_update_control_parameters(struct bbr *bbr, + struct quic_cc_path *p, + uint32_t acked) +{ + bbr_set_pacing_rate(bbr, p); + bbr_set_send_quantum(bbr, p); + bbr_set_cwnd(bbr, p, acked); +} + +static inline int in_recovery_period(struct quic_cc_path *p, uint32_t ts) +{ + return tick_isset(p->recovery_start_ts) || + tick_is_le(ts, p->recovery_start_ts); +} + +static void bbr_handle_recovery(struct bbr *bbr, struct quic_cc_path *p, + unsigned int largest_pkt_sent_ts, + uint32_t acked) +{ + if (bbr->in_loss_recovery) { + if (tick_isset(largest_pkt_sent_ts) && + !in_recovery_period(p, largest_pkt_sent_ts)) { + bbr->in_loss_recovery = 0; + bbr->round_count_at_recovery = UINT64_MAX; + bbr_restore_cwnd(bbr, p); + } + + return; + } + + if (!tick_isset(bbr->recovery_start_ts)) + return; + + bbr->in_loss_recovery = 1; + bbr->round_count_at_recovery = + bbr->round_start ? bbr->round_count : bbr->round_count + 1; + bbr_save_cwnd(bbr, p); + p->cwnd = p->in_flight + MAX(acked, p->mtu); + p->recovery_start_ts = bbr->recovery_start_ts; + bbr->recovery_start_ts = TICK_ETERNITY; +} + +/* On every ACK, BBR updates its model, its state machine and its control + * parameters. + */ +static void bbr_update_on_ack(struct quic_cc *cc, + uint32_t acked, uint32_t delivered, uint32_t rtt, + uint32_t bytes_lost, unsigned largest_pkt_sent_ts) +{ + struct bbr *bbr = quic_cc_priv(cc); + struct quic_cc_path *p = container_of(cc, struct quic_cc_path, cc); + + bbr_handle_recovery(bbr, p, largest_pkt_sent_ts, acked); + bbr_update_model_and_state(bbr, p, acked, delivered, rtt, bytes_lost); + bbr_update_control_parameters(bbr, p, acked); +} + +/* At what prefix of packet did losses exceed BBRLossThresh? */ +static uint64_t bbr_inflight_hi_from_lost_packet(struct quic_cc_rs *rs, + struct quic_tx_packet *pkt) +{ + uint64_t inflight_prev, lost_prev, lost_prefix; + uint64_t size = pkt->len; + + BUG_ON(rs->tx_in_flight < size); + /* What was in flight before this packet? */ + inflight_prev = rs->tx_in_flight - size; + BUG_ON(rs->lost < size); + /* What was lost before this packet? */ + lost_prev = rs->lost - size; + lost_prefix = + (BBR_LOSS_THRESH_MULT * inflight_prev - lost_prev * BBR_LOSS_THRESH_DIVI) / + (BBR_LOSS_THRESH_DIVI - BBR_LOSS_THRESH_MULT); + return inflight_prev + lost_prefix; +} + +static void bbr_note_loss(struct bbr *bbr, uint64_t C_delivered) +{ + if (!bbr->loss_in_round) /* first loss in this round trip? */ + bbr->loss_round_delivered = C_delivered; + bbr->loss_in_round = 1; +} + +static void bbr_handle_lost_packet(struct bbr *bbr, struct quic_cc_path *p, + struct quic_tx_packet *pkt, + uint32_t lost) +{ + struct quic_cc_rs rs = {0}; + + /* C.delivered = bbr->drs.delivered */ + bbr_note_loss(bbr, bbr->drs.delivered); + if (!bbr->bw_probe_samples) + return; /* not a packet sent while probing bandwidth */ + + rs.tx_in_flight = pkt->rs.tx_in_flight; /* inflight at transmit */ + BUG_ON(bbr->drs.lost + pkt->len < lost); + /* bbr->rst->lost is not yet incremented */ + rs.lost = bbr->drs.lost + pkt->len - lost; /* data lost since transmit */ + rs.is_app_limited = pkt->rs.is_app_limited; + if (is_inflight_too_high(&rs)) { + rs.tx_in_flight = bbr_inflight_hi_from_lost_packet(&rs, pkt); + bbr_handle_inflight_too_high(bbr, p, &rs); + } +} + +/* 4.2.4. Per-Loss Steps + * + * On every packet loss event, where some sequence range "packet" is marked lost, + * the BBR algorithm executes the following BBRUpdateOnLoss() steps in order to + * update its network path model + */ +static void bbr_update_on_loss(struct quic_cc *cc, struct quic_tx_packet *pkt, + uint32_t lost) +{ + struct bbr *bbr = quic_cc_priv(cc); + struct quic_cc_path *p = container_of(cc, struct quic_cc_path, cc); + + if (bbr->state == BBR_ST_STARTUP) { + /* Not in RFC. That said, during Startup, the packet loss is handled by + * bbr_check_startup_high_loss(). + */ + return; + } + + bbr_handle_lost_packet(bbr, p, pkt, lost); +} + +static void bbr_handle_restart_from_idle(struct bbr *bbr, struct quic_cc_path *p) +{ + if (p->in_flight != 0 || !bbr->drs.app_limited) + return; + + bbr->idle_restart = 1; + bbr->extra_acked_interval_start = now_ms; + + if (bbr_is_probing_bw(bbr)) + bbr_set_pacing_rate_with_gain(bbr, p, 100); + else if (bbr->state == BBR_ST_PROBE_RTT) + bbr_check_probe_rtt_done(bbr, p); +} + +/* To be called upon packet transmissions. */ +static void bbr_on_transmit(struct quic_cc *cc) +{ + struct bbr *bbr = quic_cc_priv(cc); + struct quic_cc_path *p = container_of(cc, struct quic_cc_path, cc); + + bbr_handle_restart_from_idle(bbr, p); +} + +/* Update the delivery rate sampling state upon packet transmission */ +static void bbr_drs_on_transmit(struct quic_cc *cc, struct quic_tx_packet *pkt) +{ + struct bbr *bbr = quic_cc_priv(cc); + struct quic_cc_path *p = container_of(cc, struct quic_cc_path, cc); + + quic_cc_drs_on_pkt_sent(p, pkt, &bbr->drs); +} + +/* Callback to be called on every congestion event detection. */ +static void bbr_congestion_event(struct quic_cc *cc, uint32_t ts) +{ + struct quic_cc_path *p = container_of(cc, struct quic_cc_path, cc); + struct bbr *bbr = quic_cc_priv(&p->cc); + + if (bbr->in_loss_recovery || + tick_isset(bbr->recovery_start_ts) || in_recovery_period(p, ts)) + return; + + bbr->recovery_start_ts = ts; +} + +/* Callback to return the delivery rate sample struct from */ +struct quic_cc_drs *bbr_get_drs(struct quic_cc *cc) +{ + return &((struct bbr *)quic_cc_priv(cc))->drs; +} + +/* Return the pacing delay between bursts of packets in nanoseconds. */ +uint bbr_pacing_rate(const struct quic_cc *cc) +{ + struct bbr *bbr = quic_cc_priv(cc); + struct quic_cc_path *p = container_of(cc, struct quic_cc_path, cc); + + return p->mtu * 1000000000 / bbr->pacing_rate; +} + +/* Return the pacing burst size in datagrams */ +uint bbr_pacing_burst(const struct quic_cc *cc) +{ + struct quic_cc_path *p = container_of(cc, struct quic_cc_path, cc); + + return p->send_quantum / p->mtu; +} + +static inline const char *bbr_state_str(struct bbr *bbr) +{ + switch (bbr->state) { + case BBR_ST_STARTUP: + return "s"; + case BBR_ST_DRAIN: + return "d"; + case BBR_ST_PROBE_BW_DOWN: + return "pbd"; + case BBR_ST_PROBE_BW_CRUISE: + return "pbc"; + case BBR_ST_PROBE_BW_REFILL: + return "pbf"; + case BBR_ST_PROBE_BW_UP: + return "pbu"; + case BBR_ST_PROBE_RTT: + return "pr"; + default: + return "uk"; + } +} + +/* Callback used to dump BBR specific information from "show quic" CLI command. */ +static void bbr_state_cli(struct buffer *buf, const struct quic_cc_path *p) +{ + struct bbr *bbr = quic_cc_priv(&p->cc); + + chunk_appendf(buf, " bbr: st=%s max_bw=%llu min_rtt=%llu bw=%llu" + " sq=%llu pacing_rate=%llu\n", + bbr_state_str(bbr), (ull)bbr->max_bw, (ull)bbr->min_rtt, + (ull)bbr->bw, (ull)p->send_quantum, (ull)bbr->pacing_rate); +} + +struct quic_cc_algo quic_cc_algo_bbr = { + .type = QUIC_CC_ALGO_TP_BBR, + .init = bbr_init, + .pacing_rate = bbr_pacing_rate, + .pacing_burst = bbr_pacing_burst, + .get_drs = bbr_get_drs, + .on_transmit = bbr_on_transmit, + .drs_on_transmit = bbr_drs_on_transmit, + .on_ack_rcvd = bbr_update_on_ack, + .congestion_event = bbr_congestion_event, + .on_pkt_lost = bbr_update_on_loss, + .state_cli = bbr_state_cli, +}; + +void bbr_check(void) +{ + struct quic_cc *cc; + BUG_ON(sizeof(struct bbr) > sizeof(cc->priv)); +} + +INITCALL0(STG_REGISTER, bbr_check);