xref: /freebsd/sys/netinet/cc/cc_cdg.c (revision e9e8876a4d6afc1ad5315faaa191b25121a813d7)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2009-2013
5  * 	Swinburne University of Technology, Melbourne, Australia
6  * All rights reserved.
7  *
8  * This software was developed at the Centre for Advanced Internet
9  * Architectures, Swinburne University of Technology, by David Hayes, made
10  * possible in part by a gift from The Cisco University Research Program Fund,
11  * a corporate advised fund of Silicon Valley Community Foundation. Development
12  * and testing were further assisted by a grant from the FreeBSD Foundation.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 /*
37  * CAIA Delay-Gradient (CDG) congestion control algorithm
38  *
39  * An implemention of the delay-gradient congestion control algorithm proposed
40  * in the following paper:
41  *
42  * D. A. Hayes and G. Armitage, "Revisiting TCP Congestion Control using Delay
43  * Gradients", in IFIP Networking, Valencia, Spain, 9-13 May 2011.
44  *
45  * Developed as part of the NewTCP research project at Swinburne University of
46  * Technology's Centre for Advanced Internet Architectures, Melbourne,
47  * Australia. More details are available at:
48  *   http://caia.swin.edu.au/urp/newtcp/
49  */
50 
51 #include <sys/cdefs.h>
52 __FBSDID("$FreeBSD$");
53 
54 #include <sys/param.h>
55 #include <sys/hhook.h>
56 #include <sys/kernel.h>
57 #include <sys/khelp.h>
58 #include <sys/limits.h>
59 #include <sys/lock.h>
60 #include <sys/malloc.h>
61 #include <sys/module.h>
62 #include <sys/queue.h>
63 #include <sys/socket.h>
64 #include <sys/socketvar.h>
65 #include <sys/sysctl.h>
66 #include <sys/systm.h>
67 
68 #include <net/vnet.h>
69 
70 #include <net/route.h>
71 #include <net/route/nhop.h>
72 
73 #include <netinet/in_pcb.h>
74 #include <netinet/tcp.h>
75 #include <netinet/tcp_seq.h>
76 #include <netinet/tcp_timer.h>
77 #include <netinet/tcp_var.h>
78 #include <netinet/cc/cc.h>
79 #include <netinet/cc/cc_module.h>
80 
81 #include <netinet/khelp/h_ertt.h>
82 
83 #include <vm/uma.h>
84 
85 #define	CDG_VERSION "0.1"
86 
87 /* Private delay-gradient induced congestion control signal. */
88 #define	CC_CDG_DELAY 0x01000000
89 
90 /* NewReno window deflation factor on loss (as a percentage). */
91 #define	RENO_BETA 50
92 
93 /* Queue states. */
94 #define	CDG_Q_EMPTY	1
95 #define	CDG_Q_RISING	2
96 #define	CDG_Q_FALLING	3
97 #define	CDG_Q_FULL	4
98 #define	CDG_Q_UNKNOWN	9999
99 
100 /* Number of bit shifts used in probexp lookup table. */
101 #define	EXP_PREC 15
102 
103 /* Largest gradient represented in probexp lookup table. */
104 #define	MAXGRAD 5
105 
106 /*
107  * Delay Precision Enhance - number of bit shifts used for qtrend related
108  * integer arithmetic precision.
109  */
110 #define	D_P_E 7
111 
112 struct qdiff_sample {
113 	long qdiff;
114 	STAILQ_ENTRY(qdiff_sample) qdiff_lnk;
115 };
116 
117 struct cdg {
118 	long max_qtrend;
119 	long min_qtrend;
120 	STAILQ_HEAD(minrtts_head, qdiff_sample) qdiffmin_q;
121 	STAILQ_HEAD(maxrtts_head, qdiff_sample) qdiffmax_q;
122 	long window_incr;
123 	/* rttcount for window increase when in congestion avoidance */
124 	long rtt_count;
125 	/* maximum measured rtt within an rtt period */
126 	int maxrtt_in_rtt;
127 	/* maximum measured rtt within prev rtt period */
128 	int maxrtt_in_prevrtt;
129 	/* minimum measured rtt within an rtt period */
130 	int minrtt_in_rtt;
131 	/* minimum measured rtt within prev rtt period */
132 	int minrtt_in_prevrtt;
133 	/* consecutive congestion episode counter */
134 	uint32_t consec_cong_cnt;
135 	/* when tracking a new reno type loss window */
136 	uint32_t shadow_w;
137 	/* maximum number of samples in the moving average queue */
138 	int sample_q_size;
139 	/* number of samples in the moving average queue */
140 	int num_samples;
141 	/* estimate of the queue state of the path */
142 	int queue_state;
143 };
144 
145 /*
146  * Lookup table for:
147  *   (1 - exp(-x)) << EXP_PREC, where x = [0,MAXGRAD] in 2^-7 increments
148  *
149  * Note: probexp[0] is set to 10 (not 0) as a safety for very low increase
150  * gradients.
151  */
152 static const int probexp[641] = {
153    10,255,508,759,1008,1255,1501,1744,1985,2225,2463,2698,2932,3165,3395,3624,
154    3850,4075,4299,4520,4740,4958,5175,5389,5602,5814,6024,6232,6438,6643,6846,
155    7048,7248,7447,7644,7839,8033,8226,8417,8606,8794,8981,9166,9350,9532,9713,
156    9892,10070,10247,10422,10596,10769,10940,11110,11278,11445,11611,11776,11939,
157    12101,12262,12422,12580,12737,12893,13048,13201,13354,13505,13655,13803,13951,
158    14097,14243,14387,14530,14672,14813,14952,15091,15229,15365,15500,15635,15768,
159    15900,16032,16162,16291,16419,16547,16673,16798,16922,17046,17168,17289,17410,
160    17529,17648,17766,17882,17998,18113,18227,18340,18453,18564,18675,18784,18893,
161    19001,19108,19215,19320,19425,19529,19632,19734,19835,19936,20036,20135,20233,
162    20331,20427,20523,20619,20713,20807,20900,20993,21084,21175,21265,21355,21444,
163    21532,21619,21706,21792,21878,21962,22046,22130,22213,22295,22376,22457,22537,
164    22617,22696,22774,22852,22929,23006,23082,23157,23232,23306,23380,23453,23525,
165    23597,23669,23739,23810,23879,23949,24017,24085,24153,24220,24286,24352,24418,
166    24483,24547,24611,24675,24738,24800,24862,24924,24985,25045,25106,25165,25224,
167    25283,25341,25399,25456,25513,25570,25626,25681,25737,25791,25846,25899,25953,
168    26006,26059,26111,26163,26214,26265,26316,26366,26416,26465,26514,26563,26611,
169    26659,26707,26754,26801,26847,26893,26939,26984,27029,27074,27118,27162,27206,
170    27249,27292,27335,27377,27419,27460,27502,27543,27583,27624,27664,27703,27743,
171    27782,27821,27859,27897,27935,27973,28010,28047,28084,28121,28157,28193,28228,
172    28263,28299,28333,28368,28402,28436,28470,28503,28536,28569,28602,28634,28667,
173    28699,28730,28762,28793,28824,28854,28885,28915,28945,28975,29004,29034,29063,
174    29092,29120,29149,29177,29205,29232,29260,29287,29314,29341,29368,29394,29421,
175    29447,29472,29498,29524,29549,29574,29599,29623,29648,29672,29696,29720,29744,
176    29767,29791,29814,29837,29860,29882,29905,29927,29949,29971,29993,30014,30036,
177    30057,30078,30099,30120,30141,30161,30181,30201,30221,30241,30261,30280,30300,
178    30319,30338,30357,30376,30394,30413,30431,30449,30467,30485,30503,30521,30538,
179    30555,30573,30590,30607,30624,30640,30657,30673,30690,30706,30722,30738,30753,
180    30769,30785,30800,30815,30831,30846,30861,30876,30890,30905,30919,30934,30948,
181    30962,30976,30990,31004,31018,31031,31045,31058,31072,31085,31098,31111,31124,
182    31137,31149,31162,31174,31187,31199,31211,31223,31235,31247,31259,31271,31283,
183    31294,31306,31317,31328,31339,31351,31362,31373,31383,31394,31405,31416,31426,
184    31436,31447,31457,31467,31477,31487,31497,31507,31517,31527,31537,31546,31556,
185    31565,31574,31584,31593,31602,31611,31620,31629,31638,31647,31655,31664,31673,
186    31681,31690,31698,31706,31715,31723,31731,31739,31747,31755,31763,31771,31778,
187    31786,31794,31801,31809,31816,31824,31831,31838,31846,31853,31860,31867,31874,
188    31881,31888,31895,31902,31908,31915,31922,31928,31935,31941,31948,31954,31960,
189    31967,31973,31979,31985,31991,31997,32003,32009,32015,32021,32027,32033,32038,
190    32044,32050,32055,32061,32066,32072,32077,32083,32088,32093,32098,32104,32109,
191    32114,32119,32124,32129,32134,32139,32144,32149,32154,32158,32163,32168,32173,
192    32177,32182,32186,32191,32195,32200,32204,32209,32213,32217,32222,32226,32230,
193    32234,32238,32242,32247,32251,32255,32259,32263,32267,32270,32274,32278,32282,
194    32286,32290,32293,32297,32301,32304,32308,32311,32315,32318,32322,32325,32329,
195    32332,32336,32339,32342,32346,32349,32352,32356,32359,32362,32365,32368,32371,
196    32374,32377,32381,32384,32387,32389,32392,32395,32398,32401,32404,32407,32410,
197    32412,32415,32418,32421,32423,32426,32429,32431,32434,32437,32439,32442,32444,
198    32447,32449,32452,32454,32457,32459,32461,32464,32466,32469,32471,32473,32476,
199    32478,32480,32482,32485,32487,32489,32491,32493,32495,32497,32500,32502,32504,
200    32506,32508,32510,32512,32514,32516,32518,32520,32522,32524,32526,32527,32529,
201    32531,32533,32535,32537,32538,32540,32542,32544,32545,32547};
202 
203 static uma_zone_t qdiffsample_zone;
204 static int ertt_id;
205 
206 VNET_DEFINE_STATIC(uint32_t, cdg_alpha_inc);
207 VNET_DEFINE_STATIC(uint32_t, cdg_beta_delay);
208 VNET_DEFINE_STATIC(uint32_t, cdg_beta_loss);
209 VNET_DEFINE_STATIC(uint32_t, cdg_smoothing_factor);
210 VNET_DEFINE_STATIC(uint32_t, cdg_exp_backoff_scale);
211 VNET_DEFINE_STATIC(uint32_t, cdg_consec_cong);
212 VNET_DEFINE_STATIC(uint32_t, cdg_hold_backoff);
213 #define	V_cdg_alpha_inc		VNET(cdg_alpha_inc)
214 #define	V_cdg_beta_delay	VNET(cdg_beta_delay)
215 #define	V_cdg_beta_loss		VNET(cdg_beta_loss)
216 #define	V_cdg_smoothing_factor	VNET(cdg_smoothing_factor)
217 #define	V_cdg_exp_backoff_scale	VNET(cdg_exp_backoff_scale)
218 #define	V_cdg_consec_cong	VNET(cdg_consec_cong)
219 #define	V_cdg_hold_backoff	VNET(cdg_hold_backoff)
220 
221 /* Function prototypes. */
222 static int cdg_mod_init(void);
223 static int cdg_mod_destroy(void);
224 static void cdg_conn_init(struct cc_var *ccv);
225 static int cdg_cb_init(struct cc_var *ccv, void *ptr);
226 static void cdg_cb_destroy(struct cc_var *ccv);
227 static void cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type);
228 static void cdg_ack_received(struct cc_var *ccv, uint16_t ack_type);
229 static size_t cdg_data_sz(void);
230 
231 struct cc_algo cdg_cc_algo = {
232 	.name = "cdg",
233 	.mod_init = cdg_mod_init,
234 	.ack_received = cdg_ack_received,
235 	.cb_destroy = cdg_cb_destroy,
236 	.cb_init = cdg_cb_init,
237 	.conn_init = cdg_conn_init,
238 	.cong_signal = cdg_cong_signal,
239 	.mod_destroy = cdg_mod_destroy,
240 	.cc_data_sz = cdg_data_sz,
241 	.post_recovery = newreno_cc_post_recovery,
242 	.after_idle = newreno_cc_after_idle,
243 };
244 
245 /* Vnet created and being initialised. */
246 static void
247 cdg_init_vnet(const void *unused __unused)
248 {
249 
250 	V_cdg_alpha_inc = 0;
251 	V_cdg_beta_delay = 70;
252 	V_cdg_beta_loss = 50;
253 	V_cdg_smoothing_factor = 8;
254 	V_cdg_exp_backoff_scale = 3;
255 	V_cdg_consec_cong = 5;
256 	V_cdg_hold_backoff = 5;
257 }
258 
259 static int
260 cdg_mod_init(void)
261 {
262 	VNET_ITERATOR_DECL(v);
263 
264 	ertt_id = khelp_get_id("ertt");
265 	if (ertt_id <= 0)
266 		return (EINVAL);
267 
268 	qdiffsample_zone = uma_zcreate("cdg_qdiffsample",
269 	    sizeof(struct qdiff_sample), NULL, NULL, NULL, NULL, 0, 0);
270 
271 	VNET_LIST_RLOCK();
272 	VNET_FOREACH(v) {
273 		CURVNET_SET(v);
274 		cdg_init_vnet(NULL);
275 		CURVNET_RESTORE();
276 	}
277 	VNET_LIST_RUNLOCK();
278 	return (0);
279 }
280 
281 static int
282 cdg_mod_destroy(void)
283 {
284 
285 	uma_zdestroy(qdiffsample_zone);
286 	return (0);
287 }
288 
289 static size_t
290 cdg_data_sz(void)
291 {
292 	return (sizeof(struct cdg));
293 }
294 
295 static int
296 cdg_cb_init(struct cc_var *ccv, void *ptr)
297 {
298 	struct cdg *cdg_data;
299 
300 	INP_WLOCK_ASSERT(ccv->ccvc.tcp->t_inpcb);
301 	if (ptr == NULL) {
302 		cdg_data = malloc(sizeof(struct cdg), M_CC_MEM, M_NOWAIT);
303 		if (cdg_data == NULL)
304 			return (ENOMEM);
305 	} else {
306 		cdg_data = ptr;
307 	}
308 	cdg_data->shadow_w = 0;
309 	cdg_data->max_qtrend = 0;
310 	cdg_data->min_qtrend = 0;
311 	cdg_data->queue_state = CDG_Q_UNKNOWN;
312 	cdg_data->maxrtt_in_rtt = 0;
313 	cdg_data->maxrtt_in_prevrtt = 0;
314 	cdg_data->minrtt_in_rtt = INT_MAX;
315 	cdg_data->minrtt_in_prevrtt = 0;
316 	cdg_data->window_incr = 0;
317 	cdg_data->rtt_count = 0;
318 	cdg_data->consec_cong_cnt = 0;
319 	cdg_data->sample_q_size = V_cdg_smoothing_factor;
320 	cdg_data->num_samples = 0;
321 	STAILQ_INIT(&cdg_data->qdiffmin_q);
322 	STAILQ_INIT(&cdg_data->qdiffmax_q);
323 
324 	ccv->cc_data = cdg_data;
325 
326 	return (0);
327 }
328 
329 static void
330 cdg_conn_init(struct cc_var *ccv)
331 {
332 	struct cdg *cdg_data = ccv->cc_data;
333 
334 	/*
335 	 * Initialise the shadow_cwnd in case we are competing with loss based
336 	 * flows from the start
337 	 */
338 	cdg_data->shadow_w = CCV(ccv, snd_cwnd);
339 }
340 
341 static void
342 cdg_cb_destroy(struct cc_var *ccv)
343 {
344 	struct cdg *cdg_data;
345 	struct qdiff_sample *qds, *qds_n;
346 
347 	cdg_data = ccv->cc_data;
348 
349 	qds = STAILQ_FIRST(&cdg_data->qdiffmin_q);
350 	while (qds != NULL) {
351 		qds_n = STAILQ_NEXT(qds, qdiff_lnk);
352 		uma_zfree(qdiffsample_zone,qds);
353 		qds = qds_n;
354 	}
355 
356 	qds = STAILQ_FIRST(&cdg_data->qdiffmax_q);
357 	while (qds != NULL) {
358 		qds_n = STAILQ_NEXT(qds, qdiff_lnk);
359 		uma_zfree(qdiffsample_zone,qds);
360 		qds = qds_n;
361 	}
362 
363 	free(ccv->cc_data, M_CC_MEM);
364 }
365 
366 static int
367 cdg_beta_handler(SYSCTL_HANDLER_ARGS)
368 {
369 	int error;
370 	uint32_t new;
371 
372 	new = *(uint32_t *)arg1;
373 	error = sysctl_handle_int(oidp, &new, 0, req);
374 	if (error == 0 && req->newptr != NULL) {
375 		if (new == 0 || new > 100)
376 			error = EINVAL;
377 		else
378 			*(uint32_t *)arg1 = new;
379 	}
380 
381 	return (error);
382 }
383 
384 static int
385 cdg_exp_backoff_scale_handler(SYSCTL_HANDLER_ARGS)
386 {
387 	int error;
388 	uint32_t new;
389 
390 	new = *(uint32_t *)arg1;
391 	error = sysctl_handle_int(oidp, &new, 0, req);
392 	if (error == 0 && req->newptr != NULL) {
393 		if (new < 1)
394 			error = EINVAL;
395 		else
396 			*(uint32_t *)arg1 = new;
397 	}
398 
399 	return (error);
400 }
401 
402 static inline uint32_t
403 cdg_window_decrease(struct cc_var *ccv, unsigned long owin, unsigned int beta)
404 {
405 
406 	return ((ulmin(CCV(ccv, snd_wnd), owin) * beta) / 100);
407 }
408 
409 /*
410  * Window increase function
411  * This window increase function is independent of the initial window size
412  * to ensure small window flows are not discriminated against (i.e. fairness).
413  * It increases at 1pkt/rtt like Reno for alpha_inc rtts, and then 2pkts/rtt for
414  * the next alpha_inc rtts, etc.
415  */
416 static void
417 cdg_window_increase(struct cc_var *ccv, int new_measurement)
418 {
419 	struct cdg *cdg_data;
420 	int incr, s_w_incr;
421 
422 	cdg_data = ccv->cc_data;
423 	incr = s_w_incr = 0;
424 
425 	if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) {
426 		/* Slow start. */
427 		incr = CCV(ccv, t_maxseg);
428 		s_w_incr = incr;
429 		cdg_data->window_incr = cdg_data->rtt_count = 0;
430 	} else {
431 		/* Congestion avoidance. */
432 		if (new_measurement) {
433 			s_w_incr = CCV(ccv, t_maxseg);
434 			if (V_cdg_alpha_inc == 0) {
435 				incr = CCV(ccv, t_maxseg);
436 			} else {
437 				if (++cdg_data->rtt_count >= V_cdg_alpha_inc) {
438 					cdg_data->window_incr++;
439 					cdg_data->rtt_count = 0;
440 				}
441 				incr = CCV(ccv, t_maxseg) *
442 				    cdg_data->window_incr;
443 			}
444 		}
445 	}
446 
447 	if (cdg_data->shadow_w > 0)
448 		cdg_data->shadow_w = ulmin(cdg_data->shadow_w + s_w_incr,
449 		    TCP_MAXWIN << CCV(ccv, snd_scale));
450 
451 	CCV(ccv, snd_cwnd) = ulmin(CCV(ccv, snd_cwnd) + incr,
452 	    TCP_MAXWIN << CCV(ccv, snd_scale));
453 }
454 
455 static void
456 cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type)
457 {
458 	struct cdg *cdg_data = ccv->cc_data;
459 
460 	switch(signal_type) {
461 	case CC_CDG_DELAY:
462 		CCV(ccv, snd_ssthresh) = cdg_window_decrease(ccv,
463 		    CCV(ccv, snd_cwnd), V_cdg_beta_delay);
464 		CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
465 		CCV(ccv, snd_recover) = CCV(ccv, snd_max);
466 		cdg_data->window_incr = cdg_data->rtt_count = 0;
467 		ENTER_CONGRECOVERY(CCV(ccv, t_flags));
468 		break;
469 	case CC_NDUPACK:
470 		/*
471 		 * If already responding to congestion OR we have guessed no
472 		 * queue in the path is full.
473 		 */
474 		if (IN_CONGRECOVERY(CCV(ccv, t_flags)) ||
475 		    cdg_data->queue_state < CDG_Q_FULL) {
476 			CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd);
477 			CCV(ccv, snd_recover) = CCV(ccv, snd_max);
478 		} else {
479 			/*
480 			 * Loss is likely to be congestion related. We have
481 			 * inferred a queue full state, so have shadow window
482 			 * react to loss as NewReno would.
483 			 */
484 			if (cdg_data->shadow_w > 0)
485 				cdg_data->shadow_w = cdg_window_decrease(ccv,
486 				    cdg_data->shadow_w, RENO_BETA);
487 
488 			CCV(ccv, snd_ssthresh) = max(cdg_data->shadow_w,
489 			    cdg_window_decrease(ccv, CCV(ccv, snd_cwnd),
490 			    V_cdg_beta_loss));
491 
492 			cdg_data->window_incr = cdg_data->rtt_count = 0;
493 		}
494 		ENTER_RECOVERY(CCV(ccv, t_flags));
495 		break;
496 	default:
497 		newreno_cc_cong_signal(ccv, signal_type);
498 		break;
499 	}
500 }
501 
502 /*
503  * Using a negative exponential probabilistic backoff so that sources with
504  * varying RTTs which share the same link will, on average, have the same
505  * probability of backoff over time.
506  *
507  * Prob_backoff = 1 - exp(-qtrend / V_cdg_exp_backoff_scale), where
508  * V_cdg_exp_backoff_scale is the average qtrend for the exponential backoff.
509  */
510 static inline int
511 prob_backoff(long qtrend)
512 {
513 	int backoff, idx, p;
514 
515 	backoff = (qtrend > ((MAXGRAD * V_cdg_exp_backoff_scale) << D_P_E));
516 
517 	if (!backoff) {
518 		if (V_cdg_exp_backoff_scale > 1)
519 			idx = (qtrend + V_cdg_exp_backoff_scale / 2) /
520 			    V_cdg_exp_backoff_scale;
521 		else
522 			idx = qtrend;
523 
524 		/* Backoff probability proportional to rate of queue growth. */
525 		p = (INT_MAX / (1 << EXP_PREC)) * probexp[idx];
526 		backoff = (random() < p);
527 	}
528 
529 	return (backoff);
530 }
531 
532 static inline void
533 calc_moving_average(struct cdg *cdg_data, long qdiff_max, long qdiff_min)
534 {
535 	struct qdiff_sample *qds;
536 
537 	++cdg_data->num_samples;
538 	if (cdg_data->num_samples > cdg_data->sample_q_size) {
539 		/* Minimum RTT. */
540 		qds = STAILQ_FIRST(&cdg_data->qdiffmin_q);
541 		cdg_data->min_qtrend =  cdg_data->min_qtrend +
542 		    (qdiff_min - qds->qdiff) / cdg_data->sample_q_size;
543 		STAILQ_REMOVE_HEAD(&cdg_data->qdiffmin_q, qdiff_lnk);
544 		qds->qdiff = qdiff_min;
545 		STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds, qdiff_lnk);
546 
547 		/* Maximum RTT. */
548 		qds = STAILQ_FIRST(&cdg_data->qdiffmax_q);
549 		cdg_data->max_qtrend =  cdg_data->max_qtrend +
550 		    (qdiff_max - qds->qdiff) / cdg_data->sample_q_size;
551 		STAILQ_REMOVE_HEAD(&cdg_data->qdiffmax_q, qdiff_lnk);
552 		qds->qdiff = qdiff_max;
553 		STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds, qdiff_lnk);
554 		--cdg_data->num_samples;
555 	} else {
556 		qds = uma_zalloc(qdiffsample_zone, M_NOWAIT);
557 		if (qds != NULL) {
558 			cdg_data->min_qtrend = cdg_data->min_qtrend +
559 			    qdiff_min / cdg_data->sample_q_size;
560 			qds->qdiff = qdiff_min;
561 			STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds,
562 			    qdiff_lnk);
563 		}
564 
565 		qds = uma_zalloc(qdiffsample_zone, M_NOWAIT);
566 		if (qds) {
567 			cdg_data->max_qtrend = cdg_data->max_qtrend +
568 			    qdiff_max / cdg_data->sample_q_size;
569 			qds->qdiff = qdiff_max;
570 			STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds,
571 			    qdiff_lnk);
572 		}
573 	}
574 }
575 
576 static void
577 cdg_ack_received(struct cc_var *ccv, uint16_t ack_type)
578 {
579 	struct cdg *cdg_data;
580 	struct ertt *e_t;
581 	long qdiff_max, qdiff_min;
582 	int congestion, new_measurement, slowstart;
583 
584 	cdg_data = ccv->cc_data;
585 	e_t = (struct ertt *)khelp_get_osd(CCV(ccv, osd), ertt_id);
586 	new_measurement = e_t->flags & ERTT_NEW_MEASUREMENT;
587 	congestion = 0;
588 	cdg_data->maxrtt_in_rtt = imax(e_t->rtt, cdg_data->maxrtt_in_rtt);
589 	cdg_data->minrtt_in_rtt = imin(e_t->rtt, cdg_data->minrtt_in_rtt);
590 
591 	if (new_measurement) {
592 		slowstart = (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh));
593 		/*
594 		 * Update smoothed gradient measurements. Since we are only
595 		 * using one measurement per RTT, use max or min rtt_in_rtt.
596 		 * This is also less noisy than a sample RTT measurement. Max
597 		 * RTT measurements can have trouble due to OS issues.
598 		 */
599 		if (cdg_data->maxrtt_in_prevrtt) {
600 			qdiff_max = ((long)(cdg_data->maxrtt_in_rtt -
601 			    cdg_data->maxrtt_in_prevrtt) << D_P_E );
602 			qdiff_min = ((long)(cdg_data->minrtt_in_rtt -
603 			    cdg_data->minrtt_in_prevrtt) << D_P_E );
604 
605 			if (cdg_data->sample_q_size == 0) {
606 				cdg_data->max_qtrend = qdiff_max;
607 				cdg_data->min_qtrend = qdiff_min;
608 			} else
609 				calc_moving_average(cdg_data, qdiff_max, qdiff_min);
610 
611 			/* Probabilistic backoff with respect to gradient. */
612 			if (slowstart && qdiff_min > 0)
613 				congestion = prob_backoff(qdiff_min);
614 			else if (cdg_data->min_qtrend > 0)
615 				congestion = prob_backoff(cdg_data->min_qtrend);
616 			else if (slowstart && qdiff_max > 0)
617 				congestion = prob_backoff(qdiff_max);
618 			else if (cdg_data->max_qtrend > 0)
619 				congestion = prob_backoff(cdg_data->max_qtrend);
620 
621 			/* Update estimate of queue state. */
622 			if (cdg_data->min_qtrend > 0 &&
623 			    cdg_data->max_qtrend <= 0) {
624 				cdg_data->queue_state = CDG_Q_FULL;
625 			} else if (cdg_data->min_qtrend >= 0 &&
626 			    cdg_data->max_qtrend < 0) {
627 				cdg_data->queue_state = CDG_Q_EMPTY;
628 				cdg_data->shadow_w = 0;
629 			} else if (cdg_data->min_qtrend > 0 &&
630 			    cdg_data->max_qtrend > 0) {
631 				cdg_data->queue_state = CDG_Q_RISING;
632 			} else if (cdg_data->min_qtrend < 0 &&
633 			    cdg_data->max_qtrend < 0) {
634 				cdg_data->queue_state = CDG_Q_FALLING;
635 			}
636 
637 			if (cdg_data->min_qtrend < 0 ||
638 			    cdg_data->max_qtrend < 0)
639 				cdg_data->consec_cong_cnt = 0;
640 		}
641 
642 		cdg_data->minrtt_in_prevrtt = cdg_data->minrtt_in_rtt;
643 		cdg_data->minrtt_in_rtt = INT_MAX;
644 		cdg_data->maxrtt_in_prevrtt = cdg_data->maxrtt_in_rtt;
645 		cdg_data->maxrtt_in_rtt = 0;
646 		e_t->flags &= ~ERTT_NEW_MEASUREMENT;
647 	}
648 
649 	if (congestion) {
650 		cdg_data->consec_cong_cnt++;
651 		if (!IN_RECOVERY(CCV(ccv, t_flags))) {
652 			if (cdg_data->consec_cong_cnt <= V_cdg_consec_cong)
653 				cdg_cong_signal(ccv, CC_CDG_DELAY);
654 			else
655 				/*
656 				 * We have been backing off but the queue is not
657 				 * falling. Assume we are competing with
658 				 * loss-based flows and don't back off for the
659 				 * next V_cdg_hold_backoff RTT periods.
660 				 */
661 				if (cdg_data->consec_cong_cnt >=
662 				    V_cdg_consec_cong + V_cdg_hold_backoff)
663 					cdg_data->consec_cong_cnt = 0;
664 
665 			/* Won't see effect until 2nd RTT. */
666 			cdg_data->maxrtt_in_prevrtt = 0;
667 			/*
668 			 * Resync shadow window in case we are competing with a
669 			 * loss based flow
670 			 */
671 			cdg_data->shadow_w = ulmax(CCV(ccv, snd_cwnd),
672 			    cdg_data->shadow_w);
673 		}
674 	} else if (ack_type == CC_ACK)
675 		cdg_window_increase(ccv, new_measurement);
676 }
677 
678 /* When a vnet is created and being initialised, init the per-stack CDG vars. */
679 VNET_SYSINIT(cdg_init_vnet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
680     cdg_init_vnet, NULL);
681 
682 SYSCTL_DECL(_net_inet_tcp_cc_cdg);
683 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, cdg, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
684     "CAIA delay-gradient congestion control related settings");
685 
686 SYSCTL_STRING(_net_inet_tcp_cc_cdg, OID_AUTO, version,
687     CTLFLAG_RD, CDG_VERSION, sizeof(CDG_VERSION) - 1,
688     "Current algorithm/implementation version number");
689 
690 SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, alpha_inc,
691     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_alpha_inc), 0,
692     "Increment the window increase factor alpha by 1 MSS segment every "
693     "alpha_inc RTTs during congestion avoidance mode.");
694 
695 SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_delay,
696     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
697     &VNET_NAME(cdg_beta_delay), 70, &cdg_beta_handler, "IU",
698     "Delay-based window decrease factor as a percentage "
699     "(on delay-based backoff, w = w * beta_delay / 100)");
700 
701 SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_loss,
702     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
703     &VNET_NAME(cdg_beta_loss), 50, &cdg_beta_handler, "IU",
704     "Loss-based window decrease factor as a percentage "
705     "(on loss-based backoff, w = w * beta_loss / 100)");
706 
707 SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, exp_backoff_scale,
708     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
709     &VNET_NAME(cdg_exp_backoff_scale), 2, &cdg_exp_backoff_scale_handler, "IU",
710     "Scaling parameter for the probabilistic exponential backoff");
711 
712 SYSCTL_UINT(_net_inet_tcp_cc_cdg,  OID_AUTO, smoothing_factor,
713     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_smoothing_factor), 8,
714     "Number of samples used for moving average smoothing (0 = no smoothing)");
715 
716 SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_consec_cong,
717     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_consec_cong), 5,
718     "Number of consecutive delay-gradient based congestion episodes which will "
719     "trigger loss based CC compatibility");
720 
721 SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_hold_backoff,
722     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_hold_backoff), 5,
723     "Number of consecutive delay-gradient based congestion episodes to hold "
724     "the window backoff for loss based CC compatibility");
725 
726 DECLARE_CC_MODULE(cdg, &cdg_cc_algo);
727 MODULE_VERSION(cdg, 2);
728 MODULE_DEPEND(cdg, ertt, 1, 1, 1);
729