xref: /freebsd/sys/netinet/cc/cc_dctcp.c (revision 6871d4882591c9a8fcab24d084c93f0a2972e1af)
1 /*-
2  * Copyright (c) 2007-2008
3  *	Swinburne University of Technology, Melbourne, Australia
4  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
5  * Copyright (c) 2014 Midori Kato <katoon@sfc.wide.ad.jp>
6  * Copyright (c) 2014 The FreeBSD Foundation
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 /*
32  * An implementation of the DCTCP algorithm for FreeBSD, based on
33  * "Data Center TCP (DCTCP)" by M. Alizadeh, A. Greenberg, D. A. Maltz,
34  * J. Padhye, P. Patel, B. Prabhakar, S. Sengupta, and M. Sridharan.,
35  * in ACM Conference on SIGCOMM 2010, New York, USA,
36  * Originally released as the contribution of Microsoft Research project.
37  */
38 
39 #include <sys/cdefs.h>
40 __FBSDID("$FreeBSD$");
41 
42 #include <sys/param.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/module.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <sys/systm.h>
50 
51 #include <net/vnet.h>
52 
53 #include <netinet/tcp.h>
54 #include <netinet/tcp_seq.h>
55 #include <netinet/tcp_var.h>
56 #include <netinet/cc/cc.h>
57 #include <netinet/cc/cc_module.h>
58 
59 #define MAX_ALPHA_VALUE 1024
60 VNET_DEFINE_STATIC(uint32_t, dctcp_alpha) = 0;
61 #define V_dctcp_alpha	    VNET(dctcp_alpha)
62 VNET_DEFINE_STATIC(uint32_t, dctcp_shift_g) = 4;
63 #define	V_dctcp_shift_g	    VNET(dctcp_shift_g)
64 VNET_DEFINE_STATIC(uint32_t, dctcp_slowstart) = 0;
65 #define	V_dctcp_slowstart   VNET(dctcp_slowstart)
66 
67 struct dctcp {
68 	int     bytes_ecn;	/* # of marked bytes during a RTT */
69 	int     bytes_total;	/* # of acked bytes during a RTT */
70 	int     alpha;		/* the fraction of marked bytes */
71 	int     ce_prev;	/* CE state of the last segment */
72 	int     save_sndnxt;	/* end sequence number of the current window */
73 	int	ece_curr;	/* ECE flag in this segment */
74 	int	ece_prev;	/* ECE flag in the last segment */
75 	uint32_t    num_cong_events; /* # of congestion events */
76 };
77 
78 static MALLOC_DEFINE(M_dctcp, "dctcp data",
79     "Per connection data required for the dctcp algorithm");
80 
81 static void	dctcp_ack_received(struct cc_var *ccv, uint16_t type);
82 static void	dctcp_after_idle(struct cc_var *ccv);
83 static void	dctcp_cb_destroy(struct cc_var *ccv);
84 static int	dctcp_cb_init(struct cc_var *ccv);
85 static void	dctcp_cong_signal(struct cc_var *ccv, uint32_t type);
86 static void	dctcp_conn_init(struct cc_var *ccv);
87 static void	dctcp_post_recovery(struct cc_var *ccv);
88 static void	dctcp_ecnpkt_handler(struct cc_var *ccv);
89 static void	dctcp_update_alpha(struct cc_var *ccv);
90 
91 struct cc_algo dctcp_cc_algo = {
92 	.name = "dctcp",
93 	.ack_received = dctcp_ack_received,
94 	.cb_destroy = dctcp_cb_destroy,
95 	.cb_init = dctcp_cb_init,
96 	.cong_signal = dctcp_cong_signal,
97 	.conn_init = dctcp_conn_init,
98 	.post_recovery = dctcp_post_recovery,
99 	.ecnpkt_handler = dctcp_ecnpkt_handler,
100 	.after_idle = dctcp_after_idle,
101 };
102 
103 static void
104 dctcp_ack_received(struct cc_var *ccv, uint16_t type)
105 {
106 	struct dctcp *dctcp_data;
107 	int bytes_acked = 0;
108 
109 	dctcp_data = ccv->cc_data;
110 
111 	if (CCV(ccv, t_flags) & TF_ECN_PERMIT) {
112 		/*
113 		 * DCTCP doesn't treat receipt of ECN marked packet as a
114 		 * congestion event. Thus, DCTCP always executes the ACK
115 		 * processing out of congestion recovery.
116 		 */
117 		if (IN_CONGRECOVERY(CCV(ccv, t_flags))) {
118 			EXIT_CONGRECOVERY(CCV(ccv, t_flags));
119 			newreno_cc_algo.ack_received(ccv, type);
120 			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
121 		} else
122 			newreno_cc_algo.ack_received(ccv, type);
123 
124 		if (type == CC_DUPACK)
125 			bytes_acked = CCV(ccv, t_maxseg);
126 
127 		if (type == CC_ACK)
128 			bytes_acked = ccv->bytes_this_ack;
129 
130 		/* Update total bytes. */
131 		dctcp_data->bytes_total += bytes_acked;
132 
133 		/* Update total marked bytes. */
134 		if (dctcp_data->ece_curr) {
135 			if (!dctcp_data->ece_prev
136 			    && bytes_acked > CCV(ccv, t_maxseg)) {
137 				dctcp_data->bytes_ecn +=
138 				    (bytes_acked - CCV(ccv, t_maxseg));
139 			} else
140 				dctcp_data->bytes_ecn += bytes_acked;
141 			dctcp_data->ece_prev = 1;
142 		} else {
143 			if (dctcp_data->ece_prev
144 			    && bytes_acked > CCV(ccv, t_maxseg))
145 				dctcp_data->bytes_ecn += CCV(ccv, t_maxseg);
146 			dctcp_data->ece_prev = 0;
147 		}
148 		dctcp_data->ece_curr = 0;
149 
150 		/*
151 		 * Update the fraction of marked bytes at the end of
152 		 * current window size.
153 		 */
154 		if ((IN_FASTRECOVERY(CCV(ccv, t_flags)) &&
155 		    SEQ_GEQ(ccv->curack, CCV(ccv, snd_recover))) ||
156 		    (!IN_FASTRECOVERY(CCV(ccv, t_flags)) &&
157 		    SEQ_GT(ccv->curack, dctcp_data->save_sndnxt)))
158 			dctcp_update_alpha(ccv);
159 	} else
160 		newreno_cc_algo.ack_received(ccv, type);
161 }
162 
163 static void
164 dctcp_after_idle(struct cc_var *ccv)
165 {
166 	struct dctcp *dctcp_data;
167 
168 	dctcp_data = ccv->cc_data;
169 
170 	/* Initialize internal parameters after idle time */
171 	dctcp_data->bytes_ecn = 0;
172 	dctcp_data->bytes_total = 0;
173 	dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
174 	dctcp_data->alpha = V_dctcp_alpha;
175 	dctcp_data->ece_curr = 0;
176 	dctcp_data->ece_prev = 0;
177 	dctcp_data->num_cong_events = 0;
178 
179 	dctcp_cc_algo.after_idle = newreno_cc_algo.after_idle;
180 }
181 
182 static void
183 dctcp_cb_destroy(struct cc_var *ccv)
184 {
185 	free(ccv->cc_data, M_dctcp);
186 }
187 
188 static int
189 dctcp_cb_init(struct cc_var *ccv)
190 {
191 	struct dctcp *dctcp_data;
192 
193 	dctcp_data = malloc(sizeof(struct dctcp), M_dctcp, M_NOWAIT|M_ZERO);
194 
195 	if (dctcp_data == NULL)
196 		return (ENOMEM);
197 
198 	/* Initialize some key variables with sensible defaults. */
199 	dctcp_data->bytes_ecn = 0;
200 	dctcp_data->bytes_total = 0;
201 	/*
202 	 * When alpha is set to 0 in the beginning, DCTCP sender transfers as
203 	 * much data as possible until the value converges which may expand the
204 	 * queueing delay at the switch. When alpha is set to 1, queueing delay
205 	 * is kept small.
206 	 * Throughput-sensitive applications should have alpha = 0
207 	 * Latency-sensitive applications should have alpha = 1
208 	 *
209 	 * Note: DCTCP draft suggests initial alpha to be 1 but we've decided to
210 	 * keep it 0 as default.
211 	 */
212 	dctcp_data->alpha = V_dctcp_alpha;
213 	dctcp_data->save_sndnxt = 0;
214 	dctcp_data->ce_prev = 0;
215 	dctcp_data->ece_curr = 0;
216 	dctcp_data->ece_prev = 0;
217 	dctcp_data->num_cong_events = 0;
218 
219 	ccv->cc_data = dctcp_data;
220 	return (0);
221 }
222 
223 /*
224  * Perform any necessary tasks before we enter congestion recovery.
225  */
226 static void
227 dctcp_cong_signal(struct cc_var *ccv, uint32_t type)
228 {
229 	struct dctcp *dctcp_data;
230 	u_int win, mss;
231 
232 	dctcp_data = ccv->cc_data;
233 	win = CCV(ccv, snd_cwnd);
234 	mss = CCV(ccv, t_maxseg);
235 
236 	switch (type) {
237 	case CC_NDUPACK:
238 		if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
239 			if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
240 				CCV(ccv, snd_ssthresh) = mss *
241 				    max(win / 2 / mss, 2);
242 				dctcp_data->num_cong_events++;
243 			} else {
244 				/* cwnd has already updated as congestion
245 				 * recovery. Reverse cwnd value using
246 				 * snd_cwnd_prev and recalculate snd_ssthresh
247 				 */
248 				win = CCV(ccv, snd_cwnd_prev);
249 				CCV(ccv, snd_ssthresh) =
250 				    max(win / 2 / mss, 2) * mss;
251 			}
252 			ENTER_RECOVERY(CCV(ccv, t_flags));
253 		}
254 		break;
255 	case CC_ECN:
256 		/*
257 		 * Save current snd_cwnd when the host encounters both
258 		 * congestion recovery and fast recovery.
259 		 */
260 		CCV(ccv, snd_cwnd_prev) = win;
261 		if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
262 			if (V_dctcp_slowstart &&
263 			    dctcp_data->num_cong_events++ == 0) {
264 				CCV(ccv, snd_ssthresh) =
265 				    mss * max(win / 2 / mss, 2);
266 				dctcp_data->alpha = MAX_ALPHA_VALUE;
267 				dctcp_data->bytes_ecn = 0;
268 				dctcp_data->bytes_total = 0;
269 				dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
270 			} else
271 				CCV(ccv, snd_ssthresh) = max((win - ((win *
272 				    dctcp_data->alpha) >> 11)) / mss, 2) * mss;
273 			CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
274 			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
275 		}
276 		dctcp_data->ece_curr = 1;
277 		break;
278 	case CC_RTO:
279 		if (CCV(ccv, t_flags) & TF_ECN_PERMIT) {
280 			CCV(ccv, t_flags) |= TF_ECN_SND_CWR;
281 			dctcp_update_alpha(ccv);
282 			dctcp_data->save_sndnxt += CCV(ccv, t_maxseg);
283 			dctcp_data->num_cong_events++;
284 		}
285 		break;
286 	}
287 }
288 
289 static void
290 dctcp_conn_init(struct cc_var *ccv)
291 {
292 	struct dctcp *dctcp_data;
293 
294 	dctcp_data = ccv->cc_data;
295 
296 	if (CCV(ccv, t_flags) & TF_ECN_PERMIT)
297 		dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
298 }
299 
300 /*
301  * Perform any necessary tasks before we exit congestion recovery.
302  */
303 static void
304 dctcp_post_recovery(struct cc_var *ccv)
305 {
306 	dctcp_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
307 
308 	if (CCV(ccv, t_flags) & TF_ECN_PERMIT)
309 		dctcp_update_alpha(ccv);
310 }
311 
312 /*
313  * Execute an additional ECN processing using ECN field in IP header and the CWR
314  * bit in TCP header.
315  *
316  * delay_ack == 0 - Delayed ACK disabled
317  * delay_ack == 1 - Delayed ACK enabled
318  */
319 
320 static void
321 dctcp_ecnpkt_handler(struct cc_var *ccv)
322 {
323 	struct dctcp *dctcp_data;
324 	uint32_t ccflag;
325 	int delay_ack;
326 
327 	dctcp_data = ccv->cc_data;
328 	ccflag = ccv->flags;
329 	delay_ack = 1;
330 
331 	/*
332 	 * DCTCP responses an ACK immediately when the CE state
333 	 * in between this segment and the last segment is not same.
334 	 */
335 	if (ccflag & CCF_IPHDR_CE) {
336 		if (!dctcp_data->ce_prev && (ccflag & CCF_DELACK))
337 			delay_ack = 0;
338 		dctcp_data->ce_prev = 1;
339 		CCV(ccv, t_flags) |= TF_ECN_SND_ECE;
340 	} else {
341 		if (dctcp_data->ce_prev && (ccflag & CCF_DELACK))
342 			delay_ack = 0;
343 		dctcp_data->ce_prev = 0;
344 		CCV(ccv, t_flags) &= ~TF_ECN_SND_ECE;
345 	}
346 
347 	/* DCTCP sets delayed ack when this segment sets the CWR flag. */
348 	if ((ccflag & CCF_DELACK) && (ccflag & CCF_TCPHDR_CWR))
349 		delay_ack = 1;
350 
351 	if (delay_ack == 0)
352 		ccv->flags |= CCF_ACKNOW;
353 	else
354 		ccv->flags &= ~CCF_ACKNOW;
355 }
356 
357 /*
358  * Update the fraction of marked bytes represented as 'alpha'.
359  * Also initialize several internal parameters at the end of this function.
360  */
361 static void
362 dctcp_update_alpha(struct cc_var *ccv)
363 {
364 	struct dctcp *dctcp_data;
365 	int alpha_prev;
366 
367 	dctcp_data = ccv->cc_data;
368 	alpha_prev = dctcp_data->alpha;
369 	dctcp_data->bytes_total = max(dctcp_data->bytes_total, 1);
370 
371 	/*
372 	 * Update alpha: alpha = (1 - g) * alpha + g * F.
373 	 * Here:
374 	 * g is weight factor
375 	 *	recommaded to be set to 1/16
376 	 *	small g = slow convergence between competitive DCTCP flows
377 	 *	large g = impacts low utilization of bandwidth at switches
378 	 * F is fraction of marked segments in last RTT
379 	 *	updated every RTT
380 	 * Alpha must be round to 0 - MAX_ALPHA_VALUE.
381 	 */
382 	dctcp_data->alpha = min(alpha_prev - (alpha_prev >> V_dctcp_shift_g) +
383 	    (dctcp_data->bytes_ecn << (10 - V_dctcp_shift_g)) /
384 	    dctcp_data->bytes_total, MAX_ALPHA_VALUE);
385 
386 	/* Initialize internal parameters for next alpha calculation */
387 	dctcp_data->bytes_ecn = 0;
388 	dctcp_data->bytes_total = 0;
389 	dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
390 }
391 
392 static int
393 dctcp_alpha_handler(SYSCTL_HANDLER_ARGS)
394 {
395 	uint32_t new;
396 	int error;
397 
398 	new = V_dctcp_alpha;
399 	error = sysctl_handle_int(oidp, &new, 0, req);
400 	if (error == 0 && req->newptr != NULL) {
401 		if (new > 1)
402 			error = EINVAL;
403 		else {
404 			if (new > MAX_ALPHA_VALUE)
405 				V_dctcp_alpha = MAX_ALPHA_VALUE;
406 			else
407 				V_dctcp_alpha = new;
408 		}
409 	}
410 
411 	return (error);
412 }
413 
414 static int
415 dctcp_shift_g_handler(SYSCTL_HANDLER_ARGS)
416 {
417 	uint32_t new;
418 	int error;
419 
420 	new = V_dctcp_shift_g;
421 	error = sysctl_handle_int(oidp, &new, 0, req);
422 	if (error == 0 && req->newptr != NULL) {
423 		if (new > 1)
424 			error = EINVAL;
425 		else
426 			V_dctcp_shift_g = new;
427 	}
428 
429 	return (error);
430 }
431 
432 static int
433 dctcp_slowstart_handler(SYSCTL_HANDLER_ARGS)
434 {
435 	uint32_t new;
436 	int error;
437 
438 	new = V_dctcp_slowstart;
439 	error = sysctl_handle_int(oidp, &new, 0, req);
440 	if (error == 0 && req->newptr != NULL) {
441 		if (new > 1)
442 			error = EINVAL;
443 		else
444 			V_dctcp_slowstart = new;
445 	}
446 
447 	return (error);
448 }
449 
450 SYSCTL_DECL(_net_inet_tcp_cc_dctcp);
451 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, dctcp, CTLFLAG_RW, NULL,
452     "dctcp congestion control related settings");
453 
454 SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, alpha,
455     CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(dctcp_alpha), 0,
456     &dctcp_alpha_handler,
457     "IU", "dctcp alpha parameter");
458 
459 SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, shift_g,
460     CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(dctcp_shift_g), 4,
461     &dctcp_shift_g_handler,
462     "IU", "dctcp shift parameter");
463 
464 SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, slowstart,
465     CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(dctcp_slowstart), 0,
466     &dctcp_slowstart_handler,
467     "IU", "half CWND reduction after the first slow start");
468 
469 DECLARE_CC_MODULE(dctcp, &dctcp_cc_algo);
470