xref: /freebsd/sys/netinet/cc/cc_dctcp.c (revision 64807b300fb6711a37fbf34980f7d1bb45c75c8d)
1*64807b30SHiren Panchasara /*-
2*64807b30SHiren Panchasara  * Copyright (c) 2007-2008
3*64807b30SHiren Panchasara  *	Swinburne University of Technology, Melbourne, Australia
4*64807b30SHiren Panchasara  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
5*64807b30SHiren Panchasara  * Copyright (c) 2014 Midori Kato <katoon@sfc.wide.ad.jp>
6*64807b30SHiren Panchasara  * Copyright (c) 2014 The FreeBSD Foundation
7*64807b30SHiren Panchasara  * All rights reserved.
8*64807b30SHiren Panchasara  *
9*64807b30SHiren Panchasara  * Redistribution and use in source and binary forms, with or without
10*64807b30SHiren Panchasara  * modification, are permitted provided that the following conditions
11*64807b30SHiren Panchasara  * are met:
12*64807b30SHiren Panchasara  * 1. Redistributions of source code must retain the above copyright
13*64807b30SHiren Panchasara  *    notice, this list of conditions and the following disclaimer.
14*64807b30SHiren Panchasara  * 2. Redistributions in binary form must reproduce the above copyright
15*64807b30SHiren Panchasara  *    notice, this list of conditions and the following disclaimer in the
16*64807b30SHiren Panchasara  *    documentation and/or other materials provided with the distribution.
17*64807b30SHiren Panchasara  *
18*64807b30SHiren Panchasara  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19*64807b30SHiren Panchasara  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20*64807b30SHiren Panchasara  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21*64807b30SHiren Panchasara  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22*64807b30SHiren Panchasara  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23*64807b30SHiren Panchasara  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24*64807b30SHiren Panchasara  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25*64807b30SHiren Panchasara  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26*64807b30SHiren Panchasara  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27*64807b30SHiren Panchasara  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28*64807b30SHiren Panchasara  * SUCH DAMAGE.
29*64807b30SHiren Panchasara  */
30*64807b30SHiren Panchasara 
31*64807b30SHiren Panchasara /*
32*64807b30SHiren Panchasara  * An implementation of the DCTCP algorithm for FreeBSD, based on
33*64807b30SHiren Panchasara  * "Data Center TCP (DCTCP)" by M. Alizadeh, A. Greenberg, D. A. Maltz,
34*64807b30SHiren Panchasara  * J. Padhye, P. Patel, B. Prabhakar, S. Sengupta, and M. Sridharan.,
35*64807b30SHiren Panchasara  * in ACM Conference on SIGCOMM 2010, New York, USA,
36*64807b30SHiren Panchasara  * Originally released as the contribution of Microsoft Research project.
37*64807b30SHiren Panchasara  */
38*64807b30SHiren Panchasara 
39*64807b30SHiren Panchasara #include <sys/cdefs.h>
40*64807b30SHiren Panchasara __FBSDID("$FreeBSD$");
41*64807b30SHiren Panchasara 
42*64807b30SHiren Panchasara #include <sys/param.h>
43*64807b30SHiren Panchasara #include <sys/kernel.h>
44*64807b30SHiren Panchasara #include <sys/malloc.h>
45*64807b30SHiren Panchasara #include <sys/module.h>
46*64807b30SHiren Panchasara #include <sys/socket.h>
47*64807b30SHiren Panchasara #include <sys/socketvar.h>
48*64807b30SHiren Panchasara #include <sys/sysctl.h>
49*64807b30SHiren Panchasara #include <sys/systm.h>
50*64807b30SHiren Panchasara 
51*64807b30SHiren Panchasara #include <net/vnet.h>
52*64807b30SHiren Panchasara 
53*64807b30SHiren Panchasara #include <netinet/in.h>
54*64807b30SHiren Panchasara #include <netinet/ip.h>
55*64807b30SHiren Panchasara #include <netinet/cc.h>
56*64807b30SHiren Panchasara #include <netinet/tcp_seq.h>
57*64807b30SHiren Panchasara #include <netinet/tcp_var.h>
58*64807b30SHiren Panchasara 
59*64807b30SHiren Panchasara #include <netinet/cc/cc_module.h>
60*64807b30SHiren Panchasara 
61*64807b30SHiren Panchasara #define	CAST_PTR_INT(X)	(*((int*)(X)))
62*64807b30SHiren Panchasara 
63*64807b30SHiren Panchasara #define MAX_ALPHA_VALUE 1024
64*64807b30SHiren Panchasara static VNET_DEFINE(uint32_t, dctcp_alpha) = 0;
65*64807b30SHiren Panchasara #define V_dctcp_alpha	    VNET(dctcp_alpha)
66*64807b30SHiren Panchasara static VNET_DEFINE(uint32_t, dctcp_shift_g) = 4;
67*64807b30SHiren Panchasara #define	V_dctcp_shift_g	    VNET(dctcp_shift_g)
68*64807b30SHiren Panchasara static VNET_DEFINE(uint32_t, dctcp_slowstart) = 0;
69*64807b30SHiren Panchasara #define	V_dctcp_slowstart   VNET(dctcp_slowstart)
70*64807b30SHiren Panchasara 
71*64807b30SHiren Panchasara struct dctcp {
72*64807b30SHiren Panchasara 	int     bytes_ecn;	/* # of marked bytes during a RTT */
73*64807b30SHiren Panchasara 	int     bytes_total;	/* # of acked bytes during a RTT */
74*64807b30SHiren Panchasara 	int     alpha;		/* the fraction of marked bytes */
75*64807b30SHiren Panchasara 	int     ce_prev;	/* CE state of the last segment */
76*64807b30SHiren Panchasara 	int     save_sndnxt;	/* end sequence number of the current window */
77*64807b30SHiren Panchasara 	int	ece_curr;	/* ECE flag in this segment */
78*64807b30SHiren Panchasara 	int	ece_prev;	/* ECE flag in the last segment */
79*64807b30SHiren Panchasara 	uint32_t    num_cong_events; /* # of congestion events */
80*64807b30SHiren Panchasara };
81*64807b30SHiren Panchasara 
82*64807b30SHiren Panchasara static MALLOC_DEFINE(M_dctcp, "dctcp data",
83*64807b30SHiren Panchasara     "Per connection data required for the dctcp algorithm");
84*64807b30SHiren Panchasara 
85*64807b30SHiren Panchasara static void	dctcp_ack_received(struct cc_var *ccv, uint16_t type);
86*64807b30SHiren Panchasara static void	dctcp_after_idle(struct cc_var *ccv);
87*64807b30SHiren Panchasara static void	dctcp_cb_destroy(struct cc_var *ccv);
88*64807b30SHiren Panchasara static int	dctcp_cb_init(struct cc_var *ccv);
89*64807b30SHiren Panchasara static void	dctcp_cong_signal(struct cc_var *ccv, uint32_t type);
90*64807b30SHiren Panchasara static void	dctcp_conn_init(struct cc_var *ccv);
91*64807b30SHiren Panchasara static void	dctcp_post_recovery(struct cc_var *ccv);
92*64807b30SHiren Panchasara static void	dctcp_ecnpkt_handler(struct cc_var *ccv);
93*64807b30SHiren Panchasara static void	dctcp_update_alpha(struct cc_var *ccv);
94*64807b30SHiren Panchasara 
95*64807b30SHiren Panchasara struct cc_algo dctcp_cc_algo = {
96*64807b30SHiren Panchasara 	.name = "dctcp",
97*64807b30SHiren Panchasara 	.ack_received = dctcp_ack_received,
98*64807b30SHiren Panchasara 	.cb_destroy = dctcp_cb_destroy,
99*64807b30SHiren Panchasara 	.cb_init = dctcp_cb_init,
100*64807b30SHiren Panchasara 	.cong_signal = dctcp_cong_signal,
101*64807b30SHiren Panchasara 	.conn_init = dctcp_conn_init,
102*64807b30SHiren Panchasara 	.post_recovery = dctcp_post_recovery,
103*64807b30SHiren Panchasara 	.ecnpkt_handler = dctcp_ecnpkt_handler,
104*64807b30SHiren Panchasara 	.after_idle = dctcp_after_idle,
105*64807b30SHiren Panchasara };
106*64807b30SHiren Panchasara 
107*64807b30SHiren Panchasara static void
108*64807b30SHiren Panchasara dctcp_ack_received(struct cc_var *ccv, uint16_t type)
109*64807b30SHiren Panchasara {
110*64807b30SHiren Panchasara 	struct dctcp *dctcp_data;
111*64807b30SHiren Panchasara 	int bytes_acked = 0;
112*64807b30SHiren Panchasara 
113*64807b30SHiren Panchasara 	dctcp_data = ccv->cc_data;
114*64807b30SHiren Panchasara 
115*64807b30SHiren Panchasara 	if (CCV(ccv, t_flags) & TF_ECN_PERMIT) {
116*64807b30SHiren Panchasara 		/*
117*64807b30SHiren Panchasara 		 * DCTCP doesn't treat receipt of ECN marked packet as a
118*64807b30SHiren Panchasara 		 * congestion event. Thus, DCTCP always executes the ACK
119*64807b30SHiren Panchasara 		 * processing out of congestion recovery.
120*64807b30SHiren Panchasara 		 */
121*64807b30SHiren Panchasara 		if (IN_CONGRECOVERY(CCV(ccv, t_flags))) {
122*64807b30SHiren Panchasara 			EXIT_CONGRECOVERY(CCV(ccv, t_flags));
123*64807b30SHiren Panchasara 			newreno_cc_algo.ack_received(ccv, type);
124*64807b30SHiren Panchasara 			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
125*64807b30SHiren Panchasara 		} else
126*64807b30SHiren Panchasara 			newreno_cc_algo.ack_received(ccv, type);
127*64807b30SHiren Panchasara 
128*64807b30SHiren Panchasara 		if (type == CC_DUPACK)
129*64807b30SHiren Panchasara 			bytes_acked = CCV(ccv, t_maxseg);
130*64807b30SHiren Panchasara 
131*64807b30SHiren Panchasara 		if (type == CC_ACK)
132*64807b30SHiren Panchasara 			bytes_acked = ccv->bytes_this_ack;
133*64807b30SHiren Panchasara 
134*64807b30SHiren Panchasara 		/* Update total bytes. */
135*64807b30SHiren Panchasara 		dctcp_data->bytes_total += bytes_acked;
136*64807b30SHiren Panchasara 
137*64807b30SHiren Panchasara 		/* Update total marked bytes. */
138*64807b30SHiren Panchasara 		if (dctcp_data->ece_curr) {
139*64807b30SHiren Panchasara 			if (!dctcp_data->ece_prev
140*64807b30SHiren Panchasara 			    && bytes_acked > CCV(ccv, t_maxseg)) {
141*64807b30SHiren Panchasara 				dctcp_data->bytes_ecn +=
142*64807b30SHiren Panchasara 				    (bytes_acked - CCV(ccv, t_maxseg));
143*64807b30SHiren Panchasara 			} else
144*64807b30SHiren Panchasara 				dctcp_data->bytes_ecn += bytes_acked;
145*64807b30SHiren Panchasara 			dctcp_data->ece_prev = 1;
146*64807b30SHiren Panchasara 		} else {
147*64807b30SHiren Panchasara 			if (dctcp_data->ece_prev
148*64807b30SHiren Panchasara 			    && bytes_acked > CCV(ccv, t_maxseg))
149*64807b30SHiren Panchasara 				dctcp_data->bytes_ecn += CCV(ccv, t_maxseg);
150*64807b30SHiren Panchasara 			dctcp_data->ece_prev = 0;
151*64807b30SHiren Panchasara 		}
152*64807b30SHiren Panchasara 		dctcp_data->ece_curr = 0;
153*64807b30SHiren Panchasara 
154*64807b30SHiren Panchasara 		/*
155*64807b30SHiren Panchasara 		 * Update the fraction of marked bytes at the end of
156*64807b30SHiren Panchasara 		 * current window size.
157*64807b30SHiren Panchasara 		 */
158*64807b30SHiren Panchasara 		if ((IN_FASTRECOVERY(CCV(ccv, t_flags)) &&
159*64807b30SHiren Panchasara 		    SEQ_GEQ(ccv->curack, CCV(ccv, snd_recover))) ||
160*64807b30SHiren Panchasara 		    (!IN_FASTRECOVERY(CCV(ccv, t_flags)) &&
161*64807b30SHiren Panchasara 		    SEQ_GT(ccv->curack, dctcp_data->save_sndnxt)))
162*64807b30SHiren Panchasara 			dctcp_update_alpha(ccv);
163*64807b30SHiren Panchasara 	} else
164*64807b30SHiren Panchasara 		newreno_cc_algo.ack_received(ccv, type);
165*64807b30SHiren Panchasara }
166*64807b30SHiren Panchasara 
167*64807b30SHiren Panchasara static void
168*64807b30SHiren Panchasara dctcp_after_idle(struct cc_var *ccv)
169*64807b30SHiren Panchasara {
170*64807b30SHiren Panchasara 	struct dctcp *dctcp_data;
171*64807b30SHiren Panchasara 
172*64807b30SHiren Panchasara 	dctcp_data = ccv->cc_data;
173*64807b30SHiren Panchasara 
174*64807b30SHiren Panchasara 	/* Initialize internal parameters after idle time */
175*64807b30SHiren Panchasara 	dctcp_data->bytes_ecn = 0;
176*64807b30SHiren Panchasara 	dctcp_data->bytes_total = 0;
177*64807b30SHiren Panchasara 	dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
178*64807b30SHiren Panchasara 	dctcp_data->alpha = V_dctcp_alpha;
179*64807b30SHiren Panchasara 	dctcp_data->ece_curr = 0;
180*64807b30SHiren Panchasara 	dctcp_data->ece_prev = 0;
181*64807b30SHiren Panchasara 	dctcp_data->num_cong_events = 0;
182*64807b30SHiren Panchasara 
183*64807b30SHiren Panchasara 	dctcp_cc_algo.after_idle = newreno_cc_algo.after_idle;
184*64807b30SHiren Panchasara }
185*64807b30SHiren Panchasara 
186*64807b30SHiren Panchasara static void
187*64807b30SHiren Panchasara dctcp_cb_destroy(struct cc_var *ccv)
188*64807b30SHiren Panchasara {
189*64807b30SHiren Panchasara 	if (ccv->cc_data != NULL)
190*64807b30SHiren Panchasara 		free(ccv->cc_data, M_dctcp);
191*64807b30SHiren Panchasara }
192*64807b30SHiren Panchasara 
193*64807b30SHiren Panchasara static int
194*64807b30SHiren Panchasara dctcp_cb_init(struct cc_var *ccv)
195*64807b30SHiren Panchasara {
196*64807b30SHiren Panchasara 	struct dctcp *dctcp_data;
197*64807b30SHiren Panchasara 
198*64807b30SHiren Panchasara 	dctcp_data = malloc(sizeof(struct dctcp), M_dctcp, M_NOWAIT|M_ZERO);
199*64807b30SHiren Panchasara 
200*64807b30SHiren Panchasara 	if (dctcp_data == NULL)
201*64807b30SHiren Panchasara 		return (ENOMEM);
202*64807b30SHiren Panchasara 
203*64807b30SHiren Panchasara 	/* Initialize some key variables with sensible defaults. */
204*64807b30SHiren Panchasara 	dctcp_data->bytes_ecn = 0;
205*64807b30SHiren Panchasara 	dctcp_data->bytes_total = 0;
206*64807b30SHiren Panchasara 	/*
207*64807b30SHiren Panchasara 	 * When alpha is set to 0 in the beggining, DCTCP sender transfers as
208*64807b30SHiren Panchasara 	 * much data as possible until the value converges which may expand the
209*64807b30SHiren Panchasara 	 * queueing delay at the switch. When alpha is set to 1, queueing delay
210*64807b30SHiren Panchasara 	 * is kept small.
211*64807b30SHiren Panchasara 	 * Throughput-sensitive applications should have alpha = 0
212*64807b30SHiren Panchasara 	 * Latency-sensitive applications should have alpha = 1
213*64807b30SHiren Panchasara 	 *
214*64807b30SHiren Panchasara 	 * Note: DCTCP draft suggests initial alpha to be 1 but we've decided to
215*64807b30SHiren Panchasara 	 * keep it 0 as default.
216*64807b30SHiren Panchasara 	 */
217*64807b30SHiren Panchasara 	dctcp_data->alpha = V_dctcp_alpha;
218*64807b30SHiren Panchasara 	dctcp_data->save_sndnxt = 0;
219*64807b30SHiren Panchasara 	dctcp_data->ce_prev = 0;
220*64807b30SHiren Panchasara 	dctcp_data->ece_curr = 0;
221*64807b30SHiren Panchasara 	dctcp_data->ece_prev = 0;
222*64807b30SHiren Panchasara 	dctcp_data->num_cong_events = 0;
223*64807b30SHiren Panchasara 
224*64807b30SHiren Panchasara 	ccv->cc_data = dctcp_data;
225*64807b30SHiren Panchasara 	return (0);
226*64807b30SHiren Panchasara }
227*64807b30SHiren Panchasara 
228*64807b30SHiren Panchasara /*
229*64807b30SHiren Panchasara  * Perform any necessary tasks before we enter congestion recovery.
230*64807b30SHiren Panchasara  */
231*64807b30SHiren Panchasara static void
232*64807b30SHiren Panchasara dctcp_cong_signal(struct cc_var *ccv, uint32_t type)
233*64807b30SHiren Panchasara {
234*64807b30SHiren Panchasara 	struct dctcp *dctcp_data;
235*64807b30SHiren Panchasara 	u_int win, mss;
236*64807b30SHiren Panchasara 
237*64807b30SHiren Panchasara 	dctcp_data = ccv->cc_data;
238*64807b30SHiren Panchasara 	win = CCV(ccv, snd_cwnd);
239*64807b30SHiren Panchasara 	mss = CCV(ccv, t_maxseg);
240*64807b30SHiren Panchasara 
241*64807b30SHiren Panchasara 	switch (type) {
242*64807b30SHiren Panchasara 	case CC_NDUPACK:
243*64807b30SHiren Panchasara 		if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
244*64807b30SHiren Panchasara 			if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
245*64807b30SHiren Panchasara 				CCV(ccv, snd_ssthresh) = mss *
246*64807b30SHiren Panchasara 				    max(win / 2 / mss, 2);
247*64807b30SHiren Panchasara 				dctcp_data->num_cong_events++;
248*64807b30SHiren Panchasara 			} else {
249*64807b30SHiren Panchasara 				/* cwnd has already updated as congestion
250*64807b30SHiren Panchasara 				 * recovery. Reverse cwnd value using
251*64807b30SHiren Panchasara 				 * snd_cwnd_prev and recalculate snd_ssthresh
252*64807b30SHiren Panchasara 				 */
253*64807b30SHiren Panchasara 				win = CCV(ccv, snd_cwnd_prev);
254*64807b30SHiren Panchasara 				CCV(ccv, snd_ssthresh) =
255*64807b30SHiren Panchasara 				    max(win / 2 / mss, 2) * mss;
256*64807b30SHiren Panchasara 			}
257*64807b30SHiren Panchasara 			ENTER_RECOVERY(CCV(ccv, t_flags));
258*64807b30SHiren Panchasara 		}
259*64807b30SHiren Panchasara 		break;
260*64807b30SHiren Panchasara 	case CC_ECN:
261*64807b30SHiren Panchasara 		/*
262*64807b30SHiren Panchasara 		 * Save current snd_cwnd when the host encounters both
263*64807b30SHiren Panchasara 		 * congestion recovery and fast recovery.
264*64807b30SHiren Panchasara 		 */
265*64807b30SHiren Panchasara 		CCV(ccv, snd_cwnd_prev) = win;
266*64807b30SHiren Panchasara 		if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
267*64807b30SHiren Panchasara 			if (V_dctcp_slowstart &&
268*64807b30SHiren Panchasara 			    dctcp_data->num_cong_events++ == 0) {
269*64807b30SHiren Panchasara 				CCV(ccv, snd_ssthresh) =
270*64807b30SHiren Panchasara 				    mss * max(win / 2 / mss, 2);
271*64807b30SHiren Panchasara 				dctcp_data->alpha = MAX_ALPHA_VALUE;
272*64807b30SHiren Panchasara 				dctcp_data->bytes_ecn = 0;
273*64807b30SHiren Panchasara 				dctcp_data->bytes_total = 0;
274*64807b30SHiren Panchasara 				dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
275*64807b30SHiren Panchasara 			} else
276*64807b30SHiren Panchasara 				CCV(ccv, snd_ssthresh) = max((win - ((win *
277*64807b30SHiren Panchasara 				    dctcp_data->alpha) >> 11)) / mss, 2) * mss;
278*64807b30SHiren Panchasara 			CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
279*64807b30SHiren Panchasara 			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
280*64807b30SHiren Panchasara 		}
281*64807b30SHiren Panchasara 		dctcp_data->ece_curr = 1;
282*64807b30SHiren Panchasara 		break;
283*64807b30SHiren Panchasara 	case CC_RTO:
284*64807b30SHiren Panchasara 		if (CCV(ccv, t_flags) & TF_ECN_PERMIT) {
285*64807b30SHiren Panchasara 			CCV(ccv, t_flags) |= TF_ECN_SND_CWR;
286*64807b30SHiren Panchasara 			dctcp_update_alpha(ccv);
287*64807b30SHiren Panchasara 			dctcp_data->save_sndnxt += CCV(ccv, t_maxseg);
288*64807b30SHiren Panchasara 			dctcp_data->num_cong_events++;
289*64807b30SHiren Panchasara 		}
290*64807b30SHiren Panchasara 		break;
291*64807b30SHiren Panchasara 	}
292*64807b30SHiren Panchasara }
293*64807b30SHiren Panchasara 
294*64807b30SHiren Panchasara static void
295*64807b30SHiren Panchasara dctcp_conn_init(struct cc_var *ccv)
296*64807b30SHiren Panchasara {
297*64807b30SHiren Panchasara 	struct dctcp *dctcp_data;
298*64807b30SHiren Panchasara 
299*64807b30SHiren Panchasara 	dctcp_data = ccv->cc_data;
300*64807b30SHiren Panchasara 
301*64807b30SHiren Panchasara 	if (CCV(ccv, t_flags) & TF_ECN_PERMIT)
302*64807b30SHiren Panchasara 		dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
303*64807b30SHiren Panchasara }
304*64807b30SHiren Panchasara 
305*64807b30SHiren Panchasara /*
306*64807b30SHiren Panchasara  * Perform any necessary tasks before we exit congestion recovery.
307*64807b30SHiren Panchasara  */
308*64807b30SHiren Panchasara static void
309*64807b30SHiren Panchasara dctcp_post_recovery(struct cc_var *ccv)
310*64807b30SHiren Panchasara {
311*64807b30SHiren Panchasara 	dctcp_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
312*64807b30SHiren Panchasara 
313*64807b30SHiren Panchasara 	if (CCV(ccv, t_flags) & TF_ECN_PERMIT)
314*64807b30SHiren Panchasara 		dctcp_update_alpha(ccv);
315*64807b30SHiren Panchasara }
316*64807b30SHiren Panchasara 
317*64807b30SHiren Panchasara /*
318*64807b30SHiren Panchasara  * Execute an additional ECN processing using ECN field in IP header and the CWR
319*64807b30SHiren Panchasara  * bit in TCP header.
320*64807b30SHiren Panchasara  *
321*64807b30SHiren Panchasara  * delay_ack == 0 - Delayed ACK disabled
322*64807b30SHiren Panchasara  * delay_ack == 1 - Delayed ACK enabled
323*64807b30SHiren Panchasara  */
324*64807b30SHiren Panchasara 
325*64807b30SHiren Panchasara static void
326*64807b30SHiren Panchasara dctcp_ecnpkt_handler(struct cc_var *ccv)
327*64807b30SHiren Panchasara {
328*64807b30SHiren Panchasara 	struct dctcp *dctcp_data;
329*64807b30SHiren Panchasara 	uint32_t ccflag;
330*64807b30SHiren Panchasara 	int delay_ack;
331*64807b30SHiren Panchasara 
332*64807b30SHiren Panchasara 	dctcp_data = ccv->cc_data;
333*64807b30SHiren Panchasara 	ccflag = ccv->flags;
334*64807b30SHiren Panchasara 	delay_ack = 1;
335*64807b30SHiren Panchasara 
336*64807b30SHiren Panchasara 	/*
337*64807b30SHiren Panchasara 	 * DCTCP responses an ACK immediately when the CE state
338*64807b30SHiren Panchasara 	 * in between this segment and the last segment is not same.
339*64807b30SHiren Panchasara 	 */
340*64807b30SHiren Panchasara 	if (ccflag & CCF_IPHDR_CE) {
341*64807b30SHiren Panchasara 		if (!dctcp_data->ce_prev && (ccflag & CCF_DELACK))
342*64807b30SHiren Panchasara 			delay_ack = 0;
343*64807b30SHiren Panchasara 		dctcp_data->ce_prev = 1;
344*64807b30SHiren Panchasara 		CCV(ccv, t_flags) |= TF_ECN_SND_ECE;
345*64807b30SHiren Panchasara 	} else {
346*64807b30SHiren Panchasara 		if (dctcp_data->ce_prev && (ccflag & CCF_DELACK))
347*64807b30SHiren Panchasara 			delay_ack = 0;
348*64807b30SHiren Panchasara 		dctcp_data->ce_prev = 0;
349*64807b30SHiren Panchasara 		CCV(ccv, t_flags) &= ~TF_ECN_SND_ECE;
350*64807b30SHiren Panchasara 	}
351*64807b30SHiren Panchasara 
352*64807b30SHiren Panchasara 	/* DCTCP sets delayed ack when this segment sets the CWR flag. */
353*64807b30SHiren Panchasara 	if ((ccflag & CCF_DELACK) && (ccflag & CCF_TCPHDR_CWR))
354*64807b30SHiren Panchasara 		delay_ack = 1;
355*64807b30SHiren Panchasara 
356*64807b30SHiren Panchasara 	if (delay_ack == 0)
357*64807b30SHiren Panchasara 		ccv->flags |= CCF_ACKNOW;
358*64807b30SHiren Panchasara 	else
359*64807b30SHiren Panchasara 		ccv->flags &= ~CCF_ACKNOW;
360*64807b30SHiren Panchasara }
361*64807b30SHiren Panchasara 
362*64807b30SHiren Panchasara /*
363*64807b30SHiren Panchasara  * Update the fraction of marked bytes represented as 'alpha'.
364*64807b30SHiren Panchasara  * Also initialize several internal parameters at the end of this function.
365*64807b30SHiren Panchasara  */
366*64807b30SHiren Panchasara static void
367*64807b30SHiren Panchasara dctcp_update_alpha(struct cc_var *ccv)
368*64807b30SHiren Panchasara {
369*64807b30SHiren Panchasara 	struct dctcp *dctcp_data;
370*64807b30SHiren Panchasara 	int alpha_prev;
371*64807b30SHiren Panchasara 
372*64807b30SHiren Panchasara 	dctcp_data = ccv->cc_data;
373*64807b30SHiren Panchasara 	alpha_prev = dctcp_data->alpha;
374*64807b30SHiren Panchasara 	dctcp_data->bytes_total = max(dctcp_data->bytes_total, 1);
375*64807b30SHiren Panchasara 
376*64807b30SHiren Panchasara 	/*
377*64807b30SHiren Panchasara 	 * Update alpha: alpha = (1 - g) * alpha + g * F.
378*64807b30SHiren Panchasara 	 * Here:
379*64807b30SHiren Panchasara 	 * g is weight factor
380*64807b30SHiren Panchasara 	 *	recommaded to be set to 1/16
381*64807b30SHiren Panchasara 	 *	small g = slow convergence between competitive DCTCP flows
382*64807b30SHiren Panchasara 	 *	large g = impacts low utilization of bandwidth at switches
383*64807b30SHiren Panchasara 	 * F is fraction of marked segments in last RTT
384*64807b30SHiren Panchasara 	 *	updated every RTT
385*64807b30SHiren Panchasara 	 * Alpha must be round to 0 - MAX_ALPHA_VALUE.
386*64807b30SHiren Panchasara 	 */
387*64807b30SHiren Panchasara 	dctcp_data->alpha = min(alpha_prev - (alpha_prev >> V_dctcp_shift_g) +
388*64807b30SHiren Panchasara 	    (dctcp_data->bytes_ecn << (10 - V_dctcp_shift_g)) /
389*64807b30SHiren Panchasara 	    dctcp_data->bytes_total, MAX_ALPHA_VALUE);
390*64807b30SHiren Panchasara 
391*64807b30SHiren Panchasara 	/* Initialize internal parameters for next alpha calculation */
392*64807b30SHiren Panchasara 	dctcp_data->bytes_ecn = 0;
393*64807b30SHiren Panchasara 	dctcp_data->bytes_total = 0;
394*64807b30SHiren Panchasara 	dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
395*64807b30SHiren Panchasara }
396*64807b30SHiren Panchasara 
397*64807b30SHiren Panchasara static int
398*64807b30SHiren Panchasara dctcp_alpha_handler(SYSCTL_HANDLER_ARGS)
399*64807b30SHiren Panchasara {
400*64807b30SHiren Panchasara 	uint32_t new;
401*64807b30SHiren Panchasara 	int error;
402*64807b30SHiren Panchasara 
403*64807b30SHiren Panchasara 	new = V_dctcp_alpha;
404*64807b30SHiren Panchasara 	error = sysctl_handle_int(oidp, &new, 0, req);
405*64807b30SHiren Panchasara 	if (error == 0 && req->newptr != NULL) {
406*64807b30SHiren Panchasara 		if (CAST_PTR_INT(req->newptr) > 1)
407*64807b30SHiren Panchasara 			error = EINVAL;
408*64807b30SHiren Panchasara 		else {
409*64807b30SHiren Panchasara 			if (new > MAX_ALPHA_VALUE)
410*64807b30SHiren Panchasara 				V_dctcp_alpha = MAX_ALPHA_VALUE;
411*64807b30SHiren Panchasara 			else
412*64807b30SHiren Panchasara 				V_dctcp_alpha = new;
413*64807b30SHiren Panchasara 		}
414*64807b30SHiren Panchasara 	}
415*64807b30SHiren Panchasara 
416*64807b30SHiren Panchasara 	return (error);
417*64807b30SHiren Panchasara }
418*64807b30SHiren Panchasara 
419*64807b30SHiren Panchasara static int
420*64807b30SHiren Panchasara dctcp_shift_g_handler(SYSCTL_HANDLER_ARGS)
421*64807b30SHiren Panchasara {
422*64807b30SHiren Panchasara 	uint32_t new;
423*64807b30SHiren Panchasara 	int error;
424*64807b30SHiren Panchasara 
425*64807b30SHiren Panchasara 	new = V_dctcp_shift_g;
426*64807b30SHiren Panchasara 	error = sysctl_handle_int(oidp, &new, 0, req);
427*64807b30SHiren Panchasara 	if (error == 0 && req->newptr != NULL) {
428*64807b30SHiren Panchasara 		if (CAST_PTR_INT(req->newptr) > 1)
429*64807b30SHiren Panchasara 			error = EINVAL;
430*64807b30SHiren Panchasara 		else
431*64807b30SHiren Panchasara 			V_dctcp_shift_g = new;
432*64807b30SHiren Panchasara 	}
433*64807b30SHiren Panchasara 
434*64807b30SHiren Panchasara 	return (error);
435*64807b30SHiren Panchasara }
436*64807b30SHiren Panchasara 
437*64807b30SHiren Panchasara static int
438*64807b30SHiren Panchasara dctcp_slowstart_handler(SYSCTL_HANDLER_ARGS)
439*64807b30SHiren Panchasara {
440*64807b30SHiren Panchasara 	uint32_t new;
441*64807b30SHiren Panchasara 	int error;
442*64807b30SHiren Panchasara 
443*64807b30SHiren Panchasara 	new = V_dctcp_slowstart;
444*64807b30SHiren Panchasara 	error = sysctl_handle_int(oidp, &new, 0, req);
445*64807b30SHiren Panchasara 	if (error == 0 && req->newptr != NULL) {
446*64807b30SHiren Panchasara 		if (CAST_PTR_INT(req->newptr) > 1)
447*64807b30SHiren Panchasara 			error = EINVAL;
448*64807b30SHiren Panchasara 		else
449*64807b30SHiren Panchasara 			V_dctcp_slowstart = new;
450*64807b30SHiren Panchasara 	}
451*64807b30SHiren Panchasara 
452*64807b30SHiren Panchasara 	return (error);
453*64807b30SHiren Panchasara }
454*64807b30SHiren Panchasara 
455*64807b30SHiren Panchasara SYSCTL_DECL(_net_inet_tcp_cc_dctcp);
456*64807b30SHiren Panchasara SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, dctcp, CTLFLAG_RW, NULL,
457*64807b30SHiren Panchasara     "dctcp congestion control related settings");
458*64807b30SHiren Panchasara 
459*64807b30SHiren Panchasara SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, alpha,
460*64807b30SHiren Panchasara     CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(dctcp_alpha), 0,
461*64807b30SHiren Panchasara     &dctcp_alpha_handler,
462*64807b30SHiren Panchasara     "IU", "dctcp alpha parameter");
463*64807b30SHiren Panchasara 
464*64807b30SHiren Panchasara SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, shift_g,
465*64807b30SHiren Panchasara     CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(dctcp_shift_g), 4,
466*64807b30SHiren Panchasara     &dctcp_shift_g_handler,
467*64807b30SHiren Panchasara     "IU", "dctcp shift parameter");
468*64807b30SHiren Panchasara 
469*64807b30SHiren Panchasara SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, slowstart,
470*64807b30SHiren Panchasara     CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(dctcp_slowstart), 0,
471*64807b30SHiren Panchasara     &dctcp_slowstart_handler,
472*64807b30SHiren Panchasara     "IU", "half CWND reduction after the first slow start");
473*64807b30SHiren Panchasara 
474*64807b30SHiren Panchasara DECLARE_CC_MODULE(dctcp, &dctcp_cc_algo);
475