cc.c (75dfc66c1b2b44609e5a7c3e1d6a751be4922689) cc.c (b8d60729deefa0bd13e6a395fcab4928e6e10445)
1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2007-2008
5 * Swinburne University of Technology, Melbourne, Australia.
6 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
7 * Copyright (c) 2010 The FreeBSD Foundation
8 * All rights reserved.

--- 36 unchanged lines hidden (view full) ---

45 * Australia, which was made possible in part by a grant from the Cisco
46 * University Research Program Fund at Community Foundation Silicon Valley.
47 * More details are available at:
48 * http://caia.swin.edu.au/urp/newtcp/
49 */
50
51#include <sys/cdefs.h>
52__FBSDID("$FreeBSD$");
1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2007-2008
5 * Swinburne University of Technology, Melbourne, Australia.
6 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
7 * Copyright (c) 2010 The FreeBSD Foundation
8 * All rights reserved.

--- 36 unchanged lines hidden (view full) ---

45 * Australia, which was made possible in part by a grant from the Cisco
46 * University Research Program Fund at Community Foundation Silicon Valley.
47 * More details are available at:
48 * http://caia.swin.edu.au/urp/newtcp/
49 */
50
51#include <sys/cdefs.h>
52__FBSDID("$FreeBSD$");
53
53#include <opt_cc.h>
54#include <sys/param.h>
55#include <sys/kernel.h>
56#include <sys/libkern.h>
57#include <sys/lock.h>
58#include <sys/malloc.h>
59#include <sys/module.h>
60#include <sys/mutex.h>
61#include <sys/queue.h>
62#include <sys/rwlock.h>
63#include <sys/sbuf.h>
64#include <sys/socket.h>
65#include <sys/socketvar.h>
66#include <sys/sysctl.h>
67
68#include <net/vnet.h>
69
70#include <netinet/in.h>
71#include <netinet/in_pcb.h>
72#include <netinet/tcp.h>
54#include <sys/param.h>
55#include <sys/kernel.h>
56#include <sys/libkern.h>
57#include <sys/lock.h>
58#include <sys/malloc.h>
59#include <sys/module.h>
60#include <sys/mutex.h>
61#include <sys/queue.h>
62#include <sys/rwlock.h>
63#include <sys/sbuf.h>
64#include <sys/socket.h>
65#include <sys/socketvar.h>
66#include <sys/sysctl.h>
67
68#include <net/vnet.h>
69
70#include <netinet/in.h>
71#include <netinet/in_pcb.h>
72#include <netinet/tcp.h>
73#include <netinet/tcp_seq.h>
73#include <netinet/tcp_var.h>
74#include <netinet/tcp_var.h>
75#include <netinet/tcp_log_buf.h>
76#include <netinet/tcp_hpts.h>
74#include <netinet/cc/cc.h>
77#include <netinet/cc/cc.h>
75
76#include <netinet/cc/cc_module.h>
77
78#include <netinet/cc/cc_module.h>
79
80MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory");
81
78/*
79 * List of available cc algorithms on the current system. First element
80 * is used as the system default CC algorithm.
81 */
82struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
83
84/* Protects the cc_list TAILQ. */
85struct rwlock cc_list_lock;
86
82/*
83 * List of available cc algorithms on the current system. First element
84 * is used as the system default CC algorithm.
85 */
86struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
87
88/* Protects the cc_list TAILQ. */
89struct rwlock cc_list_lock;
90
87VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo;
91VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL;
88
92
93VNET_DEFINE(uint32_t, newreno_beta) = 50;
94#define V_newreno_beta VNET(newreno_beta)
95
89/*
90 * Sysctl handler to show and change the default CC algorithm.
91 */
92static int
93cc_default_algo(SYSCTL_HANDLER_ARGS)
94{
95 char default_cc[TCP_CA_NAME_MAX];
96 struct cc_algo *funcs;
97 int error;
98
99 /* Get the current default: */
100 CC_LIST_RLOCK();
96/*
97 * Sysctl handler to show and change the default CC algorithm.
98 */
99static int
100cc_default_algo(SYSCTL_HANDLER_ARGS)
101{
102 char default_cc[TCP_CA_NAME_MAX];
103 struct cc_algo *funcs;
104 int error;
105
106 /* Get the current default: */
107 CC_LIST_RLOCK();
101 strlcpy(default_cc, CC_DEFAULT()->name, sizeof(default_cc));
108 if (CC_DEFAULT_ALGO() != NULL)
109 strlcpy(default_cc, CC_DEFAULT_ALGO()->name, sizeof(default_cc));
110 else
111 memset(default_cc, 0, TCP_CA_NAME_MAX);
102 CC_LIST_RUNLOCK();
103
104 error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req);
105
106 /* Check for error or no change */
107 if (error != 0 || req->newptr == NULL)
108 goto done;
109
110 error = ESRCH;
112 CC_LIST_RUNLOCK();
113
114 error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req);
115
116 /* Check for error or no change */
117 if (error != 0 || req->newptr == NULL)
118 goto done;
119
120 error = ESRCH;
111
112 /* Find algo with specified name and set it to default. */
113 CC_LIST_RLOCK();
114 STAILQ_FOREACH(funcs, &cc_list, entries) {
115 if (strncmp(default_cc, funcs->name, sizeof(default_cc)))
116 continue;
117 V_default_cc_ptr = funcs;
118 error = 0;
119 break;

--- 16 unchanged lines hidden (view full) ---

136 err = nalgos = 0;
137 first = 1;
138
139 CC_LIST_RLOCK();
140 STAILQ_FOREACH(algo, &cc_list, entries) {
141 nalgos++;
142 }
143 CC_LIST_RUNLOCK();
121 /* Find algo with specified name and set it to default. */
122 CC_LIST_RLOCK();
123 STAILQ_FOREACH(funcs, &cc_list, entries) {
124 if (strncmp(default_cc, funcs->name, sizeof(default_cc)))
125 continue;
126 V_default_cc_ptr = funcs;
127 error = 0;
128 break;

--- 16 unchanged lines hidden (view full) ---

145 err = nalgos = 0;
146 first = 1;
147
148 CC_LIST_RLOCK();
149 STAILQ_FOREACH(algo, &cc_list, entries) {
150 nalgos++;
151 }
152 CC_LIST_RUNLOCK();
144
153 if (nalgos == 0) {
154 return (ENOENT);
155 }
145 s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN);
146
147 if (s == NULL)
148 return (ENOMEM);
149
150 /*
151 * It is theoretically possible for the CC list to have grown in size
152 * since the call to sbuf_new() and therefore for the sbuf to be too

--- 18 unchanged lines hidden (view full) ---

171 err = sysctl_handle_string(oidp, sbuf_data(s), 0, req);
172 }
173
174 sbuf_delete(s);
175 return (err);
176}
177
178/*
156 s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN);
157
158 if (s == NULL)
159 return (ENOMEM);
160
161 /*
162 * It is theoretically possible for the CC list to have grown in size
163 * since the call to sbuf_new() and therefore for the sbuf to be too

--- 18 unchanged lines hidden (view full) ---

182 err = sysctl_handle_string(oidp, sbuf_data(s), 0, req);
183 }
184
185 sbuf_delete(s);
186 return (err);
187}
188
189/*
179 * Reset the default CC algo to NewReno for any netstack which is using the algo
180 * that is about to go away as its default.
190 * Return the number of times a proposed removal_cc is
191 * being used as the default.
181 */
192 */
182static void
183cc_checkreset_default(struct cc_algo *remove_cc)
193static int
194cc_check_default(struct cc_algo *remove_cc)
184{
195{
196 int cnt = 0;
185 VNET_ITERATOR_DECL(vnet_iter);
186
187 CC_LIST_LOCK_ASSERT();
188
189 VNET_LIST_RLOCK_NOSLEEP();
190 VNET_FOREACH(vnet_iter) {
191 CURVNET_SET(vnet_iter);
197 VNET_ITERATOR_DECL(vnet_iter);
198
199 CC_LIST_LOCK_ASSERT();
200
201 VNET_LIST_RLOCK_NOSLEEP();
202 VNET_FOREACH(vnet_iter) {
203 CURVNET_SET(vnet_iter);
192 if (strncmp(CC_DEFAULT()->name, remove_cc->name,
193 TCP_CA_NAME_MAX) == 0)
194 V_default_cc_ptr = &newreno_cc_algo;
204 if ((CC_DEFAULT_ALGO() != NULL) &&
205 strncmp(CC_DEFAULT_ALGO()->name,
206 remove_cc->name,
207 TCP_CA_NAME_MAX) == 0) {
208 cnt++;
209 }
195 CURVNET_RESTORE();
196 }
197 VNET_LIST_RUNLOCK_NOSLEEP();
210 CURVNET_RESTORE();
211 }
212 VNET_LIST_RUNLOCK_NOSLEEP();
213 return (cnt);
198}
199
200/*
201 * Initialise CC subsystem on system boot.
202 */
203static void
204cc_init(void)
205{

--- 7 unchanged lines hidden (view full) ---

213int
214cc_deregister_algo(struct cc_algo *remove_cc)
215{
216 struct cc_algo *funcs, *tmpfuncs;
217 int err;
218
219 err = ENOENT;
220
214}
215
216/*
217 * Initialise CC subsystem on system boot.
218 */
219static void
220cc_init(void)
221{

--- 7 unchanged lines hidden (view full) ---

229int
230cc_deregister_algo(struct cc_algo *remove_cc)
231{
232 struct cc_algo *funcs, *tmpfuncs;
233 int err;
234
235 err = ENOENT;
236
221 /* Never allow newreno to be deregistered. */
222 if (&newreno_cc_algo == remove_cc)
223 return (EPERM);
224
225 /* Remove algo from cc_list so that new connections can't use it. */
226 CC_LIST_WLOCK();
227 STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
228 if (funcs == remove_cc) {
237 /* Remove algo from cc_list so that new connections can't use it. */
238 CC_LIST_WLOCK();
239 STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
240 if (funcs == remove_cc) {
229 cc_checkreset_default(remove_cc);
230 STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
231 err = 0;
241 if (cc_check_default(remove_cc)) {
242 err = EBUSY;
243 break;
244 }
245 /* Add a temp flag to stop new adds to it */
246 funcs->flags |= CC_MODULE_BEING_REMOVED;
232 break;
233 }
234 }
235 CC_LIST_WUNLOCK();
247 break;
248 }
249 }
250 CC_LIST_WUNLOCK();
236
237 if (!err)
238 /*
239 * XXXLAS:
240 * - We may need to handle non-zero return values in future.
241 * - If we add CC framework support for protocols other than
242 * TCP, we may want a more generic way to handle this step.
243 */
244 tcp_ccalgounload(remove_cc);
245
251 err = tcp_ccalgounload(remove_cc);
252 /*
253 * Now back through and we either remove the temp flag
254 * or pull the registration.
255 */
256 CC_LIST_WLOCK();
257 STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
258 if (funcs == remove_cc) {
259 if (err == 0)
260 STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
261 else
262 funcs->flags &= ~CC_MODULE_BEING_REMOVED;
263 break;
264 }
265 }
266 CC_LIST_WUNLOCK();
246 return (err);
247}
248
249/*
250 * Returns 0 on success, non-zero on failure.
251 */
252int
253cc_register_algo(struct cc_algo *add_cc)

--- 4 unchanged lines hidden (view full) ---

258 err = 0;
259
260 /*
261 * Iterate over list of registered CC algorithms and make sure
262 * we're not trying to add a duplicate.
263 */
264 CC_LIST_WLOCK();
265 STAILQ_FOREACH(funcs, &cc_list, entries) {
267 return (err);
268}
269
270/*
271 * Returns 0 on success, non-zero on failure.
272 */
273int
274cc_register_algo(struct cc_algo *add_cc)

--- 4 unchanged lines hidden (view full) ---

279 err = 0;
280
281 /*
282 * Iterate over list of registered CC algorithms and make sure
283 * we're not trying to add a duplicate.
284 */
285 CC_LIST_WLOCK();
286 STAILQ_FOREACH(funcs, &cc_list, entries) {
266 if (funcs == add_cc || strncmp(funcs->name, add_cc->name,
267 TCP_CA_NAME_MAX) == 0)
287 if (funcs == add_cc ||
288 strncmp(funcs->name, add_cc->name,
289 TCP_CA_NAME_MAX) == 0) {
268 err = EEXIST;
290 err = EEXIST;
291 break;
292 }
269 }
293 }
270
271 if (!err)
294 /*
295 * The first loaded congestion control module will become
296 * the default until we find the "CC_DEFAULT" defined in
297 * the config (if we do).
298 */
299 if (!err) {
272 STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
300 STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
273
301 if (strcmp(add_cc->name, CC_DEFAULT) == 0) {
302 V_default_cc_ptr = add_cc;
303 } else if (V_default_cc_ptr == NULL) {
304 V_default_cc_ptr = add_cc;
305 }
306 }
274 CC_LIST_WUNLOCK();
275
276 return (err);
277}
278
279/*
307 CC_LIST_WUNLOCK();
308
309 return (err);
310}
311
312/*
313 * Perform any necessary tasks before we exit congestion recovery.
314 */
315void
316newreno_cc_post_recovery(struct cc_var *ccv)
317{
318 int pipe;
319
320 if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
321 /*
322 * Fast recovery will conclude after returning from this
323 * function. Window inflation should have left us with
324 * approximately snd_ssthresh outstanding data. But in case we
325 * would be inclined to send a burst, better to do it via the
326 * slow start mechanism.
327 *
328 * XXXLAS: Find a way to do this without needing curack
329 */
330 if (V_tcp_do_newsack)
331 pipe = tcp_compute_pipe(ccv->ccvc.tcp);
332 else
333 pipe = CCV(ccv, snd_max) - ccv->curack;
334 if (pipe < CCV(ccv, snd_ssthresh))
335 /*
336 * Ensure that cwnd does not collapse to 1 MSS under
337 * adverse conditons. Implements RFC6582
338 */
339 CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) +
340 CCV(ccv, t_maxseg);
341 else
342 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
343 }
344}
345
346void
347newreno_cc_after_idle(struct cc_var *ccv)
348{
349 uint32_t rw;
350 /*
351 * If we've been idle for more than one retransmit timeout the old
352 * congestion window is no longer current and we have to reduce it to
353 * the restart window before we can transmit again.
354 *
355 * The restart window is the initial window or the last CWND, whichever
356 * is smaller.
357 *
358 * This is done to prevent us from flooding the path with a full CWND at
359 * wirespeed, overloading router and switch buffers along the way.
360 *
361 * See RFC5681 Section 4.1. "Restarting Idle Connections".
362 *
363 * In addition, per RFC2861 Section 2, the ssthresh is set to the
364 * maximum of the former ssthresh or 3/4 of the old cwnd, to
365 * not exit slow-start prematurely.
366 */
367 rw = tcp_compute_initwnd(tcp_maxseg(ccv->ccvc.tcp));
368
369 CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh),
370 CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2));
371
372 CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd));
373}
374
375/*
376 * Perform any necessary tasks before we enter congestion recovery.
377 */
378void
379newreno_cc_cong_signal(struct cc_var *ccv, uint32_t type)
380{
381 uint32_t cwin, factor;
382 u_int mss;
383
384 cwin = CCV(ccv, snd_cwnd);
385 mss = tcp_fixed_maxseg(ccv->ccvc.tcp);
386 /*
387 * Other TCP congestion controls use newreno_cong_signal(), but
388 * with their own private cc_data. Make sure the cc_data is used
389 * correctly.
390 */
391 factor = V_newreno_beta;
392
393 /* Catch algos which mistakenly leak private signal types. */
394 KASSERT((type & CC_SIGPRIVMASK) == 0,
395 ("%s: congestion signal type 0x%08x is private\n", __func__, type));
396
397 cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss),
398 2) * mss;
399
400 switch (type) {
401 case CC_NDUPACK:
402 if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
403 if (!IN_CONGRECOVERY(CCV(ccv, t_flags)))
404 CCV(ccv, snd_ssthresh) = cwin;
405 ENTER_RECOVERY(CCV(ccv, t_flags));
406 }
407 break;
408 case CC_ECN:
409 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
410 CCV(ccv, snd_ssthresh) = cwin;
411 CCV(ccv, snd_cwnd) = cwin;
412 ENTER_CONGRECOVERY(CCV(ccv, t_flags));
413 }
414 break;
415 case CC_RTO:
416 CCV(ccv, snd_ssthresh) = max(min(CCV(ccv, snd_wnd),
417 CCV(ccv, snd_cwnd)) / 2 / mss,
418 2) * mss;
419 CCV(ccv, snd_cwnd) = mss;
420 break;
421 }
422}
423
424void
425newreno_cc_ack_received(struct cc_var *ccv, uint16_t type)
426{
427 if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
428 (ccv->flags & CCF_CWND_LIMITED)) {
429 u_int cw = CCV(ccv, snd_cwnd);
430 u_int incr = CCV(ccv, t_maxseg);
431
432 /*
433 * Regular in-order ACK, open the congestion window.
434 * Method depends on which congestion control state we're
435 * in (slow start or cong avoid) and if ABC (RFC 3465) is
436 * enabled.
437 *
438 * slow start: cwnd <= ssthresh
439 * cong avoid: cwnd > ssthresh
440 *
441 * slow start and ABC (RFC 3465):
442 * Grow cwnd exponentially by the amount of data
443 * ACKed capping the max increment per ACK to
444 * (abc_l_var * maxseg) bytes.
445 *
446 * slow start without ABC (RFC 5681):
447 * Grow cwnd exponentially by maxseg per ACK.
448 *
449 * cong avoid and ABC (RFC 3465):
450 * Grow cwnd linearly by maxseg per RTT for each
451 * cwnd worth of ACKed data.
452 *
453 * cong avoid without ABC (RFC 5681):
454 * Grow cwnd linearly by approximately maxseg per RTT using
455 * maxseg^2 / cwnd per ACK as the increment.
456 * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
457 * avoid capping cwnd.
458 */
459 if (cw > CCV(ccv, snd_ssthresh)) {
460 if (V_tcp_do_rfc3465) {
461 if (ccv->flags & CCF_ABC_SENTAWND)
462 ccv->flags &= ~CCF_ABC_SENTAWND;
463 else
464 incr = 0;
465 } else
466 incr = max((incr * incr / cw), 1);
467 } else if (V_tcp_do_rfc3465) {
468 /*
469 * In slow-start with ABC enabled and no RTO in sight?
470 * (Must not use abc_l_var > 1 if slow starting after
471 * an RTO. On RTO, snd_nxt = snd_una, so the
472 * snd_nxt == snd_max check is sufficient to
473 * handle this).
474 *
475 * XXXLAS: Find a way to signal SS after RTO that
476 * doesn't rely on tcpcb vars.
477 */
478 uint16_t abc_val;
479
480 if (ccv->flags & CCF_USE_LOCAL_ABC)
481 abc_val = ccv->labc;
482 else
483 abc_val = V_tcp_abc_l_var;
484 if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max))
485 incr = min(ccv->bytes_this_ack,
486 ccv->nsegs * abc_val *
487 CCV(ccv, t_maxseg));
488 else
489 incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg));
490
491 }
492 /* ABC is on by default, so incr equals 0 frequently. */
493 if (incr > 0)
494 CCV(ccv, snd_cwnd) = min(cw + incr,
495 TCP_MAXWIN << CCV(ccv, snd_scale));
496 }
497}
498
499/*
280 * Handles kld related events. Returns 0 on success, non-zero on failure.
281 */
282int
283cc_modevent(module_t mod, int event_type, void *data)
284{
285 struct cc_algo *algo;
286 int err;
287
288 err = 0;
289 algo = (struct cc_algo *)data;
290
291 switch(event_type) {
292 case MOD_LOAD:
500 * Handles kld related events. Returns 0 on success, non-zero on failure.
501 */
502int
503cc_modevent(module_t mod, int event_type, void *data)
504{
505 struct cc_algo *algo;
506 int err;
507
508 err = 0;
509 algo = (struct cc_algo *)data;
510
511 switch(event_type) {
512 case MOD_LOAD:
513 if ((algo->cc_data_sz == NULL) && (algo->cb_init != NULL)) {
514 /*
515 * A module must have a cc_data_sz function
516 * even if it has no data it should return 0.
517 */
518 printf("Module Load Fails, it lacks a cc_data_sz() function but has a cb_init()!\n");
519 err = EINVAL;
520 break;
521 }
293 if (algo->mod_init != NULL)
294 err = algo->mod_init();
295 if (!err)
296 err = cc_register_algo(algo);
297 break;
298
299 case MOD_QUIESCE:
300 case MOD_SHUTDOWN:

--- 42 unchanged lines hidden ---
522 if (algo->mod_init != NULL)
523 err = algo->mod_init();
524 if (!err)
525 err = cc_register_algo(algo);
526 break;
527
528 case MOD_QUIESCE:
529 case MOD_SHUTDOWN:

--- 42 unchanged lines hidden ---